npm - akm-cli - Versions diffs - 0.7.0-rc1 → 0.7.0 - Mend

akm-cli 0.7.0-rc1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (83) hide show

package/dist/src/cli.js +100 -16
package/dist/src/commands/config-cli.js +42 -0
package/dist/src/commands/history.js +78 -7
package/dist/src/commands/registry-search.js +69 -6
package/dist/src/commands/search.js +30 -3
package/dist/src/commands/show.js +29 -0
package/dist/src/commands/source-add.js +5 -1
package/dist/src/commands/source-manage.js +7 -1
package/dist/src/core/config.js +28 -0
package/dist/src/indexer/db-search.js +1 -0
package/dist/src/indexer/indexer.js +16 -2
package/dist/src/indexer/matchers.js +1 -1
package/dist/src/indexer/search-source.js +4 -2
package/dist/src/integrations/agent/profiles.js +1 -1
package/dist/src/integrations/agent/spawn.js +67 -16
package/dist/src/integrations/github.js +9 -3
package/dist/src/llm/embedders/remote.js +37 -3
package/dist/src/output/cli-hints.js +15 -2
package/dist/src/output/renderers.js +3 -1
package/dist/src/output/shapes.js +8 -1
package/dist/src/output/text.js +156 -3
package/dist/src/registry/build-index.js +5 -4
package/dist/src/registry/providers/static-index.js +3 -1
package/dist/src/setup/setup.js +9 -0
package/dist/src/wiki/wiki.js +54 -6
package/dist/src/workflows/runs.js +37 -3
package/dist/tests/architecture/agent-no-llm-sdk-guard.test.js +1 -1
package/dist/tests/bench/attribution.test.js +24 -23
package/dist/tests/bench/cleanup.js +31 -0
package/dist/tests/bench/cli.js +366 -31
package/dist/tests/bench/cli.test.js +282 -14
package/dist/tests/bench/corpus.js +3 -0
package/dist/tests/bench/corpus.test.js +10 -10
package/dist/tests/bench/doctor.js +525 -0
package/dist/tests/bench/driver.js +77 -22
package/dist/tests/bench/driver.test.js +142 -1
package/dist/tests/bench/environment.js +233 -0
package/dist/tests/bench/environment.test.js +199 -0
package/dist/tests/bench/evolve.js +67 -0
package/dist/tests/bench/evolve.test.js +12 -4
package/dist/tests/bench/failure-modes.test.js +52 -3
package/dist/tests/bench/feedback-integrity.test.js +3 -2
package/dist/tests/bench/leakage.test.js +105 -2
package/dist/tests/bench/learning-curve.test.js +3 -2
package/dist/tests/bench/metrics.js +102 -26
package/dist/tests/bench/metrics.test.js +10 -4
package/dist/tests/bench/opencode-config.js +194 -0
package/dist/tests/bench/opencode-config.test.js +370 -0
package/dist/tests/bench/report.js +73 -9
package/dist/tests/bench/report.test.js +59 -10
package/dist/tests/bench/run-config.js +355 -0
package/dist/tests/bench/run-config.test.js +298 -0
package/dist/tests/bench/run-curate-test.js +32 -0
package/dist/tests/bench/run-failing-tasks.js +56 -0
package/dist/tests/bench/run-full-bench.js +51 -0
package/dist/tests/bench/run-items36-targeted.js +69 -0
package/dist/tests/bench/run-nano-quick.js +42 -0
package/dist/tests/bench/run-waveg-targeted.js +62 -0
package/dist/tests/bench/runner.js +257 -94
package/dist/tests/bench/tmp.js +90 -0
package/dist/tests/bench/trajectory.js +2 -2
package/dist/tests/bench/verifier.js +6 -1
package/dist/tests/bench/workflow-spec.js +11 -24
package/dist/tests/bench/workflow-spec.test.js +1 -1
package/dist/tests/bench/workflow-trace.js +34 -0
package/dist/tests/cli-errors.test.js +1 -0
package/dist/tests/commands/history.test.js +195 -0
package/dist/tests/config.test.js +25 -0
package/dist/tests/e2e.test.js +23 -2
package/dist/tests/fixtures/stashes/load.js +1 -1
package/dist/tests/fixtures/stashes/load.test.js +11 -2
package/dist/tests/indexer.test.js +12 -1
package/dist/tests/output-baseline.test.js +2 -1
package/dist/tests/output-shapes-unit.test.js +3 -1
package/dist/tests/registry-build-index.test.js +17 -1
package/dist/tests/registry-providers/static-index.test.js +34 -0
package/dist/tests/registry-search.test.js +200 -0
package/dist/tests/remember-frontmatter.test.js +11 -13
package/dist/tests/source-qa-fixes.test.js +18 -0
package/dist/tests/source-registry.test.js +3 -3
package/dist/tests/source-source.test.js +61 -1
package/dist/tests/workflow-qa-fixes.test.js +18 -0
package/package.json +1 -1

package/dist/tests/bench/runner.js CHANGED Viewed

@@ -26,13 +26,89 @@ import { computeFixtureContentHash, loadFixtureStash } from "../fixtures/stashes
 import { registerCleanup } from "./cleanup";
 import { computeTaskCorpusHash, readTaskBody } from "./corpus";
 import { runOne } from "./driver";
+import { validateFixtureCorpus } from "./environment";
 import { aggregateCorpus, aggregateFailureModes, aggregatePerTask, aggregateTrajectory, classifyFailureMode, computeCorpusDelta, computePerAssetAttribution, computePerTaskDelta, computeSearchBridge, extractAssetLoads, extractGoldRanks, } from "./metrics";
 import { resolveGitBranch, resolveGitCommit } from "./report";
-import { benchMkdtemp } from "./tmp";
+import { benchMkdtemp, benchTmpRoot } from "./tmp";
 import { computeTrajectory } from "./trajectory";
 import { evaluateRunAgainstAllSpecs, } from "./workflow-evaluator";
 import { loadAllWorkflowSpecs } from "./workflow-spec";
 import { normalizeRunToTrace } from "./workflow-trace";
+/** Checkpoint write interval: write a partial file every N completed runs. */
+const CHECKPOINT_INTERVAL = 5;
+/** Partial file max age before cleanup: 24 hours in milliseconds. */
+const PARTIAL_MAX_AGE_MS = 24 * 60 * 60 * 1000;
+/**
+ * Emit a one-line progress update to stderr after each (task, arm, seed)
+ * completes. Goes to stderr even when --json is passed so operators always
+ * have a heartbeat signal during long runs.
+ *
+ * Format: `[<completed>/<total>] <taskId> <arm> <outcome> <wallclockSeconds>s`
+ */
+function emitProgress(completed, total, run) {
+    const secs = Math.round(run.wallclockMs / 1000);
+    process.stderr.write(`[${completed}/${total}] ${run.taskId} ${run.arm} ${run.outcome} ${secs}s\n`);
+}
+/**
+ * Write a partial checkpoint file under `${AKM_CACHE_DIR}/bench/`.
+ * The file contains the runs completed so far plus a `partial: true` marker
+ * and a `summary.total_runs_completed` counter. Old partial files (>24h)
+ * are not cleaned up here — that is done at startup via `cleanupOldPartials`.
+ */
+function writePartialCheckpoint(runs, timestamp) {
+    try {
+        const root = benchTmpRoot();
+        const filename = `bench-partial-${timestamp.replace(/[:.]/g, "-")}.json`;
+        const outPath = path.join(root, filename);
+        const envelope = {
+            partial: true,
+            summary: {
+                total_runs_completed: runs.length,
+            },
+            timestamp,
+            runs: runs.map((r) => ({
+                task_id: r.taskId,
+                arm: r.arm,
+                seed: r.seed,
+                model: r.model,
+                outcome: r.outcome,
+                wallclock_ms: r.wallclockMs,
+            })),
+        };
+        fs.writeFileSync(outPath, JSON.stringify(envelope, null, 2), "utf8");
+    }
+    catch {
+        // Checkpoint writes are best-effort — never abort a run for a write failure.
+    }
+}
+/**
+ * Remove partial checkpoint files older than 24 hours from the bench tmp root.
+ * Called once at the start of `runUtility` to reap orphans from prior crashed runs.
+ */
+function cleanupOldPartials() {
+    try {
+        const root = benchTmpRoot();
+        const now = Date.now();
+        const entries = fs.readdirSync(root);
+        for (const entry of entries) {
+            if (!entry.startsWith("bench-partial-"))
+                continue;
+            const fullPath = path.join(root, entry);
+            try {
+                const stat = fs.statSync(fullPath);
+                if (now - stat.mtimeMs > PARTIAL_MAX_AGE_MS) {
+                    fs.unlinkSync(fullPath);
+                }
+            }
+            catch {
+                /* swallow per-file errors */
+            }
+        }
+    }
+    catch {
+        /* swallow — cleanup is best-effort */
+    }
+}
 /**
  * Default workflows directory. Can be overridden by callers (tests) via
  * `RunUtilityOptions.workflowsDir`. Specs in this directory are loaded ONCE
@@ -40,6 +116,16 @@ import { normalizeRunToTrace } from "./workflow-trace";
  * `applies_to` so we don't I/O in the hot loop.
  */
 const DEFAULT_WORKFLOWS_DIR = path.resolve(__dirname, "..", "fixtures", "bench", "workflows");
+/**
+ * Run `items` in batches of `n` concurrently, calling `fn` for each item.
+ * Batches are executed sequentially; within each batch all items run with
+ * `Promise.all`. This gives bounded concurrency without a full work-queue.
+ */
+async function runInBatches(items, n, fn) {
+    for (let i = 0; i < items.length; i += n) {
+        await Promise.all(items.slice(i, i + n).map(fn));
+    }
+}
 /**
  * Run K seeds × len(arms) × len(tasks) and return the §13.3 report.
  *
@@ -47,6 +133,11 @@ const DEFAULT_WORKFLOWS_DIR = path.resolve(__dirname, "..", "fixtures", "bench",
  * every failure path into a RunResult, so the runner only has to worry
  * about its own infrastructure (stash materialisation, workspace copy).
  * Those failures are recorded as `harness_error` runs.
+ *
+ * When `options.parallel > 1`, work items are batched and run concurrently
+ * via `runInBatches`. The shared `warnings`, `goldRankRecords`, and
+ * `workflowChecks` arrays are updated atomically at the end of each item so
+ * no JS-level races occur (Node/Bun is single-threaded).
  */
 export async function runUtility(options) {
     const seedsPerArm = options.seedsPerArm ?? 5;
@@ -54,9 +145,39 @@ export async function runUtility(options) {
     const budgetWallMs = options.budgetWallMs ?? 120000;
     const slice = options.slice ?? "all";
     const materialiseStash = options.materialiseStash ?? true;
+    // Clamp parallel to [1, 8].
+    const parallel = Math.min(8, Math.max(1, options.parallel ?? 1));
+    if (parallel > 4 && !options.forceParallel) {
+        process.stderr.write(`bench: --parallel ${parallel} exceeds 4; high concurrency may overwhelm local providers. ` +
+            `Pass --force-parallel to suppress this warning.\n`);
+    }
+    // Clean up orphaned partial files from prior crashed runs (best-effort).
+    cleanupOldPartials();
     const grouped = new Map();
     const warnings = [];
+    // Validate all task stash references before starting any work. Missing
+    // fixtures produce harness_error at run time; better to surface them loudly
+    // at startup with the fixture name than to discover them per-seed mid-run.
+    if (materialiseStash && options.arms.includes("akm")) {
+        const { missing } = validateFixtureCorpus(options.tasks);
+        for (const [fixture, taskIds] of missing) {
+            const w = `fixture "${fixture}" missing MANIFEST.json — tasks will harness_error: ${taskIds.join(", ")}`;
+            process.stderr.write(`bench: WARNING: ${w}\n`);
+            warnings.push(w);
+        }
+    }
     const goldRankRecords = [];
+    // Progress tracking: compute total run count upfront so progress lines show
+    // `[7/40]` rather than an unbounded counter.
+    const armsForProgress = options.includeSynthetic
+        ? [...new Set([...options.arms, "synthetic"])]
+        : options.arms;
+    const totalRuns = options.tasks.length * armsForProgress.length * seedsPerArm;
+    let completedRuns = 0;
+    // Partial checkpoint accumulator: collects all RunResults as they land so
+    // we can write a partial envelope periodically without keeping duplicates.
+    const allCompletedRuns = [];
+    const runTimestamp = options.timestamp ?? new Date().toISOString();
     // #257: load workflow specs ONCE per runUtility call. Skipped when the
     // caller passes an empty `workflowsDir` string (test escape hatch). Errors
     // are surfaced as warnings — workflow evaluation is best-effort and a
@@ -89,7 +210,7 @@ export async function runUtility(options) {
         let stashError;
         if (options.arms.includes("akm") && materialiseStash && !overrideStashDir) {
             try {
-                stash = loadFixtureStash(task.stash, { skipIndex: true });
+                stash = loadFixtureStash(task.stash);
             }
             catch (err) {
                 stashError = err instanceof Error ? err.message : String(err);
@@ -121,99 +242,130 @@ export async function runUtility(options) {
                 return options.arms;
             return [...options.arms, "synthetic"];
         })();
-        try {
-            for (const arm of armsForTask) {
-                const armRuns = [];
-                taskRuns.set(arm, armRuns);
-                for (let seed = 0; seed < seedsPerArm; seed += 1) {
-                    // Resolve the stashDir we'll forward to the agent. The akm arm
-                    // always carries a stashDir so AKM_STASH_DIR is set in the child
-                    // env — this is how downstream tooling (and the trajectory parser
-                    // event-stream lookup) distinguishes arms. When the operator opted
-                    // out of fixture materialisation (tests, dry-run), we still pass a
-                    // stable placeholder so the env keys are wired correctly.
-                    let stashDir;
-                    if (arm === "akm") {
-                        // Resolution order (must match the issue #251 acceptance criteria):
-                        //   1. Per-task explicit override (used by `runMaskedCorpus` to
-                        //      point at a tmp stash with one asset removed). Highest
-                        //      priority because attribution correctness depends on this
-                        //      branch never being shadowed by the `__no-stash__`
-                        //      placeholder fallback.
-                        //   2. Per-(task, arm)-call `stashDirByFixture` override (Phase
-                        //      3 evolve persistence).
-                        //   3. Per-task materialised fixture stash from `loadFixtureStash`.
-                        //   4. `materialiseStash: false` placeholder so AKM_STASH_DIR is
-                        //      still wired into the child env.
-                        if (task.stashDirOverride)
-                            stashDir = task.stashDirOverride;
-                        else if (overrideStashDir)
-                            stashDir = overrideStashDir;
-                        else if (stash)
-                            stashDir = stash.stashDir;
-                        else if (!materialiseStash)
-                            stashDir = path.join(task.taskDir, "__no-stash__");
-                    }
-                    // Build the prompt-override (#267). The builder is invoked once
-                    // per (task, arm) — seeds share a prompt. `undefined` keeps the
-                    // driver's default prompt in play.
-                    //
-                    // #261: the synthetic arm has a scratch-notes prompt contract —
-                    // the model is told no AKM stash is available and instructed to
-                    // write/use its own procedural notes. When the caller does not
-                    // supply a `buildPrompt` override for the synthetic arm we fall
-                    // back to a built-in scratch-notes prompt so the contract is
-                    // honoured by every utility-track caller, not just `runEvolve`.
-                    let promptOverride = options.buildPrompt?.(task, arm);
-                    if (promptOverride === undefined && arm === "synthetic") {
-                        promptOverride = buildUtilitySyntheticPrompt(task.id);
-                    }
-                    const run = await runOneIsolated({
-                        task,
-                        arm,
-                        seed,
-                        model: options.model,
-                        stashDir,
-                        budgetTokens,
-                        budgetWallMs,
-                        spawn: options.spawn,
-                        warnings,
-                        ...(promptOverride !== undefined ? { prompt: promptOverride } : {}),
-                    });
-                    armRuns.push(run);
-                    // §6.7 search-pipeline bridge: only the akm arm consults the stash,
-                    // and we only attribute ranks for tasks with a gold ref. Both
-                    // guards mean noakm and gold-less runs are silently excluded.
-                    if (arm === "akm" && task.goldRef) {
-                        const searches = extractGoldRanks(run, task.goldRef);
-                        goldRankRecords.push({
-                            taskId: task.id,
-                            arm,
-                            seed,
-                            outcome: run.outcome,
-                            goldRef: task.goldRef,
-                            searches,
-                        });
-                    }
-                    // #257: evaluate the akm-arm run against every workflow spec. The
-                    // evaluator's `specApplies` filter handles applicability (arm,
-                    // domain, gold ref, repeated-failures threshold), so we hand it the
-                    // entire spec list and append whatever it returns. noakm/synthetic
-                    // arms are not evaluated — workflow specs target the akm arm.
-                    if (arm === "akm" && workflowSpecs.length > 0) {
-                        const trace = normalizeRunToTrace(run, { warnings });
-                        const runCtx = {
-                            arm: run.arm,
-                            taskId: run.taskId,
-                            seed: run.seed,
-                            outcome: run.outcome,
-                        };
-                        const taskMetadata = buildWorkflowTaskMetadata(task, trace);
-                        const checks = evaluateRunAgainstAllSpecs(trace, workflowSpecs, runCtx, taskMetadata);
-                        workflowChecks.push(...checks);
-                    }
-                }
+        const workItems = [];
+        for (const arm of armsForTask) {
+            taskRuns.set(arm, []);
+            for (let seed = 0; seed < seedsPerArm; seed += 1) {
+                workItems.push({ arm, seed });
+            }
+        }
+        // Per-run worker: resolves stash/prompt, executes runOneIsolated, then
+        // splices the result into the shared accumulators. Because Bun/Node is
+        // single-threaded these splices are race-free even across concurrent
+        // awaits — only one microtask runs at a time between yield points.
+        const runItem = async ({ arm, seed }) => {
+            // Resolve the stashDir we'll forward to the agent. The akm arm
+            // always carries a stashDir so AKM_STASH_DIR is set in the child
+            // env — this is how downstream tooling (and the trajectory parser
+            // event-stream lookup) distinguishes arms. When the operator opted
+            // out of fixture materialisation (tests, dry-run), we still pass a
+            // stable placeholder so the env keys are wired correctly.
+            let stashDir;
+            if (arm === "akm") {
+                // Resolution order (must match the issue #251 acceptance criteria):
+                //   1. Per-task explicit override (used by `runMaskedCorpus` to
+                //      point at a tmp stash with one asset removed). Highest
+                //      priority because attribution correctness depends on this
+                //      branch never being shadowed by the `__no-stash__`
+                //      placeholder fallback.
+                //   2. Per-(task, arm)-call `stashDirByFixture` override (Phase
+                //      3 evolve persistence).
+                //   3. Per-task materialised fixture stash from `loadFixtureStash`.
+                //   4. `materialiseStash: false` placeholder so AKM_STASH_DIR is
+                //      still wired into the child env.
+                if (task.stashDirOverride)
+                    stashDir = task.stashDirOverride;
+                else if (overrideStashDir)
+                    stashDir = overrideStashDir;
+                else if (stash)
+                    stashDir = stash.stashDir;
+                else if (!materialiseStash)
+                    stashDir = path.join(task.taskDir, "__no-stash__");
+            }
+            // Build the prompt-override (#267). The builder is invoked once
+            // per (task, arm) — seeds share a prompt. `undefined` keeps the
+            // driver's default prompt in play.
+            //
+            // #261: the synthetic arm has a scratch-notes prompt contract —
+            // the model is told no AKM stash is available and instructed to
+            // write/use its own procedural notes. When the caller does not
+            // supply a `buildPrompt` override for the synthetic arm we fall
+            // back to a built-in scratch-notes prompt so the contract is
+            // honoured by every utility-track caller, not just `runEvolve`.
+            let promptOverride = options.buildPrompt?.(task, arm);
+            if (promptOverride === undefined && arm === "synthetic") {
+                promptOverride = buildUtilitySyntheticPrompt(task.id);
+            }
+            // Collect per-run warnings separately and merge after the run so
+            // concurrent runs don't interleave partial warning sequences.
+            const runWarnings = [];
+            const run = await runOneIsolated({
+                task,
+                arm,
+                seed,
+                model: options.model,
+                stashDir,
+                budgetTokens,
+                budgetWallMs,
+                spawn: options.spawn,
+                warnings: runWarnings,
+                ...(promptOverride !== undefined ? { prompt: promptOverride } : {}),
+                ...(options.opencodeProviders ? { opencodeProviders: options.opencodeProviders } : {}),
+                ...(stash?.indexCacheHome ? { indexCacheHome: stash.indexCacheHome } : {}),
+            });
+            // Merge per-run warnings into the shared array.
+            if (runWarnings.length > 0)
+                warnings.push(...runWarnings);
+            taskRuns.get(arm)?.push(run);
+            // Emit a compact progress line to stderr (unconditional — even under
+            // --json so operators have a heartbeat during long runs).
+            completedRuns += 1;
+            emitProgress(completedRuns, totalRuns, run);
+            // Accumulate for partial checkpointing.
+            allCompletedRuns.push(run);
+            if (completedRuns % CHECKPOINT_INTERVAL === 0) {
+                writePartialCheckpoint(allCompletedRuns, runTimestamp);
             }
+            // §6.7 search-pipeline bridge: only the akm arm consults the stash,
+            // and we only attribute ranks for tasks with a gold ref. Both
+            // guards mean noakm and gold-less runs are silently excluded.
+            if (arm === "akm" && task.goldRef) {
+                const searches = extractGoldRanks(run, task.goldRef);
+                goldRankRecords.push({
+                    taskId: task.id,
+                    arm,
+                    seed,
+                    outcome: run.outcome,
+                    goldRef: task.goldRef,
+                    searches,
+                });
+            }
+            // #257: evaluate the akm-arm run against every workflow spec. The
+            // evaluator's `specApplies` filter handles applicability (arm,
+            // domain, gold ref, repeated-failures threshold), so we hand it the
+            // entire spec list and append whatever it returns. noakm/synthetic
+            // arms are not evaluated — workflow specs target the akm arm.
+            if (arm === "akm" && workflowSpecs.length > 0) {
+                const trace = normalizeRunToTrace(run, {
+                    warnings: runWarnings,
+                    harness: {
+                        agentStartedTs: run.startedAt,
+                        agentFinishedTs: run.finishedAt,
+                    },
+                });
+                const runCtx = {
+                    arm: run.arm,
+                    taskId: run.taskId,
+                    seed: run.seed,
+                    outcome: run.outcome,
+                };
+                const taskMetadata = buildWorkflowTaskMetadata(task, trace);
+                const checks = evaluateRunAgainstAllSpecs(trace, workflowSpecs, runCtx, taskMetadata);
+                workflowChecks.push(...checks);
+            }
+        };
+        try {
+            await runInBatches(workItems, parallel, runItem);
         }
         finally {
             // Deregister BEFORE running cleanup so a SIGINT arriving during this
@@ -289,6 +441,7 @@ async function runOneIsolated(args) {
             track: "utility",
             arm: args.arm,
             taskId: args.task.id,
+            taskTitle: args.task.title,
             workspace,
             model: args.model,
             seed: args.seed,
@@ -297,10 +450,13 @@ async function runOneIsolated(args) {
             verifier: args.task.verifier,
             taskDir: args.task.taskDir,
             ...(args.task.expectedMatch ? { expectedMatch: args.task.expectedMatch } : {}),
+            ...(args.task.akmKeywords ? { akmKeywords: args.task.akmKeywords } : {}),
             ...(args.stashDir ? { stashDir: args.stashDir } : {}),
             ...(args.spawn ? { spawn: args.spawn } : {}),
             ...(args.prompt !== undefined ? { prompt: args.prompt } : {}),
             warnings: args.warnings,
+            ...(args.opencodeProviders ? { opencodeProviders: args.opencodeProviders } : {}),
+            ...(args.indexCacheHome ? { indexCacheHome: args.indexCacheHome } : {}),
         };
         const result = await runOne(runOptions);
         // Splice in the trajectory metric. The driver always returns
@@ -532,5 +688,12 @@ function buildReport(args) {
     // we just collected. This is the §6.5 "free" diagnostic — it runs on every
     // utility invocation, no extra spawns.
     baseReport.perAsset = computePerAssetAttribution(baseReport);
+    // Stamp the optional baseline pass-rate map onto the report so the
+    // renderer surfaces a `vs base` column in markdown and a
+    // `baseline_by_task_id` field in JSON. Additive — when the caller did
+    // not pass a baseline the report shape is byte-identical to before.
+    if (args.options.baselineByTaskId) {
+        baseReport.baselineByTaskId = { ...args.options.baselineByTaskId };
+    }
     return baseReport;
 }

package/dist/tests/bench/tmp.js CHANGED Viewed

@@ -39,3 +39,93 @@ export function benchTmpRoot() {
 export function benchMkdtemp(prefix) {
     return fs.mkdtempSync(path.join(benchTmpRoot(), prefix));
 }
+// ── PID file ────────────────────────────────────────────────────────────────
+/** Absolute path to the bench PID file: `${AKM_CACHE_DIR}/bench/bench.pid`. */
+export function benchPidPath() {
+    return path.join(benchTmpRoot(), "bench.pid");
+}
+/**
+ * Write `process.pid` to `bench.pid`.
+ *
+ * If a stale PID file exists and the referenced process is no longer running,
+ * it is removed with a warning before writing the new one.
+ *
+ * Returns a cleanup function that removes the PID file. Call it in a
+ * `finally` block so the file is removed on both clean exit and exceptions.
+ */
+export function writeBenchPid() {
+    const pidPath = benchPidPath();
+    // Check for an existing PID file and warn if stale.
+    if (fs.existsSync(pidPath)) {
+        let existingPid;
+        try {
+            const raw = fs.readFileSync(pidPath, "utf8").trim();
+            existingPid = Number.parseInt(raw, 10);
+        }
+        catch {
+            // Unreadable — treat as stale.
+        }
+        if (existingPid !== undefined && Number.isFinite(existingPid) && !isPidRunning(existingPid)) {
+            // Stale PID — warn and remove.
+            process.stderr.write(`bench: removing stale PID file for PID ${existingPid} (process not running)\n`);
+            try {
+                fs.rmSync(pidPath, { force: true });
+            }
+            catch {
+                /* best-effort */
+            }
+        }
+    }
+    try {
+        fs.writeFileSync(pidPath, String(process.pid), "utf8");
+    }
+    catch {
+        /* best-effort — PID file is diagnostic, not critical */
+    }
+    return () => {
+        try {
+            // Only remove if it still contains our PID (guard against races).
+            const current = fs.readFileSync(pidPath, "utf8").trim();
+            if (current === String(process.pid)) {
+                fs.rmSync(pidPath, { force: true });
+            }
+        }
+        catch {
+            /* best-effort */
+        }
+    };
+}
+/**
+ * Read the PID from `bench.pid`. Returns `undefined` when the file does not
+ * exist or cannot be parsed.
+ */
+export function readBenchPid() {
+    const pidPath = benchPidPath();
+    if (!fs.existsSync(pidPath))
+        return undefined;
+    try {
+        const raw = fs.readFileSync(pidPath, "utf8").trim();
+        const n = Number.parseInt(raw, 10);
+        return Number.isFinite(n) && n > 0 ? n : undefined;
+    }
+    catch {
+        return undefined;
+    }
+}
+/**
+ * Return `true` when the process with the given PID is running on this host.
+ * Uses `process.kill(pid, 0)` — signal 0 is a no-op probe that throws ESRCH
+ * when the process does not exist and EPERM when it exists but is owned by
+ * another user (in which case it IS running).
+ */
+export function isPidRunning(pid) {
+    try {
+        process.kill(pid, 0);
+        return true;
+    }
+    catch (err) {
+        const code = err.code;
+        // EPERM means the process exists but we don't have permission to signal it.
+        return code === "EPERM";
+    }
+}

package/dist/tests/bench/trajectory.js CHANGED Viewed

@@ -48,8 +48,8 @@ function computeCorrectAssetLoaded(task, runResult, opts) {
         return null;
     const ref = task.goldRef;
     // Search the events stream for any tool-call event that carries the ref.
-    // akm itself does not emit an event for `show`, but third parties might,
-    // and the field is forward-compatible.
+    // akm show emits an event to events.jsonl, so this path is the primary
+    // detection route when the structured event stream is available.
     for (const event of runResult.events) {
         const refField = event.ref;
         if (typeof refField === "string" && matchesRef(refField, ref))

package/dist/tests/bench/verifier.js CHANGED Viewed

@@ -79,7 +79,12 @@ export async function runVerifier(taskDir, workspace, kind, config) {
         return runProcess(["bash", script], workspace, resolveSpawn(config));
     }
     if (kind === "pytest") {
-        return runProcess(["pytest", "-q", "--tb=line"], workspace, resolveSpawn(config));
+        // Test files live at <taskDir>/tests/, not inside the workspace copy.
+        // Pass the absolute path so pytest discovers them while running with
+        // cwd=workspace (which lets relative paths like pathlib.Path("file.yml") work).
+        const testsDir = path.join(taskDir, "tests");
+        const testArgs = fs.existsSync(testsDir) ? [testsDir] : [];
+        return runProcess(["pytest", "-q", "--tb=line", ...testArgs], workspace, resolveSpawn(config));
     }
     if (kind === "regex") {
         const pattern = config?.expectedMatch;

package/dist/tests/bench/workflow-spec.js CHANGED Viewed

@@ -9,9 +9,8 @@
  *   - `loadWorkflowSpec(path, root?)` — parses + validates one file
  *   - `loadAllWorkflowSpecs(dir)` — walks a workflows directory
  *
- * Event names are validated against a HARDCODED set in this file. Once
- * #254 lands, #256 will reconcile by importing the source-of-truth set
- * from `workflow-trace.ts`. Until then this set is the contract.
+ * Event names are validated against `WORKFLOW_TRACE_EVENT_NAMES` imported from
+ * `workflow-trace.ts` — single source of truth, no dual-maintenance hazard.
  *
  * Asset refs (e.g. `gold_ref`) are validated via `parseAssetRef` from
  * `src/core/asset-ref.ts` — never reinvent ref validation.
@@ -20,31 +19,19 @@ import { readdirSync, readFileSync, statSync } from "node:fs";
 import path from "node:path";
 import { parse as parseYaml } from "yaml";
 import { parseAssetRef } from "../../src/core/asset-ref";
-// ── Event-name set (hardcoded; reconcile with #254 in wave 3) ──────────────
+import { WORKFLOW_TRACE_EVENT_NAMES } from "./workflow-trace";
+// ── Event-name set (derived from workflow-trace.ts — single source of truth) ─
 /**
- * Hardcoded event-name allowlist. Mirrors the WorkflowTraceEvent.kind set
- * specified in the #254 brief.
+ * Allowlist of known event names, derived from `WORKFLOW_TRACE_EVENT_NAMES` in
+ * `workflow-trace.ts`. Using the exported runtime Set eliminates the dual-
+ * maintenance hazard: add a new event type once in `workflow-trace.ts` and
+ * both the normalizer and the spec validator see it automatically.
  *
  * `first_workspace_write` is a synthetic marker (the first `workspace_write`
  * for a run) and is included so specs can talk about it directly.
  */
-export const KNOWN_EVENT_NAMES = Object.freeze([
-    "agent_started",
-    "akm_search",
-    "akm_show",
-    "akm_feedback",
-    "akm_reflect",
-    "akm_distill",
-    "akm_propose",
-    "akm_proposal_accept",
-    "workspace_read",
-    "workspace_write",
-    "test_run",
-    "verifier_run",
-    "agent_finished",
-    "first_workspace_write",
-]);
-const EVENT_NAME_SET = new Set(KNOWN_EVENT_NAMES);
+export const KNOWN_EVENT_NAMES = WORKFLOW_TRACE_EVENT_NAMES;
+const EVENT_NAME_SET = KNOWN_EVENT_NAMES;
 function isKnownEvent(name) {
     return typeof name === "string" && EVENT_NAME_SET.has(name);
 }
@@ -96,7 +83,7 @@ function requireNumber(obj, key, specPath) {
 }
 function validateEventName(name, specPath, where) {
     if (!isKnownEvent(name)) {
-        throw new WorkflowSpecError(`Unknown event name "${String(name)}" in ${where}. ` + `Allowed: ${KNOWN_EVENT_NAMES.join(", ")}`, specPath);
+        throw new WorkflowSpecError(`Unknown event name "${String(name)}" in ${where}. ` + `Allowed: ${[...KNOWN_EVENT_NAMES].join(", ")}`, specPath);
     }
     return name;
 }

package/dist/tests/bench/workflow-spec.test.js CHANGED Viewed

@@ -17,7 +17,7 @@ const REQUIRED_SPECS = [
     "akm-feedback-after-use",
     "akm-negative-feedback-on-failure",
     "akm-reflect-after-repeated-failure",
-    "akm-proposal-review-before-accept",
+    "akm-workflow-followed",
 ];
 // ── Scratch directory helpers ──────────────────────────────────────────────
 let scratch;