akm-cli 0.6.0 → 0.7.0-rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +66 -0
- package/dist/{cli.js → src/cli.js} +672 -29
- package/dist/{commands → src/commands}/config-cli.js +5 -4
- package/dist/src/commands/distill.js +283 -0
- package/dist/src/commands/events.js +108 -0
- package/dist/src/commands/history.js +120 -0
- package/dist/{commands → src/commands}/installed-stashes.js +28 -2
- package/dist/src/commands/proposal.js +119 -0
- package/dist/src/commands/propose.js +171 -0
- package/dist/src/commands/reflect.js +193 -0
- package/dist/{commands → src/commands}/registry-search.js +2 -1
- package/dist/{commands → src/commands}/remember.js +12 -0
- package/dist/{commands → src/commands}/search.js +74 -1
- package/dist/{commands → src/commands}/self-update.js +4 -3
- package/dist/{commands → src/commands}/show.js +67 -2
- package/dist/{core → src/core}/asset-ref.js +5 -5
- package/dist/{core → src/core}/asset-spec.js +12 -0
- package/dist/{core → src/core}/common.js +1 -1
- package/dist/{core → src/core}/config.js +175 -121
- package/dist/{core → src/core}/errors.js +4 -0
- package/dist/src/core/events.js +239 -0
- package/dist/src/core/lesson-lint.js +86 -0
- package/dist/src/core/proposals.js +406 -0
- package/dist/src/core/warn.js +72 -0
- package/dist/{core → src/core}/write-source.js +80 -5
- package/dist/{indexer → src/indexer}/db-search.js +119 -27
- package/dist/{indexer → src/indexer}/db.js +76 -23
- package/dist/{indexer → src/indexer}/file-context.js +0 -3
- package/dist/src/indexer/graph-boost.js +179 -0
- package/dist/src/indexer/graph-extraction.js +212 -0
- package/dist/{indexer → src/indexer}/indexer.js +73 -6
- package/dist/src/indexer/memory-inference.js +263 -0
- package/dist/{indexer → src/indexer}/metadata.js +114 -11
- package/dist/src/integrations/agent/config.js +292 -0
- package/dist/src/integrations/agent/detect.js +94 -0
- package/dist/src/integrations/agent/index.js +17 -0
- package/dist/src/integrations/agent/profiles.js +65 -0
- package/dist/src/integrations/agent/prompts.js +167 -0
- package/dist/src/integrations/agent/spawn.js +221 -0
- package/dist/{integrations → src/integrations}/lockfile.js +0 -26
- package/dist/{llm → src/llm}/client.js +33 -2
- package/dist/src/llm/feature-gate.js +108 -0
- package/dist/src/llm/graph-extract.js +107 -0
- package/dist/src/llm/index-passes.js +35 -0
- package/dist/src/llm/memory-infer.js +86 -0
- package/dist/{output → src/output}/renderers.js +60 -1
- package/dist/src/output/shapes.js +516 -0
- package/dist/{output → src/output}/text.js +447 -4
- package/dist/{registry → src/registry}/build-index.js +14 -4
- package/dist/{registry → src/registry}/factory.js +0 -8
- package/dist/{registry → src/registry}/providers/static-index.js +3 -2
- package/dist/{registry → src/registry}/resolve.js +68 -2
- package/dist/{setup → src/setup}/setup.js +43 -5
- package/dist/{sources → src/sources}/providers/git.js +7 -15
- package/dist/{wiki → src/wiki}/wiki.js +9 -11
- package/dist/tests/add-website-source.test.js +119 -0
- package/dist/tests/agent/agent-config-loader.test.js +70 -0
- package/dist/tests/agent/agent-config.test.js +221 -0
- package/dist/tests/agent/agent-detect.test.js +100 -0
- package/dist/tests/agent/agent-spawn.test.js +234 -0
- package/dist/tests/agent-output.test.js +186 -0
- package/dist/tests/architecture/agent-no-llm-sdk-guard.test.js +103 -0
- package/dist/tests/architecture/agent-spawn-seam.test.js +193 -0
- package/dist/tests/architecture/llm-stateless-seam.test.js +112 -0
- package/dist/tests/asset-ref.test.js +192 -0
- package/dist/tests/asset-registry.test.js +103 -0
- package/dist/tests/asset-spec.test.js +241 -0
- package/dist/tests/bench/attribution.test.js +995 -0
- package/dist/tests/bench/cleanup-sigint.test.js +83 -0
- package/dist/tests/bench/cleanup.js +203 -0
- package/dist/tests/bench/cleanup.test.js +166 -0
- package/dist/tests/bench/cli.js +683 -0
- package/dist/tests/bench/cli.test.js +177 -0
- package/dist/tests/bench/compare.test.js +556 -0
- package/dist/tests/bench/corpus.js +314 -0
- package/dist/tests/bench/corpus.test.js +258 -0
- package/dist/tests/bench/driver.js +346 -0
- package/dist/tests/bench/driver.test.js +443 -0
- package/dist/tests/bench/evolve-metrics.js +179 -0
- package/dist/tests/bench/evolve-metrics.test.js +187 -0
- package/dist/tests/bench/evolve.js +580 -0
- package/dist/tests/bench/evolve.test.js +616 -0
- package/dist/tests/bench/failure-modes.test.js +300 -0
- package/dist/tests/bench/feedback-integrity.test.js +456 -0
- package/dist/tests/bench/leakage.test.js +125 -0
- package/dist/tests/bench/learning-curve.test.js +133 -0
- package/dist/tests/bench/metrics.js +2319 -0
- package/dist/tests/bench/metrics.test.js +1144 -0
- package/dist/tests/bench/no-os-tmpdir-invariant.test.js +43 -0
- package/dist/tests/bench/report.js +1821 -0
- package/dist/tests/bench/report.test.js +989 -0
- package/dist/tests/bench/runner.js +536 -0
- package/dist/tests/bench/runner.test.js +958 -0
- package/dist/tests/bench/search-bridge.test.js +331 -0
- package/dist/tests/bench/tmp.js +41 -0
- package/dist/tests/bench/trajectory.js +116 -0
- package/dist/tests/bench/trajectory.test.js +127 -0
- package/dist/tests/bench/verifier.js +109 -0
- package/dist/tests/bench/verifier.test.js +118 -0
- package/dist/tests/bench/workflow-evaluator.js +557 -0
- package/dist/tests/bench/workflow-evaluator.test.js +421 -0
- package/dist/tests/bench/workflow-spec.js +358 -0
- package/dist/tests/bench/workflow-spec.test.js +363 -0
- package/dist/tests/bench/workflow-trace.js +438 -0
- package/dist/tests/bench/workflow-trace.test.js +254 -0
- package/dist/tests/benchmark-search-quality.js +536 -0
- package/dist/tests/benchmark-suite.js +1441 -0
- package/dist/tests/capture-cli.test.js +112 -0
- package/dist/tests/cli-errors.test.js +203 -0
- package/dist/tests/commands/events.test.js +370 -0
- package/dist/tests/commands/history.test.js +223 -0
- package/dist/tests/commands/import.test.js +103 -0
- package/dist/tests/commands/proposal-cli.test.js +209 -0
- package/dist/tests/commands/reflect-propose-cli.test.js +333 -0
- package/dist/tests/commands/remember.test.js +97 -0
- package/dist/tests/commands/scope-flags.test.js +300 -0
- package/dist/tests/commands/search.test.js +537 -0
- package/dist/tests/commands/show-indexer-parity.test.js +117 -0
- package/dist/tests/commands/show.test.js +294 -0
- package/dist/tests/common.test.js +266 -0
- package/dist/tests/completions.test.js +142 -0
- package/dist/tests/config-cli.test.js +193 -0
- package/dist/tests/config-llm-features.test.js +139 -0
- package/dist/tests/config.test.js +544 -0
- package/dist/tests/contracts/migration-baseline.test.js +43 -0
- package/dist/tests/contracts/reflect-propose-envelope.test.js +139 -0
- package/dist/tests/contracts/spec-helpers.js +46 -0
- package/dist/tests/contracts/v1-spec-section-11-proposal-queue.test.js +228 -0
- package/dist/tests/contracts/v1-spec-section-12-agent-config.test.js +56 -0
- package/dist/tests/contracts/v1-spec-section-13-lesson-type.test.js +34 -0
- package/dist/tests/contracts/v1-spec-section-14-llm-features.test.js +94 -0
- package/dist/tests/contracts/v1-spec-section-4-1-asset-types.test.js +39 -0
- package/dist/tests/contracts/v1-spec-section-4-2-quality-rules.test.js +44 -0
- package/dist/tests/contracts/v1-spec-section-5-configuration.test.js +47 -0
- package/dist/tests/contracts/v1-spec-section-6-orchestration.test.js +40 -0
- package/dist/tests/contracts/v1-spec-section-7-module-layout.test.js +58 -0
- package/dist/tests/contracts/v1-spec-section-8-extension-points.test.js +34 -0
- package/dist/tests/contracts/v1-spec-section-9-4-cli-surface.test.js +75 -0
- package/dist/tests/contracts/v1-spec-section-9-7-llm-agent-boundary.test.js +36 -0
- package/dist/tests/core/write-source.test.js +366 -0
- package/dist/tests/curate-command.test.js +87 -0
- package/dist/tests/db-scoring.test.js +201 -0
- package/dist/tests/db.test.js +654 -0
- package/dist/tests/distill-cli-flag.test.js +208 -0
- package/dist/tests/distill.test.js +515 -0
- package/dist/tests/docker-install.test.js +120 -0
- package/dist/tests/e2e.test.js +1398 -0
- package/dist/tests/embedder.test.js +340 -0
- package/dist/tests/embedding-model-config.test.js +379 -0
- package/dist/tests/feedback-command.test.js +172 -0
- package/dist/tests/file-context.test.js +552 -0
- package/dist/tests/fixtures/scripts/git/summarize-diff.js +9 -0
- package/dist/tests/fixtures/scripts/lint/eslint-check.js +7 -0
- package/dist/tests/fixtures/stashes/load.js +166 -0
- package/dist/tests/fixtures/stashes/load.test.js +88 -0
- package/dist/tests/fixtures/stashes/ranking-baseline/scripts/mem0-search.js +12 -0
- package/dist/tests/frontmatter.test.js +190 -0
- package/dist/tests/fts-field-weighting.test.js +254 -0
- package/dist/tests/fuzzy-search.test.js +230 -0
- package/dist/tests/git-provider-clone.test.js +45 -0
- package/dist/tests/github.test.js +161 -0
- package/dist/tests/graph-boost-ranking.test.js +305 -0
- package/dist/tests/graph-extraction.test.js +282 -0
- package/dist/tests/helpers/usage-events.js +8 -0
- package/dist/tests/index-pass-llm.test.js +161 -0
- package/dist/tests/indexer.test.js +559 -0
- package/dist/tests/info-command.test.js +166 -0
- package/dist/tests/init.test.js +69 -0
- package/dist/tests/install-script.test.js +246 -0
- package/dist/tests/integration/agent-real-profile.test.js +94 -0
- package/dist/tests/issue-36-repro.test.js +304 -0
- package/dist/tests/issues-191-194.test.js +160 -0
- package/dist/tests/lesson-lint.test.js +111 -0
- package/dist/tests/llm-client.test.js +115 -0
- package/dist/tests/llm-feature-gate.test.js +151 -0
- package/dist/tests/llm.test.js +139 -0
- package/dist/tests/lockfile.test.js +216 -0
- package/dist/tests/manifest.test.js +205 -0
- package/dist/tests/markdown.test.js +126 -0
- package/dist/tests/matchers-unit.test.js +189 -0
- package/dist/tests/memory-inference.test.js +299 -0
- package/dist/tests/merge-scoring.test.js +136 -0
- package/dist/tests/metadata.test.js +313 -0
- package/dist/tests/migration-help.test.js +89 -0
- package/dist/tests/origin-resolve.test.js +124 -0
- package/dist/tests/output-baseline.test.js +217 -0
- package/dist/tests/output-shapes-unit.test.js +476 -0
- package/dist/tests/parallel-search.test.js +272 -0
- package/dist/tests/parameter-metadata.test.js +365 -0
- package/dist/tests/paths.test.js +177 -0
- package/dist/tests/progressive-disclosure.test.js +280 -0
- package/dist/tests/proposals.test.js +279 -0
- package/dist/tests/proposed-quality.test.js +271 -0
- package/dist/tests/provider-registry.test.js +32 -0
- package/dist/tests/ranking-regression.test.js +548 -0
- package/dist/tests/reflect-propose.test.js +455 -0
- package/dist/tests/registry-build-index.test.js +378 -0
- package/dist/tests/registry-cli.test.js +290 -0
- package/dist/tests/registry-index-v2.test.js +430 -0
- package/dist/tests/registry-install.test.js +728 -0
- package/dist/tests/registry-providers/parity.test.js +189 -0
- package/dist/tests/registry-providers/skills-sh.test.js +309 -0
- package/dist/tests/registry-providers/static-index.test.js +204 -0
- package/dist/tests/registry-resolve.test.js +126 -0
- package/dist/tests/registry-search.test.js +723 -0
- package/dist/tests/remember-frontmatter.test.js +380 -0
- package/dist/tests/remember-unit.test.js +123 -0
- package/dist/tests/ripgrep-install.test.js +251 -0
- package/dist/tests/ripgrep-resolve.test.js +108 -0
- package/dist/tests/ripgrep.test.js +163 -0
- package/dist/tests/save-command.test.js +94 -0
- package/dist/tests/save-trust-qa-fixes.test.js +270 -0
- package/dist/tests/scoring-pipeline.test.js +648 -0
- package/dist/tests/search-include-proposed-cli.test.js +118 -0
- package/dist/tests/self-update.test.js +442 -0
- package/dist/tests/semantic-search-e2e.test.js +512 -0
- package/dist/tests/semantic-status.test.js +471 -0
- package/dist/tests/setup-run.integration.js +877 -0
- package/dist/tests/setup-wizard.test.js +198 -0
- package/dist/tests/setup.test.js +131 -0
- package/dist/tests/source-add.test.js +11 -0
- package/dist/tests/source-clone.test.js +254 -0
- package/dist/tests/source-manage.test.js +366 -0
- package/dist/tests/source-providers/filesystem.test.js +82 -0
- package/dist/tests/source-providers/git.test.js +252 -0
- package/dist/tests/source-providers/website.test.js +128 -0
- package/dist/tests/source-qa-fixes.test.js +268 -0
- package/dist/tests/source-registry.test.js +350 -0
- package/dist/tests/source-resolve.test.js +100 -0
- package/dist/tests/source-source.test.js +221 -0
- package/dist/tests/source.test.js +533 -0
- package/dist/tests/tar-utils-scan.test.js +73 -0
- package/dist/tests/toggle-components.test.js +73 -0
- package/dist/tests/usage-telemetry.test.js +265 -0
- package/dist/tests/utility-scoring.test.js +558 -0
- package/dist/tests/vault-load-error.test.js +78 -0
- package/dist/tests/vault-qa-fixes.test.js +194 -0
- package/dist/tests/vault.test.js +429 -0
- package/dist/tests/vector-search.test.js +608 -0
- package/dist/tests/walker.test.js +252 -0
- package/dist/tests/wave2-cluster-bc.test.js +228 -0
- package/dist/tests/wave2-cluster-d.test.js +180 -0
- package/dist/tests/wave2-cluster-e.test.js +179 -0
- package/dist/tests/wiki-qa-fixes.test.js +270 -0
- package/dist/tests/wiki.test.js +529 -0
- package/dist/tests/workflow-cli.test.js +271 -0
- package/dist/tests/workflow-markdown.test.js +171 -0
- package/dist/tests/workflow-path-escape.test.js +132 -0
- package/dist/tests/workflow-qa-fixes.test.js +377 -0
- package/dist/tests/workflows/indexer-rejection.test.js +213 -0
- package/docs/README.md +8 -0
- package/docs/migration/release-notes/0.7.0.md +244 -0
- package/package.json +2 -2
- package/dist/core/warn.js +0 -27
- package/dist/output/shapes.js +0 -212
- /package/dist/{commands → src/commands}/completions.js +0 -0
- /package/dist/{commands → src/commands}/curate.js +0 -0
- /package/dist/{commands → src/commands}/info.js +0 -0
- /package/dist/{commands → src/commands}/init.js +0 -0
- /package/dist/{commands → src/commands}/install-audit.js +0 -0
- /package/dist/{commands → src/commands}/migration-help.js +0 -0
- /package/dist/{commands → src/commands}/source-add.js +0 -0
- /package/dist/{commands → src/commands}/source-clone.js +0 -0
- /package/dist/{commands → src/commands}/source-manage.js +0 -0
- /package/dist/{commands → src/commands}/vault.js +0 -0
- /package/dist/{core → src/core}/asset-registry.js +0 -0
- /package/dist/{core → src/core}/frontmatter.js +0 -0
- /package/dist/{core → src/core}/markdown.js +0 -0
- /package/dist/{core → src/core}/paths.js +0 -0
- /package/dist/{indexer → src/indexer}/manifest.js +0 -0
- /package/dist/{indexer → src/indexer}/matchers.js +0 -0
- /package/dist/{indexer → src/indexer}/search-fields.js +0 -0
- /package/dist/{indexer → src/indexer}/search-source.js +0 -0
- /package/dist/{indexer → src/indexer}/semantic-status.js +0 -0
- /package/dist/{indexer → src/indexer}/usage-events.js +0 -0
- /package/dist/{indexer → src/indexer}/walker.js +0 -0
- /package/dist/{integrations → src/integrations}/github.js +0 -0
- /package/dist/{llm → src/llm}/embedder.js +0 -0
- /package/dist/{llm → src/llm}/embedders/cache.js +0 -0
- /package/dist/{llm → src/llm}/embedders/local.js +0 -0
- /package/dist/{llm → src/llm}/embedders/remote.js +0 -0
- /package/dist/{llm → src/llm}/embedders/types.js +0 -0
- /package/dist/{llm → src/llm}/metadata-enhance.js +0 -0
- /package/dist/{output → src/output}/cli-hints.js +0 -0
- /package/dist/{output → src/output}/context.js +0 -0
- /package/dist/{registry → src/registry}/create-provider-registry.js +0 -0
- /package/dist/{registry → src/registry}/origin-resolve.js +0 -0
- /package/dist/{registry → src/registry}/providers/index.js +0 -0
- /package/dist/{registry → src/registry}/providers/skills-sh.js +0 -0
- /package/dist/{registry → src/registry}/providers/types.js +0 -0
- /package/dist/{registry → src/registry}/types.js +0 -0
- /package/dist/{setup → src/setup}/detect.js +0 -0
- /package/dist/{setup → src/setup}/ripgrep-install.js +0 -0
- /package/dist/{setup → src/setup}/ripgrep-resolve.js +0 -0
- /package/dist/{setup → src/setup}/steps.js +0 -0
- /package/dist/{sources → src/sources}/include.js +0 -0
- /package/dist/{sources → src/sources}/provider-factory.js +0 -0
- /package/dist/{sources → src/sources}/provider.js +0 -0
- /package/dist/{sources → src/sources}/providers/filesystem.js +0 -0
- /package/dist/{sources → src/sources}/providers/index.js +0 -0
- /package/dist/{sources → src/sources}/providers/install-types.js +0 -0
- /package/dist/{sources → src/sources}/providers/npm.js +0 -0
- /package/dist/{sources → src/sources}/providers/provider-utils.js +0 -0
- /package/dist/{sources → src/sources}/providers/sync-from-ref.js +0 -0
- /package/dist/{sources → src/sources}/providers/tar-utils.js +0 -0
- /package/dist/{sources → src/sources}/providers/website.js +0 -0
- /package/dist/{sources → src/sources}/resolve.js +0 -0
- /package/dist/{sources → src/sources}/types.js +0 -0
- /package/dist/{templates → src/templates}/wiki-templates.js +0 -0
- /package/dist/{version.js → src/version.js} +0 -0
- /package/dist/{workflows → src/workflows}/authoring.js +0 -0
- /package/dist/{workflows → src/workflows}/cli.js +0 -0
- /package/dist/{workflows → src/workflows}/db.js +0 -0
- /package/dist/{workflows → src/workflows}/document-cache.js +0 -0
- /package/dist/{workflows → src/workflows}/parser.js +0 -0
- /package/dist/{workflows → src/workflows}/renderer.js +0 -0
- /package/dist/{workflows → src/workflows}/runs.js +0 -0
- /package/dist/{workflows → src/workflows}/schema.js +0 -0
- /package/dist/{workflows → src/workflows}/validator.js +0 -0
|
@@ -0,0 +1,536 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* akm-bench K-seed runner (spec §5 + §6).
|
|
3
|
+
*
|
|
4
|
+
* `runUtility(options)` is the single entry point used by both the CLI
|
|
5
|
+
* dispatcher (`tests/bench/cli.ts utility`) and unit tests. It expands the
|
|
6
|
+
* caller's `(tasks × arms × seeds)` cartesian product, calls `runOne` for
|
|
7
|
+
* each triple, splices the trajectory record back in, and returns a
|
|
8
|
+
* `UtilityRunReport` that `renderUtilityReport` can stamp into JSON +
|
|
9
|
+
* markdown.
|
|
10
|
+
*
|
|
11
|
+
* Per-(arm, seed) isolation:
|
|
12
|
+
* • Workspace: each (task, arm, seed) gets a fresh tmp dir seeded from the
|
|
13
|
+
* task's `workspace/` template so runs cannot pollute each other.
|
|
14
|
+
* • Stash: only the `akm` arm materialises a stash via `loadFixtureStash`.
|
|
15
|
+
* We materialise once per task (the stash content is identical across
|
|
16
|
+
* the K seeds) and reuse it.
|
|
17
|
+
*
|
|
18
|
+
* Cleanup: every tmp resource is wrapped in `try/finally`. We never leak
|
|
19
|
+
* tmp dirs even on harness exceptions.
|
|
20
|
+
*/
|
|
21
|
+
import { createHash } from "node:crypto";
|
|
22
|
+
import fs from "node:fs";
|
|
23
|
+
import path from "node:path";
|
|
24
|
+
import { warn } from "../../src/core/warn";
|
|
25
|
+
import { computeFixtureContentHash, loadFixtureStash } from "../fixtures/stashes/load";
|
|
26
|
+
import { registerCleanup } from "./cleanup";
|
|
27
|
+
import { computeTaskCorpusHash, readTaskBody } from "./corpus";
|
|
28
|
+
import { runOne } from "./driver";
|
|
29
|
+
import { aggregateCorpus, aggregateFailureModes, aggregatePerTask, aggregateTrajectory, classifyFailureMode, computeCorpusDelta, computePerAssetAttribution, computePerTaskDelta, computeSearchBridge, extractAssetLoads, extractGoldRanks, } from "./metrics";
|
|
30
|
+
import { resolveGitBranch, resolveGitCommit } from "./report";
|
|
31
|
+
import { benchMkdtemp } from "./tmp";
|
|
32
|
+
import { computeTrajectory } from "./trajectory";
|
|
33
|
+
import { evaluateRunAgainstAllSpecs, } from "./workflow-evaluator";
|
|
34
|
+
import { loadAllWorkflowSpecs } from "./workflow-spec";
|
|
35
|
+
import { normalizeRunToTrace } from "./workflow-trace";
|
|
36
|
+
/**
|
|
37
|
+
* Default workflows directory. Can be overridden by callers (tests) via
|
|
38
|
+
* `RunUtilityOptions.workflowsDir`. Specs in this directory are loaded ONCE
|
|
39
|
+
* per `runUtility` call (not per run) — the evaluator filters via each spec's
|
|
40
|
+
* `applies_to` so we don't I/O in the hot loop.
|
|
41
|
+
*/
|
|
42
|
+
const DEFAULT_WORKFLOWS_DIR = path.resolve(__dirname, "..", "fixtures", "bench", "workflows");
|
|
43
|
+
/**
|
|
44
|
+
* Run K seeds × len(arms) × len(tasks) and return the §13.3 report.
|
|
45
|
+
*
|
|
46
|
+
* The function is robust to per-run failures — `runOne` already captures
|
|
47
|
+
* every failure path into a RunResult, so the runner only has to worry
|
|
48
|
+
* about its own infrastructure (stash materialisation, workspace copy).
|
|
49
|
+
* Those failures are recorded as `harness_error` runs.
|
|
50
|
+
*/
|
|
51
|
+
export async function runUtility(options) {
|
|
52
|
+
const seedsPerArm = options.seedsPerArm ?? 5;
|
|
53
|
+
const budgetTokens = options.budgetTokens ?? 30000;
|
|
54
|
+
const budgetWallMs = options.budgetWallMs ?? 120000;
|
|
55
|
+
const slice = options.slice ?? "all";
|
|
56
|
+
const materialiseStash = options.materialiseStash ?? true;
|
|
57
|
+
const grouped = new Map();
|
|
58
|
+
const warnings = [];
|
|
59
|
+
const goldRankRecords = [];
|
|
60
|
+
// #257: load workflow specs ONCE per runUtility call. Skipped when the
|
|
61
|
+
// caller passes an empty `workflowsDir` string (test escape hatch). Errors
|
|
62
|
+
// are surfaced as warnings — workflow evaluation is best-effort and a
|
|
63
|
+
// missing/malformed spec must not abort the whole bench run.
|
|
64
|
+
const workflowSpecs = [];
|
|
65
|
+
const workflowsDir = options.workflowsDir ?? DEFAULT_WORKFLOWS_DIR;
|
|
66
|
+
if (workflowsDir.length > 0) {
|
|
67
|
+
try {
|
|
68
|
+
const loaded = loadAllWorkflowSpecs(workflowsDir);
|
|
69
|
+
workflowSpecs.push(...loaded);
|
|
70
|
+
}
|
|
71
|
+
catch (err) {
|
|
72
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
73
|
+
warnings.push(`workflow specs: failed to load from "${workflowsDir}": ${msg}`);
|
|
74
|
+
warn(`[runUtility] workflow specs unavailable: ${msg}`);
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
const workflowChecks = [];
|
|
78
|
+
for (const task of options.tasks) {
|
|
79
|
+
const taskRuns = new Map();
|
|
80
|
+
grouped.set(task.id, taskRuns);
|
|
81
|
+
// Resolve a caller-supplied stash override before materialising. When
|
|
82
|
+
// `stashDirByFixture` provides a directory for this task's fixture, we
|
|
83
|
+
// skip `loadFixtureStash` entirely and forward the override.
|
|
84
|
+
const overrideStashDir = options.stashDirByFixture?.get(task.stash);
|
|
85
|
+
// Materialise the akm-arm stash once per task. We share it across the K
|
|
86
|
+
// seeds because the stash content is identical and re-running `akm
|
|
87
|
+
// index` for every seed is wasted work.
|
|
88
|
+
let stash;
|
|
89
|
+
let stashError;
|
|
90
|
+
if (options.arms.includes("akm") && materialiseStash && !overrideStashDir) {
|
|
91
|
+
try {
|
|
92
|
+
stash = loadFixtureStash(task.stash, { skipIndex: true });
|
|
93
|
+
}
|
|
94
|
+
catch (err) {
|
|
95
|
+
stashError = err instanceof Error ? err.message : String(err);
|
|
96
|
+
warnings.push(`task ${task.id}: stash "${task.stash}" failed to load: ${stashError}`);
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
// SIGINT/SIGTERM trap (#267): register the per-task stash cleanup so an
|
|
100
|
+
// external signal mid-run reaps the tmp dir we just created.
|
|
101
|
+
const stashSnapshot = stash;
|
|
102
|
+
const deregisterStash = stashSnapshot
|
|
103
|
+
? registerCleanup(() => {
|
|
104
|
+
try {
|
|
105
|
+
stashSnapshot.cleanup();
|
|
106
|
+
}
|
|
107
|
+
catch {
|
|
108
|
+
/* swallow */
|
|
109
|
+
}
|
|
110
|
+
})
|
|
111
|
+
: () => { };
|
|
112
|
+
// #261: when `includeSynthetic` is set, splice the synthetic arm into the
|
|
113
|
+
// per-task arm iteration alongside whatever the caller asked for. We
|
|
114
|
+
// dedupe so a caller that already passes `synthetic` in `arms` does not
|
|
115
|
+
// see it run twice. Pre-#261 callers (no flag, no `synthetic` in arms)
|
|
116
|
+
// see the old loop verbatim — that's the byte-identical default contract.
|
|
117
|
+
const armsForTask = (() => {
|
|
118
|
+
if (!options.includeSynthetic)
|
|
119
|
+
return options.arms;
|
|
120
|
+
if (options.arms.includes("synthetic"))
|
|
121
|
+
return options.arms;
|
|
122
|
+
return [...options.arms, "synthetic"];
|
|
123
|
+
})();
|
|
124
|
+
try {
|
|
125
|
+
for (const arm of armsForTask) {
|
|
126
|
+
const armRuns = [];
|
|
127
|
+
taskRuns.set(arm, armRuns);
|
|
128
|
+
for (let seed = 0; seed < seedsPerArm; seed += 1) {
|
|
129
|
+
// Resolve the stashDir we'll forward to the agent. The akm arm
|
|
130
|
+
// always carries a stashDir so AKM_STASH_DIR is set in the child
|
|
131
|
+
// env — this is how downstream tooling (and the trajectory parser
|
|
132
|
+
// event-stream lookup) distinguishes arms. When the operator opted
|
|
133
|
+
// out of fixture materialisation (tests, dry-run), we still pass a
|
|
134
|
+
// stable placeholder so the env keys are wired correctly.
|
|
135
|
+
let stashDir;
|
|
136
|
+
if (arm === "akm") {
|
|
137
|
+
// Resolution order (must match the issue #251 acceptance criteria):
|
|
138
|
+
// 1. Per-task explicit override (used by `runMaskedCorpus` to
|
|
139
|
+
// point at a tmp stash with one asset removed). Highest
|
|
140
|
+
// priority because attribution correctness depends on this
|
|
141
|
+
// branch never being shadowed by the `__no-stash__`
|
|
142
|
+
// placeholder fallback.
|
|
143
|
+
// 2. Per-(task, arm)-call `stashDirByFixture` override (Phase
|
|
144
|
+
// 3 evolve persistence).
|
|
145
|
+
// 3. Per-task materialised fixture stash from `loadFixtureStash`.
|
|
146
|
+
// 4. `materialiseStash: false` placeholder so AKM_STASH_DIR is
|
|
147
|
+
// still wired into the child env.
|
|
148
|
+
if (task.stashDirOverride)
|
|
149
|
+
stashDir = task.stashDirOverride;
|
|
150
|
+
else if (overrideStashDir)
|
|
151
|
+
stashDir = overrideStashDir;
|
|
152
|
+
else if (stash)
|
|
153
|
+
stashDir = stash.stashDir;
|
|
154
|
+
else if (!materialiseStash)
|
|
155
|
+
stashDir = path.join(task.taskDir, "__no-stash__");
|
|
156
|
+
}
|
|
157
|
+
// Build the prompt-override (#267). The builder is invoked once
|
|
158
|
+
// per (task, arm) — seeds share a prompt. `undefined` keeps the
|
|
159
|
+
// driver's default prompt in play.
|
|
160
|
+
//
|
|
161
|
+
// #261: the synthetic arm has a scratch-notes prompt contract —
|
|
162
|
+
// the model is told no AKM stash is available and instructed to
|
|
163
|
+
// write/use its own procedural notes. When the caller does not
|
|
164
|
+
// supply a `buildPrompt` override for the synthetic arm we fall
|
|
165
|
+
// back to a built-in scratch-notes prompt so the contract is
|
|
166
|
+
// honoured by every utility-track caller, not just `runEvolve`.
|
|
167
|
+
let promptOverride = options.buildPrompt?.(task, arm);
|
|
168
|
+
if (promptOverride === undefined && arm === "synthetic") {
|
|
169
|
+
promptOverride = buildUtilitySyntheticPrompt(task.id);
|
|
170
|
+
}
|
|
171
|
+
const run = await runOneIsolated({
|
|
172
|
+
task,
|
|
173
|
+
arm,
|
|
174
|
+
seed,
|
|
175
|
+
model: options.model,
|
|
176
|
+
stashDir,
|
|
177
|
+
budgetTokens,
|
|
178
|
+
budgetWallMs,
|
|
179
|
+
spawn: options.spawn,
|
|
180
|
+
warnings,
|
|
181
|
+
...(promptOverride !== undefined ? { prompt: promptOverride } : {}),
|
|
182
|
+
});
|
|
183
|
+
armRuns.push(run);
|
|
184
|
+
// §6.7 search-pipeline bridge: only the akm arm consults the stash,
|
|
185
|
+
// and we only attribute ranks for tasks with a gold ref. Both
|
|
186
|
+
// guards mean noakm and gold-less runs are silently excluded.
|
|
187
|
+
if (arm === "akm" && task.goldRef) {
|
|
188
|
+
const searches = extractGoldRanks(run, task.goldRef);
|
|
189
|
+
goldRankRecords.push({
|
|
190
|
+
taskId: task.id,
|
|
191
|
+
arm,
|
|
192
|
+
seed,
|
|
193
|
+
outcome: run.outcome,
|
|
194
|
+
goldRef: task.goldRef,
|
|
195
|
+
searches,
|
|
196
|
+
});
|
|
197
|
+
}
|
|
198
|
+
// #257: evaluate the akm-arm run against every workflow spec. The
|
|
199
|
+
// evaluator's `specApplies` filter handles applicability (arm,
|
|
200
|
+
// domain, gold ref, repeated-failures threshold), so we hand it the
|
|
201
|
+
// entire spec list and append whatever it returns. noakm/synthetic
|
|
202
|
+
// arms are not evaluated — workflow specs target the akm arm.
|
|
203
|
+
if (arm === "akm" && workflowSpecs.length > 0) {
|
|
204
|
+
const trace = normalizeRunToTrace(run, { warnings });
|
|
205
|
+
const runCtx = {
|
|
206
|
+
arm: run.arm,
|
|
207
|
+
taskId: run.taskId,
|
|
208
|
+
seed: run.seed,
|
|
209
|
+
outcome: run.outcome,
|
|
210
|
+
};
|
|
211
|
+
const taskMetadata = buildWorkflowTaskMetadata(task, trace);
|
|
212
|
+
const checks = evaluateRunAgainstAllSpecs(trace, workflowSpecs, runCtx, taskMetadata);
|
|
213
|
+
workflowChecks.push(...checks);
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
finally {
|
|
219
|
+
// Deregister BEFORE running cleanup so a SIGINT arriving during this
|
|
220
|
+
// block doesn't double-fire the cleanup (per cleanup.ts contract).
|
|
221
|
+
deregisterStash();
|
|
222
|
+
stash?.cleanup();
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
return buildReport({
|
|
226
|
+
grouped,
|
|
227
|
+
options,
|
|
228
|
+
seedsPerArm,
|
|
229
|
+
slice,
|
|
230
|
+
warnings,
|
|
231
|
+
goldRankRecords,
|
|
232
|
+
workflowChecks,
|
|
233
|
+
});
|
|
234
|
+
}
|
|
235
|
+
function buildWorkflowTaskMetadata(task, trace) {
|
|
236
|
+
const flags = {
|
|
237
|
+
search_has_relevant_result: searchResultIncludesGoldRef(trace, task.goldRef),
|
|
238
|
+
task_has_tests: taskHasTests(task),
|
|
239
|
+
};
|
|
240
|
+
return {
|
|
241
|
+
...(task.goldRef !== undefined ? { goldRef: task.goldRef } : {}),
|
|
242
|
+
flags,
|
|
243
|
+
};
|
|
244
|
+
}
|
|
245
|
+
function searchResultIncludesGoldRef(trace, goldRef) {
|
|
246
|
+
if (!goldRef)
|
|
247
|
+
return false;
|
|
248
|
+
for (const event of trace.events) {
|
|
249
|
+
if (event.type !== "akm_search")
|
|
250
|
+
continue;
|
|
251
|
+
if (event.resultRefs?.includes(goldRef))
|
|
252
|
+
return true;
|
|
253
|
+
}
|
|
254
|
+
return false;
|
|
255
|
+
}
|
|
256
|
+
function taskHasTests(task) {
|
|
257
|
+
if (task.verifier === "pytest")
|
|
258
|
+
return true;
|
|
259
|
+
const testsDir = path.join(task.taskDir, "tests");
|
|
260
|
+
if (!fs.existsSync(testsDir))
|
|
261
|
+
return false;
|
|
262
|
+
try {
|
|
263
|
+
return fs.readdirSync(testsDir).some((name) => name.endsWith(".py") || name.endsWith(".sh"));
|
|
264
|
+
}
|
|
265
|
+
catch {
|
|
266
|
+
return false;
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
/**
|
|
270
|
+
* Set up a fresh workspace for one (task, arm, seed) triple, run `runOne`
|
|
271
|
+
* against it, splice in the trajectory record, then tear everything down.
|
|
272
|
+
*/
|
|
273
|
+
async function runOneIsolated(args) {
|
|
274
|
+
const workspace = benchMkdtemp(`akm-bench-ws-${args.task.domain}-`);
|
|
275
|
+
// SIGINT trap: register workspace cleanup so external signals don't leak
|
|
276
|
+
// tmp dirs. Deregistered in `finally` before we do the synchronous rm so
|
|
277
|
+
// the handler doesn't double-fire (per cleanup.ts contract).
|
|
278
|
+
const deregisterWorkspace = registerCleanup(() => {
|
|
279
|
+
try {
|
|
280
|
+
fs.rmSync(workspace, { recursive: true, force: true });
|
|
281
|
+
}
|
|
282
|
+
catch {
|
|
283
|
+
/* swallow */
|
|
284
|
+
}
|
|
285
|
+
});
|
|
286
|
+
try {
|
|
287
|
+
seedWorkspace(args.task.taskDir, workspace);
|
|
288
|
+
const runOptions = {
|
|
289
|
+
track: "utility",
|
|
290
|
+
arm: args.arm,
|
|
291
|
+
taskId: args.task.id,
|
|
292
|
+
workspace,
|
|
293
|
+
model: args.model,
|
|
294
|
+
seed: args.seed,
|
|
295
|
+
budgetTokens: args.budgetTokens,
|
|
296
|
+
budgetWallMs: args.budgetWallMs,
|
|
297
|
+
verifier: args.task.verifier,
|
|
298
|
+
taskDir: args.task.taskDir,
|
|
299
|
+
...(args.task.expectedMatch ? { expectedMatch: args.task.expectedMatch } : {}),
|
|
300
|
+
...(args.stashDir ? { stashDir: args.stashDir } : {}),
|
|
301
|
+
...(args.spawn ? { spawn: args.spawn } : {}),
|
|
302
|
+
...(args.prompt !== undefined ? { prompt: args.prompt } : {}),
|
|
303
|
+
warnings: args.warnings,
|
|
304
|
+
};
|
|
305
|
+
const result = await runOne(runOptions);
|
|
306
|
+
// Splice in the trajectory metric. The driver always returns
|
|
307
|
+
// `{ null, null }` — this is where the real values get filled.
|
|
308
|
+
const trajectory = computeTrajectory({ goldRef: args.task.goldRef }, result, {
|
|
309
|
+
warnings: args.warnings,
|
|
310
|
+
});
|
|
311
|
+
// Per-asset attribution is post-processing on the trace; it's free, so we
|
|
312
|
+
// run it on every (task, arm, seed) result. The driver emits an empty
|
|
313
|
+
// assetsLoaded[]; this is where the real refs get filled. Spec §6.5.
|
|
314
|
+
const assetsLoaded = extractAssetLoads(result);
|
|
315
|
+
// Splice in the failure-mode label. Only the akm arm carries one; the
|
|
316
|
+
// noakm baseline is the control and isn't part of the §6.6 to-do list.
|
|
317
|
+
// `classifyFailureMode` returns null for non-failed runs.
|
|
318
|
+
const failureMode = args.arm === "akm" ? classifyFailureMode(args.task, { ...result, trajectory, assetsLoaded }) : null;
|
|
319
|
+
return { ...result, trajectory, assetsLoaded, failureMode };
|
|
320
|
+
}
|
|
321
|
+
finally {
|
|
322
|
+
deregisterWorkspace();
|
|
323
|
+
fs.rmSync(workspace, { recursive: true, force: true });
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
/**
|
|
327
|
+
* Copy the task's `workspace/` template into the per-run tmp dir. If the
|
|
328
|
+
* task has no `workspace/` (loader-test fixtures), the run starts with an
|
|
329
|
+
* empty cwd — that is also valid for verifier-only tasks.
|
|
330
|
+
*/
|
|
331
|
+
function seedWorkspace(taskDir, dest) {
|
|
332
|
+
const src = path.join(taskDir, "workspace");
|
|
333
|
+
if (!fs.existsSync(src))
|
|
334
|
+
return;
|
|
335
|
+
copyDirRecursive(src, dest);
|
|
336
|
+
}
|
|
337
|
+
/**
|
|
338
|
+
* Default synthetic-arm prompt (#261). Used by Track A `runUtility` when the
|
|
339
|
+
* caller opts in via `includeSynthetic: true` and does not also supply a
|
|
340
|
+
* `buildPrompt` override for the synthetic arm.
|
|
341
|
+
*
|
|
342
|
+
* The prompt is a clear scratch-notes contract: the model is told no AKM
|
|
343
|
+
* stash is available and instructed to write/use its own procedural notes
|
|
344
|
+
* before solving the task. This mirrors the prompt shape used by Track B's
|
|
345
|
+
* `buildSyntheticPrompt(taskId)` but is intentionally duplicated here so
|
|
346
|
+
* Track A has no module-level dependency on `evolve.ts`.
|
|
347
|
+
*
|
|
348
|
+
* Exported for tests.
|
|
349
|
+
*/
|
|
350
|
+
export function buildUtilitySyntheticPrompt(taskId) {
|
|
351
|
+
return [
|
|
352
|
+
`Task: ${taskId}`,
|
|
353
|
+
"Arm: synthetic (Bring Your Own Skills)",
|
|
354
|
+
"No akm stash is available; AKM_STASH_DIR is intentionally absent. Before solving",
|
|
355
|
+
"the task, write a short scratchpad of the skills and steps you intend to use,",
|
|
356
|
+
"then proceed. Cite the scratchpad in your trace so the verifier can attribute",
|
|
357
|
+
"the approach to your own reasoning rather than retrieved guidance.",
|
|
358
|
+
].join("\n");
|
|
359
|
+
}
|
|
360
|
+
function copyDirRecursive(src, dest) {
|
|
361
|
+
fs.mkdirSync(dest, { recursive: true });
|
|
362
|
+
const entries = fs.readdirSync(src, { withFileTypes: true });
|
|
363
|
+
for (const entry of entries) {
|
|
364
|
+
if (entry.name === ".gitkeep")
|
|
365
|
+
continue;
|
|
366
|
+
const s = path.join(src, entry.name);
|
|
367
|
+
const d = path.join(dest, entry.name);
|
|
368
|
+
if (entry.isDirectory())
|
|
369
|
+
copyDirRecursive(s, d);
|
|
370
|
+
else if (entry.isFile())
|
|
371
|
+
fs.copyFileSync(s, d);
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
function buildReport(args) {
|
|
375
|
+
const tasks = [];
|
|
376
|
+
const noakmPerTask = {};
|
|
377
|
+
const akmPerTask = {};
|
|
378
|
+
const synthPerTask = {};
|
|
379
|
+
const akmRunsAll = [];
|
|
380
|
+
const allRuns = [];
|
|
381
|
+
const includeSynth = args.options.includeSynthetic === true;
|
|
382
|
+
// #257: index workflow checks by taskId so we can attach a per-task
|
|
383
|
+
// mean compliance to each `UtilityReportTaskEntry`. Only `pass` and
|
|
384
|
+
// `partial` statuses contribute non-zero scores; `not_applicable` is
|
|
385
|
+
// skipped (the spec did not target this run); `harness_error` rolls in
|
|
386
|
+
// as a 0 so corrupt traces drag the per-task number down.
|
|
387
|
+
const checksByTask = new Map();
|
|
388
|
+
for (const c of args.workflowChecks) {
|
|
389
|
+
const arr = checksByTask.get(c.taskId);
|
|
390
|
+
if (arr)
|
|
391
|
+
arr.push(c);
|
|
392
|
+
else
|
|
393
|
+
checksByTask.set(c.taskId, [c]);
|
|
394
|
+
}
|
|
395
|
+
for (const task of args.options.tasks) {
|
|
396
|
+
const taskRuns = args.grouped.get(task.id);
|
|
397
|
+
const noakmRuns = taskRuns?.get("noakm") ?? [];
|
|
398
|
+
const akmRuns = taskRuns?.get("akm") ?? [];
|
|
399
|
+
// #261: synthetic-arm runs are only consulted when the caller opted in.
|
|
400
|
+
// A missing arm is NOT a zero-pass arm — we leave `synthPerTask[task.id]`
|
|
401
|
+
// unset rather than defaulting to a zeroed PerTaskMetrics so downstream
|
|
402
|
+
// consumers can distinguish "arm not run" from "arm ran with 0 passes".
|
|
403
|
+
const synthRuns = includeSynth ? (taskRuns?.get("synthetic") ?? []) : [];
|
|
404
|
+
const noakmMetrics = aggregatePerTask(noakmRuns);
|
|
405
|
+
const akmMetrics = aggregatePerTask(akmRuns);
|
|
406
|
+
const delta = computePerTaskDelta(noakmMetrics, akmMetrics);
|
|
407
|
+
noakmPerTask[task.id] = noakmMetrics;
|
|
408
|
+
akmPerTask[task.id] = akmMetrics;
|
|
409
|
+
if (includeSynth) {
|
|
410
|
+
synthPerTask[task.id] = aggregatePerTask(synthRuns);
|
|
411
|
+
}
|
|
412
|
+
akmRunsAll.push(...akmRuns);
|
|
413
|
+
// Preserve arm order (noakm, synthetic when enabled, then akm) so the
|
|
414
|
+
// persisted runs[] array is deterministic across reruns. #249. The
|
|
415
|
+
// synthetic block is omitted entirely when includeSynth is false so the
|
|
416
|
+
// pre-#261 envelope stays byte-identical.
|
|
417
|
+
if (includeSynth) {
|
|
418
|
+
allRuns.push(...noakmRuns, ...synthRuns, ...akmRuns);
|
|
419
|
+
}
|
|
420
|
+
else {
|
|
421
|
+
allRuns.push(...noakmRuns, ...akmRuns);
|
|
422
|
+
}
|
|
423
|
+
// #257: per-task workflow compliance, mean of `score` over applicable
|
|
424
|
+
// checks (excludes `not_applicable`). Undefined when this task has no
|
|
425
|
+
// applicable checks at all so downstream renderers can distinguish
|
|
426
|
+
// "not measured" from "measured at 0".
|
|
427
|
+
const taskChecks = checksByTask.get(task.id) ?? [];
|
|
428
|
+
const applicableTaskChecks = taskChecks.filter((c) => c.status !== "not_applicable");
|
|
429
|
+
let workflowCompliance;
|
|
430
|
+
if (applicableTaskChecks.length > 0) {
|
|
431
|
+
let sum = 0;
|
|
432
|
+
for (const c of applicableTaskChecks)
|
|
433
|
+
sum += c.score;
|
|
434
|
+
workflowCompliance = sum / applicableTaskChecks.length;
|
|
435
|
+
}
|
|
436
|
+
tasks.push({
|
|
437
|
+
id: task.id,
|
|
438
|
+
noakm: noakmMetrics,
|
|
439
|
+
akm: akmMetrics,
|
|
440
|
+
delta,
|
|
441
|
+
...(includeSynth ? { synthetic: aggregatePerTask(synthRuns) } : {}),
|
|
442
|
+
...(workflowCompliance !== undefined ? { workflowCompliance } : {}),
|
|
443
|
+
});
|
|
444
|
+
}
|
|
445
|
+
const aggregateNoakm = aggregateCorpus(noakmPerTask);
|
|
446
|
+
const aggregateAkm = aggregateCorpus(akmPerTask);
|
|
447
|
+
const aggregateDelta = computeCorpusDelta(aggregateNoakm, aggregateAkm);
|
|
448
|
+
// #261: synthetic-arm aggregate is built ONLY when the caller opted in.
|
|
449
|
+
// We compute it once here so the report renderer can stamp `arms.synthetic`
|
|
450
|
+
// and `akm_over_synthetic_lift` without recomputing.
|
|
451
|
+
const aggregateSynth = includeSynth ? aggregateCorpus(synthPerTask) : undefined;
|
|
452
|
+
const trajectoryAkm = aggregateTrajectory(akmRunsAll);
|
|
453
|
+
// Failure-mode aggregate (§6.6). Walks every akm-arm run; runs that are
|
|
454
|
+
// not "fail" carry `failureMode: null` and are skipped here.
|
|
455
|
+
const failureEntries = [];
|
|
456
|
+
for (const r of akmRunsAll) {
|
|
457
|
+
if (r.failureMode)
|
|
458
|
+
failureEntries.push({ taskId: r.taskId, mode: r.failureMode });
|
|
459
|
+
}
|
|
460
|
+
const failureModes = aggregateFailureModes(failureEntries);
|
|
461
|
+
const domains = new Set(args.options.tasks.map((t) => t.domain)).size;
|
|
462
|
+
const branch = args.options.branch ?? resolveGitBranch();
|
|
463
|
+
const commit = args.options.commit ?? resolveGitCommit();
|
|
464
|
+
const timestamp = args.options.timestamp ?? new Date().toISOString();
|
|
465
|
+
// §6.7 — compute the search-pipeline bridge once over the whole corpus.
|
|
466
|
+
// The function tolerates an empty record list (renders the N/A sentence
|
|
467
|
+
// downstream).
|
|
468
|
+
const searchBridge = computeSearchBridge({ goldRankRecords: args.goldRankRecords });
|
|
469
|
+
// #250 — stamp deterministic corpus + fixture identity into the report
|
|
470
|
+
// so `bench compare` can refuse cross-corpus / cross-fixture diffs unless
|
|
471
|
+
// the operator explicitly opts in via --allow-corpus-mismatch /
|
|
472
|
+
// --allow-fixture-mismatch.
|
|
473
|
+
const selectedTaskIds = [...args.options.tasks.map((t) => t.id)].sort();
|
|
474
|
+
const taskBodies = new Map();
|
|
475
|
+
for (const t of args.options.tasks)
|
|
476
|
+
taskBodies.set(t.id, readTaskBody(t.taskDir));
|
|
477
|
+
const taskCorpusHash = computeTaskCorpusHash(selectedTaskIds, taskBodies);
|
|
478
|
+
const fixtureNames = [...new Set(args.options.tasks.map((t) => t.stash))].sort();
|
|
479
|
+
const fixtures = {};
|
|
480
|
+
for (const name of fixtureNames) {
|
|
481
|
+
try {
|
|
482
|
+
fixtures[name] = computeFixtureContentHash(name);
|
|
483
|
+
}
|
|
484
|
+
catch (err) {
|
|
485
|
+
// Loader-test tasks point at fixtures that may not exist on disk; we
|
|
486
|
+
// still want to stamp identity for the present fixtures, so we record
|
|
487
|
+
// the failure as a warning and continue with the remaining set.
|
|
488
|
+
args.warnings.push(`corpus stamp: cannot hash fixture "${name}": ${err instanceof Error ? err.message : String(err)}`);
|
|
489
|
+
}
|
|
490
|
+
}
|
|
491
|
+
// Combined fixture-content hash. Hash input is the same `<name>\0<hash>\0`
|
|
492
|
+
// pattern used elsewhere — order-stable because `fixtureNames` is sorted.
|
|
493
|
+
const combinedHash = createHash("sha256");
|
|
494
|
+
for (const name of fixtureNames) {
|
|
495
|
+
combinedHash.update(name);
|
|
496
|
+
combinedHash.update("\0");
|
|
497
|
+
combinedHash.update(fixtures[name] ?? "");
|
|
498
|
+
combinedHash.update("\0");
|
|
499
|
+
}
|
|
500
|
+
const fixtureContentHash = combinedHash.digest("hex");
|
|
501
|
+
const baseReport = {
|
|
502
|
+
timestamp,
|
|
503
|
+
branch,
|
|
504
|
+
commit,
|
|
505
|
+
model: args.options.model,
|
|
506
|
+
corpus: {
|
|
507
|
+
domains,
|
|
508
|
+
tasks: args.options.tasks.length,
|
|
509
|
+
slice: args.slice,
|
|
510
|
+
seedsPerArm: args.seedsPerArm,
|
|
511
|
+
selectedTaskIds,
|
|
512
|
+
taskCorpusHash,
|
|
513
|
+
fixtures,
|
|
514
|
+
fixtureContentHash,
|
|
515
|
+
},
|
|
516
|
+
aggregateNoakm,
|
|
517
|
+
aggregateAkm,
|
|
518
|
+
aggregateDelta,
|
|
519
|
+
...(aggregateSynth ? { aggregateSynth } : {}),
|
|
520
|
+
trajectoryAkm,
|
|
521
|
+
failureModes,
|
|
522
|
+
tasks,
|
|
523
|
+
warnings: args.warnings,
|
|
524
|
+
akmRuns: akmRunsAll,
|
|
525
|
+
allRuns,
|
|
526
|
+
taskMetadata: args.options.tasks,
|
|
527
|
+
goldRankRecords: args.goldRankRecords,
|
|
528
|
+
searchBridge,
|
|
529
|
+
workflowChecks: args.workflowChecks,
|
|
530
|
+
};
|
|
531
|
+
// Compute per-asset attribution as post-processing on the akm-arm runs
|
|
532
|
+
// we just collected. This is the §6.5 "free" diagnostic — it runs on every
|
|
533
|
+
// utility invocation, no extra spawns.
|
|
534
|
+
baseReport.perAsset = computePerAssetAttribution(baseReport);
|
|
535
|
+
return baseReport;
|
|
536
|
+
}
|