akm-cli 0.6.1 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +66 -0
- package/dist/{cli.js → src/cli.js} +712 -34
- package/dist/{commands → src/commands}/config-cli.js +47 -4
- package/dist/src/commands/distill.js +283 -0
- package/dist/src/commands/events.js +108 -0
- package/dist/src/commands/history.js +191 -0
- package/dist/{commands → src/commands}/installed-stashes.js +1 -1
- package/dist/src/commands/proposal.js +119 -0
- package/dist/src/commands/propose.js +171 -0
- package/dist/src/commands/reflect.js +193 -0
- package/dist/{commands → src/commands}/registry-search.js +71 -7
- package/dist/{commands → src/commands}/remember.js +12 -0
- package/dist/{commands → src/commands}/search.js +104 -4
- package/dist/{commands → src/commands}/self-update.js +4 -3
- package/dist/{commands → src/commands}/show.js +73 -0
- package/dist/{commands → src/commands}/source-add.js +5 -1
- package/dist/{commands → src/commands}/source-manage.js +7 -1
- package/dist/{core → src/core}/asset-ref.js +5 -5
- package/dist/{core → src/core}/asset-spec.js +12 -0
- package/dist/{core → src/core}/common.js +1 -1
- package/dist/{core → src/core}/config.js +203 -121
- package/dist/{core → src/core}/errors.js +4 -0
- package/dist/src/core/events.js +239 -0
- package/dist/src/core/lesson-lint.js +86 -0
- package/dist/src/core/proposals.js +406 -0
- package/dist/src/core/warn.js +72 -0
- package/dist/{core → src/core}/write-source.js +80 -5
- package/dist/{indexer → src/indexer}/db-search.js +114 -24
- package/dist/{indexer → src/indexer}/db.js +76 -23
- package/dist/{indexer → src/indexer}/file-context.js +0 -3
- package/dist/src/indexer/graph-boost.js +179 -0
- package/dist/src/indexer/graph-extraction.js +212 -0
- package/dist/{indexer → src/indexer}/indexer.js +88 -7
- package/dist/{indexer → src/indexer}/matchers.js +1 -1
- package/dist/src/indexer/memory-inference.js +263 -0
- package/dist/{indexer → src/indexer}/metadata.js +111 -3
- package/dist/{indexer → src/indexer}/search-source.js +4 -2
- package/dist/src/integrations/agent/config.js +292 -0
- package/dist/src/integrations/agent/detect.js +94 -0
- package/dist/src/integrations/agent/index.js +17 -0
- package/dist/src/integrations/agent/profiles.js +65 -0
- package/dist/src/integrations/agent/prompts.js +167 -0
- package/dist/src/integrations/agent/spawn.js +272 -0
- package/dist/{integrations → src/integrations}/github.js +9 -3
- package/dist/{integrations → src/integrations}/lockfile.js +0 -26
- package/dist/{llm → src/llm}/client.js +33 -2
- package/dist/{llm → src/llm}/embedders/remote.js +37 -3
- package/dist/src/llm/feature-gate.js +108 -0
- package/dist/src/llm/graph-extract.js +107 -0
- package/dist/src/llm/index-passes.js +35 -0
- package/dist/src/llm/memory-infer.js +86 -0
- package/dist/{output → src/output}/cli-hints.js +15 -2
- package/dist/{output → src/output}/renderers.js +63 -2
- package/dist/src/output/shapes.js +523 -0
- package/dist/src/output/text.js +1116 -0
- package/dist/{registry → src/registry}/build-index.js +19 -8
- package/dist/{registry → src/registry}/factory.js +0 -8
- package/dist/{registry → src/registry}/providers/static-index.js +6 -3
- package/dist/{registry → src/registry}/resolve.js +68 -2
- package/dist/{setup → src/setup}/setup.js +52 -5
- package/dist/{sources → src/sources}/providers/git.js +7 -15
- package/dist/{wiki → src/wiki}/wiki.js +54 -6
- package/dist/{workflows → src/workflows}/runs.js +37 -3
- package/dist/tests/add-website-source.test.js +119 -0
- package/dist/tests/agent/agent-config-loader.test.js +70 -0
- package/dist/tests/agent/agent-config.test.js +221 -0
- package/dist/tests/agent/agent-detect.test.js +100 -0
- package/dist/tests/agent/agent-spawn.test.js +234 -0
- package/dist/tests/agent-output.test.js +186 -0
- package/dist/tests/architecture/agent-no-llm-sdk-guard.test.js +103 -0
- package/dist/tests/architecture/agent-spawn-seam.test.js +193 -0
- package/dist/tests/architecture/llm-stateless-seam.test.js +112 -0
- package/dist/tests/asset-ref.test.js +192 -0
- package/dist/tests/asset-registry.test.js +103 -0
- package/dist/tests/asset-spec.test.js +241 -0
- package/dist/tests/bench/attribution.test.js +996 -0
- package/dist/tests/bench/cleanup-sigint.test.js +83 -0
- package/dist/tests/bench/cleanup.js +234 -0
- package/dist/tests/bench/cleanup.test.js +166 -0
- package/dist/tests/bench/cli.js +1018 -0
- package/dist/tests/bench/cli.test.js +445 -0
- package/dist/tests/bench/compare.test.js +556 -0
- package/dist/tests/bench/corpus.js +317 -0
- package/dist/tests/bench/corpus.test.js +258 -0
- package/dist/tests/bench/doctor.js +525 -0
- package/dist/tests/bench/driver.js +401 -0
- package/dist/tests/bench/driver.test.js +584 -0
- package/dist/tests/bench/environment.js +233 -0
- package/dist/tests/bench/environment.test.js +199 -0
- package/dist/tests/bench/evolve-metrics.js +179 -0
- package/dist/tests/bench/evolve-metrics.test.js +187 -0
- package/dist/tests/bench/evolve.js +647 -0
- package/dist/tests/bench/evolve.test.js +624 -0
- package/dist/tests/bench/failure-modes.test.js +349 -0
- package/dist/tests/bench/feedback-integrity.test.js +457 -0
- package/dist/tests/bench/leakage.test.js +228 -0
- package/dist/tests/bench/learning-curve.test.js +134 -0
- package/dist/tests/bench/metrics.js +2395 -0
- package/dist/tests/bench/metrics.test.js +1150 -0
- package/dist/tests/bench/no-os-tmpdir-invariant.test.js +43 -0
- package/dist/tests/bench/opencode-config.js +194 -0
- package/dist/tests/bench/opencode-config.test.js +370 -0
- package/dist/tests/bench/report.js +1885 -0
- package/dist/tests/bench/report.test.js +1038 -0
- package/dist/tests/bench/run-config.js +355 -0
- package/dist/tests/bench/run-config.test.js +298 -0
- package/dist/tests/bench/run-curate-test.js +32 -0
- package/dist/tests/bench/run-failing-tasks.js +56 -0
- package/dist/tests/bench/run-full-bench.js +51 -0
- package/dist/tests/bench/run-items36-targeted.js +69 -0
- package/dist/tests/bench/run-nano-quick.js +42 -0
- package/dist/tests/bench/run-waveg-targeted.js +62 -0
- package/dist/tests/bench/runner.js +699 -0
- package/dist/tests/bench/runner.test.js +958 -0
- package/dist/tests/bench/search-bridge.test.js +331 -0
- package/dist/tests/bench/tmp.js +131 -0
- package/dist/tests/bench/trajectory.js +116 -0
- package/dist/tests/bench/trajectory.test.js +127 -0
- package/dist/tests/bench/verifier.js +114 -0
- package/dist/tests/bench/verifier.test.js +118 -0
- package/dist/tests/bench/workflow-evaluator.js +557 -0
- package/dist/tests/bench/workflow-evaluator.test.js +421 -0
- package/dist/tests/bench/workflow-spec.js +345 -0
- package/dist/tests/bench/workflow-spec.test.js +363 -0
- package/dist/tests/bench/workflow-trace.js +472 -0
- package/dist/tests/bench/workflow-trace.test.js +254 -0
- package/dist/tests/benchmark-search-quality.js +536 -0
- package/dist/tests/benchmark-suite.js +1441 -0
- package/dist/tests/capture-cli.test.js +112 -0
- package/dist/tests/cli-errors.test.js +204 -0
- package/dist/tests/commands/events.test.js +370 -0
- package/dist/tests/commands/history.test.js +418 -0
- package/dist/tests/commands/import.test.js +103 -0
- package/dist/tests/commands/proposal-cli.test.js +209 -0
- package/dist/tests/commands/reflect-propose-cli.test.js +333 -0
- package/dist/tests/commands/remember.test.js +97 -0
- package/dist/tests/commands/scope-flags.test.js +300 -0
- package/dist/tests/commands/search.test.js +537 -0
- package/dist/tests/commands/show-indexer-parity.test.js +117 -0
- package/dist/tests/commands/show.test.js +294 -0
- package/dist/tests/common.test.js +266 -0
- package/dist/tests/completions.test.js +142 -0
- package/dist/tests/config-cli.test.js +193 -0
- package/dist/tests/config-llm-features.test.js +139 -0
- package/dist/tests/config.test.js +569 -0
- package/dist/tests/contracts/migration-baseline.test.js +43 -0
- package/dist/tests/contracts/reflect-propose-envelope.test.js +139 -0
- package/dist/tests/contracts/spec-helpers.js +46 -0
- package/dist/tests/contracts/v1-spec-section-11-proposal-queue.test.js +228 -0
- package/dist/tests/contracts/v1-spec-section-12-agent-config.test.js +56 -0
- package/dist/tests/contracts/v1-spec-section-13-lesson-type.test.js +34 -0
- package/dist/tests/contracts/v1-spec-section-14-llm-features.test.js +94 -0
- package/dist/tests/contracts/v1-spec-section-4-1-asset-types.test.js +39 -0
- package/dist/tests/contracts/v1-spec-section-4-2-quality-rules.test.js +44 -0
- package/dist/tests/contracts/v1-spec-section-5-configuration.test.js +47 -0
- package/dist/tests/contracts/v1-spec-section-6-orchestration.test.js +40 -0
- package/dist/tests/contracts/v1-spec-section-7-module-layout.test.js +58 -0
- package/dist/tests/contracts/v1-spec-section-8-extension-points.test.js +34 -0
- package/dist/tests/contracts/v1-spec-section-9-4-cli-surface.test.js +75 -0
- package/dist/tests/contracts/v1-spec-section-9-7-llm-agent-boundary.test.js +36 -0
- package/dist/tests/core/write-source.test.js +366 -0
- package/dist/tests/curate-command.test.js +87 -0
- package/dist/tests/db-scoring.test.js +201 -0
- package/dist/tests/db.test.js +654 -0
- package/dist/tests/distill-cli-flag.test.js +208 -0
- package/dist/tests/distill.test.js +515 -0
- package/dist/tests/docker-install.test.js +120 -0
- package/dist/tests/e2e.test.js +1419 -0
- package/dist/tests/embedder.test.js +340 -0
- package/dist/tests/embedding-model-config.test.js +379 -0
- package/dist/tests/feedback-command.test.js +172 -0
- package/dist/tests/file-context.test.js +552 -0
- package/dist/tests/fixtures/scripts/git/summarize-diff.js +9 -0
- package/dist/tests/fixtures/scripts/lint/eslint-check.js +7 -0
- package/dist/tests/fixtures/stashes/load.js +166 -0
- package/dist/tests/fixtures/stashes/load.test.js +97 -0
- package/dist/tests/fixtures/stashes/ranking-baseline/scripts/mem0-search.js +12 -0
- package/dist/tests/frontmatter.test.js +190 -0
- package/dist/tests/fts-field-weighting.test.js +254 -0
- package/dist/tests/fuzzy-search.test.js +230 -0
- package/dist/tests/git-provider-clone.test.js +45 -0
- package/dist/tests/github.test.js +161 -0
- package/dist/tests/graph-boost-ranking.test.js +305 -0
- package/dist/tests/graph-extraction.test.js +282 -0
- package/dist/tests/helpers/usage-events.js +8 -0
- package/dist/tests/index-pass-llm.test.js +161 -0
- package/dist/tests/indexer.test.js +570 -0
- package/dist/tests/info-command.test.js +166 -0
- package/dist/tests/init.test.js +69 -0
- package/dist/tests/install-script.test.js +246 -0
- package/dist/tests/integration/agent-real-profile.test.js +94 -0
- package/dist/tests/issue-36-repro.test.js +304 -0
- package/dist/tests/issues-191-194.test.js +160 -0
- package/dist/tests/lesson-lint.test.js +111 -0
- package/dist/tests/llm-client.test.js +115 -0
- package/dist/tests/llm-feature-gate.test.js +151 -0
- package/dist/tests/llm.test.js +139 -0
- package/dist/tests/lockfile.test.js +216 -0
- package/dist/tests/manifest.test.js +205 -0
- package/dist/tests/markdown.test.js +126 -0
- package/dist/tests/matchers-unit.test.js +189 -0
- package/dist/tests/memory-inference.test.js +299 -0
- package/dist/tests/merge-scoring.test.js +136 -0
- package/dist/tests/metadata.test.js +313 -0
- package/dist/tests/migration-help.test.js +89 -0
- package/dist/tests/origin-resolve.test.js +124 -0
- package/dist/tests/output-baseline.test.js +218 -0
- package/dist/tests/output-shapes-unit.test.js +478 -0
- package/dist/tests/parallel-search.test.js +272 -0
- package/dist/tests/parameter-metadata.test.js +365 -0
- package/dist/tests/paths.test.js +177 -0
- package/dist/tests/progressive-disclosure.test.js +280 -0
- package/dist/tests/proposals.test.js +279 -0
- package/dist/tests/proposed-quality.test.js +271 -0
- package/dist/tests/provider-registry.test.js +32 -0
- package/dist/tests/ranking-regression.test.js +548 -0
- package/dist/tests/reflect-propose.test.js +455 -0
- package/dist/tests/registry-build-index.test.js +394 -0
- package/dist/tests/registry-cli.test.js +290 -0
- package/dist/tests/registry-index-v2.test.js +430 -0
- package/dist/tests/registry-install.test.js +728 -0
- package/dist/tests/registry-providers/parity.test.js +189 -0
- package/dist/tests/registry-providers/skills-sh.test.js +309 -0
- package/dist/tests/registry-providers/static-index.test.js +238 -0
- package/dist/tests/registry-resolve.test.js +126 -0
- package/dist/tests/registry-search.test.js +923 -0
- package/dist/tests/remember-frontmatter.test.js +378 -0
- package/dist/tests/remember-unit.test.js +123 -0
- package/dist/tests/ripgrep-install.test.js +251 -0
- package/dist/tests/ripgrep-resolve.test.js +108 -0
- package/dist/tests/ripgrep.test.js +163 -0
- package/dist/tests/save-command.test.js +94 -0
- package/dist/tests/save-trust-qa-fixes.test.js +270 -0
- package/dist/tests/scoring-pipeline.test.js +648 -0
- package/dist/tests/search-include-proposed-cli.test.js +118 -0
- package/dist/tests/self-update.test.js +442 -0
- package/dist/tests/semantic-search-e2e.test.js +512 -0
- package/dist/tests/semantic-status.test.js +471 -0
- package/dist/tests/setup-run.integration.js +877 -0
- package/dist/tests/setup-wizard.test.js +198 -0
- package/dist/tests/setup.test.js +131 -0
- package/dist/tests/source-add.test.js +11 -0
- package/dist/tests/source-clone.test.js +254 -0
- package/dist/tests/source-manage.test.js +366 -0
- package/dist/tests/source-providers/filesystem.test.js +82 -0
- package/dist/tests/source-providers/git.test.js +252 -0
- package/dist/tests/source-providers/website.test.js +128 -0
- package/dist/tests/source-qa-fixes.test.js +286 -0
- package/dist/tests/source-registry.test.js +350 -0
- package/dist/tests/source-resolve.test.js +100 -0
- package/dist/tests/source-source.test.js +281 -0
- package/dist/tests/source.test.js +533 -0
- package/dist/tests/tar-utils-scan.test.js +73 -0
- package/dist/tests/toggle-components.test.js +73 -0
- package/dist/tests/usage-telemetry.test.js +265 -0
- package/dist/tests/utility-scoring.test.js +558 -0
- package/dist/tests/vault-load-error.test.js +78 -0
- package/dist/tests/vault-qa-fixes.test.js +194 -0
- package/dist/tests/vault.test.js +429 -0
- package/dist/tests/vector-search.test.js +608 -0
- package/dist/tests/walker.test.js +252 -0
- package/dist/tests/wave2-cluster-bc.test.js +228 -0
- package/dist/tests/wave2-cluster-d.test.js +180 -0
- package/dist/tests/wave2-cluster-e.test.js +179 -0
- package/dist/tests/wiki-qa-fixes.test.js +270 -0
- package/dist/tests/wiki.test.js +529 -0
- package/dist/tests/workflow-cli.test.js +271 -0
- package/dist/tests/workflow-markdown.test.js +171 -0
- package/dist/tests/workflow-path-escape.test.js +132 -0
- package/dist/tests/workflow-qa-fixes.test.js +395 -0
- package/dist/tests/workflows/indexer-rejection.test.js +213 -0
- package/docs/README.md +8 -0
- package/docs/migration/release-notes/0.7.0.md +244 -0
- package/package.json +2 -2
- package/dist/core/warn.js +0 -27
- package/dist/output/shapes.js +0 -212
- package/dist/output/text.js +0 -520
- /package/dist/{commands → src/commands}/completions.js +0 -0
- /package/dist/{commands → src/commands}/curate.js +0 -0
- /package/dist/{commands → src/commands}/info.js +0 -0
- /package/dist/{commands → src/commands}/init.js +0 -0
- /package/dist/{commands → src/commands}/install-audit.js +0 -0
- /package/dist/{commands → src/commands}/migration-help.js +0 -0
- /package/dist/{commands → src/commands}/source-clone.js +0 -0
- /package/dist/{commands → src/commands}/vault.js +0 -0
- /package/dist/{core → src/core}/asset-registry.js +0 -0
- /package/dist/{core → src/core}/frontmatter.js +0 -0
- /package/dist/{core → src/core}/markdown.js +0 -0
- /package/dist/{core → src/core}/paths.js +0 -0
- /package/dist/{indexer → src/indexer}/manifest.js +0 -0
- /package/dist/{indexer → src/indexer}/search-fields.js +0 -0
- /package/dist/{indexer → src/indexer}/semantic-status.js +0 -0
- /package/dist/{indexer → src/indexer}/usage-events.js +0 -0
- /package/dist/{indexer → src/indexer}/walker.js +0 -0
- /package/dist/{llm → src/llm}/embedder.js +0 -0
- /package/dist/{llm → src/llm}/embedders/cache.js +0 -0
- /package/dist/{llm → src/llm}/embedders/local.js +0 -0
- /package/dist/{llm → src/llm}/embedders/types.js +0 -0
- /package/dist/{llm → src/llm}/metadata-enhance.js +0 -0
- /package/dist/{output → src/output}/context.js +0 -0
- /package/dist/{registry → src/registry}/create-provider-registry.js +0 -0
- /package/dist/{registry → src/registry}/origin-resolve.js +0 -0
- /package/dist/{registry → src/registry}/providers/index.js +0 -0
- /package/dist/{registry → src/registry}/providers/skills-sh.js +0 -0
- /package/dist/{registry → src/registry}/providers/types.js +0 -0
- /package/dist/{registry → src/registry}/types.js +0 -0
- /package/dist/{setup → src/setup}/detect.js +0 -0
- /package/dist/{setup → src/setup}/ripgrep-install.js +0 -0
- /package/dist/{setup → src/setup}/ripgrep-resolve.js +0 -0
- /package/dist/{setup → src/setup}/steps.js +0 -0
- /package/dist/{sources → src/sources}/include.js +0 -0
- /package/dist/{sources → src/sources}/provider-factory.js +0 -0
- /package/dist/{sources → src/sources}/provider.js +0 -0
- /package/dist/{sources → src/sources}/providers/filesystem.js +0 -0
- /package/dist/{sources → src/sources}/providers/index.js +0 -0
- /package/dist/{sources → src/sources}/providers/install-types.js +0 -0
- /package/dist/{sources → src/sources}/providers/npm.js +0 -0
- /package/dist/{sources → src/sources}/providers/provider-utils.js +0 -0
- /package/dist/{sources → src/sources}/providers/sync-from-ref.js +0 -0
- /package/dist/{sources → src/sources}/providers/tar-utils.js +0 -0
- /package/dist/{sources → src/sources}/providers/website.js +0 -0
- /package/dist/{sources → src/sources}/resolve.js +0 -0
- /package/dist/{sources → src/sources}/types.js +0 -0
- /package/dist/{templates → src/templates}/wiki-templates.js +0 -0
- /package/dist/{version.js → src/version.js} +0 -0
- /package/dist/{workflows → src/workflows}/authoring.js +0 -0
- /package/dist/{workflows → src/workflows}/cli.js +0 -0
- /package/dist/{workflows → src/workflows}/db.js +0 -0
- /package/dist/{workflows → src/workflows}/document-cache.js +0 -0
- /package/dist/{workflows → src/workflows}/parser.js +0 -0
- /package/dist/{workflows → src/workflows}/renderer.js +0 -0
- /package/dist/{workflows → src/workflows}/schema.js +0 -0
- /package/dist/{workflows → src/workflows}/validator.js +0 -0
|
@@ -0,0 +1,647 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* akm-bench `evolve` — Track B longitudinal three-phase runner (spec §4 + §6.4).
|
|
3
|
+
*
|
|
4
|
+
* `runEvolve()` orchestrates three phases against a single eval-domain corpus:
|
|
5
|
+
*
|
|
6
|
+
* • Phase 1 (signal accumulation): run K seeds × tasks (train slice only)
|
|
7
|
+
* under the akm arm, then record `akm feedback <gold_ref> --positive` /
|
|
8
|
+
* `--negative` events per outcome.
|
|
9
|
+
* • Phase 2 (evolve): for every asset whose negative feedback crosses the
|
|
10
|
+
* threshold, invoke `akm distill` and `akm reflect`, validate every
|
|
11
|
+
* resulting proposal via `akm proposal show --json`, then accept or
|
|
12
|
+
* reject per lint outcome. After processing, rebuild the index.
|
|
13
|
+
* • Phase 3 (re-evaluate): run the eval slice under THREE arms — `pre` (the
|
|
14
|
+
* original un-evolved fixture), `post` (the evolved fixture), `synthetic`
|
|
15
|
+
* (no stash, scratchpad-only "Bring Your Own Skills" prompt).
|
|
16
|
+
*
|
|
17
|
+
* Leakage prevention (spec §7.4): before invoking distill we compute the set
|
|
18
|
+
* of eval-slice gold refs and pass it to `akm distill` via
|
|
19
|
+
* `--exclude-feedback-from <csv>` (#267). `akmDistill` filters those
|
|
20
|
+
* feedback events out of its LLM input before constructing the prompt.
|
|
21
|
+
* Refs in the exclusion list still see distillation run — but distillation
|
|
22
|
+
* runs from asset content alone, with no feedback signal that could have
|
|
23
|
+
* leaked from the eval slice. The proposal log + Phase 1 feedback stream
|
|
24
|
+
* are also filtered before computeProposalQualityMetrics ever sees them.
|
|
25
|
+
*
|
|
26
|
+
* Test seams: every external interaction is funnelled through one of three
|
|
27
|
+
* injectable functions:
|
|
28
|
+
* - `spawn` — forwarded to `runOne` (drives the agent harness).
|
|
29
|
+
* - `akmCli(args, cwd, env)` — invoked for every `akm <verb>` subprocess.
|
|
30
|
+
* - `materialiseStash` — when false, `runUtility` doesn't touch
|
|
31
|
+
* fixtures/stashes/.
|
|
32
|
+
* Tests inject fakes; production wires the real `Bun.spawnSync` and the
|
|
33
|
+
* real `loadFixtureStash`.
|
|
34
|
+
*/
|
|
35
|
+
import path from "node:path";
|
|
36
|
+
import { loadFixtureStash } from "../fixtures/stashes/load";
|
|
37
|
+
import { registerCleanup } from "./cleanup";
|
|
38
|
+
import { computeLessonMetrics } from "./evolve-metrics";
|
|
39
|
+
import { computeFeedbackIntegrity, computeLongitudinalMetrics, computeProposalQualityMetrics, } from "./metrics";
|
|
40
|
+
import { runUtility } from "./runner";
|
|
41
|
+
import { benchMkdtemp } from "./tmp";
|
|
42
|
+
/**
|
|
43
|
+
* Drive the three-phase Track B runner.
|
|
44
|
+
*
|
|
45
|
+
* Pre: `tasks` is already filtered to one domain (or `all`). The runner
|
|
46
|
+
* partitions internally on `task.slice`.
|
|
47
|
+
*
|
|
48
|
+
* Sandboxing: at the start of every real run the runner materialises one
|
|
49
|
+
* dedicated tmp stash per fixture (the `evolveStash`) plus a fresh sibling
|
|
50
|
+
* snapshot per fixture (the `preStash`). Phase 1 + Phase 2 pin
|
|
51
|
+
* `AKM_STASH_DIR` to the appropriate `evolveStash` for every spawned `akm`
|
|
52
|
+
* invocation; Phase 3's pre arm uses `preStash`, the post arm uses
|
|
53
|
+
* `evolveStash`, and the synthetic arm uses no stash. The operator's real
|
|
54
|
+
* `process.env.AKM_STASH_DIR` is never read or written by `runEvolve`. All
|
|
55
|
+
* stashes are cleaned up in a top-level try/finally.
|
|
56
|
+
*/
|
|
57
|
+
export async function runEvolve(options) {
|
|
58
|
+
const seedsPerArm = options.seedsPerArm ?? 5;
|
|
59
|
+
const budgetTokens = options.budgetTokens ?? 30000;
|
|
60
|
+
const budgetWallMs = options.budgetWallMs ?? 120000;
|
|
61
|
+
const negativeThreshold = options.negativeThreshold ?? { absoluteCount: 2, ratio: 0.5 };
|
|
62
|
+
const materialiseStash = options.materialiseStash ?? true;
|
|
63
|
+
const akmCli = options.akmCli ?? defaultAkmCli;
|
|
64
|
+
const warnings = [];
|
|
65
|
+
const trainTasks = options.tasks.filter((t) => effectiveSlice(t) === "train");
|
|
66
|
+
const evalTasks = options.tasks.filter((t) => effectiveSlice(t) === "eval");
|
|
67
|
+
// Use the first task's domain (or "all") as the corpus label. The CLI
|
|
68
|
+
// already filtered to one domain; this is just for the report header.
|
|
69
|
+
const domain = uniqueDomain(options.tasks);
|
|
70
|
+
// ── Sandbox setup: per-fixture evolveStash + preStash. ───────────────────
|
|
71
|
+
// We materialise one tmp stash per unique `task.stash` so Phase 1
|
|
72
|
+
// accumulates feedback into the same on-disk stash that Phase 2 mutates,
|
|
73
|
+
// and that Phase 3's post arm reads back. The operator's real
|
|
74
|
+
// AKM_STASH_DIR is never touched. The pre arm gets a fresh snapshot of
|
|
75
|
+
// the same starting fixture (no Phase 2 mutations applied).
|
|
76
|
+
const fixtureNames = new Set();
|
|
77
|
+
for (const t of options.tasks)
|
|
78
|
+
fixtureNames.add(t.stash);
|
|
79
|
+
const evolveStashes = new Map();
|
|
80
|
+
const preStashes = new Map();
|
|
81
|
+
const evolveDirByFixture = new Map();
|
|
82
|
+
const preDirByFixture = new Map();
|
|
83
|
+
/** Per-fixture XDG_CACHE_HOME dirs allocated for evolve-stash indexing. */
|
|
84
|
+
const evolveCacheDirByFixture = new Map();
|
|
85
|
+
// SIGINT trap (#267): every per-fixture stash registers its cleanup with
|
|
86
|
+
// the shared registry so an external Ctrl-C reaps the tmp dirs even when
|
|
87
|
+
// the top-level try/finally never runs. We deregister in the matching
|
|
88
|
+
// finally block before invoking the synchronous cleanup so the handler
|
|
89
|
+
// doesn't double-fire.
|
|
90
|
+
const stashDeregistrations = [];
|
|
91
|
+
if (materialiseStash) {
|
|
92
|
+
for (const name of fixtureNames) {
|
|
93
|
+
try {
|
|
94
|
+
const evolved = loadFixtureStash(name, { skipIndex: false });
|
|
95
|
+
evolveStashes.set(name, evolved);
|
|
96
|
+
evolveDirByFixture.set(name, evolved.stashDir);
|
|
97
|
+
// Allocate a per-fixture cache dir for the evolve-stash re-index.
|
|
98
|
+
// `loadFixtureStash` used its own isolated XDG_CACHE_HOME; subsequent
|
|
99
|
+
// `akmCli` calls (feedback, distill, reflect) must look in the same
|
|
100
|
+
// cache. We allocate a fresh bench cache dir and pass it through
|
|
101
|
+
// `indexEvolveStash` + `envForRef` so the FTS5 DB is in a known place.
|
|
102
|
+
evolveCacheDirByFixture.set(name, benchMkdtemp(`akm-evolve-cache-${name}-`));
|
|
103
|
+
stashDeregistrations.push(registerCleanup(() => {
|
|
104
|
+
try {
|
|
105
|
+
evolved.cleanup();
|
|
106
|
+
}
|
|
107
|
+
catch {
|
|
108
|
+
/* swallow */
|
|
109
|
+
}
|
|
110
|
+
}));
|
|
111
|
+
}
|
|
112
|
+
catch (err) {
|
|
113
|
+
warnings.push(`evolve: failed to materialise evolve stash for fixture "${name}": ${err.message}`);
|
|
114
|
+
}
|
|
115
|
+
try {
|
|
116
|
+
const pre = loadFixtureStash(name, { skipIndex: false });
|
|
117
|
+
preStashes.set(name, pre);
|
|
118
|
+
preDirByFixture.set(name, pre.stashDir);
|
|
119
|
+
stashDeregistrations.push(registerCleanup(() => {
|
|
120
|
+
try {
|
|
121
|
+
pre.cleanup();
|
|
122
|
+
}
|
|
123
|
+
catch {
|
|
124
|
+
/* swallow */
|
|
125
|
+
}
|
|
126
|
+
}));
|
|
127
|
+
}
|
|
128
|
+
catch (err) {
|
|
129
|
+
warnings.push(`evolve: failed to materialise pre stash for fixture "${name}": ${err.message}`);
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
// Resolve the evolveStash dir for a given asset ref. We map ref → fixture
|
|
134
|
+
// by looking up which task's gold ref it matches; if no task owns it (or
|
|
135
|
+
// multiple do, which is unusual), we fall back to the first available
|
|
136
|
+
// evolveStash. The simple — and most common — case is a single fixture
|
|
137
|
+
// per `--tasks <domain>` invocation.
|
|
138
|
+
const refToFixture = new Map();
|
|
139
|
+
for (const t of options.tasks) {
|
|
140
|
+
if (t.goldRef)
|
|
141
|
+
refToFixture.set(t.goldRef, t.stash);
|
|
142
|
+
}
|
|
143
|
+
const fallbackEvolveDir = [...evolveDirByFixture.values()][0];
|
|
144
|
+
const fallbackEvolveCacheDir = [...evolveCacheDirByFixture.values()][0];
|
|
145
|
+
function envForRef(ref) {
|
|
146
|
+
const baseEnv = { ...process.env };
|
|
147
|
+
if (!materialiseStash) {
|
|
148
|
+
// Tests opt out of fixture materialisation entirely; we still strip
|
|
149
|
+
// the operator's AKM_STASH_DIR so the fake CLI sees a known sentinel.
|
|
150
|
+
delete baseEnv.AKM_STASH_DIR;
|
|
151
|
+
return baseEnv;
|
|
152
|
+
}
|
|
153
|
+
const fixture = ref ? refToFixture.get(ref) : undefined;
|
|
154
|
+
const dir = (fixture && evolveDirByFixture.get(fixture)) ?? fallbackEvolveDir;
|
|
155
|
+
const cacheDir = (fixture && evolveCacheDirByFixture.get(fixture)) ?? fallbackEvolveCacheDir;
|
|
156
|
+
if (dir)
|
|
157
|
+
baseEnv.AKM_STASH_DIR = dir;
|
|
158
|
+
else
|
|
159
|
+
delete baseEnv.AKM_STASH_DIR;
|
|
160
|
+
if (cacheDir)
|
|
161
|
+
baseEnv.XDG_CACHE_HOME = cacheDir;
|
|
162
|
+
return baseEnv;
|
|
163
|
+
}
|
|
164
|
+
// ── Phase 1 pre-flight: index each evolve stash in its dedicated cache. ───
|
|
165
|
+
// `loadFixtureStash` already ran `akm index` but used an isolated
|
|
166
|
+
// XDG_CACHE_HOME that subsequent `akmCli` calls (feedback, distill, reflect)
|
|
167
|
+
// cannot see. Re-running `akm index` here via `akmCli` with the same
|
|
168
|
+
// AKM_STASH_DIR + XDG_CACHE_HOME that `envForRef` will produce ensures the
|
|
169
|
+
// FTS5 database is populated where Phase 1 feedback will look.
|
|
170
|
+
// Non-zero exit adds a warning but does not abort — Phase 1 can still run
|
|
171
|
+
// with degraded feedback if the index step fails.
|
|
172
|
+
if (materialiseStash) {
|
|
173
|
+
const phase1Cwd = options.tasks[0]?.taskDir ?? process.cwd();
|
|
174
|
+
for (const [fixtureName, stashDir] of evolveDirByFixture) {
|
|
175
|
+
const cacheDir = evolveCacheDirByFixture.get(fixtureName);
|
|
176
|
+
if (!cacheDir)
|
|
177
|
+
continue;
|
|
178
|
+
try {
|
|
179
|
+
const result = await indexEvolveStash(stashDir, cacheDir, akmCli, phase1Cwd);
|
|
180
|
+
if (!result.ok) {
|
|
181
|
+
warnings.push(`evolve: pre-flight akm index failed for stash ${stashDir}: ${result.stderr.trim()}`);
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
catch (err) {
|
|
185
|
+
warnings.push(`evolve: pre-flight akm index threw for stash ${stashDir}: ${err.message}`);
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
let preReport;
|
|
190
|
+
let postReport;
|
|
191
|
+
let syntheticReport;
|
|
192
|
+
let phase1Report;
|
|
193
|
+
const feedbackLog = [];
|
|
194
|
+
const proposalLog = [];
|
|
195
|
+
try {
|
|
196
|
+
// ── Phase 1: accumulate signal on the train slice (akm arm only). ─────
|
|
197
|
+
phase1Report = await runUtility({
|
|
198
|
+
tasks: trainTasks,
|
|
199
|
+
arms: ["akm"],
|
|
200
|
+
model: options.model,
|
|
201
|
+
seedsPerArm,
|
|
202
|
+
budgetTokens,
|
|
203
|
+
budgetWallMs,
|
|
204
|
+
slice: "train",
|
|
205
|
+
...(options.spawn ? { spawn: options.spawn } : {}),
|
|
206
|
+
// We pre-materialised the per-fixture evolve stash above; tell the
|
|
207
|
+
// runner to forward those dirs and skip its own per-task materialise.
|
|
208
|
+
materialiseStash,
|
|
209
|
+
...(materialiseStash ? { stashDirByFixture: evolveDirByFixture } : {}),
|
|
210
|
+
...(options.timestamp ? { timestamp: options.timestamp } : {}),
|
|
211
|
+
...(options.branch ? { branch: options.branch } : {}),
|
|
212
|
+
...(options.commit ? { commit: options.commit } : {}),
|
|
213
|
+
...(options.opencodeProviders ? { opencodeProviders: options.opencodeProviders } : {}),
|
|
214
|
+
});
|
|
215
|
+
// Issue feedback events per (task, seed) outcome on the akm arm.
|
|
216
|
+
const feedbackByRef = new Map();
|
|
217
|
+
const phase1Cwd = options.tasks[0]?.taskDir ?? process.cwd();
|
|
218
|
+
for (const run of phase1Report.akmRuns ?? []) {
|
|
219
|
+
const taskMeta = options.tasks.find((t) => t.id === run.taskId);
|
|
220
|
+
const goldRef = taskMeta?.goldRef;
|
|
221
|
+
if (!goldRef)
|
|
222
|
+
continue;
|
|
223
|
+
if (run.outcome === "harness_error")
|
|
224
|
+
continue;
|
|
225
|
+
const signal = run.outcome === "pass" ? "positive" : "negative";
|
|
226
|
+
const args = ["feedback", goldRef, signal === "positive" ? "--positive" : "--negative"];
|
|
227
|
+
// Wrap in try/catch so a single throwing akmCli (e.g. subprocess
|
|
228
|
+
// crash) cannot leave `feedbackByRef` partially populated and let
|
|
229
|
+
// Phase 2 proceed on corrupt state.
|
|
230
|
+
try {
|
|
231
|
+
const cliResult = await akmCli(args, phase1Cwd, envForRef(goldRef));
|
|
232
|
+
feedbackLog.push({ taskId: run.taskId, seed: run.seed, goldRef, signal, ok: cliResult.exitCode === 0 });
|
|
233
|
+
if (cliResult.exitCode !== 0) {
|
|
234
|
+
warnings.push(`phase1: akm feedback for ${goldRef} (${signal}) failed: ${cliResult.stderr.trim()}`);
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
catch (err) {
|
|
238
|
+
feedbackLog.push({ taskId: run.taskId, seed: run.seed, goldRef, signal, ok: false });
|
|
239
|
+
warnings.push(`phase1.feedback_dispatch_failed: ${goldRef} ${err.message}`);
|
|
240
|
+
}
|
|
241
|
+
const counts = feedbackByRef.get(goldRef) ?? { positive: 0, negative: 0 };
|
|
242
|
+
if (signal === "positive")
|
|
243
|
+
counts.positive += 1;
|
|
244
|
+
else
|
|
245
|
+
counts.negative += 1;
|
|
246
|
+
feedbackByRef.set(goldRef, counts);
|
|
247
|
+
}
|
|
248
|
+
// ── Phase 2: evolve. ────────────────────────────────────────────────────
|
|
249
|
+
const evalGoldRefs = new Set();
|
|
250
|
+
for (const t of evalTasks) {
|
|
251
|
+
if (t.goldRef)
|
|
252
|
+
evalGoldRefs.add(t.goldRef);
|
|
253
|
+
}
|
|
254
|
+
const refsToEvolve = [];
|
|
255
|
+
for (const [ref, counts] of feedbackByRef.entries()) {
|
|
256
|
+
if (crossesNegativeThreshold(counts, negativeThreshold))
|
|
257
|
+
refsToEvolve.push(ref);
|
|
258
|
+
}
|
|
259
|
+
refsToEvolve.sort();
|
|
260
|
+
// §7.4 leakage prevention (#267): instead of hard-skipping refs that
|
|
261
|
+
// overlap eval-slice gold refs, we now pass the gold-ref set through
|
|
262
|
+
// `--exclude-feedback-from` (and the matching env var) so `akm distill`
|
|
263
|
+
// filters those events out of its LLM input. The behaviour collapses
|
|
264
|
+
// back to "no useful feedback shown" for refs that ARE the gold ref —
|
|
265
|
+
// distill then runs from asset content only, which is what we want.
|
|
266
|
+
const evalGoldRefList = [...evalGoldRefs].sort();
|
|
267
|
+
const excludeFeedbackCsv = evalGoldRefList.join(",");
|
|
268
|
+
for (const ref of refsToEvolve) {
|
|
269
|
+
// The env var fallback is the contract `akm distill` honours; it lets
|
|
270
|
+
// the bench keep working even if a hypothetical caller invokes
|
|
271
|
+
// distill via a wrapper that mangles flags.
|
|
272
|
+
const evolveEnv = {
|
|
273
|
+
...envForRef(ref),
|
|
274
|
+
AKM_BENCH_EXCLUDE_GOLD_REFS: excludeFeedbackCsv,
|
|
275
|
+
...(excludeFeedbackCsv ? { AKM_DISTILL_EXCLUDE_FEEDBACK_FROM: excludeFeedbackCsv } : {}),
|
|
276
|
+
};
|
|
277
|
+
// Pass the eval-gold list explicitly via the CLI flag so the contract
|
|
278
|
+
// is observable in test logs (the env var is a fallback for harnesses
|
|
279
|
+
// that strip flags). Reflect doesn't accept this flag — it's a distill
|
|
280
|
+
// concern only.
|
|
281
|
+
const distillArgs = ["distill", ref];
|
|
282
|
+
if (excludeFeedbackCsv) {
|
|
283
|
+
distillArgs.push("--exclude-feedback-from", excludeFeedbackCsv);
|
|
284
|
+
}
|
|
285
|
+
const distillResult = await akmCli(distillArgs, phase1Cwd, evolveEnv);
|
|
286
|
+
if (distillResult.exitCode !== 0) {
|
|
287
|
+
warnings.push(`phase2: akm distill ${ref} failed: ${distillResult.stderr.trim()}`);
|
|
288
|
+
}
|
|
289
|
+
else if (evalGoldRefs.has(ref) && excludeFeedbackCsv) {
|
|
290
|
+
// Per-ref leakage info — replaces the previous "skipped" message.
|
|
291
|
+
// Operator can audit which refs ran through the filter and confirm
|
|
292
|
+
// distillation didn't see leaked feedback.
|
|
293
|
+
warnings.push(`phase2: filtered eval-slice gold-ref feedback from distill input for ${ref} (--exclude-feedback-from ${excludeFeedbackCsv}).`);
|
|
294
|
+
}
|
|
295
|
+
const reflectResult = await akmCli(["reflect", ref], phase1Cwd, evolveEnv);
|
|
296
|
+
if (reflectResult.exitCode !== 0) {
|
|
297
|
+
// `reflect` requires `agent.default` to be configured — a missing
|
|
298
|
+
// config is non-fatal for the bench; we record and continue.
|
|
299
|
+
warnings.push(`phase2: akm reflect ${ref} skipped/failed: ${reflectResult.stderr.trim()}`);
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
// Walk the proposal queue per fixture (each evolveStash has its own
|
|
303
|
+
// proposal log on disk). When we materialised stashes we iterate every
|
|
304
|
+
// fixture that produced proposals; in the common single-fixture case
|
|
305
|
+
// this is one pass.
|
|
306
|
+
const proposalFixtures = materialiseStash ? [...evolveDirByFixture.keys()] : [undefined];
|
|
307
|
+
for (const fixtureName of proposalFixtures) {
|
|
308
|
+
const proposalEnv = { ...process.env };
|
|
309
|
+
if (materialiseStash && fixtureName) {
|
|
310
|
+
const dir = evolveDirByFixture.get(fixtureName);
|
|
311
|
+
if (dir)
|
|
312
|
+
proposalEnv.AKM_STASH_DIR = dir;
|
|
313
|
+
const cacheDir = evolveCacheDirByFixture.get(fixtureName);
|
|
314
|
+
if (cacheDir)
|
|
315
|
+
proposalEnv.XDG_CACHE_HOME = cacheDir;
|
|
316
|
+
}
|
|
317
|
+
else if (!materialiseStash) {
|
|
318
|
+
delete proposalEnv.AKM_STASH_DIR;
|
|
319
|
+
}
|
|
320
|
+
const listResult = await akmCli(["proposal", "list", "--json"], phase1Cwd, proposalEnv);
|
|
321
|
+
const proposals = parseProposalList(listResult.stdout);
|
|
322
|
+
for (const p of proposals) {
|
|
323
|
+
const showResult = await akmCli(["proposal", "show", p.id, "--json"], phase1Cwd, proposalEnv);
|
|
324
|
+
const lintInfo = parseProposalShow(showResult.stdout);
|
|
325
|
+
const lintPass = lintInfo.lintPass;
|
|
326
|
+
if (lintPass) {
|
|
327
|
+
const acceptResult = await akmCli(["proposal", "accept", p.id], phase1Cwd, proposalEnv);
|
|
328
|
+
proposalLog.push({
|
|
329
|
+
proposalId: p.id,
|
|
330
|
+
assetRef: p.assetRef,
|
|
331
|
+
kind: p.kind,
|
|
332
|
+
lintPass: true,
|
|
333
|
+
decision: acceptResult.exitCode === 0 ? "accept" : "reject",
|
|
334
|
+
...(acceptResult.exitCode === 0 ? {} : { rejectReason: `accept failed: ${acceptResult.stderr.trim()}` }),
|
|
335
|
+
});
|
|
336
|
+
}
|
|
337
|
+
else {
|
|
338
|
+
const reason = lintInfo.lintMessage ?? "lint failed";
|
|
339
|
+
const rejectResult = await akmCli(["proposal", "reject", p.id, "--reason", `lint failed: ${reason}`], phase1Cwd, proposalEnv);
|
|
340
|
+
proposalLog.push({
|
|
341
|
+
proposalId: p.id,
|
|
342
|
+
assetRef: p.assetRef,
|
|
343
|
+
kind: p.kind,
|
|
344
|
+
lintPass: false,
|
|
345
|
+
decision: "reject",
|
|
346
|
+
rejectReason: reason,
|
|
347
|
+
});
|
|
348
|
+
if (rejectResult.exitCode !== 0) {
|
|
349
|
+
warnings.push(`phase2: akm proposal reject ${p.id} failed: ${rejectResult.stderr.trim()}`);
|
|
350
|
+
}
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
// Rebuild the index so accepted lessons surface in Phase 3.
|
|
354
|
+
const indexResult = await akmCli(["index"], phase1Cwd, proposalEnv);
|
|
355
|
+
if (indexResult.exitCode !== 0) {
|
|
356
|
+
warnings.push(`phase2: akm index rebuild failed: ${indexResult.stderr.trim()}`);
|
|
357
|
+
}
|
|
358
|
+
}
|
|
359
|
+
// ── Phase 3: re-evaluate (eval slice). ─────────────────────────────────
|
|
360
|
+
// pre arm: fresh snapshot of the starting fixture (no Phase 2 mutations
|
|
361
|
+
// applied). post arm: the mutated evolveStash so accepted lessons reach
|
|
362
|
+
// the eval slice. synthetic arm: no stash.
|
|
363
|
+
preReport = await runUtility({
|
|
364
|
+
tasks: evalTasks,
|
|
365
|
+
arms: ["akm"],
|
|
366
|
+
model: options.model,
|
|
367
|
+
seedsPerArm,
|
|
368
|
+
budgetTokens,
|
|
369
|
+
budgetWallMs,
|
|
370
|
+
slice: "eval",
|
|
371
|
+
...(options.spawn ? { spawn: options.spawn } : {}),
|
|
372
|
+
materialiseStash,
|
|
373
|
+
...(materialiseStash ? { stashDirByFixture: preDirByFixture } : {}),
|
|
374
|
+
...(options.timestamp ? { timestamp: options.timestamp } : {}),
|
|
375
|
+
...(options.branch ? { branch: options.branch } : {}),
|
|
376
|
+
...(options.commit ? { commit: options.commit } : {}),
|
|
377
|
+
...(options.opencodeProviders ? { opencodeProviders: options.opencodeProviders } : {}),
|
|
378
|
+
});
|
|
379
|
+
postReport = await runUtility({
|
|
380
|
+
tasks: evalTasks,
|
|
381
|
+
arms: ["akm"],
|
|
382
|
+
model: options.model,
|
|
383
|
+
seedsPerArm,
|
|
384
|
+
budgetTokens,
|
|
385
|
+
budgetWallMs,
|
|
386
|
+
slice: "eval",
|
|
387
|
+
// Stamp arm metadata so spawn fakes can distinguish pre-vs-post via
|
|
388
|
+
// an env probe. We thread it via a fresh `spawn` wrapper when one
|
|
389
|
+
// was supplied.
|
|
390
|
+
materialiseStash,
|
|
391
|
+
...(materialiseStash ? { stashDirByFixture: evolveDirByFixture } : {}),
|
|
392
|
+
...(options.timestamp ? { timestamp: options.timestamp } : {}),
|
|
393
|
+
...(options.branch ? { branch: options.branch } : {}),
|
|
394
|
+
...(options.commit ? { commit: options.commit } : {}),
|
|
395
|
+
...(options.spawn ? { spawn: wrapSpawnWithArm(options.spawn, "post") } : {}),
|
|
396
|
+
...(options.opencodeProviders ? { opencodeProviders: options.opencodeProviders } : {}),
|
|
397
|
+
});
|
|
398
|
+
// synthetic: no stash. We pass a spawn wrapper that strips
|
|
399
|
+
// AKM_STASH_DIR and injects the "Bring Your Own Skills" tag so test
|
|
400
|
+
// fakes (and a future real harness) can branch. #267 — also forward a
|
|
401
|
+
// per-task scratchpad prompt via the runner's `buildPrompt` seam so the
|
|
402
|
+
// synthetic arm actually exercises the BYOS prompt path rather than
|
|
403
|
+
// relying on the noakm default.
|
|
404
|
+
syntheticReport = await runUtility({
|
|
405
|
+
tasks: evalTasks,
|
|
406
|
+
arms: ["akm"],
|
|
407
|
+
model: options.model,
|
|
408
|
+
seedsPerArm,
|
|
409
|
+
budgetTokens,
|
|
410
|
+
budgetWallMs,
|
|
411
|
+
slice: "eval",
|
|
412
|
+
materialiseStash: false,
|
|
413
|
+
buildPrompt: (task, _arm) => buildSyntheticPrompt(task.id),
|
|
414
|
+
...(options.timestamp ? { timestamp: options.timestamp } : {}),
|
|
415
|
+
...(options.branch ? { branch: options.branch } : {}),
|
|
416
|
+
...(options.commit ? { commit: options.commit } : {}),
|
|
417
|
+
...(options.spawn ? { spawn: wrapSpawnWithArm(options.spawn, "synthetic", undefined, true) } : {}),
|
|
418
|
+
...(options.opencodeProviders ? { opencodeProviders: options.opencodeProviders } : {}),
|
|
419
|
+
});
|
|
420
|
+
}
|
|
421
|
+
finally {
|
|
422
|
+
// Deregister BEFORE running cleanup so a SIGINT during teardown
|
|
423
|
+
// doesn't double-fire the cleanup fns (per cleanup.ts contract).
|
|
424
|
+
for (const deregister of stashDeregistrations)
|
|
425
|
+
deregister();
|
|
426
|
+
for (const s of evolveStashes.values()) {
|
|
427
|
+
try {
|
|
428
|
+
s.cleanup();
|
|
429
|
+
}
|
|
430
|
+
catch {
|
|
431
|
+
/* swallow — best-effort tmp cleanup */
|
|
432
|
+
}
|
|
433
|
+
}
|
|
434
|
+
for (const s of preStashes.values()) {
|
|
435
|
+
try {
|
|
436
|
+
s.cleanup();
|
|
437
|
+
}
|
|
438
|
+
catch {
|
|
439
|
+
/* swallow — best-effort tmp cleanup */
|
|
440
|
+
}
|
|
441
|
+
}
|
|
442
|
+
}
|
|
443
|
+
// ── Compute aggregates. ──────────────────────────────────────────────────
|
|
444
|
+
const proposalsMetrics = computeProposalQualityMetrics(proposalLog);
|
|
445
|
+
const longitudinal = computeLongitudinalMetrics(preReport, postReport, syntheticReport);
|
|
446
|
+
const feedbackIntegrity = computeFeedbackIntegrity({ phase1: phase1Report, feedbackLog });
|
|
447
|
+
// #264 — lesson quality + reuse metrics. The runner doesn't (yet) read
|
|
448
|
+
// accepted lesson bodies off disk or load verifier source text; we pass
|
|
449
|
+
// empty maps so the leakage check defaults to "low" until the read seam
|
|
450
|
+
// lands. Reuse + negative-transfer attribution work today off the
|
|
451
|
+
// pre/post arm `assetsLoaded` stream.
|
|
452
|
+
const lessons = computeLessonMetrics({
|
|
453
|
+
proposalLog,
|
|
454
|
+
feedbackLog,
|
|
455
|
+
preRuns: preReport.akmRuns ?? [],
|
|
456
|
+
postRuns: postReport.akmRuns ?? [],
|
|
457
|
+
});
|
|
458
|
+
return {
|
|
459
|
+
timestamp: options.timestamp ?? new Date().toISOString(),
|
|
460
|
+
branch: options.branch ?? preReport.branch,
|
|
461
|
+
commit: options.commit ?? preReport.commit,
|
|
462
|
+
model: options.model,
|
|
463
|
+
domain,
|
|
464
|
+
seedsPerArm,
|
|
465
|
+
feedbackLog,
|
|
466
|
+
proposalLog,
|
|
467
|
+
proposals: proposalsMetrics,
|
|
468
|
+
lessons,
|
|
469
|
+
longitudinal,
|
|
470
|
+
feedbackIntegrity,
|
|
471
|
+
phase1: phase1Report,
|
|
472
|
+
arms: { pre: preReport, post: postReport, synthetic: syntheticReport },
|
|
473
|
+
warnings: [
|
|
474
|
+
...warnings,
|
|
475
|
+
...phase1Report.warnings,
|
|
476
|
+
...preReport.warnings,
|
|
477
|
+
...postReport.warnings,
|
|
478
|
+
...syntheticReport.warnings,
|
|
479
|
+
],
|
|
480
|
+
};
|
|
481
|
+
}
|
|
482
|
+
/**
|
|
483
|
+
* Default subprocess invoker — runs `bun run src/cli.ts <args>` in `cwd`
|
|
484
|
+
* with the supplied env. Real runs use this; tests inject a fake.
|
|
485
|
+
*/
|
|
486
|
+
async function defaultAkmCli(args, cwd, env) {
|
|
487
|
+
const cli = path.resolve(__dirname, "..", "..", "src", "cli.ts");
|
|
488
|
+
const proc = Bun.spawnSync({
|
|
489
|
+
cmd: ["bun", "run", cli, ...args],
|
|
490
|
+
cwd,
|
|
491
|
+
env: { ...process.env, ...env },
|
|
492
|
+
stdout: "pipe",
|
|
493
|
+
stderr: "pipe",
|
|
494
|
+
});
|
|
495
|
+
const stdout = proc.stdout ? new TextDecoder().decode(proc.stdout) : "";
|
|
496
|
+
const stderr = proc.stderr ? new TextDecoder().decode(proc.stderr) : "";
|
|
497
|
+
return { exitCode: proc.exitCode ?? -1, stdout, stderr };
|
|
498
|
+
}
|
|
499
|
+
/**
|
|
500
|
+
* Threshold check: an asset crosses the negative threshold if either the
|
|
501
|
+
* absolute negative count meets `absoluteCount` OR the negative *ratio* among
|
|
502
|
+
* total feedback exceeds `ratio`. Either branch is sufficient — both are
|
|
503
|
+
* spec-mandated defaults.
|
|
504
|
+
*/
|
|
505
|
+
function crossesNegativeThreshold(counts, threshold) {
|
|
506
|
+
if (counts.negative >= threshold.absoluteCount)
|
|
507
|
+
return true;
|
|
508
|
+
const total = counts.positive + counts.negative;
|
|
509
|
+
if (total === 0)
|
|
510
|
+
return false;
|
|
511
|
+
return counts.negative / total > threshold.ratio;
|
|
512
|
+
}
|
|
513
|
+
/** Best-effort partition. Honours explicit `slice:` and falls back to id-hash. */
|
|
514
|
+
function effectiveSlice(task) {
|
|
515
|
+
if (task.slice)
|
|
516
|
+
return task.slice;
|
|
517
|
+
// Mirror corpus.effectiveSlice — SHA-1 first byte parity.
|
|
518
|
+
// We avoid the import cycle by inlining the trivial fallback.
|
|
519
|
+
let h = 0;
|
|
520
|
+
for (let i = 0; i < task.id.length; i += 1)
|
|
521
|
+
h = (h * 31 + task.id.charCodeAt(i)) | 0;
|
|
522
|
+
return Math.abs(h) % 2 === 0 ? "train" : "eval";
|
|
523
|
+
}
|
|
524
|
+
function uniqueDomain(tasks) {
|
|
525
|
+
const set = new Set(tasks.map((t) => t.domain));
|
|
526
|
+
if (set.size === 1)
|
|
527
|
+
return [...set][0] ?? "all";
|
|
528
|
+
return "all";
|
|
529
|
+
}
|
|
530
|
+
/**
|
|
531
|
+
* Wrap a spawn fake so every child sees `BENCH_EVOLVE_ARM=<arm>` (and
|
|
532
|
+
* `BENCH_EVOLVE_SCRATCHPAD=1` for the synthetic arm). Used by Phase 3 so
|
|
533
|
+
* test fakes can distinguish the three arms without us having to expose a
|
|
534
|
+
* `prompt` override on `runUtility`. Real production runs receive the same
|
|
535
|
+
* env keys; the real `runAgent` harness ignores them.
|
|
536
|
+
*/
|
|
537
|
+
function wrapSpawnWithArm(inner, arm, stashDir, scratchpad = false) {
|
|
538
|
+
return (cmd, opts) => {
|
|
539
|
+
const env = { ...(opts.env ?? {}) };
|
|
540
|
+
env.BENCH_EVOLVE_ARM = arm;
|
|
541
|
+
if (scratchpad)
|
|
542
|
+
env.BENCH_EVOLVE_SCRATCHPAD = "1";
|
|
543
|
+
if (stashDir)
|
|
544
|
+
env.AKM_STASH_DIR = stashDir;
|
|
545
|
+
if (arm === "synthetic")
|
|
546
|
+
delete env.AKM_STASH_DIR;
|
|
547
|
+
return inner(cmd, { ...opts, env });
|
|
548
|
+
};
|
|
549
|
+
}
|
|
550
|
+
/** Tolerant parser for `akm proposal list --json` stdout. */
|
|
551
|
+
function parseProposalList(stdout) {
|
|
552
|
+
if (!stdout.trim())
|
|
553
|
+
return [];
|
|
554
|
+
let parsed;
|
|
555
|
+
try {
|
|
556
|
+
parsed = JSON.parse(stdout);
|
|
557
|
+
}
|
|
558
|
+
catch {
|
|
559
|
+
return [];
|
|
560
|
+
}
|
|
561
|
+
const arr = Array.isArray(parsed)
|
|
562
|
+
? parsed
|
|
563
|
+
: Array.isArray(parsed.proposals)
|
|
564
|
+
? parsed.proposals
|
|
565
|
+
: [];
|
|
566
|
+
const out = [];
|
|
567
|
+
for (const item of arr) {
|
|
568
|
+
if (!item || typeof item !== "object")
|
|
569
|
+
continue;
|
|
570
|
+
const rec = item;
|
|
571
|
+
const id = typeof rec.id === "string" ? rec.id : null;
|
|
572
|
+
const assetRef = typeof rec.target_ref === "string"
|
|
573
|
+
? rec.target_ref
|
|
574
|
+
: typeof rec.targetRef === "string"
|
|
575
|
+
? rec.targetRef
|
|
576
|
+
: typeof rec.ref === "string"
|
|
577
|
+
? rec.ref
|
|
578
|
+
: null;
|
|
579
|
+
const kindRaw = typeof rec.kind === "string" ? rec.kind : typeof rec.source === "string" ? rec.source : "unknown";
|
|
580
|
+
const kind = kindRaw === "lesson" || kindRaw === "distill"
|
|
581
|
+
? "lesson"
|
|
582
|
+
: kindRaw === "revision" || kindRaw === "reflect"
|
|
583
|
+
? "revision"
|
|
584
|
+
: "unknown";
|
|
585
|
+
if (!id || !assetRef)
|
|
586
|
+
continue;
|
|
587
|
+
out.push({ id, assetRef, kind });
|
|
588
|
+
}
|
|
589
|
+
return out;
|
|
590
|
+
}
|
|
591
|
+
function parseProposalShow(stdout) {
|
|
592
|
+
if (!stdout.trim())
|
|
593
|
+
return { lintPass: false, lintMessage: "empty proposal show output" };
|
|
594
|
+
let parsed;
|
|
595
|
+
try {
|
|
596
|
+
parsed = JSON.parse(stdout);
|
|
597
|
+
}
|
|
598
|
+
catch (err) {
|
|
599
|
+
return { lintPass: false, lintMessage: `proposal show: parse error (${err.message})` };
|
|
600
|
+
}
|
|
601
|
+
const lintPass = parsed.lint_pass === true ||
|
|
602
|
+
parsed.lintPass === true ||
|
|
603
|
+
(typeof parsed.lint === "object" && parsed.lint !== null && parsed.lint.pass === true);
|
|
604
|
+
const lintRaw = parsed.lint;
|
|
605
|
+
let lintMessage;
|
|
606
|
+
if (lintRaw && typeof lintRaw === "object") {
|
|
607
|
+
const issues = lintRaw.issues;
|
|
608
|
+
if (Array.isArray(issues) && issues.length > 0) {
|
|
609
|
+
lintMessage = issues
|
|
610
|
+
.map((i) => (typeof i === "string" ? i : (i?.message ?? JSON.stringify(i))))
|
|
611
|
+
.join("; ");
|
|
612
|
+
}
|
|
613
|
+
}
|
|
614
|
+
return { lintPass, ...(lintMessage ? { lintMessage } : {}) };
|
|
615
|
+
}
|
|
616
|
+
/**
|
|
617
|
+
* Run `akm index` on the evolve stash to populate the FTS5 database in the
|
|
618
|
+
* cache directory that Phase 1 `akmCli` calls will use.
|
|
619
|
+
*
|
|
620
|
+
* `loadFixtureStash` already indexed the stash into an isolated XDG_CACHE_HOME
|
|
621
|
+
* that is invisible to subsequent `akmCli` calls. Calling this helper with the
|
|
622
|
+
* same `stashDir` + `cacheDir` that `envForRef` will forward ensures `akm
|
|
623
|
+
* feedback` (and later `akm distill` / `akm reflect`) can look up refs in the
|
|
624
|
+
* FTS5 index.
|
|
625
|
+
*
|
|
626
|
+
* Returns `{ ok: true }` on exit code 0, `{ ok: false, stderr }` otherwise.
|
|
627
|
+
* Exported for tests.
|
|
628
|
+
*/
|
|
629
|
+
export async function indexEvolveStash(stashDir, cacheDir, akmCli, cwd) {
|
|
630
|
+
const env = {
|
|
631
|
+
...process.env,
|
|
632
|
+
AKM_STASH_DIR: stashDir,
|
|
633
|
+
XDG_CACHE_HOME: cacheDir,
|
|
634
|
+
};
|
|
635
|
+
const result = await akmCli(["index"], cwd, env);
|
|
636
|
+
return { ok: result.exitCode === 0, stderr: result.stderr };
|
|
637
|
+
}
|
|
638
|
+
/** Exposed for tests so the synthetic-arm prompt construction can be asserted. */
|
|
639
|
+
export function buildSyntheticPrompt(taskId) {
|
|
640
|
+
return [
|
|
641
|
+
`Task: ${taskId}`,
|
|
642
|
+
"Arm: synthetic (Bring Your Own Skills)",
|
|
643
|
+
"No akm stash is available. Before solving the task, write a short scratchpad of the skills",
|
|
644
|
+
"and steps you intend to use, then proceed. Cite the scratchpad in your trace so the verifier",
|
|
645
|
+
"can attribute the approach to your own reasoning rather than retrieved guidance.",
|
|
646
|
+
].join("\n");
|
|
647
|
+
}
|