akm-cli 0.7.0 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +8 -8
- package/dist/tests/add-website-source.test.js +0 -119
- package/dist/tests/agent/agent-config-loader.test.js +0 -70
- package/dist/tests/agent/agent-config.test.js +0 -221
- package/dist/tests/agent/agent-detect.test.js +0 -100
- package/dist/tests/agent/agent-spawn.test.js +0 -234
- package/dist/tests/agent-output.test.js +0 -186
- package/dist/tests/architecture/agent-no-llm-sdk-guard.test.js +0 -103
- package/dist/tests/architecture/agent-spawn-seam.test.js +0 -193
- package/dist/tests/architecture/llm-stateless-seam.test.js +0 -112
- package/dist/tests/asset-ref.test.js +0 -192
- package/dist/tests/asset-registry.test.js +0 -103
- package/dist/tests/asset-spec.test.js +0 -241
- package/dist/tests/bench/attribution.test.js +0 -996
- package/dist/tests/bench/cleanup-sigint.test.js +0 -83
- package/dist/tests/bench/cleanup.js +0 -234
- package/dist/tests/bench/cleanup.test.js +0 -166
- package/dist/tests/bench/cli.js +0 -1018
- package/dist/tests/bench/cli.test.js +0 -445
- package/dist/tests/bench/compare.test.js +0 -556
- package/dist/tests/bench/corpus.js +0 -317
- package/dist/tests/bench/corpus.test.js +0 -258
- package/dist/tests/bench/doctor.js +0 -525
- package/dist/tests/bench/driver.js +0 -401
- package/dist/tests/bench/driver.test.js +0 -584
- package/dist/tests/bench/environment.js +0 -233
- package/dist/tests/bench/environment.test.js +0 -199
- package/dist/tests/bench/evolve-metrics.js +0 -179
- package/dist/tests/bench/evolve-metrics.test.js +0 -187
- package/dist/tests/bench/evolve.js +0 -647
- package/dist/tests/bench/evolve.test.js +0 -624
- package/dist/tests/bench/failure-modes.test.js +0 -349
- package/dist/tests/bench/feedback-integrity.test.js +0 -457
- package/dist/tests/bench/leakage.test.js +0 -228
- package/dist/tests/bench/learning-curve.test.js +0 -134
- package/dist/tests/bench/metrics.js +0 -2395
- package/dist/tests/bench/metrics.test.js +0 -1150
- package/dist/tests/bench/no-os-tmpdir-invariant.test.js +0 -43
- package/dist/tests/bench/opencode-config.js +0 -194
- package/dist/tests/bench/opencode-config.test.js +0 -370
- package/dist/tests/bench/report.js +0 -1885
- package/dist/tests/bench/report.test.js +0 -1038
- package/dist/tests/bench/run-config.js +0 -355
- package/dist/tests/bench/run-config.test.js +0 -298
- package/dist/tests/bench/run-curate-test.js +0 -32
- package/dist/tests/bench/run-failing-tasks.js +0 -56
- package/dist/tests/bench/run-full-bench.js +0 -51
- package/dist/tests/bench/run-items36-targeted.js +0 -69
- package/dist/tests/bench/run-nano-quick.js +0 -42
- package/dist/tests/bench/run-waveg-targeted.js +0 -62
- package/dist/tests/bench/runner.js +0 -699
- package/dist/tests/bench/runner.test.js +0 -958
- package/dist/tests/bench/search-bridge.test.js +0 -331
- package/dist/tests/bench/tmp.js +0 -131
- package/dist/tests/bench/trajectory.js +0 -116
- package/dist/tests/bench/trajectory.test.js +0 -127
- package/dist/tests/bench/verifier.js +0 -114
- package/dist/tests/bench/verifier.test.js +0 -118
- package/dist/tests/bench/workflow-evaluator.js +0 -557
- package/dist/tests/bench/workflow-evaluator.test.js +0 -421
- package/dist/tests/bench/workflow-spec.js +0 -345
- package/dist/tests/bench/workflow-spec.test.js +0 -363
- package/dist/tests/bench/workflow-trace.js +0 -472
- package/dist/tests/bench/workflow-trace.test.js +0 -254
- package/dist/tests/benchmark-search-quality.js +0 -536
- package/dist/tests/benchmark-suite.js +0 -1441
- package/dist/tests/capture-cli.test.js +0 -112
- package/dist/tests/cli-errors.test.js +0 -204
- package/dist/tests/commands/events.test.js +0 -370
- package/dist/tests/commands/history.test.js +0 -418
- package/dist/tests/commands/import.test.js +0 -103
- package/dist/tests/commands/proposal-cli.test.js +0 -209
- package/dist/tests/commands/reflect-propose-cli.test.js +0 -333
- package/dist/tests/commands/remember.test.js +0 -97
- package/dist/tests/commands/scope-flags.test.js +0 -300
- package/dist/tests/commands/search.test.js +0 -537
- package/dist/tests/commands/show-indexer-parity.test.js +0 -117
- package/dist/tests/commands/show.test.js +0 -294
- package/dist/tests/common.test.js +0 -266
- package/dist/tests/completions.test.js +0 -142
- package/dist/tests/config-cli.test.js +0 -193
- package/dist/tests/config-llm-features.test.js +0 -139
- package/dist/tests/config.test.js +0 -569
- package/dist/tests/contracts/migration-baseline.test.js +0 -43
- package/dist/tests/contracts/reflect-propose-envelope.test.js +0 -139
- package/dist/tests/contracts/spec-helpers.js +0 -46
- package/dist/tests/contracts/v1-spec-section-11-proposal-queue.test.js +0 -228
- package/dist/tests/contracts/v1-spec-section-12-agent-config.test.js +0 -56
- package/dist/tests/contracts/v1-spec-section-13-lesson-type.test.js +0 -34
- package/dist/tests/contracts/v1-spec-section-14-llm-features.test.js +0 -94
- package/dist/tests/contracts/v1-spec-section-4-1-asset-types.test.js +0 -39
- package/dist/tests/contracts/v1-spec-section-4-2-quality-rules.test.js +0 -44
- package/dist/tests/contracts/v1-spec-section-5-configuration.test.js +0 -47
- package/dist/tests/contracts/v1-spec-section-6-orchestration.test.js +0 -40
- package/dist/tests/contracts/v1-spec-section-7-module-layout.test.js +0 -58
- package/dist/tests/contracts/v1-spec-section-8-extension-points.test.js +0 -34
- package/dist/tests/contracts/v1-spec-section-9-4-cli-surface.test.js +0 -75
- package/dist/tests/contracts/v1-spec-section-9-7-llm-agent-boundary.test.js +0 -36
- package/dist/tests/core/write-source.test.js +0 -366
- package/dist/tests/curate-command.test.js +0 -87
- package/dist/tests/db-scoring.test.js +0 -201
- package/dist/tests/db.test.js +0 -654
- package/dist/tests/distill-cli-flag.test.js +0 -208
- package/dist/tests/distill.test.js +0 -515
- package/dist/tests/docker-install.test.js +0 -120
- package/dist/tests/e2e.test.js +0 -1419
- package/dist/tests/embedder.test.js +0 -340
- package/dist/tests/embedding-model-config.test.js +0 -379
- package/dist/tests/feedback-command.test.js +0 -172
- package/dist/tests/file-context.test.js +0 -552
- package/dist/tests/fixtures/scripts/git/summarize-diff.js +0 -9
- package/dist/tests/fixtures/scripts/lint/eslint-check.js +0 -7
- package/dist/tests/fixtures/stashes/load.js +0 -166
- package/dist/tests/fixtures/stashes/load.test.js +0 -97
- package/dist/tests/fixtures/stashes/ranking-baseline/scripts/mem0-search.js +0 -12
- package/dist/tests/frontmatter.test.js +0 -190
- package/dist/tests/fts-field-weighting.test.js +0 -254
- package/dist/tests/fuzzy-search.test.js +0 -230
- package/dist/tests/git-provider-clone.test.js +0 -45
- package/dist/tests/github.test.js +0 -161
- package/dist/tests/graph-boost-ranking.test.js +0 -305
- package/dist/tests/graph-extraction.test.js +0 -282
- package/dist/tests/helpers/usage-events.js +0 -8
- package/dist/tests/index-pass-llm.test.js +0 -161
- package/dist/tests/indexer.test.js +0 -570
- package/dist/tests/info-command.test.js +0 -166
- package/dist/tests/init.test.js +0 -69
- package/dist/tests/install-script.test.js +0 -246
- package/dist/tests/integration/agent-real-profile.test.js +0 -94
- package/dist/tests/issue-36-repro.test.js +0 -304
- package/dist/tests/issues-191-194.test.js +0 -160
- package/dist/tests/lesson-lint.test.js +0 -111
- package/dist/tests/llm-client.test.js +0 -115
- package/dist/tests/llm-feature-gate.test.js +0 -151
- package/dist/tests/llm.test.js +0 -139
- package/dist/tests/lockfile.test.js +0 -216
- package/dist/tests/manifest.test.js +0 -205
- package/dist/tests/markdown.test.js +0 -126
- package/dist/tests/matchers-unit.test.js +0 -189
- package/dist/tests/memory-inference.test.js +0 -299
- package/dist/tests/merge-scoring.test.js +0 -136
- package/dist/tests/metadata.test.js +0 -313
- package/dist/tests/migration-help.test.js +0 -89
- package/dist/tests/origin-resolve.test.js +0 -124
- package/dist/tests/output-baseline.test.js +0 -218
- package/dist/tests/output-shapes-unit.test.js +0 -478
- package/dist/tests/parallel-search.test.js +0 -272
- package/dist/tests/parameter-metadata.test.js +0 -365
- package/dist/tests/paths.test.js +0 -177
- package/dist/tests/progressive-disclosure.test.js +0 -280
- package/dist/tests/proposals.test.js +0 -279
- package/dist/tests/proposed-quality.test.js +0 -271
- package/dist/tests/provider-registry.test.js +0 -32
- package/dist/tests/ranking-regression.test.js +0 -548
- package/dist/tests/reflect-propose.test.js +0 -455
- package/dist/tests/registry-build-index.test.js +0 -394
- package/dist/tests/registry-cli.test.js +0 -290
- package/dist/tests/registry-index-v2.test.js +0 -430
- package/dist/tests/registry-install.test.js +0 -728
- package/dist/tests/registry-providers/parity.test.js +0 -189
- package/dist/tests/registry-providers/skills-sh.test.js +0 -309
- package/dist/tests/registry-providers/static-index.test.js +0 -238
- package/dist/tests/registry-resolve.test.js +0 -126
- package/dist/tests/registry-search.test.js +0 -923
- package/dist/tests/remember-frontmatter.test.js +0 -378
- package/dist/tests/remember-unit.test.js +0 -123
- package/dist/tests/ripgrep-install.test.js +0 -251
- package/dist/tests/ripgrep-resolve.test.js +0 -108
- package/dist/tests/ripgrep.test.js +0 -163
- package/dist/tests/save-command.test.js +0 -94
- package/dist/tests/save-trust-qa-fixes.test.js +0 -270
- package/dist/tests/scoring-pipeline.test.js +0 -648
- package/dist/tests/search-include-proposed-cli.test.js +0 -118
- package/dist/tests/self-update.test.js +0 -442
- package/dist/tests/semantic-search-e2e.test.js +0 -512
- package/dist/tests/semantic-status.test.js +0 -471
- package/dist/tests/setup-run.integration.js +0 -877
- package/dist/tests/setup-wizard.test.js +0 -198
- package/dist/tests/setup.test.js +0 -131
- package/dist/tests/source-add.test.js +0 -11
- package/dist/tests/source-clone.test.js +0 -254
- package/dist/tests/source-manage.test.js +0 -366
- package/dist/tests/source-providers/filesystem.test.js +0 -82
- package/dist/tests/source-providers/git.test.js +0 -252
- package/dist/tests/source-providers/website.test.js +0 -128
- package/dist/tests/source-qa-fixes.test.js +0 -286
- package/dist/tests/source-registry.test.js +0 -350
- package/dist/tests/source-resolve.test.js +0 -100
- package/dist/tests/source-source.test.js +0 -281
- package/dist/tests/source.test.js +0 -533
- package/dist/tests/tar-utils-scan.test.js +0 -73
- package/dist/tests/toggle-components.test.js +0 -73
- package/dist/tests/usage-telemetry.test.js +0 -265
- package/dist/tests/utility-scoring.test.js +0 -558
- package/dist/tests/vault-load-error.test.js +0 -78
- package/dist/tests/vault-qa-fixes.test.js +0 -194
- package/dist/tests/vault.test.js +0 -429
- package/dist/tests/vector-search.test.js +0 -608
- package/dist/tests/walker.test.js +0 -252
- package/dist/tests/wave2-cluster-bc.test.js +0 -228
- package/dist/tests/wave2-cluster-d.test.js +0 -180
- package/dist/tests/wave2-cluster-e.test.js +0 -179
- package/dist/tests/wiki-qa-fixes.test.js +0 -270
- package/dist/tests/wiki.test.js +0 -529
- package/dist/tests/workflow-cli.test.js +0 -271
- package/dist/tests/workflow-markdown.test.js +0 -171
- package/dist/tests/workflow-path-escape.test.js +0 -132
- package/dist/tests/workflow-qa-fixes.test.js +0 -395
- package/dist/tests/workflows/indexer-rejection.test.js +0 -213
- /package/dist/{src/cli.js → cli.js} +0 -0
- /package/dist/{src/commands → commands}/completions.js +0 -0
- /package/dist/{src/commands → commands}/config-cli.js +0 -0
- /package/dist/{src/commands → commands}/curate.js +0 -0
- /package/dist/{src/commands → commands}/distill.js +0 -0
- /package/dist/{src/commands → commands}/events.js +0 -0
- /package/dist/{src/commands → commands}/history.js +0 -0
- /package/dist/{src/commands → commands}/info.js +0 -0
- /package/dist/{src/commands → commands}/init.js +0 -0
- /package/dist/{src/commands → commands}/install-audit.js +0 -0
- /package/dist/{src/commands → commands}/installed-stashes.js +0 -0
- /package/dist/{src/commands → commands}/migration-help.js +0 -0
- /package/dist/{src/commands → commands}/proposal.js +0 -0
- /package/dist/{src/commands → commands}/propose.js +0 -0
- /package/dist/{src/commands → commands}/reflect.js +0 -0
- /package/dist/{src/commands → commands}/registry-search.js +0 -0
- /package/dist/{src/commands → commands}/remember.js +0 -0
- /package/dist/{src/commands → commands}/search.js +0 -0
- /package/dist/{src/commands → commands}/self-update.js +0 -0
- /package/dist/{src/commands → commands}/show.js +0 -0
- /package/dist/{src/commands → commands}/source-add.js +0 -0
- /package/dist/{src/commands → commands}/source-clone.js +0 -0
- /package/dist/{src/commands → commands}/source-manage.js +0 -0
- /package/dist/{src/commands → commands}/vault.js +0 -0
- /package/dist/{src/core → core}/asset-ref.js +0 -0
- /package/dist/{src/core → core}/asset-registry.js +0 -0
- /package/dist/{src/core → core}/asset-spec.js +0 -0
- /package/dist/{src/core → core}/common.js +0 -0
- /package/dist/{src/core → core}/config.js +0 -0
- /package/dist/{src/core → core}/errors.js +0 -0
- /package/dist/{src/core → core}/events.js +0 -0
- /package/dist/{src/core → core}/frontmatter.js +0 -0
- /package/dist/{src/core → core}/lesson-lint.js +0 -0
- /package/dist/{src/core → core}/markdown.js +0 -0
- /package/dist/{src/core → core}/paths.js +0 -0
- /package/dist/{src/core → core}/proposals.js +0 -0
- /package/dist/{src/core → core}/warn.js +0 -0
- /package/dist/{src/core → core}/write-source.js +0 -0
- /package/dist/{src/indexer → indexer}/db-search.js +0 -0
- /package/dist/{src/indexer → indexer}/db.js +0 -0
- /package/dist/{src/indexer → indexer}/file-context.js +0 -0
- /package/dist/{src/indexer → indexer}/graph-boost.js +0 -0
- /package/dist/{src/indexer → indexer}/graph-extraction.js +0 -0
- /package/dist/{src/indexer → indexer}/indexer.js +0 -0
- /package/dist/{src/indexer → indexer}/manifest.js +0 -0
- /package/dist/{src/indexer → indexer}/matchers.js +0 -0
- /package/dist/{src/indexer → indexer}/memory-inference.js +0 -0
- /package/dist/{src/indexer → indexer}/metadata.js +0 -0
- /package/dist/{src/indexer → indexer}/search-fields.js +0 -0
- /package/dist/{src/indexer → indexer}/search-source.js +0 -0
- /package/dist/{src/indexer → indexer}/semantic-status.js +0 -0
- /package/dist/{src/indexer → indexer}/usage-events.js +0 -0
- /package/dist/{src/indexer → indexer}/walker.js +0 -0
- /package/dist/{src/integrations → integrations}/agent/config.js +0 -0
- /package/dist/{src/integrations → integrations}/agent/detect.js +0 -0
- /package/dist/{src/integrations → integrations}/agent/index.js +0 -0
- /package/dist/{src/integrations → integrations}/agent/profiles.js +0 -0
- /package/dist/{src/integrations → integrations}/agent/prompts.js +0 -0
- /package/dist/{src/integrations → integrations}/agent/spawn.js +0 -0
- /package/dist/{src/integrations → integrations}/github.js +0 -0
- /package/dist/{src/integrations → integrations}/lockfile.js +0 -0
- /package/dist/{src/llm → llm}/client.js +0 -0
- /package/dist/{src/llm → llm}/embedder.js +0 -0
- /package/dist/{src/llm → llm}/embedders/cache.js +0 -0
- /package/dist/{src/llm → llm}/embedders/local.js +0 -0
- /package/dist/{src/llm → llm}/embedders/remote.js +0 -0
- /package/dist/{src/llm → llm}/embedders/types.js +0 -0
- /package/dist/{src/llm → llm}/feature-gate.js +0 -0
- /package/dist/{src/llm → llm}/graph-extract.js +0 -0
- /package/dist/{src/llm → llm}/index-passes.js +0 -0
- /package/dist/{src/llm → llm}/memory-infer.js +0 -0
- /package/dist/{src/llm → llm}/metadata-enhance.js +0 -0
- /package/dist/{src/output → output}/cli-hints.js +0 -0
- /package/dist/{src/output → output}/context.js +0 -0
- /package/dist/{src/output → output}/renderers.js +0 -0
- /package/dist/{src/output → output}/shapes.js +0 -0
- /package/dist/{src/output → output}/text.js +0 -0
- /package/dist/{src/registry → registry}/build-index.js +0 -0
- /package/dist/{src/registry → registry}/create-provider-registry.js +0 -0
- /package/dist/{src/registry → registry}/factory.js +0 -0
- /package/dist/{src/registry → registry}/origin-resolve.js +0 -0
- /package/dist/{src/registry → registry}/providers/index.js +0 -0
- /package/dist/{src/registry → registry}/providers/skills-sh.js +0 -0
- /package/dist/{src/registry → registry}/providers/static-index.js +0 -0
- /package/dist/{src/registry → registry}/providers/types.js +0 -0
- /package/dist/{src/registry → registry}/resolve.js +0 -0
- /package/dist/{src/registry → registry}/types.js +0 -0
- /package/dist/{src/setup → setup}/detect.js +0 -0
- /package/dist/{src/setup → setup}/ripgrep-install.js +0 -0
- /package/dist/{src/setup → setup}/ripgrep-resolve.js +0 -0
- /package/dist/{src/setup → setup}/setup.js +0 -0
- /package/dist/{src/setup → setup}/steps.js +0 -0
- /package/dist/{src/sources → sources}/include.js +0 -0
- /package/dist/{src/sources → sources}/provider-factory.js +0 -0
- /package/dist/{src/sources → sources}/provider.js +0 -0
- /package/dist/{src/sources → sources}/providers/filesystem.js +0 -0
- /package/dist/{src/sources → sources}/providers/git.js +0 -0
- /package/dist/{src/sources → sources}/providers/index.js +0 -0
- /package/dist/{src/sources → sources}/providers/install-types.js +0 -0
- /package/dist/{src/sources → sources}/providers/npm.js +0 -0
- /package/dist/{src/sources → sources}/providers/provider-utils.js +0 -0
- /package/dist/{src/sources → sources}/providers/sync-from-ref.js +0 -0
- /package/dist/{src/sources → sources}/providers/tar-utils.js +0 -0
- /package/dist/{src/sources → sources}/providers/website.js +0 -0
- /package/dist/{src/sources → sources}/resolve.js +0 -0
- /package/dist/{src/sources → sources}/types.js +0 -0
- /package/dist/{src/templates → templates}/wiki-templates.js +0 -0
- /package/dist/{src/version.js → version.js} +0 -0
- /package/dist/{src/wiki → wiki}/wiki.js +0 -0
- /package/dist/{src/workflows → workflows}/authoring.js +0 -0
- /package/dist/{src/workflows → workflows}/cli.js +0 -0
- /package/dist/{src/workflows → workflows}/db.js +0 -0
- /package/dist/{src/workflows → workflows}/document-cache.js +0 -0
- /package/dist/{src/workflows → workflows}/parser.js +0 -0
- /package/dist/{src/workflows → workflows}/renderer.js +0 -0
- /package/dist/{src/workflows → workflows}/runs.js +0 -0
- /package/dist/{src/workflows → workflows}/schema.js +0 -0
- /package/dist/{src/workflows → workflows}/validator.js +0 -0
|
@@ -1,233 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* environment.ts — unified bench environment setup.
|
|
3
|
-
*
|
|
4
|
-
* `setupBenchEnvironment` is the single function that owns all per-run
|
|
5
|
-
* isolation: isolation dirs, opencode.json, akm config, FTS5 index. Both
|
|
6
|
-
* `runOne` (driver.ts) and the doctor's live-run check call this function,
|
|
7
|
-
* guaranteeing they produce identical environments.
|
|
8
|
-
*
|
|
9
|
-
* Key design decisions:
|
|
10
|
-
* - `BENCH_OPENCODE_INVARIANTS` (plugin:[], permission block) are always
|
|
11
|
-
* written — they are bench isolation invariants, not conditional on the
|
|
12
|
-
* provider path. No silent stub fallbacks.
|
|
13
|
-
* - `dryRun: true` skips the akm config and index writes. Unit tests set
|
|
14
|
-
* this so the setup path is exercised without spawning a real agent.
|
|
15
|
-
* - `validateFixtureCorpus` is called at bench startup to catch missing
|
|
16
|
-
* fixtures before any work items start, not per-task mid-run.
|
|
17
|
-
*/
|
|
18
|
-
import fs from "node:fs";
|
|
19
|
-
import path from "node:path";
|
|
20
|
-
import { buildIsolatedEnv, buildSanitizedEnvSource, createIsolationDirs } from "./driver";
|
|
21
|
-
import { BenchConfigError, selectProviderForModel } from "./opencode-config";
|
|
22
|
-
import { benchMkdtemp } from "./tmp";
|
|
23
|
-
// ── Bench isolation invariants ───────────────────────────────────────────────
|
|
24
|
-
/**
|
|
25
|
-
* Top-level keys written unconditionally into every bench-generated
|
|
26
|
-
* opencode.json. These are isolation invariants — never conditional on
|
|
27
|
-
* provider resolution or model type.
|
|
28
|
-
*
|
|
29
|
-
* - `plugin: []` — prevents operator plugins (akm-opencode, etc.) from
|
|
30
|
-
* running lifecycle hooks that override AKM_STASH_DIR, warm indexes
|
|
31
|
-
* against the wrong stash, or prompt akm setup wizards.
|
|
32
|
-
* - `permission` — opencode in non-interactive (`opencode run`) mode
|
|
33
|
-
* silently skips tool calls without explicit permission grants.
|
|
34
|
-
*/
|
|
35
|
-
export const BENCH_OPENCODE_INVARIANTS = {
|
|
36
|
-
plugin: [],
|
|
37
|
-
permission: {
|
|
38
|
-
bash: "allow",
|
|
39
|
-
edit: "allow",
|
|
40
|
-
write: "allow",
|
|
41
|
-
read: "allow",
|
|
42
|
-
webfetch: "allow",
|
|
43
|
-
},
|
|
44
|
-
};
|
|
45
|
-
// ── Built-in cloud prefixes ──────────────────────────────────────────────────
|
|
46
|
-
/**
|
|
47
|
-
* opencode provider prefixes that resolve via its built-in cloud-provider
|
|
48
|
-
* registry. Models with one of these prefixes do not need a custom provider
|
|
49
|
-
* entry in the bench providers JSON. Models with any other prefix require
|
|
50
|
-
* `opencodeProviders` — the harness refuses to run without it to prevent
|
|
51
|
-
* silent cloud-model fallback and unexpected API charges.
|
|
52
|
-
*/
|
|
53
|
-
export const BUILTIN_CLOUD_PREFIXES = new Set([
|
|
54
|
-
"anthropic",
|
|
55
|
-
"openai",
|
|
56
|
-
"openrouter",
|
|
57
|
-
"opencode",
|
|
58
|
-
"google",
|
|
59
|
-
"amazon",
|
|
60
|
-
"azure",
|
|
61
|
-
"vertex",
|
|
62
|
-
"bedrock",
|
|
63
|
-
"mistral",
|
|
64
|
-
"groq",
|
|
65
|
-
"together",
|
|
66
|
-
"fireworks",
|
|
67
|
-
]);
|
|
68
|
-
/**
|
|
69
|
-
* Write an `opencode.json` into `opencodeConfigDir`.
|
|
70
|
-
*
|
|
71
|
-
* Always includes `BENCH_OPENCODE_INVARIANTS` (plugin:[], permission block).
|
|
72
|
-
* When `providers` is supplied and the model prefix resolves, the `provider`
|
|
73
|
-
* block is added. When the prefix is not found in the providers map (built-in
|
|
74
|
-
* cloud model), the file is written without a provider block and a warning is
|
|
75
|
-
* returned — this is not an error because built-in cloud models resolve via
|
|
76
|
-
* opencode's own registry.
|
|
77
|
-
*
|
|
78
|
-
* Returns a `WriteOpencodeJsonResult` — never throws for expected cases.
|
|
79
|
-
* Throws for unexpected FS errors.
|
|
80
|
-
*/
|
|
81
|
-
export function writeOpencodeJson(opencodeConfigDir, model, providers) {
|
|
82
|
-
const warnings = [];
|
|
83
|
-
let providerKey;
|
|
84
|
-
let providerBlock;
|
|
85
|
-
if (providers) {
|
|
86
|
-
try {
|
|
87
|
-
const selected = selectProviderForModel(providers, model);
|
|
88
|
-
providerKey = selected.providerKey;
|
|
89
|
-
providerBlock = { [selected.providerKey]: selected.entry };
|
|
90
|
-
}
|
|
91
|
-
catch (err) {
|
|
92
|
-
if (err instanceof BenchConfigError) {
|
|
93
|
-
// Check if this is a local-provider model that MUST have a provider block.
|
|
94
|
-
const modelPrefix = model.split("/")[0];
|
|
95
|
-
if (modelPrefix && !BUILTIN_CLOUD_PREFIXES.has(modelPrefix)) {
|
|
96
|
-
// Local-prefix model not in providers map — this is a hard error, not a
|
|
97
|
-
// fallback. Writing opencode.json without a provider block would cause
|
|
98
|
-
// opencode to use cloud resolution, skewing results and incurring costs.
|
|
99
|
-
throw new BenchConfigError(`model "${model}" uses local prefix "${modelPrefix}" but was not found in the providers config. ` +
|
|
100
|
-
`Add it to the providers file or use a built-in cloud model prefix.`, true);
|
|
101
|
-
}
|
|
102
|
-
warnings.push(`model "${model}" not found in providers config; writing stub (expected for built-in cloud models)`);
|
|
103
|
-
}
|
|
104
|
-
else {
|
|
105
|
-
throw err;
|
|
106
|
-
}
|
|
107
|
-
}
|
|
108
|
-
}
|
|
109
|
-
const config = {
|
|
110
|
-
$schema: "https://opencode.ai/config.json",
|
|
111
|
-
model,
|
|
112
|
-
...BENCH_OPENCODE_INVARIANTS,
|
|
113
|
-
...(providerBlock ? { provider: providerBlock } : {}),
|
|
114
|
-
};
|
|
115
|
-
fs.writeFileSync(path.join(opencodeConfigDir, "opencode.json"), JSON.stringify(config, null, 2), { mode: 0o600 });
|
|
116
|
-
return { providerKey, warnings };
|
|
117
|
-
}
|
|
118
|
-
/**
|
|
119
|
-
* Set up a complete bench run environment.
|
|
120
|
-
*
|
|
121
|
-
* 1. Creates isolation dirs (XDG_CACHE_HOME, XDG_CONFIG_HOME, OPENCODE_CONFIG).
|
|
122
|
-
* 2. Writes opencode.json with BENCH_OPENCODE_INVARIANTS + optional provider.
|
|
123
|
-
* 3. Writes $XDG_CONFIG_HOME/akm/config.json so the akm CLI and any plugin
|
|
124
|
-
* find the correct stash via `akm config get stashDir`.
|
|
125
|
-
* 4. Copies the pre-built FTS5 index into XDG_CACHE_HOME, or re-indexes as
|
|
126
|
-
* fallback if no pre-built cache is available.
|
|
127
|
-
*
|
|
128
|
-
* Throws `BenchConfigError` for model prefix / provider mismatches.
|
|
129
|
-
*/
|
|
130
|
-
export function setupBenchEnvironment(params) {
|
|
131
|
-
const { model, arm, stashDir: rawStashDir, indexCacheHome, providers, dryRun = false, warnings = [] } = params;
|
|
132
|
-
// Synthetic arm must never carry a stash.
|
|
133
|
-
const stashDir = arm === "synthetic" ? undefined : rawStashDir;
|
|
134
|
-
// Safety: refuse to run local-provider models without a providers config.
|
|
135
|
-
const modelParts = model.split("/");
|
|
136
|
-
if (modelParts.length >= 2 && !BUILTIN_CLOUD_PREFIXES.has(modelParts[0]) && !providers) {
|
|
137
|
-
throw new BenchConfigError(`model "${model}" uses custom provider prefix "${modelParts[0]}" — supply opencodeProviders to avoid silent fallback to a cloud model`, false);
|
|
138
|
-
}
|
|
139
|
-
const dirs = createIsolationDirs(stashDir);
|
|
140
|
-
const env = buildIsolatedEnv(dirs, model);
|
|
141
|
-
// Synthetic arm must not carry AKM_STASH_DIR even if createIsolationDirs
|
|
142
|
-
// somehow set it (recurrence guard for the #243 fixup pattern).
|
|
143
|
-
if (arm === "synthetic") {
|
|
144
|
-
delete env.AKM_STASH_DIR;
|
|
145
|
-
}
|
|
146
|
-
// Write opencode.json with invariants + optional provider block.
|
|
147
|
-
const result = writeOpencodeJson(dirs.opencodeConfig, model, providers);
|
|
148
|
-
for (const w of result.warnings)
|
|
149
|
-
warnings.push(w);
|
|
150
|
-
// Wire akm config and index only when a real stash is on disk.
|
|
151
|
-
const stashOnDisk = stashDir ? fs.existsSync(stashDir) : false;
|
|
152
|
-
if (stashDir && stashOnDisk && !dryRun) {
|
|
153
|
-
// akm config: so `akm config get stashDir` returns the fixture path
|
|
154
|
-
// and the akm-opencode plugin (if somehow re-enabled) injects the right
|
|
155
|
-
// AKM_STASH_DIR into the bash-tool env via its shell.env hook.
|
|
156
|
-
const akmConfigDir = path.join(dirs.configHome, "akm");
|
|
157
|
-
fs.mkdirSync(akmConfigDir, { recursive: true });
|
|
158
|
-
fs.writeFileSync(path.join(akmConfigDir, "config.json"), JSON.stringify({ stashDir }), { mode: 0o600 });
|
|
159
|
-
// FTS5 index: fast-path copy from pre-built cache; slow-path re-index.
|
|
160
|
-
const destAkmDir = path.join(dirs.cacheHome, "akm");
|
|
161
|
-
fs.mkdirSync(destAkmDir, { recursive: true });
|
|
162
|
-
if (indexCacheHome) {
|
|
163
|
-
const srcAkmDir = path.join(indexCacheHome, "akm");
|
|
164
|
-
try {
|
|
165
|
-
for (const entry of fs.readdirSync(srcAkmDir)) {
|
|
166
|
-
fs.copyFileSync(path.join(srcAkmDir, entry), path.join(destAkmDir, entry));
|
|
167
|
-
}
|
|
168
|
-
}
|
|
169
|
-
catch (err) {
|
|
170
|
-
warnings.push(`index copy failed, falling back to re-index: ${err.message}`);
|
|
171
|
-
_runAkmIndex(stashDir, env);
|
|
172
|
-
}
|
|
173
|
-
}
|
|
174
|
-
else {
|
|
175
|
-
_runAkmIndex(stashDir, env);
|
|
176
|
-
}
|
|
177
|
-
}
|
|
178
|
-
return {
|
|
179
|
-
dirs,
|
|
180
|
-
env,
|
|
181
|
-
teardown() {
|
|
182
|
-
try {
|
|
183
|
-
fs.rmSync(dirs.root, { recursive: true, force: true });
|
|
184
|
-
}
|
|
185
|
-
catch {
|
|
186
|
-
/* swallow */
|
|
187
|
-
}
|
|
188
|
-
},
|
|
189
|
-
};
|
|
190
|
-
}
|
|
191
|
-
function _runAkmIndex(stashDir, env) {
|
|
192
|
-
const cliEntry = path.resolve(__dirname, "..", "..", "src", "cli.ts");
|
|
193
|
-
Bun.spawnSync({
|
|
194
|
-
cmd: ["bun", "run", cliEntry, "index", "--full"],
|
|
195
|
-
cwd: stashDir,
|
|
196
|
-
env: { ...buildSanitizedEnvSource(), ...env },
|
|
197
|
-
stdout: "pipe",
|
|
198
|
-
stderr: "pipe",
|
|
199
|
-
});
|
|
200
|
-
}
|
|
201
|
-
// ── validateFixtureCorpus ────────────────────────────────────────────────────
|
|
202
|
-
const FIXTURES_ROOT = path.resolve(__dirname, "..", "fixtures", "stashes");
|
|
203
|
-
/**
|
|
204
|
-
* Validate that all task stash references name fixtures that exist on disk
|
|
205
|
-
* (i.e. have a MANIFEST.json). Returns the set of missing fixture names.
|
|
206
|
-
*
|
|
207
|
-
* Call at bench startup before creating any work items. A non-empty `missing`
|
|
208
|
-
* set means those tasks will produce `harness_error` at run time — better to
|
|
209
|
-
* surface that now with named failures than to discover it per-seed.
|
|
210
|
-
*/
|
|
211
|
-
export function validateFixtureCorpus(tasks) {
|
|
212
|
-
const byFixture = new Map();
|
|
213
|
-
for (const t of tasks) {
|
|
214
|
-
if (!byFixture.has(t.stash))
|
|
215
|
-
byFixture.set(t.stash, []);
|
|
216
|
-
byFixture.get(t.stash)?.push(t.id);
|
|
217
|
-
}
|
|
218
|
-
const valid = new Set();
|
|
219
|
-
const missing = new Map();
|
|
220
|
-
for (const [fixture, taskIds] of byFixture) {
|
|
221
|
-
const manifestPath = path.join(FIXTURES_ROOT, fixture, "MANIFEST.json");
|
|
222
|
-
if (fs.existsSync(manifestPath)) {
|
|
223
|
-
valid.add(fixture);
|
|
224
|
-
}
|
|
225
|
-
else {
|
|
226
|
-
missing.set(fixture, taskIds);
|
|
227
|
-
}
|
|
228
|
-
}
|
|
229
|
-
return { valid, missing };
|
|
230
|
-
}
|
|
231
|
-
// Re-export from driver for consumers that previously imported from there.
|
|
232
|
-
export { buildIsolatedEnv, buildSanitizedEnvSource, createIsolationDirs } from "./driver";
|
|
233
|
-
export { benchMkdtemp };
|
|
@@ -1,199 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Tests for environment.ts — writeOpencodeJson, validateFixtureCorpus,
|
|
3
|
-
* BENCH_OPENCODE_INVARIANTS, and setupBenchEnvironment (dryRun mode).
|
|
4
|
-
*/
|
|
5
|
-
import { afterAll, beforeAll, describe, expect, test } from "bun:test";
|
|
6
|
-
import fs from "node:fs";
|
|
7
|
-
import path from "node:path";
|
|
8
|
-
import { BENCH_OPENCODE_INVARIANTS, BUILTIN_CLOUD_PREFIXES, setupBenchEnvironment, validateFixtureCorpus, writeOpencodeJson, } from "./environment";
|
|
9
|
-
import { benchMkdtemp } from "./tmp";
|
|
10
|
-
// ── writeOpencodeJson ────────────────────────────────────────────────────────
|
|
11
|
-
describe("writeOpencodeJson", () => {
|
|
12
|
-
let tmp;
|
|
13
|
-
beforeAll(() => {
|
|
14
|
-
tmp = benchMkdtemp("bench-env-test-");
|
|
15
|
-
});
|
|
16
|
-
afterAll(() => {
|
|
17
|
-
fs.rmSync(tmp, { recursive: true, force: true });
|
|
18
|
-
});
|
|
19
|
-
test("always writes plugin:[] and permission block (isolation invariants)", () => {
|
|
20
|
-
const dir = path.join(tmp, "invariants");
|
|
21
|
-
fs.mkdirSync(dir, { recursive: true });
|
|
22
|
-
writeOpencodeJson(dir, "anthropic/claude-opus-4-7");
|
|
23
|
-
const config = JSON.parse(fs.readFileSync(path.join(dir, "opencode.json"), "utf8"));
|
|
24
|
-
expect(config.plugin).toEqual([]);
|
|
25
|
-
expect(config.permission?.bash).toBe("allow");
|
|
26
|
-
expect(config.permission?.edit).toBe("allow");
|
|
27
|
-
expect(config.permission?.write).toBe("allow");
|
|
28
|
-
});
|
|
29
|
-
test("writes provider block when model resolves in providers map", () => {
|
|
30
|
-
const dir = path.join(tmp, "with-provider");
|
|
31
|
-
fs.mkdirSync(dir, { recursive: true });
|
|
32
|
-
const providers = {
|
|
33
|
-
source: "/fake/providers.json",
|
|
34
|
-
providers: { myprov: { npm: "@ai-sdk/openai-compatible", name: "My Provider" } },
|
|
35
|
-
};
|
|
36
|
-
const result = writeOpencodeJson(dir, "myprov/my-model", providers);
|
|
37
|
-
expect(result.providerKey).toBe("myprov");
|
|
38
|
-
expect(result.warnings).toHaveLength(0);
|
|
39
|
-
const config = JSON.parse(fs.readFileSync(path.join(dir, "opencode.json"), "utf8"));
|
|
40
|
-
expect(config.provider?.myprov).toBeDefined();
|
|
41
|
-
expect(config.model).toBe("myprov/my-model");
|
|
42
|
-
});
|
|
43
|
-
test("writes stub (no provider block) and returns warning for built-in cloud model not in providers map", () => {
|
|
44
|
-
const dir = path.join(tmp, "cloud-stub");
|
|
45
|
-
fs.mkdirSync(dir, { recursive: true });
|
|
46
|
-
const providers = {
|
|
47
|
-
source: "/fake/providers.json",
|
|
48
|
-
providers: { otherprov: {} },
|
|
49
|
-
};
|
|
50
|
-
const result = writeOpencodeJson(dir, "opencode/big-pickle", providers);
|
|
51
|
-
expect(result.providerKey).toBeUndefined();
|
|
52
|
-
expect(result.warnings.length).toBeGreaterThan(0);
|
|
53
|
-
const config = JSON.parse(fs.readFileSync(path.join(dir, "opencode.json"), "utf8"));
|
|
54
|
-
expect(config.provider).toBeUndefined();
|
|
55
|
-
// Invariants still present.
|
|
56
|
-
expect(config.plugin).toEqual([]);
|
|
57
|
-
});
|
|
58
|
-
test("throws BenchConfigError for local-prefix model not found in providers map", () => {
|
|
59
|
-
const dir = path.join(tmp, "local-prefix-missing");
|
|
60
|
-
fs.mkdirSync(dir, { recursive: true });
|
|
61
|
-
const providers = {
|
|
62
|
-
source: "/fake/providers.json",
|
|
63
|
-
providers: { otherprov: {} },
|
|
64
|
-
};
|
|
65
|
-
// "shredder" is not in BUILTIN_CLOUD_PREFIXES and not in the providers map.
|
|
66
|
-
expect(() => writeOpencodeJson(dir, "shredder/qwen3.5-9b", providers)).toThrow(/local prefix/);
|
|
67
|
-
// The opencode.json must NOT have been written (or if partially written, provider block is absent).
|
|
68
|
-
// We check that the function threw rather than silently wrote a cloud-fallback stub.
|
|
69
|
-
});
|
|
70
|
-
test("writes provider block for local-prefix model that IS found in providers map", () => {
|
|
71
|
-
const dir = path.join(tmp, "local-prefix-found");
|
|
72
|
-
fs.mkdirSync(dir, { recursive: true });
|
|
73
|
-
const providers = {
|
|
74
|
-
source: "/fake/providers.json",
|
|
75
|
-
providers: { shredder: { npm: "@ai-sdk/openai-compatible", name: "Shredder" } },
|
|
76
|
-
};
|
|
77
|
-
const result = writeOpencodeJson(dir, "shredder/qwen3.5-9b", providers);
|
|
78
|
-
expect(result.providerKey).toBe("shredder");
|
|
79
|
-
expect(result.warnings).toHaveLength(0);
|
|
80
|
-
const config = JSON.parse(fs.readFileSync(path.join(dir, "opencode.json"), "utf8"));
|
|
81
|
-
expect(config.provider?.shredder).toBeDefined();
|
|
82
|
-
expect(config.model).toBe("shredder/qwen3.5-9b");
|
|
83
|
-
});
|
|
84
|
-
test("mode 0o600 (not world-readable)", () => {
|
|
85
|
-
const dir = path.join(tmp, "mode-check");
|
|
86
|
-
fs.mkdirSync(dir, { recursive: true });
|
|
87
|
-
writeOpencodeJson(dir, "anthropic/claude-opus-4-7");
|
|
88
|
-
const stat = fs.statSync(path.join(dir, "opencode.json"));
|
|
89
|
-
expect(stat.mode & 0o777).toBe(0o600);
|
|
90
|
-
});
|
|
91
|
-
});
|
|
92
|
-
// ── BENCH_OPENCODE_INVARIANTS ────────────────────────────────────────────────
|
|
93
|
-
describe("BENCH_OPENCODE_INVARIANTS", () => {
|
|
94
|
-
test("plugin is an empty readonly array", () => {
|
|
95
|
-
expect(BENCH_OPENCODE_INVARIANTS.plugin).toEqual([]);
|
|
96
|
-
expect(Array.isArray(BENCH_OPENCODE_INVARIANTS.plugin)).toBe(true);
|
|
97
|
-
});
|
|
98
|
-
test("permission.bash is 'allow'", () => {
|
|
99
|
-
expect(BENCH_OPENCODE_INVARIANTS.permission.bash).toBe("allow");
|
|
100
|
-
});
|
|
101
|
-
});
|
|
102
|
-
// ── BUILTIN_CLOUD_PREFIXES ───────────────────────────────────────────────────
|
|
103
|
-
describe("BUILTIN_CLOUD_PREFIXES", () => {
|
|
104
|
-
test("includes anthropic, openai, opencode", () => {
|
|
105
|
-
expect(BUILTIN_CLOUD_PREFIXES.has("anthropic")).toBe(true);
|
|
106
|
-
expect(BUILTIN_CLOUD_PREFIXES.has("openai")).toBe(true);
|
|
107
|
-
expect(BUILTIN_CLOUD_PREFIXES.has("opencode")).toBe(true);
|
|
108
|
-
});
|
|
109
|
-
test("does not include custom provider prefixes like 'shredder' or 'don'", () => {
|
|
110
|
-
expect(BUILTIN_CLOUD_PREFIXES.has("shredder")).toBe(false);
|
|
111
|
-
expect(BUILTIN_CLOUD_PREFIXES.has("don")).toBe(false);
|
|
112
|
-
});
|
|
113
|
-
});
|
|
114
|
-
// ── validateFixtureCorpus ────────────────────────────────────────────────────
|
|
115
|
-
describe("validateFixtureCorpus", () => {
|
|
116
|
-
test("returns known fixtures as valid", () => {
|
|
117
|
-
const tasks = [{ id: "az-cli/foo", stash: "az-cli" }];
|
|
118
|
-
const { valid, missing } = validateFixtureCorpus(tasks);
|
|
119
|
-
expect(valid.has("az-cli")).toBe(true);
|
|
120
|
-
expect(missing.size).toBe(0);
|
|
121
|
-
});
|
|
122
|
-
test("returns nonexistent fixture as missing with its task IDs", () => {
|
|
123
|
-
const tasks = [
|
|
124
|
-
{ id: "ghost/task-1", stash: "ghost-fixture" },
|
|
125
|
-
{ id: "ghost/task-2", stash: "ghost-fixture" },
|
|
126
|
-
];
|
|
127
|
-
const { valid, missing } = validateFixtureCorpus(tasks);
|
|
128
|
-
expect(valid.has("ghost-fixture")).toBe(false);
|
|
129
|
-
expect(missing.has("ghost-fixture")).toBe(true);
|
|
130
|
-
expect(missing.get("ghost-fixture")).toEqual(["ghost/task-1", "ghost/task-2"]);
|
|
131
|
-
});
|
|
132
|
-
test("handles empty task list", () => {
|
|
133
|
-
const { valid, missing } = validateFixtureCorpus([]);
|
|
134
|
-
expect(valid.size).toBe(0);
|
|
135
|
-
expect(missing.size).toBe(0);
|
|
136
|
-
});
|
|
137
|
-
test("deduplicates fixture names across tasks", () => {
|
|
138
|
-
const tasks = [
|
|
139
|
-
{ id: "az-cli/a", stash: "az-cli" },
|
|
140
|
-
{ id: "az-cli/b", stash: "az-cli" },
|
|
141
|
-
{ id: "az-cli/c", stash: "az-cli" },
|
|
142
|
-
];
|
|
143
|
-
const { valid } = validateFixtureCorpus(tasks);
|
|
144
|
-
expect(valid.size).toBe(1);
|
|
145
|
-
});
|
|
146
|
-
});
|
|
147
|
-
// ── setupBenchEnvironment (dryRun) ───────────────────────────────────────────
|
|
148
|
-
describe("setupBenchEnvironment dryRun", () => {
|
|
149
|
-
test("creates isolation dirs and writes opencode.json with invariants", () => {
|
|
150
|
-
const env = setupBenchEnvironment({
|
|
151
|
-
model: "anthropic/claude-opus-4-7",
|
|
152
|
-
arm: "akm",
|
|
153
|
-
dryRun: true,
|
|
154
|
-
});
|
|
155
|
-
try {
|
|
156
|
-
expect(fs.existsSync(env.dirs.cacheHome)).toBe(true);
|
|
157
|
-
expect(fs.existsSync(env.dirs.configHome)).toBe(true);
|
|
158
|
-
expect(fs.existsSync(env.dirs.opencodeConfig)).toBe(true);
|
|
159
|
-
const config = JSON.parse(fs.readFileSync(path.join(env.dirs.opencodeConfig, "opencode.json"), "utf8"));
|
|
160
|
-
expect(config.plugin).toEqual([]);
|
|
161
|
-
expect(config.permission?.bash).toBe("allow");
|
|
162
|
-
}
|
|
163
|
-
finally {
|
|
164
|
-
env.teardown();
|
|
165
|
-
}
|
|
166
|
-
});
|
|
167
|
-
test("throws for custom provider prefix without providers config", () => {
|
|
168
|
-
expect(() => setupBenchEnvironment({
|
|
169
|
-
model: "shredder/qwen/qwen3.5-9b",
|
|
170
|
-
arm: "akm",
|
|
171
|
-
dryRun: true,
|
|
172
|
-
})).toThrow(/custom provider prefix/);
|
|
173
|
-
});
|
|
174
|
-
test("synthetic arm never sets AKM_STASH_DIR", () => {
|
|
175
|
-
const env = setupBenchEnvironment({
|
|
176
|
-
model: "anthropic/claude-opus-4-7",
|
|
177
|
-
arm: "synthetic",
|
|
178
|
-
stashDir: "/some/stash",
|
|
179
|
-
dryRun: true,
|
|
180
|
-
});
|
|
181
|
-
try {
|
|
182
|
-
expect(env.env.AKM_STASH_DIR).toBeUndefined();
|
|
183
|
-
}
|
|
184
|
-
finally {
|
|
185
|
-
env.teardown();
|
|
186
|
-
}
|
|
187
|
-
});
|
|
188
|
-
test("teardown removes the isolation dirs", () => {
|
|
189
|
-
const env = setupBenchEnvironment({
|
|
190
|
-
model: "anthropic/claude-opus-4-7",
|
|
191
|
-
arm: "akm",
|
|
192
|
-
dryRun: true,
|
|
193
|
-
});
|
|
194
|
-
const { root } = env.dirs;
|
|
195
|
-
expect(fs.existsSync(root)).toBe(true);
|
|
196
|
-
env.teardown();
|
|
197
|
-
expect(fs.existsSync(root)).toBe(false);
|
|
198
|
-
});
|
|
199
|
-
});
|
|
@@ -1,179 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Track B lesson quality + reuse metrics (issue #264, spec §6.3 follow-up).
|
|
3
|
-
*
|
|
4
|
-
* `computeLessonMetrics` walks the evolve runner's proposal log and the
|
|
5
|
-
* Phase 3 pre/post arm `RunResult[]`s and emits one `LessonRecord` per
|
|
6
|
-
* lesson-kind proposal. The record captures:
|
|
7
|
-
*
|
|
8
|
-
* - `source_failures` — eval/train tasks whose negative feedback events
|
|
9
|
-
* targeted this asset ref (joined via the supplied `feedbackLog`).
|
|
10
|
-
* - `lint_pass` / `accepted` — verbatim from the proposal log entry.
|
|
11
|
-
* - `first_reused_on` / `reuse_count` / `reuse_pass_rate` — how often the
|
|
12
|
-
* accepted lesson's ref appeared in post-arm runs' `assetsLoaded`, and
|
|
13
|
-
* the pass-rate among those reuses.
|
|
14
|
-
* - `negative_transfer_count` — count of (taskId, seed) pairs where the
|
|
15
|
-
* same task PASSED in pre but FAILED in post AND the post run loaded
|
|
16
|
-
* this lesson's ref. Spec §6.4 negative-transfer attribution.
|
|
17
|
-
* - `leakage_risk` — `"high"` when any verbatim 4-token-or-longer phrase
|
|
18
|
-
* in the supplied verifier source(s) appears verbatim in the lesson
|
|
19
|
-
* body; `"medium"` for 3-token leakage; `"low"` otherwise. Mirrors the
|
|
20
|
-
* Wave 3 `leakage.test.ts` philosophy: structural fragments are red
|
|
21
|
-
* flags, lone tokens are not.
|
|
22
|
-
*
|
|
23
|
-
* The function is pure: no disk I/O, no subprocess. Callers (the evolve
|
|
24
|
-
* runner) thread lesson bodies + verifier sources through optional maps so
|
|
25
|
-
* the leakage check is fully deterministic and testable with mock inputs.
|
|
26
|
-
*/
|
|
27
|
-
/**
|
|
28
|
-
* Compute lesson-quality + reuse metrics from the evolve runner's outputs.
|
|
29
|
-
* Pure function — does not touch disk and does not invoke any subprocess.
|
|
30
|
-
*
|
|
31
|
-
* Only `proposalLog` entries with `kind === "lesson"` are surfaced as
|
|
32
|
-
* `LessonRecord`s. Revision-kind proposals are tracked elsewhere (the
|
|
33
|
-
* §6.3 `proposals` block already covers them) and would skew the lesson
|
|
34
|
-
* reuse rate if mixed in.
|
|
35
|
-
*/
|
|
36
|
-
export function computeLessonMetrics(input) {
|
|
37
|
-
const lessons = input.proposalLog.filter((p) => p.kind === "lesson");
|
|
38
|
-
const feedbackLog = input.feedbackLog ?? [];
|
|
39
|
-
const preRuns = input.preRuns ?? [];
|
|
40
|
-
const postRuns = input.postRuns ?? [];
|
|
41
|
-
const lessonBodies = input.lessonBodies ?? {};
|
|
42
|
-
const verifierSources = input.verifierSources ?? {};
|
|
43
|
-
// Pre-index pre-arm task → seed → outcome so negative-transfer attribution
|
|
44
|
-
// is a constant-time lookup per post run.
|
|
45
|
-
const preOutcomes = new Map();
|
|
46
|
-
for (const r of preRuns) {
|
|
47
|
-
let inner = preOutcomes.get(r.taskId);
|
|
48
|
-
if (!inner) {
|
|
49
|
-
inner = new Map();
|
|
50
|
-
preOutcomes.set(r.taskId, inner);
|
|
51
|
-
}
|
|
52
|
-
inner.set(r.seed, r.outcome);
|
|
53
|
-
}
|
|
54
|
-
// Pre-index negative feedback by ref so source_failures is O(events).
|
|
55
|
-
const negativeFeedbackByRef = new Map();
|
|
56
|
-
for (const ev of feedbackLog) {
|
|
57
|
-
if (ev.signal !== "negative")
|
|
58
|
-
continue;
|
|
59
|
-
let set = negativeFeedbackByRef.get(ev.goldRef);
|
|
60
|
-
if (!set) {
|
|
61
|
-
set = new Set();
|
|
62
|
-
negativeFeedbackByRef.set(ev.goldRef, set);
|
|
63
|
-
}
|
|
64
|
-
set.add(ev.taskId);
|
|
65
|
-
}
|
|
66
|
-
const records = lessons.map((p) => {
|
|
67
|
-
const ref = p.assetRef;
|
|
68
|
-
const sourceFailures = [...(negativeFeedbackByRef.get(ref) ?? [])].sort();
|
|
69
|
-
// Reuse: post-arm runs that loaded this ref.
|
|
70
|
-
let firstReusedOn = null;
|
|
71
|
-
let reuseCount = 0;
|
|
72
|
-
let reusePassCount = 0;
|
|
73
|
-
// Negative transfer: post-FAIL where pre-PASS for the same (task, seed)
|
|
74
|
-
// AND this lesson was loaded in the post run. Dedupe by taskId so a
|
|
75
|
-
// task that regresses across multiple seeds counts once.
|
|
76
|
-
const negativeTransferTasks = new Set();
|
|
77
|
-
if (p.decision === "accept") {
|
|
78
|
-
for (const r of postRuns) {
|
|
79
|
-
if (!r.assetsLoaded?.includes(ref))
|
|
80
|
-
continue;
|
|
81
|
-
if (firstReusedOn === null)
|
|
82
|
-
firstReusedOn = r.taskId;
|
|
83
|
-
reuseCount += 1;
|
|
84
|
-
if (r.outcome === "pass")
|
|
85
|
-
reusePassCount += 1;
|
|
86
|
-
if (r.outcome === "fail" || r.outcome === "budget_exceeded") {
|
|
87
|
-
const prePerSeed = preOutcomes.get(r.taskId);
|
|
88
|
-
if (prePerSeed && prePerSeed.get(r.seed) === "pass") {
|
|
89
|
-
negativeTransferTasks.add(r.taskId);
|
|
90
|
-
}
|
|
91
|
-
}
|
|
92
|
-
}
|
|
93
|
-
}
|
|
94
|
-
const reusePassRate = reuseCount === 0 ? 0 : reusePassCount / reuseCount;
|
|
95
|
-
const leakageRisk = classifyLeakageRisk(lessonBodies[ref], verifierSources[ref]);
|
|
96
|
-
return {
|
|
97
|
-
ref,
|
|
98
|
-
source_failures: sourceFailures,
|
|
99
|
-
lint_pass: p.lintPass,
|
|
100
|
-
accepted: p.decision === "accept",
|
|
101
|
-
first_reused_on: firstReusedOn,
|
|
102
|
-
reuse_count: reuseCount,
|
|
103
|
-
reuse_pass_rate: reusePassRate,
|
|
104
|
-
negative_transfer_count: negativeTransferTasks.size,
|
|
105
|
-
leakage_risk: leakageRisk,
|
|
106
|
-
};
|
|
107
|
-
});
|
|
108
|
-
records.sort((a, b) => a.ref.localeCompare(b.ref));
|
|
109
|
-
const total = records.length;
|
|
110
|
-
const accepted = records.filter((r) => r.accepted);
|
|
111
|
-
const lintPassed = records.filter((r) => r.lint_pass).length;
|
|
112
|
-
const reusedAccepted = accepted.filter((r) => r.reuse_count > 0);
|
|
113
|
-
const reusePassRateSum = reusedAccepted.reduce((sum, r) => sum + r.reuse_pass_rate, 0);
|
|
114
|
-
const negativeTransferTotal = records.reduce((sum, r) => sum + r.negative_transfer_count, 0);
|
|
115
|
-
return {
|
|
116
|
-
lessons: records,
|
|
117
|
-
lessons_created_count: total,
|
|
118
|
-
lessons_accepted_count: accepted.length,
|
|
119
|
-
proposal_lint_pass_rate: total === 0 ? 0 : lintPassed / total,
|
|
120
|
-
proposal_acceptance_rate: total === 0 ? 0 : accepted.length / total,
|
|
121
|
-
lesson_reuse_rate: accepted.length === 0 ? 0 : reusedAccepted.length / accepted.length,
|
|
122
|
-
lesson_reuse_success_rate: reusedAccepted.length === 0 ? 0 : reusePassRateSum / reusedAccepted.length,
|
|
123
|
-
lesson_negative_transfer_count: negativeTransferTotal,
|
|
124
|
-
};
|
|
125
|
-
}
|
|
126
|
-
/**
|
|
127
|
-
* Classify lesson-body leakage against verifier source text. Returns
|
|
128
|
-
* `"high"` when a 4+-word verbatim phrase from any verifier-source entry
|
|
129
|
-
* appears in the body; `"medium"` for 3-word overlap; `"low"` otherwise.
|
|
130
|
-
*
|
|
131
|
-
* The check is intentionally simple — Wave 3's `leakage.test.ts` uses
|
|
132
|
-
* structural assertion extraction (regex literals, dotted paths, jq/grep
|
|
133
|
-
* patterns); here we just slide an N-gram window over the verifier text
|
|
134
|
-
* and ask "does the body contain this exact run of words?". Tokens are
|
|
135
|
-
* normalised to lowercase and split on non-word boundaries so trivial
|
|
136
|
-
* whitespace differences don't hide leakage.
|
|
137
|
-
*/
|
|
138
|
-
export function classifyLeakageRisk(body, verifierSources) {
|
|
139
|
-
if (!body || !verifierSources || verifierSources.length === 0)
|
|
140
|
-
return "low";
|
|
141
|
-
const bodyTokens = tokenize(body);
|
|
142
|
-
if (bodyTokens.length === 0)
|
|
143
|
-
return "low";
|
|
144
|
-
const bodyJoined = ` ${bodyTokens.join(" ")} `;
|
|
145
|
-
let mediumHit = false;
|
|
146
|
-
for (const source of verifierSources) {
|
|
147
|
-
const sourceTokens = tokenize(source);
|
|
148
|
-
if (sourceTokens.length < 3)
|
|
149
|
-
continue;
|
|
150
|
-
if (containsNGram(bodyJoined, sourceTokens, 4))
|
|
151
|
-
return "high";
|
|
152
|
-
if (!mediumHit && containsNGram(bodyJoined, sourceTokens, 3))
|
|
153
|
-
mediumHit = true;
|
|
154
|
-
}
|
|
155
|
-
return mediumHit ? "medium" : "low";
|
|
156
|
-
}
|
|
157
|
-
/**
|
|
158
|
-
* Slide an N-gram window of size `n` across `tokens` and return true if any
|
|
159
|
-
* window appears as a contiguous substring inside `bodyJoined` (which is
|
|
160
|
-
* pre-padded with spaces so word boundaries match cleanly). Skips windows
|
|
161
|
-
* shorter than `n`; returns false on empty input.
|
|
162
|
-
*/
|
|
163
|
-
function containsNGram(bodyJoined, tokens, n) {
|
|
164
|
-
if (tokens.length < n)
|
|
165
|
-
return false;
|
|
166
|
-
for (let i = 0; i + n <= tokens.length; i += 1) {
|
|
167
|
-
const phrase = ` ${tokens.slice(i, i + n).join(" ")} `;
|
|
168
|
-
if (bodyJoined.includes(phrase))
|
|
169
|
-
return true;
|
|
170
|
-
}
|
|
171
|
-
return false;
|
|
172
|
-
}
|
|
173
|
-
/** Lowercase tokens split on non-word characters. Empty strings dropped. */
|
|
174
|
-
function tokenize(text) {
|
|
175
|
-
return text
|
|
176
|
-
.toLowerCase()
|
|
177
|
-
.split(/[^a-z0-9_]+/)
|
|
178
|
-
.filter((t) => t.length > 0);
|
|
179
|
-
}
|