akm-cli 0.7.0-rc1 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{src/cli.js → cli.js} +100 -16
- package/dist/{src/commands → commands}/config-cli.js +42 -0
- package/dist/{src/commands → commands}/history.js +78 -7
- package/dist/{src/commands → commands}/registry-search.js +69 -6
- package/dist/{src/commands → commands}/search.js +30 -3
- package/dist/{src/commands → commands}/show.js +29 -0
- package/dist/{src/commands → commands}/source-add.js +5 -1
- package/dist/{src/commands → commands}/source-manage.js +7 -1
- package/dist/{src/core → core}/config.js +28 -0
- package/dist/{src/indexer → indexer}/db-search.js +1 -0
- package/dist/{src/indexer → indexer}/indexer.js +16 -2
- package/dist/{src/indexer → indexer}/matchers.js +1 -1
- package/dist/{src/indexer → indexer}/search-source.js +4 -2
- package/dist/{src/integrations → integrations}/agent/profiles.js +1 -1
- package/dist/{src/integrations → integrations}/agent/spawn.js +67 -16
- package/dist/{src/integrations → integrations}/github.js +9 -3
- package/dist/{src/llm → llm}/embedders/remote.js +37 -3
- package/dist/{src/output → output}/cli-hints.js +15 -2
- package/dist/{src/output → output}/renderers.js +3 -1
- package/dist/{src/output → output}/shapes.js +8 -1
- package/dist/{src/output → output}/text.js +156 -3
- package/dist/{src/registry → registry}/build-index.js +5 -4
- package/dist/{src/registry → registry}/providers/static-index.js +3 -1
- package/dist/{src/setup → setup}/setup.js +9 -0
- package/dist/{src/wiki → wiki}/wiki.js +54 -6
- package/dist/{src/workflows → workflows}/runs.js +37 -3
- package/package.json +8 -8
- package/dist/tests/add-website-source.test.js +0 -119
- package/dist/tests/agent/agent-config-loader.test.js +0 -70
- package/dist/tests/agent/agent-config.test.js +0 -221
- package/dist/tests/agent/agent-detect.test.js +0 -100
- package/dist/tests/agent/agent-spawn.test.js +0 -234
- package/dist/tests/agent-output.test.js +0 -186
- package/dist/tests/architecture/agent-no-llm-sdk-guard.test.js +0 -103
- package/dist/tests/architecture/agent-spawn-seam.test.js +0 -193
- package/dist/tests/architecture/llm-stateless-seam.test.js +0 -112
- package/dist/tests/asset-ref.test.js +0 -192
- package/dist/tests/asset-registry.test.js +0 -103
- package/dist/tests/asset-spec.test.js +0 -241
- package/dist/tests/bench/attribution.test.js +0 -995
- package/dist/tests/bench/cleanup-sigint.test.js +0 -83
- package/dist/tests/bench/cleanup.js +0 -203
- package/dist/tests/bench/cleanup.test.js +0 -166
- package/dist/tests/bench/cli.js +0 -683
- package/dist/tests/bench/cli.test.js +0 -177
- package/dist/tests/bench/compare.test.js +0 -556
- package/dist/tests/bench/corpus.js +0 -314
- package/dist/tests/bench/corpus.test.js +0 -258
- package/dist/tests/bench/driver.js +0 -346
- package/dist/tests/bench/driver.test.js +0 -443
- package/dist/tests/bench/evolve-metrics.js +0 -179
- package/dist/tests/bench/evolve-metrics.test.js +0 -187
- package/dist/tests/bench/evolve.js +0 -580
- package/dist/tests/bench/evolve.test.js +0 -616
- package/dist/tests/bench/failure-modes.test.js +0 -300
- package/dist/tests/bench/feedback-integrity.test.js +0 -456
- package/dist/tests/bench/leakage.test.js +0 -125
- package/dist/tests/bench/learning-curve.test.js +0 -133
- package/dist/tests/bench/metrics.js +0 -2319
- package/dist/tests/bench/metrics.test.js +0 -1144
- package/dist/tests/bench/no-os-tmpdir-invariant.test.js +0 -43
- package/dist/tests/bench/report.js +0 -1821
- package/dist/tests/bench/report.test.js +0 -989
- package/dist/tests/bench/runner.js +0 -536
- package/dist/tests/bench/runner.test.js +0 -958
- package/dist/tests/bench/search-bridge.test.js +0 -331
- package/dist/tests/bench/tmp.js +0 -41
- package/dist/tests/bench/trajectory.js +0 -116
- package/dist/tests/bench/trajectory.test.js +0 -127
- package/dist/tests/bench/verifier.js +0 -109
- package/dist/tests/bench/verifier.test.js +0 -118
- package/dist/tests/bench/workflow-evaluator.js +0 -557
- package/dist/tests/bench/workflow-evaluator.test.js +0 -421
- package/dist/tests/bench/workflow-spec.js +0 -358
- package/dist/tests/bench/workflow-spec.test.js +0 -363
- package/dist/tests/bench/workflow-trace.js +0 -438
- package/dist/tests/bench/workflow-trace.test.js +0 -254
- package/dist/tests/benchmark-search-quality.js +0 -536
- package/dist/tests/benchmark-suite.js +0 -1441
- package/dist/tests/capture-cli.test.js +0 -112
- package/dist/tests/cli-errors.test.js +0 -203
- package/dist/tests/commands/events.test.js +0 -370
- package/dist/tests/commands/history.test.js +0 -223
- package/dist/tests/commands/import.test.js +0 -103
- package/dist/tests/commands/proposal-cli.test.js +0 -209
- package/dist/tests/commands/reflect-propose-cli.test.js +0 -333
- package/dist/tests/commands/remember.test.js +0 -97
- package/dist/tests/commands/scope-flags.test.js +0 -300
- package/dist/tests/commands/search.test.js +0 -537
- package/dist/tests/commands/show-indexer-parity.test.js +0 -117
- package/dist/tests/commands/show.test.js +0 -294
- package/dist/tests/common.test.js +0 -266
- package/dist/tests/completions.test.js +0 -142
- package/dist/tests/config-cli.test.js +0 -193
- package/dist/tests/config-llm-features.test.js +0 -139
- package/dist/tests/config.test.js +0 -544
- package/dist/tests/contracts/migration-baseline.test.js +0 -43
- package/dist/tests/contracts/reflect-propose-envelope.test.js +0 -139
- package/dist/tests/contracts/spec-helpers.js +0 -46
- package/dist/tests/contracts/v1-spec-section-11-proposal-queue.test.js +0 -228
- package/dist/tests/contracts/v1-spec-section-12-agent-config.test.js +0 -56
- package/dist/tests/contracts/v1-spec-section-13-lesson-type.test.js +0 -34
- package/dist/tests/contracts/v1-spec-section-14-llm-features.test.js +0 -94
- package/dist/tests/contracts/v1-spec-section-4-1-asset-types.test.js +0 -39
- package/dist/tests/contracts/v1-spec-section-4-2-quality-rules.test.js +0 -44
- package/dist/tests/contracts/v1-spec-section-5-configuration.test.js +0 -47
- package/dist/tests/contracts/v1-spec-section-6-orchestration.test.js +0 -40
- package/dist/tests/contracts/v1-spec-section-7-module-layout.test.js +0 -58
- package/dist/tests/contracts/v1-spec-section-8-extension-points.test.js +0 -34
- package/dist/tests/contracts/v1-spec-section-9-4-cli-surface.test.js +0 -75
- package/dist/tests/contracts/v1-spec-section-9-7-llm-agent-boundary.test.js +0 -36
- package/dist/tests/core/write-source.test.js +0 -366
- package/dist/tests/curate-command.test.js +0 -87
- package/dist/tests/db-scoring.test.js +0 -201
- package/dist/tests/db.test.js +0 -654
- package/dist/tests/distill-cli-flag.test.js +0 -208
- package/dist/tests/distill.test.js +0 -515
- package/dist/tests/docker-install.test.js +0 -120
- package/dist/tests/e2e.test.js +0 -1398
- package/dist/tests/embedder.test.js +0 -340
- package/dist/tests/embedding-model-config.test.js +0 -379
- package/dist/tests/feedback-command.test.js +0 -172
- package/dist/tests/file-context.test.js +0 -552
- package/dist/tests/fixtures/scripts/git/summarize-diff.js +0 -9
- package/dist/tests/fixtures/scripts/lint/eslint-check.js +0 -7
- package/dist/tests/fixtures/stashes/load.js +0 -166
- package/dist/tests/fixtures/stashes/load.test.js +0 -88
- package/dist/tests/fixtures/stashes/ranking-baseline/scripts/mem0-search.js +0 -12
- package/dist/tests/frontmatter.test.js +0 -190
- package/dist/tests/fts-field-weighting.test.js +0 -254
- package/dist/tests/fuzzy-search.test.js +0 -230
- package/dist/tests/git-provider-clone.test.js +0 -45
- package/dist/tests/github.test.js +0 -161
- package/dist/tests/graph-boost-ranking.test.js +0 -305
- package/dist/tests/graph-extraction.test.js +0 -282
- package/dist/tests/helpers/usage-events.js +0 -8
- package/dist/tests/index-pass-llm.test.js +0 -161
- package/dist/tests/indexer.test.js +0 -559
- package/dist/tests/info-command.test.js +0 -166
- package/dist/tests/init.test.js +0 -69
- package/dist/tests/install-script.test.js +0 -246
- package/dist/tests/integration/agent-real-profile.test.js +0 -94
- package/dist/tests/issue-36-repro.test.js +0 -304
- package/dist/tests/issues-191-194.test.js +0 -160
- package/dist/tests/lesson-lint.test.js +0 -111
- package/dist/tests/llm-client.test.js +0 -115
- package/dist/tests/llm-feature-gate.test.js +0 -151
- package/dist/tests/llm.test.js +0 -139
- package/dist/tests/lockfile.test.js +0 -216
- package/dist/tests/manifest.test.js +0 -205
- package/dist/tests/markdown.test.js +0 -126
- package/dist/tests/matchers-unit.test.js +0 -189
- package/dist/tests/memory-inference.test.js +0 -299
- package/dist/tests/merge-scoring.test.js +0 -136
- package/dist/tests/metadata.test.js +0 -313
- package/dist/tests/migration-help.test.js +0 -89
- package/dist/tests/origin-resolve.test.js +0 -124
- package/dist/tests/output-baseline.test.js +0 -217
- package/dist/tests/output-shapes-unit.test.js +0 -476
- package/dist/tests/parallel-search.test.js +0 -272
- package/dist/tests/parameter-metadata.test.js +0 -365
- package/dist/tests/paths.test.js +0 -177
- package/dist/tests/progressive-disclosure.test.js +0 -280
- package/dist/tests/proposals.test.js +0 -279
- package/dist/tests/proposed-quality.test.js +0 -271
- package/dist/tests/provider-registry.test.js +0 -32
- package/dist/tests/ranking-regression.test.js +0 -548
- package/dist/tests/reflect-propose.test.js +0 -455
- package/dist/tests/registry-build-index.test.js +0 -378
- package/dist/tests/registry-cli.test.js +0 -290
- package/dist/tests/registry-index-v2.test.js +0 -430
- package/dist/tests/registry-install.test.js +0 -728
- package/dist/tests/registry-providers/parity.test.js +0 -189
- package/dist/tests/registry-providers/skills-sh.test.js +0 -309
- package/dist/tests/registry-providers/static-index.test.js +0 -204
- package/dist/tests/registry-resolve.test.js +0 -126
- package/dist/tests/registry-search.test.js +0 -723
- package/dist/tests/remember-frontmatter.test.js +0 -380
- package/dist/tests/remember-unit.test.js +0 -123
- package/dist/tests/ripgrep-install.test.js +0 -251
- package/dist/tests/ripgrep-resolve.test.js +0 -108
- package/dist/tests/ripgrep.test.js +0 -163
- package/dist/tests/save-command.test.js +0 -94
- package/dist/tests/save-trust-qa-fixes.test.js +0 -270
- package/dist/tests/scoring-pipeline.test.js +0 -648
- package/dist/tests/search-include-proposed-cli.test.js +0 -118
- package/dist/tests/self-update.test.js +0 -442
- package/dist/tests/semantic-search-e2e.test.js +0 -512
- package/dist/tests/semantic-status.test.js +0 -471
- package/dist/tests/setup-run.integration.js +0 -877
- package/dist/tests/setup-wizard.test.js +0 -198
- package/dist/tests/setup.test.js +0 -131
- package/dist/tests/source-add.test.js +0 -11
- package/dist/tests/source-clone.test.js +0 -254
- package/dist/tests/source-manage.test.js +0 -366
- package/dist/tests/source-providers/filesystem.test.js +0 -82
- package/dist/tests/source-providers/git.test.js +0 -252
- package/dist/tests/source-providers/website.test.js +0 -128
- package/dist/tests/source-qa-fixes.test.js +0 -268
- package/dist/tests/source-registry.test.js +0 -350
- package/dist/tests/source-resolve.test.js +0 -100
- package/dist/tests/source-source.test.js +0 -221
- package/dist/tests/source.test.js +0 -533
- package/dist/tests/tar-utils-scan.test.js +0 -73
- package/dist/tests/toggle-components.test.js +0 -73
- package/dist/tests/usage-telemetry.test.js +0 -265
- package/dist/tests/utility-scoring.test.js +0 -558
- package/dist/tests/vault-load-error.test.js +0 -78
- package/dist/tests/vault-qa-fixes.test.js +0 -194
- package/dist/tests/vault.test.js +0 -429
- package/dist/tests/vector-search.test.js +0 -608
- package/dist/tests/walker.test.js +0 -252
- package/dist/tests/wave2-cluster-bc.test.js +0 -228
- package/dist/tests/wave2-cluster-d.test.js +0 -180
- package/dist/tests/wave2-cluster-e.test.js +0 -179
- package/dist/tests/wiki-qa-fixes.test.js +0 -270
- package/dist/tests/wiki.test.js +0 -529
- package/dist/tests/workflow-cli.test.js +0 -271
- package/dist/tests/workflow-markdown.test.js +0 -171
- package/dist/tests/workflow-path-escape.test.js +0 -132
- package/dist/tests/workflow-qa-fixes.test.js +0 -377
- package/dist/tests/workflows/indexer-rejection.test.js +0 -213
- /package/dist/{src/commands → commands}/completions.js +0 -0
- /package/dist/{src/commands → commands}/curate.js +0 -0
- /package/dist/{src/commands → commands}/distill.js +0 -0
- /package/dist/{src/commands → commands}/events.js +0 -0
- /package/dist/{src/commands → commands}/info.js +0 -0
- /package/dist/{src/commands → commands}/init.js +0 -0
- /package/dist/{src/commands → commands}/install-audit.js +0 -0
- /package/dist/{src/commands → commands}/installed-stashes.js +0 -0
- /package/dist/{src/commands → commands}/migration-help.js +0 -0
- /package/dist/{src/commands → commands}/proposal.js +0 -0
- /package/dist/{src/commands → commands}/propose.js +0 -0
- /package/dist/{src/commands → commands}/reflect.js +0 -0
- /package/dist/{src/commands → commands}/remember.js +0 -0
- /package/dist/{src/commands → commands}/self-update.js +0 -0
- /package/dist/{src/commands → commands}/source-clone.js +0 -0
- /package/dist/{src/commands → commands}/vault.js +0 -0
- /package/dist/{src/core → core}/asset-ref.js +0 -0
- /package/dist/{src/core → core}/asset-registry.js +0 -0
- /package/dist/{src/core → core}/asset-spec.js +0 -0
- /package/dist/{src/core → core}/common.js +0 -0
- /package/dist/{src/core → core}/errors.js +0 -0
- /package/dist/{src/core → core}/events.js +0 -0
- /package/dist/{src/core → core}/frontmatter.js +0 -0
- /package/dist/{src/core → core}/lesson-lint.js +0 -0
- /package/dist/{src/core → core}/markdown.js +0 -0
- /package/dist/{src/core → core}/paths.js +0 -0
- /package/dist/{src/core → core}/proposals.js +0 -0
- /package/dist/{src/core → core}/warn.js +0 -0
- /package/dist/{src/core → core}/write-source.js +0 -0
- /package/dist/{src/indexer → indexer}/db.js +0 -0
- /package/dist/{src/indexer → indexer}/file-context.js +0 -0
- /package/dist/{src/indexer → indexer}/graph-boost.js +0 -0
- /package/dist/{src/indexer → indexer}/graph-extraction.js +0 -0
- /package/dist/{src/indexer → indexer}/manifest.js +0 -0
- /package/dist/{src/indexer → indexer}/memory-inference.js +0 -0
- /package/dist/{src/indexer → indexer}/metadata.js +0 -0
- /package/dist/{src/indexer → indexer}/search-fields.js +0 -0
- /package/dist/{src/indexer → indexer}/semantic-status.js +0 -0
- /package/dist/{src/indexer → indexer}/usage-events.js +0 -0
- /package/dist/{src/indexer → indexer}/walker.js +0 -0
- /package/dist/{src/integrations → integrations}/agent/config.js +0 -0
- /package/dist/{src/integrations → integrations}/agent/detect.js +0 -0
- /package/dist/{src/integrations → integrations}/agent/index.js +0 -0
- /package/dist/{src/integrations → integrations}/agent/prompts.js +0 -0
- /package/dist/{src/integrations → integrations}/lockfile.js +0 -0
- /package/dist/{src/llm → llm}/client.js +0 -0
- /package/dist/{src/llm → llm}/embedder.js +0 -0
- /package/dist/{src/llm → llm}/embedders/cache.js +0 -0
- /package/dist/{src/llm → llm}/embedders/local.js +0 -0
- /package/dist/{src/llm → llm}/embedders/types.js +0 -0
- /package/dist/{src/llm → llm}/feature-gate.js +0 -0
- /package/dist/{src/llm → llm}/graph-extract.js +0 -0
- /package/dist/{src/llm → llm}/index-passes.js +0 -0
- /package/dist/{src/llm → llm}/memory-infer.js +0 -0
- /package/dist/{src/llm → llm}/metadata-enhance.js +0 -0
- /package/dist/{src/output → output}/context.js +0 -0
- /package/dist/{src/registry → registry}/create-provider-registry.js +0 -0
- /package/dist/{src/registry → registry}/factory.js +0 -0
- /package/dist/{src/registry → registry}/origin-resolve.js +0 -0
- /package/dist/{src/registry → registry}/providers/index.js +0 -0
- /package/dist/{src/registry → registry}/providers/skills-sh.js +0 -0
- /package/dist/{src/registry → registry}/providers/types.js +0 -0
- /package/dist/{src/registry → registry}/resolve.js +0 -0
- /package/dist/{src/registry → registry}/types.js +0 -0
- /package/dist/{src/setup → setup}/detect.js +0 -0
- /package/dist/{src/setup → setup}/ripgrep-install.js +0 -0
- /package/dist/{src/setup → setup}/ripgrep-resolve.js +0 -0
- /package/dist/{src/setup → setup}/steps.js +0 -0
- /package/dist/{src/sources → sources}/include.js +0 -0
- /package/dist/{src/sources → sources}/provider-factory.js +0 -0
- /package/dist/{src/sources → sources}/provider.js +0 -0
- /package/dist/{src/sources → sources}/providers/filesystem.js +0 -0
- /package/dist/{src/sources → sources}/providers/git.js +0 -0
- /package/dist/{src/sources → sources}/providers/index.js +0 -0
- /package/dist/{src/sources → sources}/providers/install-types.js +0 -0
- /package/dist/{src/sources → sources}/providers/npm.js +0 -0
- /package/dist/{src/sources → sources}/providers/provider-utils.js +0 -0
- /package/dist/{src/sources → sources}/providers/sync-from-ref.js +0 -0
- /package/dist/{src/sources → sources}/providers/tar-utils.js +0 -0
- /package/dist/{src/sources → sources}/providers/website.js +0 -0
- /package/dist/{src/sources → sources}/resolve.js +0 -0
- /package/dist/{src/sources → sources}/types.js +0 -0
- /package/dist/{src/templates → templates}/wiki-templates.js +0 -0
- /package/dist/{src/version.js → version.js} +0 -0
- /package/dist/{src/workflows → workflows}/authoring.js +0 -0
- /package/dist/{src/workflows → workflows}/cli.js +0 -0
- /package/dist/{src/workflows → workflows}/db.js +0 -0
- /package/dist/{src/workflows → workflows}/document-cache.js +0 -0
- /package/dist/{src/workflows → workflows}/parser.js +0 -0
- /package/dist/{src/workflows → workflows}/renderer.js +0 -0
- /package/dist/{src/workflows → workflows}/schema.js +0 -0
- /package/dist/{src/workflows → workflows}/validator.js +0 -0
|
@@ -1,346 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* akm-bench driver — `runOne(options)` executes a single (task, arm, seed)
|
|
3
|
-
* triple end-to-end and returns a v1 RunResult envelope.
|
|
4
|
-
*
|
|
5
|
-
* See `docs/technical/benchmark.md` §5.2 for the locked schema and §7.1/§7.2
|
|
6
|
-
* for the isolation/budget rules. The shapes here are the v1 contract that
|
|
7
|
-
* #238/#239/#240/#243 will extend without breaking.
|
|
8
|
-
*
|
|
9
|
-
* Design notes:
|
|
10
|
-
* • The driver invokes opencode through `runAgent` with the built-in
|
|
11
|
-
* `opencode` profile. No new harness abstraction.
|
|
12
|
-
* • Per-run isolation: every run gets fresh tmpdirs for `XDG_CACHE_HOME`,
|
|
13
|
-
* `XDG_CONFIG_HOME`, `OPENCODE_CONFIG`, and (when `stashDir` is provided)
|
|
14
|
-
* `AKM_STASH_DIR`. The operator's personal opencode/akm config is NEVER
|
|
15
|
-
* read or written.
|
|
16
|
-
* • Hard budgets: `budgetWallMs` is enforced via `runAgent`'s timeout. A
|
|
17
|
-
* timeout produces `outcome: "budget_exceeded"`, which is a distinct
|
|
18
|
-
* state from `fail` so cost regressions stay visible.
|
|
19
|
-
* • This issue (#236) does not need a real opencode call to work end-to-end.
|
|
20
|
-
* The harness shape, isolation, and result envelope must be correct and
|
|
21
|
-
* unit-testable with an injected fake spawn.
|
|
22
|
-
*/
|
|
23
|
-
import fs from "node:fs";
|
|
24
|
-
import path from "node:path";
|
|
25
|
-
import { BUILTIN_AGENT_PROFILE_NAMES, getBuiltinAgentProfile } from "../../src/integrations/agent/profiles";
|
|
26
|
-
import { runAgent } from "../../src/integrations/agent/spawn";
|
|
27
|
-
import { benchMkdtemp } from "./tmp";
|
|
28
|
-
import { runVerifier } from "./verifier";
|
|
29
|
-
/** Operator-config env names that MUST NOT leak into per-run children. */
|
|
30
|
-
const ISOLATED_ENV_NAMES = ["OPENCODE_CONFIG", "AKM_STASH_DIR", "XDG_CACHE_HOME", "XDG_CONFIG_HOME"];
|
|
31
|
-
/**
|
|
32
|
-
* Operator-env names that MUST be stripped from `envSource` before the bench
|
|
33
|
-
* driver hands it to `runAgent`. These are credentials and config-dir hints
|
|
34
|
-
* that belong to the operator's *interactive* environment and have no
|
|
35
|
-
* business inside a bench-arm child:
|
|
36
|
-
*
|
|
37
|
-
* • `OPENCODE_API_KEY` / `ANTHROPIC_API_KEY` — real-money credentials. The
|
|
38
|
-
* opencode profile lists `OPENCODE_API_KEY` in `envPassthrough`, so
|
|
39
|
-
* without explicit scrubbing the bench would forward the operator's key
|
|
40
|
-
* into every (task × arm × seed) child. Bench is hermetic by design;
|
|
41
|
-
* credentials must be supplied through the bench's own config surface,
|
|
42
|
-
* not inherited.
|
|
43
|
-
* • `AKM_CONFIG_DIR` — points akm at the operator's stash config. Letting
|
|
44
|
-
* this leak defeats the per-run isolation tmpdirs `createIsolationDirs`
|
|
45
|
-
* materialises (XDG_CACHE_HOME / XDG_CONFIG_HOME) and would cause
|
|
46
|
-
* bench runs to read the operator's writable config.
|
|
47
|
-
*
|
|
48
|
-
* Recurrence guard for #271 (mirrors the #243/#251 fixup pattern of
|
|
49
|
-
* pinning isolation behaviour with regression tests).
|
|
50
|
-
*/
|
|
51
|
-
const SCRUBBED_OPERATOR_ENV_NAMES = ["OPENCODE_API_KEY", "ANTHROPIC_API_KEY", "AKM_CONFIG_DIR"];
|
|
52
|
-
/**
|
|
53
|
-
* Build the `envSource` passed to `runAgent`. Returns a copy of `source`
|
|
54
|
-
* (default: `process.env`) with `SCRUBBED_OPERATOR_ENV_NAMES` removed so
|
|
55
|
-
* profile-level passthrough (`profile.envPassthrough`) cannot drag operator
|
|
56
|
-
* credentials/config-dir hints into the bench-arm child.
|
|
57
|
-
*
|
|
58
|
-
* The returned object is a shallow copy — callers may mutate it without
|
|
59
|
-
* touching the real `process.env`.
|
|
60
|
-
*/
|
|
61
|
-
export function buildSanitizedEnvSource(source) {
|
|
62
|
-
const src = source ?? process.env;
|
|
63
|
-
const out = { ...src };
|
|
64
|
-
for (const name of SCRUBBED_OPERATOR_ENV_NAMES) {
|
|
65
|
-
delete out[name];
|
|
66
|
-
}
|
|
67
|
-
return out;
|
|
68
|
-
}
|
|
69
|
-
export function createIsolationDirs(stashDir) {
|
|
70
|
-
const root = benchMkdtemp("akm-bench-run-");
|
|
71
|
-
const cacheHome = path.join(root, "cache");
|
|
72
|
-
const configHome = path.join(root, "config");
|
|
73
|
-
const opencodeConfig = path.join(root, "opencode-config");
|
|
74
|
-
fs.mkdirSync(cacheHome, { recursive: true });
|
|
75
|
-
fs.mkdirSync(configHome, { recursive: true });
|
|
76
|
-
fs.mkdirSync(opencodeConfig, { recursive: true });
|
|
77
|
-
return {
|
|
78
|
-
root,
|
|
79
|
-
cacheHome,
|
|
80
|
-
configHome,
|
|
81
|
-
opencodeConfig,
|
|
82
|
-
akmStashDir: stashDir,
|
|
83
|
-
};
|
|
84
|
-
}
|
|
85
|
-
/** Build the env passed to `runAgent`. The XDG/AKM/OPENCODE keys are pinned. */
|
|
86
|
-
export function buildIsolatedEnv(dirs, model) {
|
|
87
|
-
const env = {
|
|
88
|
-
XDG_CACHE_HOME: dirs.cacheHome,
|
|
89
|
-
XDG_CONFIG_HOME: dirs.configHome,
|
|
90
|
-
OPENCODE_CONFIG: dirs.opencodeConfig,
|
|
91
|
-
BENCH_OPENCODE_MODEL: model,
|
|
92
|
-
};
|
|
93
|
-
if (dirs.akmStashDir)
|
|
94
|
-
env.AKM_STASH_DIR = dirs.akmStashDir;
|
|
95
|
-
return env;
|
|
96
|
-
}
|
|
97
|
-
/**
|
|
98
|
-
* Strip `AKM_STASH_DIR` from a child env object. Used by the synthetic-arm
|
|
99
|
-
* spawn path (#261) so the operator's real `AKM_STASH_DIR` cannot leak in
|
|
100
|
-
* via the parent process even when the harness has copied a wider env via
|
|
101
|
-
* `{ ...process.env, ...env }`. This is the recurrence guard for the #243
|
|
102
|
-
* fixup pattern — a synthetic-arm child must NEVER inherit a stash.
|
|
103
|
-
*
|
|
104
|
-
* Mutates `env` in place and returns it for ergonomic chaining.
|
|
105
|
-
*/
|
|
106
|
-
export function stripAkmStashDir(env) {
|
|
107
|
-
delete env.AKM_STASH_DIR;
|
|
108
|
-
return env;
|
|
109
|
-
}
|
|
110
|
-
/**
|
|
111
|
-
* Best-effort token-usage parser for opencode stdout. Returns numeric token
|
|
112
|
-
* counts AND a measurement status so callers can distinguish a real zero
|
|
113
|
-
* (`"parsed"`, both fields legitimately 0) from an unparseable / absent
|
|
114
|
-
* report (`"missing"`, both fields default to 0 but downstream aggregation
|
|
115
|
-
* MUST skip the run rather than treat that 0 as measured).
|
|
116
|
-
*
|
|
117
|
-
* The harness never emits `"unsupported"` from this parser — that label is
|
|
118
|
-
* stamped on results from arms that don't run a token-reporting agent
|
|
119
|
-
* (e.g. the synthetic arm), and is set by the caller, not here.
|
|
120
|
-
*/
|
|
121
|
-
export function parseTokenUsage(stdout) {
|
|
122
|
-
// opencode prints lines like `tokens: input=1234 output=5678` in some
|
|
123
|
-
// configurations. We look for the keys defensively; absent values mean we
|
|
124
|
-
// could not measure (`measurement: "missing"`).
|
|
125
|
-
const inputMatch = stdout.match(/(?:input[_\s-]?tokens?|tokens?[_\s-]?input)[\s:=]+(\d+)/i);
|
|
126
|
-
const outputMatch = stdout.match(/(?:output[_\s-]?tokens?|tokens?[_\s-]?output)[\s:=]+(\d+)/i);
|
|
127
|
-
if (!inputMatch && !outputMatch) {
|
|
128
|
-
return { input: 0, output: 0, measurement: "missing" };
|
|
129
|
-
}
|
|
130
|
-
return {
|
|
131
|
-
input: inputMatch ? Number.parseInt(inputMatch[1], 10) : 0,
|
|
132
|
-
output: outputMatch ? Number.parseInt(outputMatch[1], 10) : 0,
|
|
133
|
-
measurement: "parsed",
|
|
134
|
-
};
|
|
135
|
-
}
|
|
136
|
-
/**
|
|
137
|
-
* Maximum bytes read from events.jsonl per run. A runaway agent producing
|
|
138
|
-
* GBs of structured-log output would otherwise OOM the bench. Trajectory
|
|
139
|
-
* parsing operates on the prefix; a warning is appended when the cap is
|
|
140
|
-
* hit so the report surfaces the truncation.
|
|
141
|
-
*/
|
|
142
|
-
export const EVENTS_READ_CAP_BYTES = 16 * 1024 * 1024;
|
|
143
|
-
/**
|
|
144
|
-
* Read the events.jsonl file produced by this run, if any. The path is
|
|
145
|
-
* `<XDG_CACHE_HOME>/akm/events.jsonl` per `src/core/events.ts`.
|
|
146
|
-
*
|
|
147
|
-
* Caps the number of bytes read at `EVENTS_READ_CAP_BYTES` (16 MiB). When the
|
|
148
|
-
* file is larger, the prefix is parsed and a warning is appended to
|
|
149
|
-
* `opts.warnings` (when supplied). The trailing partial line after a
|
|
150
|
-
* truncation is dropped, since `JSON.parse` would reject it anyway.
|
|
151
|
-
*/
|
|
152
|
-
export function readRunEvents(cacheHome, opts) {
|
|
153
|
-
const eventsPath = path.join(cacheHome, "akm", "events.jsonl");
|
|
154
|
-
if (!fs.existsSync(eventsPath))
|
|
155
|
-
return [];
|
|
156
|
-
// Read up to the cap. We open the file rather than `readFileSync` so we
|
|
157
|
-
// don't allocate an arbitrarily large buffer just to throw most of it away.
|
|
158
|
-
let totalSize = 0;
|
|
159
|
-
try {
|
|
160
|
-
totalSize = fs.statSync(eventsPath).size;
|
|
161
|
-
}
|
|
162
|
-
catch {
|
|
163
|
-
return [];
|
|
164
|
-
}
|
|
165
|
-
const cap = EVENTS_READ_CAP_BYTES;
|
|
166
|
-
const truncated = totalSize > cap;
|
|
167
|
-
let text;
|
|
168
|
-
if (truncated) {
|
|
169
|
-
const buf = Buffer.alloc(cap);
|
|
170
|
-
const fd = fs.openSync(eventsPath, "r");
|
|
171
|
-
try {
|
|
172
|
-
fs.readSync(fd, buf, 0, cap, 0);
|
|
173
|
-
}
|
|
174
|
-
finally {
|
|
175
|
-
fs.closeSync(fd);
|
|
176
|
-
}
|
|
177
|
-
text = buf.toString("utf8");
|
|
178
|
-
// Drop the partial trailing line so we don't try to parse half a record.
|
|
179
|
-
const lastNl = text.lastIndexOf("\n");
|
|
180
|
-
if (lastNl !== -1)
|
|
181
|
-
text = text.slice(0, lastNl);
|
|
182
|
-
if (opts?.warnings) {
|
|
183
|
-
opts.warnings.push(`events.jsonl truncated: ${totalSize} bytes exceeds ${cap}-byte cap; trajectory computed from the prefix.`);
|
|
184
|
-
}
|
|
185
|
-
}
|
|
186
|
-
else {
|
|
187
|
-
text = fs.readFileSync(eventsPath, "utf8");
|
|
188
|
-
}
|
|
189
|
-
const out = [];
|
|
190
|
-
let id = 0;
|
|
191
|
-
for (const line of text.split("\n")) {
|
|
192
|
-
const trimmed = line.trim();
|
|
193
|
-
if (!trimmed)
|
|
194
|
-
continue;
|
|
195
|
-
try {
|
|
196
|
-
const parsed = JSON.parse(trimmed);
|
|
197
|
-
out.push({ ...parsed, id: parsed.id ?? id });
|
|
198
|
-
id += 1;
|
|
199
|
-
}
|
|
200
|
-
catch {
|
|
201
|
-
// Skip malformed lines — events stream is best-effort upstream.
|
|
202
|
-
}
|
|
203
|
-
}
|
|
204
|
-
return out;
|
|
205
|
-
}
|
|
206
|
-
/** Default prompt forwarded to opencode when caller omits one. */
|
|
207
|
-
function defaultPrompt(options) {
|
|
208
|
-
return [
|
|
209
|
-
`Task: ${options.taskId}`,
|
|
210
|
-
`Arm: ${options.arm}`,
|
|
211
|
-
`Workspace: ${options.workspace}`,
|
|
212
|
-
options.arm === "akm"
|
|
213
|
-
? "An akm stash is configured via AKM_STASH_DIR. Use `akm search` and `akm show` to find relevant assets before acting."
|
|
214
|
-
: "",
|
|
215
|
-
]
|
|
216
|
-
.filter(Boolean)
|
|
217
|
-
.join("\n");
|
|
218
|
-
}
|
|
219
|
-
/**
|
|
220
|
-
* Run a single (task, arm, seed) and return the v1 RunResult envelope.
|
|
221
|
-
*
|
|
222
|
-
* The function never throws on infrastructure failures — every error path
|
|
223
|
-
* is captured into the returned RunResult with a stable outcome value.
|
|
224
|
-
*/
|
|
225
|
-
export async function runOne(options) {
|
|
226
|
-
// Stamp a baseline result; we mutate fields below as the run progresses.
|
|
227
|
-
const result = {
|
|
228
|
-
schemaVersion: 1,
|
|
229
|
-
taskId: options.taskId,
|
|
230
|
-
arm: options.arm,
|
|
231
|
-
seed: options.seed,
|
|
232
|
-
model: options.model,
|
|
233
|
-
outcome: "harness_error",
|
|
234
|
-
tokens: { input: 0, output: 0 },
|
|
235
|
-
tokenMeasurement: "missing",
|
|
236
|
-
wallclockMs: 0,
|
|
237
|
-
trajectory: { correctAssetLoaded: null, feedbackRecorded: null },
|
|
238
|
-
events: [],
|
|
239
|
-
verifierStdout: "",
|
|
240
|
-
verifierExitCode: -1,
|
|
241
|
-
assetsLoaded: [],
|
|
242
|
-
};
|
|
243
|
-
// Look up the built-in opencode profile defensively. The lookup is a pure
|
|
244
|
-
// map read today, but wrapping it preserves the doc-comment guarantee that
|
|
245
|
-
// runOne never throws on infrastructure failures even if the registry
|
|
246
|
-
// shape changes. A missing/throwing profile becomes harness_error.
|
|
247
|
-
let profile;
|
|
248
|
-
try {
|
|
249
|
-
profile = getBuiltinAgentProfile("opencode");
|
|
250
|
-
}
|
|
251
|
-
catch (err) {
|
|
252
|
-
result.verifierStdout = `harness: getBuiltinAgentProfile("opencode") threw: ${err instanceof Error ? err.message : String(err)}`;
|
|
253
|
-
return result;
|
|
254
|
-
}
|
|
255
|
-
if (!profile) {
|
|
256
|
-
result.verifierStdout = `harness: built-in agent profile "opencode" missing; available: ${BUILTIN_AGENT_PROFILE_NAMES.join(", ")}`;
|
|
257
|
-
return result;
|
|
258
|
-
}
|
|
259
|
-
// #261: synthetic-arm runs MUST NOT carry AKM_STASH_DIR. We refuse to
|
|
260
|
-
// forward a stashDir for the synthetic arm even when the caller mistakenly
|
|
261
|
-
// supplies one, and we explicitly delete the key from the built env so the
|
|
262
|
-
// operator's real AKM_STASH_DIR can never leak in through any parent-env
|
|
263
|
-
// inheritance the harness happens to do downstream. Recurrence guard for
|
|
264
|
-
// the #243 fixup pattern.
|
|
265
|
-
const stashDir = options.arm === "synthetic" ? undefined : options.stashDir;
|
|
266
|
-
const dirs = createIsolationDirs(stashDir);
|
|
267
|
-
const env = buildIsolatedEnv(dirs, options.model);
|
|
268
|
-
if (options.arm === "synthetic") {
|
|
269
|
-
stripAkmStashDir(env);
|
|
270
|
-
}
|
|
271
|
-
try {
|
|
272
|
-
const agentResult = await runAgent(profile, options.prompt ?? defaultPrompt(options), {
|
|
273
|
-
env,
|
|
274
|
-
// #271: scrub operator credentials + config-dir hints from the env
|
|
275
|
-
// source BEFORE profile.envPassthrough copies them into the child.
|
|
276
|
-
// Without this, OPENCODE_API_KEY (in opencode's passthrough list) and
|
|
277
|
-
// AKM_CONFIG_DIR (read by akm at startup) would leak the operator's
|
|
278
|
-
// interactive environment into every bench child.
|
|
279
|
-
envSource: buildSanitizedEnvSource(),
|
|
280
|
-
cwd: options.workspace,
|
|
281
|
-
timeoutMs: options.budgetWallMs,
|
|
282
|
-
stdio: "captured",
|
|
283
|
-
...(options.spawn ? { spawn: options.spawn } : {}),
|
|
284
|
-
});
|
|
285
|
-
result.wallclockMs = agentResult.durationMs;
|
|
286
|
-
const parsed = parseTokenUsage(agentResult.stdout);
|
|
287
|
-
result.tokens = { input: parsed.input, output: parsed.output };
|
|
288
|
-
result.tokenMeasurement = parsed.measurement;
|
|
289
|
-
result.events = readRunEvents(dirs.cacheHome, { warnings: options.warnings });
|
|
290
|
-
if (!agentResult.ok) {
|
|
291
|
-
if (agentResult.reason === "timeout") {
|
|
292
|
-
result.outcome = "budget_exceeded";
|
|
293
|
-
return result;
|
|
294
|
-
}
|
|
295
|
-
// spawn_failed / non_zero_exit / parse_error all mean the harness
|
|
296
|
-
// itself broke; the verifier never saw the workspace.
|
|
297
|
-
if (agentResult.reason === "spawn_failed" || agentResult.reason === "parse_error") {
|
|
298
|
-
result.outcome = "harness_error";
|
|
299
|
-
return result;
|
|
300
|
-
}
|
|
301
|
-
// non_zero_exit from the agent: intentionally falls through to the
|
|
302
|
-
// verifier path. Per spec §5.3 ("deterministic verifiers, never LLM"),
|
|
303
|
-
// the agent is the system under test, not the judge — its exit code
|
|
304
|
-
// does not gate verification. The verifier always runs against
|
|
305
|
-
// whatever workspace state the agent left behind, even on a crash.
|
|
306
|
-
}
|
|
307
|
-
// Token-budget enforcement is best-effort: only mark `budget_exceeded`
|
|
308
|
-
// if measurement was actually parsed (issue #252) AND the total exceeds
|
|
309
|
-
// the cap. A `"missing"` / `"unsupported"` measurement MUST NOT silently
|
|
310
|
-
// mask a budget overrun as a pass — it leaves the verifier to decide.
|
|
311
|
-
if (result.tokenMeasurement === "parsed") {
|
|
312
|
-
const totalTokens = result.tokens.input + result.tokens.output;
|
|
313
|
-
if (totalTokens > options.budgetTokens) {
|
|
314
|
-
result.outcome = "budget_exceeded";
|
|
315
|
-
return result;
|
|
316
|
-
}
|
|
317
|
-
}
|
|
318
|
-
const verifierResult = await runVerifier(options.taskDir, options.workspace, options.verifier, {
|
|
319
|
-
agentStdout: agentResult.stdout,
|
|
320
|
-
expectedMatch: options.expectedMatch,
|
|
321
|
-
...(options.spawn ? { spawn: options.spawn } : {}),
|
|
322
|
-
});
|
|
323
|
-
result.verifierStdout = verifierResult.stdout;
|
|
324
|
-
result.verifierExitCode = verifierResult.exitCode;
|
|
325
|
-
if (verifierResult.exitCode === 127) {
|
|
326
|
-
// Missing runtime (e.g. pytest not on PATH) — not the agent's fault.
|
|
327
|
-
result.outcome = "harness_error";
|
|
328
|
-
}
|
|
329
|
-
else {
|
|
330
|
-
result.outcome = verifierResult.exitCode === 0 ? "pass" : "fail";
|
|
331
|
-
}
|
|
332
|
-
return result;
|
|
333
|
-
}
|
|
334
|
-
finally {
|
|
335
|
-
// Always tear down the isolation tmpdir. We copy events out before
|
|
336
|
-
// deletion (see readRunEvents above), so this is safe.
|
|
337
|
-
fs.rmSync(dirs.root, { recursive: true, force: true });
|
|
338
|
-
}
|
|
339
|
-
}
|
|
340
|
-
/** Exposed for the unit test that asserts operator env never leaks. */
|
|
341
|
-
export const _ISOLATED_ENV_NAMES = ISOLATED_ENV_NAMES;
|
|
342
|
-
/**
|
|
343
|
-
* Exposed for the #271 regression test that asserts operator credentials +
|
|
344
|
-
* `AKM_CONFIG_DIR` never reach a bench-arm child via profile.envPassthrough.
|
|
345
|
-
*/
|
|
346
|
-
export const _SCRUBBED_OPERATOR_ENV_NAMES = SCRUBBED_OPERATOR_ENV_NAMES;
|