akm-cli 0.7.0 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +8 -8
- package/dist/tests/add-website-source.test.js +0 -119
- package/dist/tests/agent/agent-config-loader.test.js +0 -70
- package/dist/tests/agent/agent-config.test.js +0 -221
- package/dist/tests/agent/agent-detect.test.js +0 -100
- package/dist/tests/agent/agent-spawn.test.js +0 -234
- package/dist/tests/agent-output.test.js +0 -186
- package/dist/tests/architecture/agent-no-llm-sdk-guard.test.js +0 -103
- package/dist/tests/architecture/agent-spawn-seam.test.js +0 -193
- package/dist/tests/architecture/llm-stateless-seam.test.js +0 -112
- package/dist/tests/asset-ref.test.js +0 -192
- package/dist/tests/asset-registry.test.js +0 -103
- package/dist/tests/asset-spec.test.js +0 -241
- package/dist/tests/bench/attribution.test.js +0 -996
- package/dist/tests/bench/cleanup-sigint.test.js +0 -83
- package/dist/tests/bench/cleanup.js +0 -234
- package/dist/tests/bench/cleanup.test.js +0 -166
- package/dist/tests/bench/cli.js +0 -1018
- package/dist/tests/bench/cli.test.js +0 -445
- package/dist/tests/bench/compare.test.js +0 -556
- package/dist/tests/bench/corpus.js +0 -317
- package/dist/tests/bench/corpus.test.js +0 -258
- package/dist/tests/bench/doctor.js +0 -525
- package/dist/tests/bench/driver.js +0 -401
- package/dist/tests/bench/driver.test.js +0 -584
- package/dist/tests/bench/environment.js +0 -233
- package/dist/tests/bench/environment.test.js +0 -199
- package/dist/tests/bench/evolve-metrics.js +0 -179
- package/dist/tests/bench/evolve-metrics.test.js +0 -187
- package/dist/tests/bench/evolve.js +0 -647
- package/dist/tests/bench/evolve.test.js +0 -624
- package/dist/tests/bench/failure-modes.test.js +0 -349
- package/dist/tests/bench/feedback-integrity.test.js +0 -457
- package/dist/tests/bench/leakage.test.js +0 -228
- package/dist/tests/bench/learning-curve.test.js +0 -134
- package/dist/tests/bench/metrics.js +0 -2395
- package/dist/tests/bench/metrics.test.js +0 -1150
- package/dist/tests/bench/no-os-tmpdir-invariant.test.js +0 -43
- package/dist/tests/bench/opencode-config.js +0 -194
- package/dist/tests/bench/opencode-config.test.js +0 -370
- package/dist/tests/bench/report.js +0 -1885
- package/dist/tests/bench/report.test.js +0 -1038
- package/dist/tests/bench/run-config.js +0 -355
- package/dist/tests/bench/run-config.test.js +0 -298
- package/dist/tests/bench/run-curate-test.js +0 -32
- package/dist/tests/bench/run-failing-tasks.js +0 -56
- package/dist/tests/bench/run-full-bench.js +0 -51
- package/dist/tests/bench/run-items36-targeted.js +0 -69
- package/dist/tests/bench/run-nano-quick.js +0 -42
- package/dist/tests/bench/run-waveg-targeted.js +0 -62
- package/dist/tests/bench/runner.js +0 -699
- package/dist/tests/bench/runner.test.js +0 -958
- package/dist/tests/bench/search-bridge.test.js +0 -331
- package/dist/tests/bench/tmp.js +0 -131
- package/dist/tests/bench/trajectory.js +0 -116
- package/dist/tests/bench/trajectory.test.js +0 -127
- package/dist/tests/bench/verifier.js +0 -114
- package/dist/tests/bench/verifier.test.js +0 -118
- package/dist/tests/bench/workflow-evaluator.js +0 -557
- package/dist/tests/bench/workflow-evaluator.test.js +0 -421
- package/dist/tests/bench/workflow-spec.js +0 -345
- package/dist/tests/bench/workflow-spec.test.js +0 -363
- package/dist/tests/bench/workflow-trace.js +0 -472
- package/dist/tests/bench/workflow-trace.test.js +0 -254
- package/dist/tests/benchmark-search-quality.js +0 -536
- package/dist/tests/benchmark-suite.js +0 -1441
- package/dist/tests/capture-cli.test.js +0 -112
- package/dist/tests/cli-errors.test.js +0 -204
- package/dist/tests/commands/events.test.js +0 -370
- package/dist/tests/commands/history.test.js +0 -418
- package/dist/tests/commands/import.test.js +0 -103
- package/dist/tests/commands/proposal-cli.test.js +0 -209
- package/dist/tests/commands/reflect-propose-cli.test.js +0 -333
- package/dist/tests/commands/remember.test.js +0 -97
- package/dist/tests/commands/scope-flags.test.js +0 -300
- package/dist/tests/commands/search.test.js +0 -537
- package/dist/tests/commands/show-indexer-parity.test.js +0 -117
- package/dist/tests/commands/show.test.js +0 -294
- package/dist/tests/common.test.js +0 -266
- package/dist/tests/completions.test.js +0 -142
- package/dist/tests/config-cli.test.js +0 -193
- package/dist/tests/config-llm-features.test.js +0 -139
- package/dist/tests/config.test.js +0 -569
- package/dist/tests/contracts/migration-baseline.test.js +0 -43
- package/dist/tests/contracts/reflect-propose-envelope.test.js +0 -139
- package/dist/tests/contracts/spec-helpers.js +0 -46
- package/dist/tests/contracts/v1-spec-section-11-proposal-queue.test.js +0 -228
- package/dist/tests/contracts/v1-spec-section-12-agent-config.test.js +0 -56
- package/dist/tests/contracts/v1-spec-section-13-lesson-type.test.js +0 -34
- package/dist/tests/contracts/v1-spec-section-14-llm-features.test.js +0 -94
- package/dist/tests/contracts/v1-spec-section-4-1-asset-types.test.js +0 -39
- package/dist/tests/contracts/v1-spec-section-4-2-quality-rules.test.js +0 -44
- package/dist/tests/contracts/v1-spec-section-5-configuration.test.js +0 -47
- package/dist/tests/contracts/v1-spec-section-6-orchestration.test.js +0 -40
- package/dist/tests/contracts/v1-spec-section-7-module-layout.test.js +0 -58
- package/dist/tests/contracts/v1-spec-section-8-extension-points.test.js +0 -34
- package/dist/tests/contracts/v1-spec-section-9-4-cli-surface.test.js +0 -75
- package/dist/tests/contracts/v1-spec-section-9-7-llm-agent-boundary.test.js +0 -36
- package/dist/tests/core/write-source.test.js +0 -366
- package/dist/tests/curate-command.test.js +0 -87
- package/dist/tests/db-scoring.test.js +0 -201
- package/dist/tests/db.test.js +0 -654
- package/dist/tests/distill-cli-flag.test.js +0 -208
- package/dist/tests/distill.test.js +0 -515
- package/dist/tests/docker-install.test.js +0 -120
- package/dist/tests/e2e.test.js +0 -1419
- package/dist/tests/embedder.test.js +0 -340
- package/dist/tests/embedding-model-config.test.js +0 -379
- package/dist/tests/feedback-command.test.js +0 -172
- package/dist/tests/file-context.test.js +0 -552
- package/dist/tests/fixtures/scripts/git/summarize-diff.js +0 -9
- package/dist/tests/fixtures/scripts/lint/eslint-check.js +0 -7
- package/dist/tests/fixtures/stashes/load.js +0 -166
- package/dist/tests/fixtures/stashes/load.test.js +0 -97
- package/dist/tests/fixtures/stashes/ranking-baseline/scripts/mem0-search.js +0 -12
- package/dist/tests/frontmatter.test.js +0 -190
- package/dist/tests/fts-field-weighting.test.js +0 -254
- package/dist/tests/fuzzy-search.test.js +0 -230
- package/dist/tests/git-provider-clone.test.js +0 -45
- package/dist/tests/github.test.js +0 -161
- package/dist/tests/graph-boost-ranking.test.js +0 -305
- package/dist/tests/graph-extraction.test.js +0 -282
- package/dist/tests/helpers/usage-events.js +0 -8
- package/dist/tests/index-pass-llm.test.js +0 -161
- package/dist/tests/indexer.test.js +0 -570
- package/dist/tests/info-command.test.js +0 -166
- package/dist/tests/init.test.js +0 -69
- package/dist/tests/install-script.test.js +0 -246
- package/dist/tests/integration/agent-real-profile.test.js +0 -94
- package/dist/tests/issue-36-repro.test.js +0 -304
- package/dist/tests/issues-191-194.test.js +0 -160
- package/dist/tests/lesson-lint.test.js +0 -111
- package/dist/tests/llm-client.test.js +0 -115
- package/dist/tests/llm-feature-gate.test.js +0 -151
- package/dist/tests/llm.test.js +0 -139
- package/dist/tests/lockfile.test.js +0 -216
- package/dist/tests/manifest.test.js +0 -205
- package/dist/tests/markdown.test.js +0 -126
- package/dist/tests/matchers-unit.test.js +0 -189
- package/dist/tests/memory-inference.test.js +0 -299
- package/dist/tests/merge-scoring.test.js +0 -136
- package/dist/tests/metadata.test.js +0 -313
- package/dist/tests/migration-help.test.js +0 -89
- package/dist/tests/origin-resolve.test.js +0 -124
- package/dist/tests/output-baseline.test.js +0 -218
- package/dist/tests/output-shapes-unit.test.js +0 -478
- package/dist/tests/parallel-search.test.js +0 -272
- package/dist/tests/parameter-metadata.test.js +0 -365
- package/dist/tests/paths.test.js +0 -177
- package/dist/tests/progressive-disclosure.test.js +0 -280
- package/dist/tests/proposals.test.js +0 -279
- package/dist/tests/proposed-quality.test.js +0 -271
- package/dist/tests/provider-registry.test.js +0 -32
- package/dist/tests/ranking-regression.test.js +0 -548
- package/dist/tests/reflect-propose.test.js +0 -455
- package/dist/tests/registry-build-index.test.js +0 -394
- package/dist/tests/registry-cli.test.js +0 -290
- package/dist/tests/registry-index-v2.test.js +0 -430
- package/dist/tests/registry-install.test.js +0 -728
- package/dist/tests/registry-providers/parity.test.js +0 -189
- package/dist/tests/registry-providers/skills-sh.test.js +0 -309
- package/dist/tests/registry-providers/static-index.test.js +0 -238
- package/dist/tests/registry-resolve.test.js +0 -126
- package/dist/tests/registry-search.test.js +0 -923
- package/dist/tests/remember-frontmatter.test.js +0 -378
- package/dist/tests/remember-unit.test.js +0 -123
- package/dist/tests/ripgrep-install.test.js +0 -251
- package/dist/tests/ripgrep-resolve.test.js +0 -108
- package/dist/tests/ripgrep.test.js +0 -163
- package/dist/tests/save-command.test.js +0 -94
- package/dist/tests/save-trust-qa-fixes.test.js +0 -270
- package/dist/tests/scoring-pipeline.test.js +0 -648
- package/dist/tests/search-include-proposed-cli.test.js +0 -118
- package/dist/tests/self-update.test.js +0 -442
- package/dist/tests/semantic-search-e2e.test.js +0 -512
- package/dist/tests/semantic-status.test.js +0 -471
- package/dist/tests/setup-run.integration.js +0 -877
- package/dist/tests/setup-wizard.test.js +0 -198
- package/dist/tests/setup.test.js +0 -131
- package/dist/tests/source-add.test.js +0 -11
- package/dist/tests/source-clone.test.js +0 -254
- package/dist/tests/source-manage.test.js +0 -366
- package/dist/tests/source-providers/filesystem.test.js +0 -82
- package/dist/tests/source-providers/git.test.js +0 -252
- package/dist/tests/source-providers/website.test.js +0 -128
- package/dist/tests/source-qa-fixes.test.js +0 -286
- package/dist/tests/source-registry.test.js +0 -350
- package/dist/tests/source-resolve.test.js +0 -100
- package/dist/tests/source-source.test.js +0 -281
- package/dist/tests/source.test.js +0 -533
- package/dist/tests/tar-utils-scan.test.js +0 -73
- package/dist/tests/toggle-components.test.js +0 -73
- package/dist/tests/usage-telemetry.test.js +0 -265
- package/dist/tests/utility-scoring.test.js +0 -558
- package/dist/tests/vault-load-error.test.js +0 -78
- package/dist/tests/vault-qa-fixes.test.js +0 -194
- package/dist/tests/vault.test.js +0 -429
- package/dist/tests/vector-search.test.js +0 -608
- package/dist/tests/walker.test.js +0 -252
- package/dist/tests/wave2-cluster-bc.test.js +0 -228
- package/dist/tests/wave2-cluster-d.test.js +0 -180
- package/dist/tests/wave2-cluster-e.test.js +0 -179
- package/dist/tests/wiki-qa-fixes.test.js +0 -270
- package/dist/tests/wiki.test.js +0 -529
- package/dist/tests/workflow-cli.test.js +0 -271
- package/dist/tests/workflow-markdown.test.js +0 -171
- package/dist/tests/workflow-path-escape.test.js +0 -132
- package/dist/tests/workflow-qa-fixes.test.js +0 -395
- package/dist/tests/workflows/indexer-rejection.test.js +0 -213
- /package/dist/{src/cli.js → cli.js} +0 -0
- /package/dist/{src/commands → commands}/completions.js +0 -0
- /package/dist/{src/commands → commands}/config-cli.js +0 -0
- /package/dist/{src/commands → commands}/curate.js +0 -0
- /package/dist/{src/commands → commands}/distill.js +0 -0
- /package/dist/{src/commands → commands}/events.js +0 -0
- /package/dist/{src/commands → commands}/history.js +0 -0
- /package/dist/{src/commands → commands}/info.js +0 -0
- /package/dist/{src/commands → commands}/init.js +0 -0
- /package/dist/{src/commands → commands}/install-audit.js +0 -0
- /package/dist/{src/commands → commands}/installed-stashes.js +0 -0
- /package/dist/{src/commands → commands}/migration-help.js +0 -0
- /package/dist/{src/commands → commands}/proposal.js +0 -0
- /package/dist/{src/commands → commands}/propose.js +0 -0
- /package/dist/{src/commands → commands}/reflect.js +0 -0
- /package/dist/{src/commands → commands}/registry-search.js +0 -0
- /package/dist/{src/commands → commands}/remember.js +0 -0
- /package/dist/{src/commands → commands}/search.js +0 -0
- /package/dist/{src/commands → commands}/self-update.js +0 -0
- /package/dist/{src/commands → commands}/show.js +0 -0
- /package/dist/{src/commands → commands}/source-add.js +0 -0
- /package/dist/{src/commands → commands}/source-clone.js +0 -0
- /package/dist/{src/commands → commands}/source-manage.js +0 -0
- /package/dist/{src/commands → commands}/vault.js +0 -0
- /package/dist/{src/core → core}/asset-ref.js +0 -0
- /package/dist/{src/core → core}/asset-registry.js +0 -0
- /package/dist/{src/core → core}/asset-spec.js +0 -0
- /package/dist/{src/core → core}/common.js +0 -0
- /package/dist/{src/core → core}/config.js +0 -0
- /package/dist/{src/core → core}/errors.js +0 -0
- /package/dist/{src/core → core}/events.js +0 -0
- /package/dist/{src/core → core}/frontmatter.js +0 -0
- /package/dist/{src/core → core}/lesson-lint.js +0 -0
- /package/dist/{src/core → core}/markdown.js +0 -0
- /package/dist/{src/core → core}/paths.js +0 -0
- /package/dist/{src/core → core}/proposals.js +0 -0
- /package/dist/{src/core → core}/warn.js +0 -0
- /package/dist/{src/core → core}/write-source.js +0 -0
- /package/dist/{src/indexer → indexer}/db-search.js +0 -0
- /package/dist/{src/indexer → indexer}/db.js +0 -0
- /package/dist/{src/indexer → indexer}/file-context.js +0 -0
- /package/dist/{src/indexer → indexer}/graph-boost.js +0 -0
- /package/dist/{src/indexer → indexer}/graph-extraction.js +0 -0
- /package/dist/{src/indexer → indexer}/indexer.js +0 -0
- /package/dist/{src/indexer → indexer}/manifest.js +0 -0
- /package/dist/{src/indexer → indexer}/matchers.js +0 -0
- /package/dist/{src/indexer → indexer}/memory-inference.js +0 -0
- /package/dist/{src/indexer → indexer}/metadata.js +0 -0
- /package/dist/{src/indexer → indexer}/search-fields.js +0 -0
- /package/dist/{src/indexer → indexer}/search-source.js +0 -0
- /package/dist/{src/indexer → indexer}/semantic-status.js +0 -0
- /package/dist/{src/indexer → indexer}/usage-events.js +0 -0
- /package/dist/{src/indexer → indexer}/walker.js +0 -0
- /package/dist/{src/integrations → integrations}/agent/config.js +0 -0
- /package/dist/{src/integrations → integrations}/agent/detect.js +0 -0
- /package/dist/{src/integrations → integrations}/agent/index.js +0 -0
- /package/dist/{src/integrations → integrations}/agent/profiles.js +0 -0
- /package/dist/{src/integrations → integrations}/agent/prompts.js +0 -0
- /package/dist/{src/integrations → integrations}/agent/spawn.js +0 -0
- /package/dist/{src/integrations → integrations}/github.js +0 -0
- /package/dist/{src/integrations → integrations}/lockfile.js +0 -0
- /package/dist/{src/llm → llm}/client.js +0 -0
- /package/dist/{src/llm → llm}/embedder.js +0 -0
- /package/dist/{src/llm → llm}/embedders/cache.js +0 -0
- /package/dist/{src/llm → llm}/embedders/local.js +0 -0
- /package/dist/{src/llm → llm}/embedders/remote.js +0 -0
- /package/dist/{src/llm → llm}/embedders/types.js +0 -0
- /package/dist/{src/llm → llm}/feature-gate.js +0 -0
- /package/dist/{src/llm → llm}/graph-extract.js +0 -0
- /package/dist/{src/llm → llm}/index-passes.js +0 -0
- /package/dist/{src/llm → llm}/memory-infer.js +0 -0
- /package/dist/{src/llm → llm}/metadata-enhance.js +0 -0
- /package/dist/{src/output → output}/cli-hints.js +0 -0
- /package/dist/{src/output → output}/context.js +0 -0
- /package/dist/{src/output → output}/renderers.js +0 -0
- /package/dist/{src/output → output}/shapes.js +0 -0
- /package/dist/{src/output → output}/text.js +0 -0
- /package/dist/{src/registry → registry}/build-index.js +0 -0
- /package/dist/{src/registry → registry}/create-provider-registry.js +0 -0
- /package/dist/{src/registry → registry}/factory.js +0 -0
- /package/dist/{src/registry → registry}/origin-resolve.js +0 -0
- /package/dist/{src/registry → registry}/providers/index.js +0 -0
- /package/dist/{src/registry → registry}/providers/skills-sh.js +0 -0
- /package/dist/{src/registry → registry}/providers/static-index.js +0 -0
- /package/dist/{src/registry → registry}/providers/types.js +0 -0
- /package/dist/{src/registry → registry}/resolve.js +0 -0
- /package/dist/{src/registry → registry}/types.js +0 -0
- /package/dist/{src/setup → setup}/detect.js +0 -0
- /package/dist/{src/setup → setup}/ripgrep-install.js +0 -0
- /package/dist/{src/setup → setup}/ripgrep-resolve.js +0 -0
- /package/dist/{src/setup → setup}/setup.js +0 -0
- /package/dist/{src/setup → setup}/steps.js +0 -0
- /package/dist/{src/sources → sources}/include.js +0 -0
- /package/dist/{src/sources → sources}/provider-factory.js +0 -0
- /package/dist/{src/sources → sources}/provider.js +0 -0
- /package/dist/{src/sources → sources}/providers/filesystem.js +0 -0
- /package/dist/{src/sources → sources}/providers/git.js +0 -0
- /package/dist/{src/sources → sources}/providers/index.js +0 -0
- /package/dist/{src/sources → sources}/providers/install-types.js +0 -0
- /package/dist/{src/sources → sources}/providers/npm.js +0 -0
- /package/dist/{src/sources → sources}/providers/provider-utils.js +0 -0
- /package/dist/{src/sources → sources}/providers/sync-from-ref.js +0 -0
- /package/dist/{src/sources → sources}/providers/tar-utils.js +0 -0
- /package/dist/{src/sources → sources}/providers/website.js +0 -0
- /package/dist/{src/sources → sources}/resolve.js +0 -0
- /package/dist/{src/sources → sources}/types.js +0 -0
- /package/dist/{src/templates → templates}/wiki-templates.js +0 -0
- /package/dist/{src/version.js → version.js} +0 -0
- /package/dist/{src/wiki → wiki}/wiki.js +0 -0
- /package/dist/{src/workflows → workflows}/authoring.js +0 -0
- /package/dist/{src/workflows → workflows}/cli.js +0 -0
- /package/dist/{src/workflows → workflows}/db.js +0 -0
- /package/dist/{src/workflows → workflows}/document-cache.js +0 -0
- /package/dist/{src/workflows → workflows}/parser.js +0 -0
- /package/dist/{src/workflows → workflows}/renderer.js +0 -0
- /package/dist/{src/workflows → workflows}/runs.js +0 -0
- /package/dist/{src/workflows → workflows}/schema.js +0 -0
- /package/dist/{src/workflows → workflows}/validator.js +0 -0
|
@@ -1,556 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Unit tests for the `bench compare` subcommand (#239).
|
|
3
|
-
*
|
|
4
|
-
* Covers:
|
|
5
|
-
* • happy-path comparison: deltas + sign markers correct.
|
|
6
|
-
* • model-mismatch refusal: both models named in the message.
|
|
7
|
-
* • missing fixture-content hash on either side: proceeds with a warning.
|
|
8
|
-
* • markdown output is byte-stable across two calls with identical input.
|
|
9
|
-
* • CLI driver: invalid input file (missing path / malformed JSON) → exit 2.
|
|
10
|
-
* • CLI driver: refusal → exit 1; success → exit 0.
|
|
11
|
-
*/
|
|
12
|
-
import { describe, expect, test } from "bun:test";
|
|
13
|
-
import fs from "node:fs";
|
|
14
|
-
import path from "node:path";
|
|
15
|
-
import { runCompareCli } from "./cli";
|
|
16
|
-
import { compareReports } from "./metrics";
|
|
17
|
-
import { renderCompareMarkdown } from "./report";
|
|
18
|
-
import { benchMkdtemp } from "./tmp";
|
|
19
|
-
const MODEL = "anthropic/claude-opus-4-7";
|
|
20
|
-
function makeReport(overrides = {}) {
|
|
21
|
-
return {
|
|
22
|
-
schemaVersion: 1,
|
|
23
|
-
track: "utility",
|
|
24
|
-
branch: "release/0.7.0",
|
|
25
|
-
commit: "deadbee",
|
|
26
|
-
timestamp: "2026-04-27T12:00:00Z",
|
|
27
|
-
agent: { harness: "opencode", model: MODEL },
|
|
28
|
-
corpus: { domains: 2, tasks: 2, slice: "all", seedsPerArm: 5 },
|
|
29
|
-
aggregate: {
|
|
30
|
-
noakm: { pass_rate: 0.4, tokens_per_pass: 18000, wallclock_ms: 41000 },
|
|
31
|
-
akm: { pass_rate: 0.6, tokens_per_pass: 14000, wallclock_ms: 36000 },
|
|
32
|
-
delta: { pass_rate: 0.2, tokens_per_pass: -4000, wallclock_ms: -5000 },
|
|
33
|
-
},
|
|
34
|
-
tasks: [
|
|
35
|
-
{
|
|
36
|
-
id: "domain-a/task-1",
|
|
37
|
-
akm: {
|
|
38
|
-
pass_rate: 0.6,
|
|
39
|
-
pass_at_1: 1,
|
|
40
|
-
tokens_per_pass: 13000,
|
|
41
|
-
wallclock_ms: 35000,
|
|
42
|
-
pass_rate_stdev: 0.1,
|
|
43
|
-
budget_exceeded_count: 0,
|
|
44
|
-
harness_error_count: 0,
|
|
45
|
-
count: 5,
|
|
46
|
-
},
|
|
47
|
-
},
|
|
48
|
-
{
|
|
49
|
-
id: "domain-b/task-2",
|
|
50
|
-
akm: {
|
|
51
|
-
pass_rate: 0.6,
|
|
52
|
-
pass_at_1: 1,
|
|
53
|
-
tokens_per_pass: 15000,
|
|
54
|
-
wallclock_ms: 37000,
|
|
55
|
-
pass_rate_stdev: 0.2,
|
|
56
|
-
budget_exceeded_count: 0,
|
|
57
|
-
harness_error_count: 0,
|
|
58
|
-
count: 5,
|
|
59
|
-
},
|
|
60
|
-
},
|
|
61
|
-
],
|
|
62
|
-
warnings: [],
|
|
63
|
-
...overrides,
|
|
64
|
-
};
|
|
65
|
-
}
|
|
66
|
-
describe("compareReports — happy path", () => {
|
|
67
|
-
test("computes aggregate delta with correct sign markers", () => {
|
|
68
|
-
const base = makeReport();
|
|
69
|
-
// Current improves pass_rate by +0.2, reduces tokens by 1000, slower by 1000ms.
|
|
70
|
-
const current = makeReport({
|
|
71
|
-
aggregate: {
|
|
72
|
-
noakm: { pass_rate: 0.4, tokens_per_pass: 18000, wallclock_ms: 41000 },
|
|
73
|
-
akm: { pass_rate: 0.8, tokens_per_pass: 13000, wallclock_ms: 37000 },
|
|
74
|
-
delta: { pass_rate: 0.4, tokens_per_pass: -5000, wallclock_ms: -4000 },
|
|
75
|
-
},
|
|
76
|
-
});
|
|
77
|
-
const result = compareReports(base, current);
|
|
78
|
-
if (!result.ok)
|
|
79
|
-
throw new Error("expected ok=true");
|
|
80
|
-
expect(result.aggregate.passRateDelta).toBeCloseTo(0.2);
|
|
81
|
-
expect(result.aggregate.passRateSign).toBe("improve");
|
|
82
|
-
expect(result.aggregate.tokensPerPassDelta).toBeCloseTo(-1000);
|
|
83
|
-
expect(result.aggregate.tokensPerPassSign).toBe("improve"); // lower tokens = better
|
|
84
|
-
expect(result.aggregate.wallclockMsDelta).toBeCloseTo(1000);
|
|
85
|
-
expect(result.aggregate.wallclockMsSign).toBe("regress"); // higher wallclock = worse
|
|
86
|
-
expect(result.perTask.length).toBe(2);
|
|
87
|
-
});
|
|
88
|
-
test("flat sign for tiny pass-rate jitter", () => {
|
|
89
|
-
const base = makeReport();
|
|
90
|
-
const current = makeReport({
|
|
91
|
-
aggregate: {
|
|
92
|
-
noakm: { pass_rate: 0.4, tokens_per_pass: 18000, wallclock_ms: 41000 },
|
|
93
|
-
akm: { pass_rate: 0.602, tokens_per_pass: 14000, wallclock_ms: 36000 },
|
|
94
|
-
delta: { pass_rate: 0.202, tokens_per_pass: -4000, wallclock_ms: -5000 },
|
|
95
|
-
},
|
|
96
|
-
});
|
|
97
|
-
const result = compareReports(base, current);
|
|
98
|
-
if (!result.ok)
|
|
99
|
-
throw new Error("expected ok=true");
|
|
100
|
-
// 0.602 − 0.6 = 0.002 < 0.005 tolerance
|
|
101
|
-
expect(result.aggregate.passRateSign).toBe("flat");
|
|
102
|
-
});
|
|
103
|
-
test("per-task row carries baseMetrics + currentMetrics + signMarker", () => {
|
|
104
|
-
const base = makeReport();
|
|
105
|
-
const current = makeReport({
|
|
106
|
-
tasks: [
|
|
107
|
-
{
|
|
108
|
-
id: "domain-a/task-1",
|
|
109
|
-
akm: {
|
|
110
|
-
pass_rate: 0.8,
|
|
111
|
-
pass_at_1: 1,
|
|
112
|
-
tokens_per_pass: 12000,
|
|
113
|
-
wallclock_ms: 33000,
|
|
114
|
-
pass_rate_stdev: 0.05,
|
|
115
|
-
budget_exceeded_count: 0,
|
|
116
|
-
harness_error_count: 0,
|
|
117
|
-
count: 5,
|
|
118
|
-
},
|
|
119
|
-
},
|
|
120
|
-
{
|
|
121
|
-
id: "domain-b/task-2",
|
|
122
|
-
akm: {
|
|
123
|
-
pass_rate: 0.4,
|
|
124
|
-
pass_at_1: 0,
|
|
125
|
-
tokens_per_pass: 16000,
|
|
126
|
-
wallclock_ms: 38000,
|
|
127
|
-
pass_rate_stdev: 0.3,
|
|
128
|
-
budget_exceeded_count: 1,
|
|
129
|
-
harness_error_count: 0,
|
|
130
|
-
count: 5,
|
|
131
|
-
},
|
|
132
|
-
},
|
|
133
|
-
],
|
|
134
|
-
});
|
|
135
|
-
const result = compareReports(base, current);
|
|
136
|
-
if (!result.ok)
|
|
137
|
-
throw new Error("expected ok=true");
|
|
138
|
-
const row1 = result.perTask.find((r) => r.id === "domain-a/task-1");
|
|
139
|
-
const row2 = result.perTask.find((r) => r.id === "domain-b/task-2");
|
|
140
|
-
expect(row1?.signMarker).toBe("improve");
|
|
141
|
-
expect(row1?.delta.passRate).toBeCloseTo(0.2);
|
|
142
|
-
expect(row1?.baseMetrics?.pass_rate_stdev).toBeCloseTo(0.1);
|
|
143
|
-
expect(row1?.currentMetrics?.pass_rate_stdev).toBeCloseTo(0.05);
|
|
144
|
-
expect(row2?.signMarker).toBe("regress");
|
|
145
|
-
expect(row2?.delta.passRate).toBeCloseTo(-0.2);
|
|
146
|
-
});
|
|
147
|
-
});
|
|
148
|
-
describe("compareReports — refusal cases", () => {
|
|
149
|
-
test("model mismatch: ok=false with both models named", () => {
|
|
150
|
-
const base = makeReport();
|
|
151
|
-
const current = makeReport({ agent: { harness: "opencode", model: "anthropic/claude-sonnet-4-5" } });
|
|
152
|
-
const result = compareReports(base, current);
|
|
153
|
-
expect(result.ok).toBe(false);
|
|
154
|
-
if (result.ok)
|
|
155
|
-
return;
|
|
156
|
-
expect(result.reason).toBe("model_mismatch");
|
|
157
|
-
expect(result.baseModel).toBe(MODEL);
|
|
158
|
-
expect(result.currentModel).toBe("anthropic/claude-sonnet-4-5");
|
|
159
|
-
expect(result.message).toContain(MODEL);
|
|
160
|
-
expect(result.message).toContain("anthropic/claude-sonnet-4-5");
|
|
161
|
-
});
|
|
162
|
-
test("schema mismatch: refuses non-v1 envelopes", () => {
|
|
163
|
-
const base = makeReport({ schemaVersion: 2 });
|
|
164
|
-
const current = makeReport();
|
|
165
|
-
const result = compareReports(base, current);
|
|
166
|
-
expect(result.ok).toBe(false);
|
|
167
|
-
if (result.ok)
|
|
168
|
-
return;
|
|
169
|
-
expect(result.reason).toBe("schema_mismatch");
|
|
170
|
-
});
|
|
171
|
-
test("track mismatch: refuses non-utility tracks", () => {
|
|
172
|
-
const base = makeReport({ track: "evolve" });
|
|
173
|
-
const current = makeReport();
|
|
174
|
-
const result = compareReports(base, current);
|
|
175
|
-
expect(result.ok).toBe(false);
|
|
176
|
-
if (result.ok)
|
|
177
|
-
return;
|
|
178
|
-
expect(result.reason).toBe("track_mismatch");
|
|
179
|
-
});
|
|
180
|
-
test("hash mismatch: refuses with both hashes named", () => {
|
|
181
|
-
const base = makeReport({
|
|
182
|
-
corpus: { domains: 2, tasks: 2, slice: "all", seedsPerArm: 5, fixtureContentHash: "abc123" },
|
|
183
|
-
});
|
|
184
|
-
const current = makeReport({
|
|
185
|
-
corpus: { domains: 2, tasks: 2, slice: "all", seedsPerArm: 5, fixtureContentHash: "def456" },
|
|
186
|
-
});
|
|
187
|
-
const result = compareReports(base, current);
|
|
188
|
-
expect(result.ok).toBe(false);
|
|
189
|
-
if (result.ok)
|
|
190
|
-
return;
|
|
191
|
-
expect(result.reason).toBe("hash_mismatch");
|
|
192
|
-
expect(result.message).toContain("abc123");
|
|
193
|
-
expect(result.message).toContain("def456");
|
|
194
|
-
});
|
|
195
|
-
});
|
|
196
|
-
describe("compareReports — fixture-hash warnings", () => {
|
|
197
|
-
test("missing hash on base: proceeds with warning", () => {
|
|
198
|
-
const base = makeReport(); // no fixtureContentHash
|
|
199
|
-
const current = makeReport({
|
|
200
|
-
corpus: { domains: 2, tasks: 2, slice: "all", seedsPerArm: 5, fixtureContentHash: "abc123" },
|
|
201
|
-
});
|
|
202
|
-
const result = compareReports(base, current);
|
|
203
|
-
expect(result.ok).toBe(true);
|
|
204
|
-
if (!result.ok)
|
|
205
|
-
return;
|
|
206
|
-
expect(result.warnings.some((w) => w.includes("base") && w.includes("fixtureContentHash"))).toBe(true);
|
|
207
|
-
});
|
|
208
|
-
test("missing hash on current: proceeds with warning", () => {
|
|
209
|
-
const base = makeReport({
|
|
210
|
-
corpus: { domains: 2, tasks: 2, slice: "all", seedsPerArm: 5, fixtureContentHash: "abc123" },
|
|
211
|
-
});
|
|
212
|
-
const current = makeReport(); // no fixtureContentHash
|
|
213
|
-
const result = compareReports(base, current);
|
|
214
|
-
expect(result.ok).toBe(true);
|
|
215
|
-
if (!result.ok)
|
|
216
|
-
return;
|
|
217
|
-
expect(result.warnings.some((w) => w.includes("current") && w.includes("fixtureContentHash"))).toBe(true);
|
|
218
|
-
});
|
|
219
|
-
test("missing on both: two fixture warnings (#250 also adds two corpus warnings)", () => {
|
|
220
|
-
const base = makeReport();
|
|
221
|
-
const current = makeReport();
|
|
222
|
-
const result = compareReports(base, current);
|
|
223
|
-
expect(result.ok).toBe(true);
|
|
224
|
-
if (!result.ok)
|
|
225
|
-
return;
|
|
226
|
-
expect(result.warnings.filter((w) => w.includes("fixtureContentHash")).length).toBe(2);
|
|
227
|
-
});
|
|
228
|
-
test("matching fixture hash: no fixture warnings", () => {
|
|
229
|
-
const base = makeReport({
|
|
230
|
-
corpus: { domains: 2, tasks: 2, slice: "all", seedsPerArm: 5, fixtureContentHash: "abc123" },
|
|
231
|
-
});
|
|
232
|
-
const current = makeReport({
|
|
233
|
-
corpus: { domains: 2, tasks: 2, slice: "all", seedsPerArm: 5, fixtureContentHash: "abc123" },
|
|
234
|
-
});
|
|
235
|
-
const result = compareReports(base, current);
|
|
236
|
-
expect(result.ok).toBe(true);
|
|
237
|
-
if (!result.ok)
|
|
238
|
-
return;
|
|
239
|
-
expect(result.warnings.filter((w) => w.includes("fixtureContentHash")).length).toBe(0);
|
|
240
|
-
});
|
|
241
|
-
});
|
|
242
|
-
describe("compareReports — corpus identity (#250)", () => {
|
|
243
|
-
function withCorpusIdentity(taskCorpusHash, selectedTaskIds, fixtureContentHash) {
|
|
244
|
-
return makeReport({
|
|
245
|
-
corpus: {
|
|
246
|
-
domains: 2,
|
|
247
|
-
tasks: selectedTaskIds.length,
|
|
248
|
-
slice: "all",
|
|
249
|
-
seedsPerArm: 5,
|
|
250
|
-
taskCorpusHash,
|
|
251
|
-
selectedTaskIds,
|
|
252
|
-
...(fixtureContentHash ? { fixtureContentHash } : {}),
|
|
253
|
-
},
|
|
254
|
-
});
|
|
255
|
-
}
|
|
256
|
-
test("matching corpus + fixture hashes: ok=true, no warnings", () => {
|
|
257
|
-
const base = withCorpusIdentity("tc-a", ["a/one", "b/two"], "fh-a");
|
|
258
|
-
const current = withCorpusIdentity("tc-a", ["a/one", "b/two"], "fh-a");
|
|
259
|
-
const result = compareReports(base, current);
|
|
260
|
-
expect(result.ok).toBe(true);
|
|
261
|
-
if (!result.ok)
|
|
262
|
-
return;
|
|
263
|
-
expect(result.warnings.length).toBe(0);
|
|
264
|
-
});
|
|
265
|
-
test("taskCorpusHash mismatch: refuses by default", () => {
|
|
266
|
-
const base = withCorpusIdentity("tc-a", ["a/one", "b/two"]);
|
|
267
|
-
const current = withCorpusIdentity("tc-b", ["a/one", "b/two"]);
|
|
268
|
-
const result = compareReports(base, current);
|
|
269
|
-
expect(result.ok).toBe(false);
|
|
270
|
-
if (result.ok)
|
|
271
|
-
return;
|
|
272
|
-
expect(result.reason).toBe("corpus_mismatch");
|
|
273
|
-
expect(result.message).toContain("tc-a");
|
|
274
|
-
expect(result.message).toContain("tc-b");
|
|
275
|
-
expect(result.baseTaskCorpusHash).toBe("tc-a");
|
|
276
|
-
expect(result.currentTaskCorpusHash).toBe("tc-b");
|
|
277
|
-
});
|
|
278
|
-
test("selectedTaskIds differ but hashes both present and matching: still ok", () => {
|
|
279
|
-
// Defensive: in practice two reports with identical taskCorpusHash should
|
|
280
|
-
// also share IDs; but if a producer ever forgets to align them, the hash
|
|
281
|
-
// dominates so we don't false-positive.
|
|
282
|
-
const base = withCorpusIdentity("tc-a", ["a/one", "b/two"]);
|
|
283
|
-
const current = withCorpusIdentity("tc-a", ["a/one", "b/two", "c/three"]);
|
|
284
|
-
const result = compareReports(base, current);
|
|
285
|
-
expect(result.ok).toBe(true);
|
|
286
|
-
});
|
|
287
|
-
test("allowCorpusMismatch converts refusal to warning", () => {
|
|
288
|
-
const base = withCorpusIdentity("tc-a", ["a/one", "b/two"]);
|
|
289
|
-
const current = withCorpusIdentity("tc-b", ["a/one", "b/two"]);
|
|
290
|
-
const result = compareReports(base, current, { allowCorpusMismatch: true });
|
|
291
|
-
expect(result.ok).toBe(true);
|
|
292
|
-
if (!result.ok)
|
|
293
|
-
return;
|
|
294
|
-
expect(result.warnings.some((w) => w.includes("--allow-corpus-mismatch"))).toBe(true);
|
|
295
|
-
});
|
|
296
|
-
test("legacy report (missing taskCorpusHash) gets a warning, not refusal", () => {
|
|
297
|
-
const base = makeReport(); // no taskCorpusHash
|
|
298
|
-
const current = withCorpusIdentity("tc-a", ["a/one", "b/two"]);
|
|
299
|
-
const result = compareReports(base, current);
|
|
300
|
-
expect(result.ok).toBe(true);
|
|
301
|
-
if (!result.ok)
|
|
302
|
-
return;
|
|
303
|
-
expect(result.warnings.some((w) => w.includes("base") && w.includes("taskCorpusHash"))).toBe(true);
|
|
304
|
-
});
|
|
305
|
-
test("legacy on both sides: two warnings, still ok", () => {
|
|
306
|
-
const base = makeReport();
|
|
307
|
-
const current = makeReport();
|
|
308
|
-
const result = compareReports(base, current);
|
|
309
|
-
expect(result.ok).toBe(true);
|
|
310
|
-
if (!result.ok)
|
|
311
|
-
return;
|
|
312
|
-
// 2 missing taskCorpusHash + 2 missing fixtureContentHash warnings.
|
|
313
|
-
expect(result.warnings.filter((w) => w.includes("taskCorpusHash")).length).toBe(2);
|
|
314
|
-
expect(result.warnings.filter((w) => w.includes("fixtureContentHash")).length).toBe(2);
|
|
315
|
-
});
|
|
316
|
-
test("fixture-content hash mismatch: refuses by default (existing behaviour)", () => {
|
|
317
|
-
const base = withCorpusIdentity("tc-a", ["a/one", "b/two"], "fh-a");
|
|
318
|
-
const current = withCorpusIdentity("tc-a", ["a/one", "b/two"], "fh-b");
|
|
319
|
-
const result = compareReports(base, current);
|
|
320
|
-
expect(result.ok).toBe(false);
|
|
321
|
-
if (result.ok)
|
|
322
|
-
return;
|
|
323
|
-
expect(result.reason).toBe("hash_mismatch");
|
|
324
|
-
});
|
|
325
|
-
test("allowFixtureMismatch converts fixture-hash refusal to warning", () => {
|
|
326
|
-
const base = withCorpusIdentity("tc-a", ["a/one", "b/two"], "fh-a");
|
|
327
|
-
const current = withCorpusIdentity("tc-a", ["a/one", "b/two"], "fh-b");
|
|
328
|
-
const result = compareReports(base, current, { allowFixtureMismatch: true });
|
|
329
|
-
expect(result.ok).toBe(true);
|
|
330
|
-
if (!result.ok)
|
|
331
|
-
return;
|
|
332
|
-
expect(result.warnings.some((w) => w.includes("--allow-fixture-mismatch"))).toBe(true);
|
|
333
|
-
});
|
|
334
|
-
test("corpus mismatch is checked before fixture mismatch (refusal precedence)", () => {
|
|
335
|
-
// When both differ and neither flag is set, the corpus refusal wins.
|
|
336
|
-
const base = withCorpusIdentity("tc-a", ["a/one"], "fh-a");
|
|
337
|
-
const current = withCorpusIdentity("tc-b", ["a/one"], "fh-b");
|
|
338
|
-
const result = compareReports(base, current);
|
|
339
|
-
expect(result.ok).toBe(false);
|
|
340
|
-
if (result.ok)
|
|
341
|
-
return;
|
|
342
|
-
expect(result.reason).toBe("corpus_mismatch");
|
|
343
|
-
});
|
|
344
|
-
});
|
|
345
|
-
describe("renderCompareMarkdown determinism", () => {
|
|
346
|
-
test("byte-stable across two calls with identical input", () => {
|
|
347
|
-
const base = makeReport();
|
|
348
|
-
const current = makeReport();
|
|
349
|
-
const r1 = compareReports(base, current);
|
|
350
|
-
const r2 = compareReports(base, current);
|
|
351
|
-
expect(renderCompareMarkdown(r1)).toBe(renderCompareMarkdown(r2));
|
|
352
|
-
});
|
|
353
|
-
test("contains aggregate header and per-task table", () => {
|
|
354
|
-
const base = makeReport();
|
|
355
|
-
const current = makeReport({
|
|
356
|
-
aggregate: {
|
|
357
|
-
noakm: { pass_rate: 0.4, tokens_per_pass: 18000, wallclock_ms: 41000 },
|
|
358
|
-
akm: { pass_rate: 0.8, tokens_per_pass: 13000, wallclock_ms: 36000 },
|
|
359
|
-
delta: { pass_rate: 0.4, tokens_per_pass: -5000, wallclock_ms: -5000 },
|
|
360
|
-
},
|
|
361
|
-
});
|
|
362
|
-
const md = renderCompareMarkdown(compareReports(base, current));
|
|
363
|
-
expect(md).toContain("# akm-bench compare");
|
|
364
|
-
expect(md).toContain("## Aggregate");
|
|
365
|
-
expect(md).toContain("## Per-task");
|
|
366
|
-
expect(md).toContain("pass_rate");
|
|
367
|
-
expect(md).toContain("+0.20"); // pass_rate delta
|
|
368
|
-
expect(md).toContain("▲"); // improve glyph
|
|
369
|
-
});
|
|
370
|
-
test("refusal renders as a single error block, not a diff table", () => {
|
|
371
|
-
const base = makeReport();
|
|
372
|
-
const current = makeReport({ agent: { harness: "opencode", model: "anthropic/claude-sonnet-4-5" } });
|
|
373
|
-
const md = renderCompareMarkdown(compareReports(base, current));
|
|
374
|
-
expect(md).toContain("refused");
|
|
375
|
-
expect(md).toContain("model_mismatch");
|
|
376
|
-
expect(md).toContain(MODEL);
|
|
377
|
-
expect(md).toContain("anthropic/claude-sonnet-4-5");
|
|
378
|
-
expect(md).not.toContain("## Aggregate"); // no diff body
|
|
379
|
-
});
|
|
380
|
-
});
|
|
381
|
-
// ── CLI driver ────────────────────────────────────────────────────────────
|
|
382
|
-
describe("runCompareCli", () => {
|
|
383
|
-
function withTmpFiles(cb, base, current) {
|
|
384
|
-
const tmp = benchMkdtemp("bench-compare-");
|
|
385
|
-
try {
|
|
386
|
-
const basePath = path.join(tmp, "base.json");
|
|
387
|
-
const currentPath = path.join(tmp, "current.json");
|
|
388
|
-
fs.writeFileSync(basePath, JSON.stringify(base ?? makeReport()));
|
|
389
|
-
fs.writeFileSync(currentPath, JSON.stringify(current ?? makeReport()));
|
|
390
|
-
cb({ basePath, currentPath, tmp });
|
|
391
|
-
}
|
|
392
|
-
finally {
|
|
393
|
-
fs.rmSync(tmp, { recursive: true, force: true });
|
|
394
|
-
}
|
|
395
|
-
}
|
|
396
|
-
test("happy path: exit 0, markdown to stdout", () => {
|
|
397
|
-
withTmpFiles(({ basePath, currentPath }) => {
|
|
398
|
-
const result = runCompareCli({ basePath, currentPath, json: false });
|
|
399
|
-
expect(result.exitCode).toBe(0);
|
|
400
|
-
expect(result.stdout).toContain("# akm-bench compare");
|
|
401
|
-
expect(result.stderr).toContain("pass_rate");
|
|
402
|
-
});
|
|
403
|
-
});
|
|
404
|
-
test("happy path with --json: exit 0, structured JSON to stdout", () => {
|
|
405
|
-
withTmpFiles(({ basePath, currentPath }) => {
|
|
406
|
-
const result = runCompareCli({ basePath, currentPath, json: true });
|
|
407
|
-
expect(result.exitCode).toBe(0);
|
|
408
|
-
const parsed = JSON.parse(result.stdout);
|
|
409
|
-
expect(parsed.ok).toBe(true);
|
|
410
|
-
});
|
|
411
|
-
});
|
|
412
|
-
test("model mismatch: exit 1 + clear stderr", () => {
|
|
413
|
-
withTmpFiles(({ basePath, currentPath }) => {
|
|
414
|
-
const result = runCompareCli({ basePath, currentPath, json: false });
|
|
415
|
-
expect(result.exitCode).toBe(1);
|
|
416
|
-
expect(result.stderr).toContain("different models");
|
|
417
|
-
}, makeReport(), makeReport({ agent: { harness: "opencode", model: "anthropic/claude-sonnet-4-5" } }));
|
|
418
|
-
});
|
|
419
|
-
test("hash mismatch: exit 1", () => {
|
|
420
|
-
withTmpFiles(({ basePath, currentPath }) => {
|
|
421
|
-
const result = runCompareCli({ basePath, currentPath, json: false });
|
|
422
|
-
expect(result.exitCode).toBe(1);
|
|
423
|
-
expect(result.stderr).toContain("fixture-content");
|
|
424
|
-
}, makeReport({ corpus: { domains: 2, tasks: 2, slice: "all", seedsPerArm: 5, fixtureContentHash: "h1" } }), makeReport({ corpus: { domains: 2, tasks: 2, slice: "all", seedsPerArm: 5, fixtureContentHash: "h2" } }));
|
|
425
|
-
});
|
|
426
|
-
test("malformed JSON in --base: exit 2", () => {
|
|
427
|
-
const tmp = benchMkdtemp("bench-compare-bad-");
|
|
428
|
-
try {
|
|
429
|
-
const basePath = path.join(tmp, "base.json");
|
|
430
|
-
const currentPath = path.join(tmp, "current.json");
|
|
431
|
-
fs.writeFileSync(basePath, "{ not valid json");
|
|
432
|
-
fs.writeFileSync(currentPath, JSON.stringify(makeReport()));
|
|
433
|
-
const result = runCompareCli({ basePath, currentPath, json: false });
|
|
434
|
-
expect(result.exitCode).toBe(2);
|
|
435
|
-
expect(result.stderr).toContain("malformed JSON");
|
|
436
|
-
}
|
|
437
|
-
finally {
|
|
438
|
-
fs.rmSync(tmp, { recursive: true, force: true });
|
|
439
|
-
}
|
|
440
|
-
});
|
|
441
|
-
test("round-trip: reports with persisted runs[] (#249) compare cleanly", () => {
|
|
442
|
-
// The runs[] field is additive — compare ignores it but must NOT reject
|
|
443
|
-
// reports that carry it. Confirms the new key is forward-compatible with
|
|
444
|
-
// the existing aggregate-based diff path.
|
|
445
|
-
const baseWithRuns = {
|
|
446
|
-
...makeReport(),
|
|
447
|
-
runs: [
|
|
448
|
-
{
|
|
449
|
-
task_id: "domain-a/task-1",
|
|
450
|
-
arm: "akm",
|
|
451
|
-
seed: 0,
|
|
452
|
-
model: MODEL,
|
|
453
|
-
outcome: "pass",
|
|
454
|
-
tokens: { input: 1, output: 2 },
|
|
455
|
-
wallclock_ms: 100,
|
|
456
|
-
verifier_exit_code: 0,
|
|
457
|
-
trajectory: { correct_asset_loaded: true, feedback_recorded: false },
|
|
458
|
-
assets_loaded: ["skill:foo"],
|
|
459
|
-
failure_mode: null,
|
|
460
|
-
},
|
|
461
|
-
],
|
|
462
|
-
};
|
|
463
|
-
withTmpFiles(({ basePath, currentPath }) => {
|
|
464
|
-
const result = runCompareCli({ basePath, currentPath, json: true });
|
|
465
|
-
expect(result.exitCode).toBe(0);
|
|
466
|
-
const parsed = JSON.parse(result.stdout);
|
|
467
|
-
expect(parsed.ok).toBe(true);
|
|
468
|
-
}, baseWithRuns, baseWithRuns);
|
|
469
|
-
});
|
|
470
|
-
test("corpus mismatch: exit 1 (#250)", () => {
|
|
471
|
-
withTmpFiles(({ basePath, currentPath }) => {
|
|
472
|
-
const result = runCompareCli({ basePath, currentPath, json: false });
|
|
473
|
-
expect(result.exitCode).toBe(1);
|
|
474
|
-
expect(result.stderr).toContain("task corpora");
|
|
475
|
-
}, makeReport({
|
|
476
|
-
corpus: {
|
|
477
|
-
domains: 2,
|
|
478
|
-
tasks: 2,
|
|
479
|
-
slice: "all",
|
|
480
|
-
seedsPerArm: 5,
|
|
481
|
-
taskCorpusHash: "tc1",
|
|
482
|
-
selectedTaskIds: ["a/one", "b/two"],
|
|
483
|
-
},
|
|
484
|
-
}), makeReport({
|
|
485
|
-
corpus: {
|
|
486
|
-
domains: 2,
|
|
487
|
-
tasks: 2,
|
|
488
|
-
slice: "all",
|
|
489
|
-
seedsPerArm: 5,
|
|
490
|
-
taskCorpusHash: "tc2",
|
|
491
|
-
selectedTaskIds: ["a/one", "b/two"],
|
|
492
|
-
},
|
|
493
|
-
}));
|
|
494
|
-
});
|
|
495
|
-
test("corpus mismatch with --allow-corpus-mismatch: exit 0 + warning (#250)", () => {
|
|
496
|
-
withTmpFiles(({ basePath, currentPath }) => {
|
|
497
|
-
const result = runCompareCli({
|
|
498
|
-
basePath,
|
|
499
|
-
currentPath,
|
|
500
|
-
json: false,
|
|
501
|
-
allowCorpusMismatch: true,
|
|
502
|
-
});
|
|
503
|
-
expect(result.exitCode).toBe(0);
|
|
504
|
-
expect(result.stderr).toContain("warning");
|
|
505
|
-
expect(result.stderr).toContain("task corpus");
|
|
506
|
-
}, makeReport({
|
|
507
|
-
corpus: {
|
|
508
|
-
domains: 2,
|
|
509
|
-
tasks: 2,
|
|
510
|
-
slice: "all",
|
|
511
|
-
seedsPerArm: 5,
|
|
512
|
-
taskCorpusHash: "tc1",
|
|
513
|
-
selectedTaskIds: ["a/one", "b/two"],
|
|
514
|
-
},
|
|
515
|
-
}), makeReport({
|
|
516
|
-
corpus: {
|
|
517
|
-
domains: 2,
|
|
518
|
-
tasks: 2,
|
|
519
|
-
slice: "all",
|
|
520
|
-
seedsPerArm: 5,
|
|
521
|
-
taskCorpusHash: "tc2",
|
|
522
|
-
selectedTaskIds: ["a/one", "b/two"],
|
|
523
|
-
},
|
|
524
|
-
}));
|
|
525
|
-
});
|
|
526
|
-
test("fixture mismatch with --allow-fixture-mismatch: exit 0 + warning (#250)", () => {
|
|
527
|
-
withTmpFiles(({ basePath, currentPath }) => {
|
|
528
|
-
const result = runCompareCli({
|
|
529
|
-
basePath,
|
|
530
|
-
currentPath,
|
|
531
|
-
json: false,
|
|
532
|
-
allowFixtureMismatch: true,
|
|
533
|
-
});
|
|
534
|
-
expect(result.exitCode).toBe(0);
|
|
535
|
-
expect(result.stderr).toContain("warning");
|
|
536
|
-
expect(result.stderr).toContain("fixture-content");
|
|
537
|
-
}, makeReport({ corpus: { domains: 2, tasks: 2, slice: "all", seedsPerArm: 5, fixtureContentHash: "h1" } }), makeReport({ corpus: { domains: 2, tasks: 2, slice: "all", seedsPerArm: 5, fixtureContentHash: "h2" } }));
|
|
538
|
-
});
|
|
539
|
-
test("missing --base file: exit 2", () => {
|
|
540
|
-
const tmp = benchMkdtemp("bench-compare-missing-");
|
|
541
|
-
try {
|
|
542
|
-
const currentPath = path.join(tmp, "current.json");
|
|
543
|
-
fs.writeFileSync(currentPath, JSON.stringify(makeReport()));
|
|
544
|
-
const result = runCompareCli({
|
|
545
|
-
basePath: path.join(tmp, "nope.json"),
|
|
546
|
-
currentPath,
|
|
547
|
-
json: false,
|
|
548
|
-
});
|
|
549
|
-
expect(result.exitCode).toBe(2);
|
|
550
|
-
expect(result.stderr).toContain("cannot read --base");
|
|
551
|
-
}
|
|
552
|
-
finally {
|
|
553
|
-
fs.rmSync(tmp, { recursive: true, force: true });
|
|
554
|
-
}
|
|
555
|
-
});
|
|
556
|
-
});
|