akm-cli 0.7.0 → 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +8 -0
- package/dist/{src/cli.js → cli.js} +22 -8
- package/dist/{src/commands → commands}/installed-stashes.js +1 -1
- package/dist/{src/commands → commands}/source-add.js +1 -1
- package/dist/{src/core → core}/common.js +16 -1
- package/dist/{src/core → core}/config.js +5 -2
- package/dist/{src/indexer → indexer}/db-search.js +16 -1
- package/dist/{src/indexer → indexer}/graph-extraction.js +5 -3
- package/dist/{src/indexer → indexer}/indexer.js +27 -11
- package/dist/{src/indexer → indexer}/memory-inference.js +47 -58
- package/dist/{src/indexer → indexer}/search-source.js +1 -1
- package/dist/{src/llm → llm}/client.js +61 -1
- package/dist/{src/llm → llm}/embedder.js +8 -5
- package/dist/{src/llm → llm}/embedders/local.js +8 -2
- package/dist/{src/llm → llm}/embedders/remote.js +4 -2
- package/dist/{src/llm → llm}/graph-extract.js +4 -4
- package/dist/llm/memory-infer.js +114 -0
- package/dist/{src/llm → llm}/metadata-enhance.js +2 -2
- package/dist/{src/output → output}/cli-hints.js +2 -0
- package/dist/{src/setup → setup}/setup.js +30 -20
- package/dist/sources/providers/website.js +27 -0
- package/dist/{src/sources/providers/website.js → sources/website-ingest.js} +38 -51
- package/docs/README.md +7 -0
- package/docs/migration/release-notes/0.7.0.md +14 -0
- package/package.json +11 -8
- package/dist/src/llm/memory-infer.js +0 -86
- package/dist/tests/add-website-source.test.js +0 -119
- package/dist/tests/agent/agent-config-loader.test.js +0 -70
- package/dist/tests/agent/agent-config.test.js +0 -221
- package/dist/tests/agent/agent-detect.test.js +0 -100
- package/dist/tests/agent/agent-spawn.test.js +0 -234
- package/dist/tests/agent-output.test.js +0 -186
- package/dist/tests/architecture/agent-no-llm-sdk-guard.test.js +0 -103
- package/dist/tests/architecture/agent-spawn-seam.test.js +0 -193
- package/dist/tests/architecture/llm-stateless-seam.test.js +0 -112
- package/dist/tests/asset-ref.test.js +0 -192
- package/dist/tests/asset-registry.test.js +0 -103
- package/dist/tests/asset-spec.test.js +0 -241
- package/dist/tests/bench/attribution.test.js +0 -996
- package/dist/tests/bench/cleanup-sigint.test.js +0 -83
- package/dist/tests/bench/cleanup.js +0 -234
- package/dist/tests/bench/cleanup.test.js +0 -166
- package/dist/tests/bench/cli.js +0 -1018
- package/dist/tests/bench/cli.test.js +0 -445
- package/dist/tests/bench/compare.test.js +0 -556
- package/dist/tests/bench/corpus.js +0 -317
- package/dist/tests/bench/corpus.test.js +0 -258
- package/dist/tests/bench/doctor.js +0 -525
- package/dist/tests/bench/driver.js +0 -401
- package/dist/tests/bench/driver.test.js +0 -584
- package/dist/tests/bench/environment.js +0 -233
- package/dist/tests/bench/environment.test.js +0 -199
- package/dist/tests/bench/evolve-metrics.js +0 -179
- package/dist/tests/bench/evolve-metrics.test.js +0 -187
- package/dist/tests/bench/evolve.js +0 -647
- package/dist/tests/bench/evolve.test.js +0 -624
- package/dist/tests/bench/failure-modes.test.js +0 -349
- package/dist/tests/bench/feedback-integrity.test.js +0 -457
- package/dist/tests/bench/leakage.test.js +0 -228
- package/dist/tests/bench/learning-curve.test.js +0 -134
- package/dist/tests/bench/metrics.js +0 -2395
- package/dist/tests/bench/metrics.test.js +0 -1150
- package/dist/tests/bench/no-os-tmpdir-invariant.test.js +0 -43
- package/dist/tests/bench/opencode-config.js +0 -194
- package/dist/tests/bench/opencode-config.test.js +0 -370
- package/dist/tests/bench/report.js +0 -1885
- package/dist/tests/bench/report.test.js +0 -1038
- package/dist/tests/bench/run-config.js +0 -355
- package/dist/tests/bench/run-config.test.js +0 -298
- package/dist/tests/bench/run-curate-test.js +0 -32
- package/dist/tests/bench/run-failing-tasks.js +0 -56
- package/dist/tests/bench/run-full-bench.js +0 -51
- package/dist/tests/bench/run-items36-targeted.js +0 -69
- package/dist/tests/bench/run-nano-quick.js +0 -42
- package/dist/tests/bench/run-waveg-targeted.js +0 -62
- package/dist/tests/bench/runner.js +0 -699
- package/dist/tests/bench/runner.test.js +0 -958
- package/dist/tests/bench/search-bridge.test.js +0 -331
- package/dist/tests/bench/tmp.js +0 -131
- package/dist/tests/bench/trajectory.js +0 -116
- package/dist/tests/bench/trajectory.test.js +0 -127
- package/dist/tests/bench/verifier.js +0 -114
- package/dist/tests/bench/verifier.test.js +0 -118
- package/dist/tests/bench/workflow-evaluator.js +0 -557
- package/dist/tests/bench/workflow-evaluator.test.js +0 -421
- package/dist/tests/bench/workflow-spec.js +0 -345
- package/dist/tests/bench/workflow-spec.test.js +0 -363
- package/dist/tests/bench/workflow-trace.js +0 -472
- package/dist/tests/bench/workflow-trace.test.js +0 -254
- package/dist/tests/benchmark-search-quality.js +0 -536
- package/dist/tests/benchmark-suite.js +0 -1441
- package/dist/tests/capture-cli.test.js +0 -112
- package/dist/tests/cli-errors.test.js +0 -204
- package/dist/tests/commands/events.test.js +0 -370
- package/dist/tests/commands/history.test.js +0 -418
- package/dist/tests/commands/import.test.js +0 -103
- package/dist/tests/commands/proposal-cli.test.js +0 -209
- package/dist/tests/commands/reflect-propose-cli.test.js +0 -333
- package/dist/tests/commands/remember.test.js +0 -97
- package/dist/tests/commands/scope-flags.test.js +0 -300
- package/dist/tests/commands/search.test.js +0 -537
- package/dist/tests/commands/show-indexer-parity.test.js +0 -117
- package/dist/tests/commands/show.test.js +0 -294
- package/dist/tests/common.test.js +0 -266
- package/dist/tests/completions.test.js +0 -142
- package/dist/tests/config-cli.test.js +0 -193
- package/dist/tests/config-llm-features.test.js +0 -139
- package/dist/tests/config.test.js +0 -569
- package/dist/tests/contracts/migration-baseline.test.js +0 -43
- package/dist/tests/contracts/reflect-propose-envelope.test.js +0 -139
- package/dist/tests/contracts/spec-helpers.js +0 -46
- package/dist/tests/contracts/v1-spec-section-11-proposal-queue.test.js +0 -228
- package/dist/tests/contracts/v1-spec-section-12-agent-config.test.js +0 -56
- package/dist/tests/contracts/v1-spec-section-13-lesson-type.test.js +0 -34
- package/dist/tests/contracts/v1-spec-section-14-llm-features.test.js +0 -94
- package/dist/tests/contracts/v1-spec-section-4-1-asset-types.test.js +0 -39
- package/dist/tests/contracts/v1-spec-section-4-2-quality-rules.test.js +0 -44
- package/dist/tests/contracts/v1-spec-section-5-configuration.test.js +0 -47
- package/dist/tests/contracts/v1-spec-section-6-orchestration.test.js +0 -40
- package/dist/tests/contracts/v1-spec-section-7-module-layout.test.js +0 -58
- package/dist/tests/contracts/v1-spec-section-8-extension-points.test.js +0 -34
- package/dist/tests/contracts/v1-spec-section-9-4-cli-surface.test.js +0 -75
- package/dist/tests/contracts/v1-spec-section-9-7-llm-agent-boundary.test.js +0 -36
- package/dist/tests/core/write-source.test.js +0 -366
- package/dist/tests/curate-command.test.js +0 -87
- package/dist/tests/db-scoring.test.js +0 -201
- package/dist/tests/db.test.js +0 -654
- package/dist/tests/distill-cli-flag.test.js +0 -208
- package/dist/tests/distill.test.js +0 -515
- package/dist/tests/docker-install.test.js +0 -120
- package/dist/tests/e2e.test.js +0 -1419
- package/dist/tests/embedder.test.js +0 -340
- package/dist/tests/embedding-model-config.test.js +0 -379
- package/dist/tests/feedback-command.test.js +0 -172
- package/dist/tests/file-context.test.js +0 -552
- package/dist/tests/fixtures/scripts/git/summarize-diff.js +0 -9
- package/dist/tests/fixtures/scripts/lint/eslint-check.js +0 -7
- package/dist/tests/fixtures/stashes/load.js +0 -166
- package/dist/tests/fixtures/stashes/load.test.js +0 -97
- package/dist/tests/fixtures/stashes/ranking-baseline/scripts/mem0-search.js +0 -12
- package/dist/tests/frontmatter.test.js +0 -190
- package/dist/tests/fts-field-weighting.test.js +0 -254
- package/dist/tests/fuzzy-search.test.js +0 -230
- package/dist/tests/git-provider-clone.test.js +0 -45
- package/dist/tests/github.test.js +0 -161
- package/dist/tests/graph-boost-ranking.test.js +0 -305
- package/dist/tests/graph-extraction.test.js +0 -282
- package/dist/tests/helpers/usage-events.js +0 -8
- package/dist/tests/index-pass-llm.test.js +0 -161
- package/dist/tests/indexer.test.js +0 -570
- package/dist/tests/info-command.test.js +0 -166
- package/dist/tests/init.test.js +0 -69
- package/dist/tests/install-script.test.js +0 -246
- package/dist/tests/integration/agent-real-profile.test.js +0 -94
- package/dist/tests/issue-36-repro.test.js +0 -304
- package/dist/tests/issues-191-194.test.js +0 -160
- package/dist/tests/lesson-lint.test.js +0 -111
- package/dist/tests/llm-client.test.js +0 -115
- package/dist/tests/llm-feature-gate.test.js +0 -151
- package/dist/tests/llm.test.js +0 -139
- package/dist/tests/lockfile.test.js +0 -216
- package/dist/tests/manifest.test.js +0 -205
- package/dist/tests/markdown.test.js +0 -126
- package/dist/tests/matchers-unit.test.js +0 -189
- package/dist/tests/memory-inference.test.js +0 -299
- package/dist/tests/merge-scoring.test.js +0 -136
- package/dist/tests/metadata.test.js +0 -313
- package/dist/tests/migration-help.test.js +0 -89
- package/dist/tests/origin-resolve.test.js +0 -124
- package/dist/tests/output-baseline.test.js +0 -218
- package/dist/tests/output-shapes-unit.test.js +0 -478
- package/dist/tests/parallel-search.test.js +0 -272
- package/dist/tests/parameter-metadata.test.js +0 -365
- package/dist/tests/paths.test.js +0 -177
- package/dist/tests/progressive-disclosure.test.js +0 -280
- package/dist/tests/proposals.test.js +0 -279
- package/dist/tests/proposed-quality.test.js +0 -271
- package/dist/tests/provider-registry.test.js +0 -32
- package/dist/tests/ranking-regression.test.js +0 -548
- package/dist/tests/reflect-propose.test.js +0 -455
- package/dist/tests/registry-build-index.test.js +0 -394
- package/dist/tests/registry-cli.test.js +0 -290
- package/dist/tests/registry-index-v2.test.js +0 -430
- package/dist/tests/registry-install.test.js +0 -728
- package/dist/tests/registry-providers/parity.test.js +0 -189
- package/dist/tests/registry-providers/skills-sh.test.js +0 -309
- package/dist/tests/registry-providers/static-index.test.js +0 -238
- package/dist/tests/registry-resolve.test.js +0 -126
- package/dist/tests/registry-search.test.js +0 -923
- package/dist/tests/remember-frontmatter.test.js +0 -378
- package/dist/tests/remember-unit.test.js +0 -123
- package/dist/tests/ripgrep-install.test.js +0 -251
- package/dist/tests/ripgrep-resolve.test.js +0 -108
- package/dist/tests/ripgrep.test.js +0 -163
- package/dist/tests/save-command.test.js +0 -94
- package/dist/tests/save-trust-qa-fixes.test.js +0 -270
- package/dist/tests/scoring-pipeline.test.js +0 -648
- package/dist/tests/search-include-proposed-cli.test.js +0 -118
- package/dist/tests/self-update.test.js +0 -442
- package/dist/tests/semantic-search-e2e.test.js +0 -512
- package/dist/tests/semantic-status.test.js +0 -471
- package/dist/tests/setup-run.integration.js +0 -877
- package/dist/tests/setup-wizard.test.js +0 -198
- package/dist/tests/setup.test.js +0 -131
- package/dist/tests/source-add.test.js +0 -11
- package/dist/tests/source-clone.test.js +0 -254
- package/dist/tests/source-manage.test.js +0 -366
- package/dist/tests/source-providers/filesystem.test.js +0 -82
- package/dist/tests/source-providers/git.test.js +0 -252
- package/dist/tests/source-providers/website.test.js +0 -128
- package/dist/tests/source-qa-fixes.test.js +0 -286
- package/dist/tests/source-registry.test.js +0 -350
- package/dist/tests/source-resolve.test.js +0 -100
- package/dist/tests/source-source.test.js +0 -281
- package/dist/tests/source.test.js +0 -533
- package/dist/tests/tar-utils-scan.test.js +0 -73
- package/dist/tests/toggle-components.test.js +0 -73
- package/dist/tests/usage-telemetry.test.js +0 -265
- package/dist/tests/utility-scoring.test.js +0 -558
- package/dist/tests/vault-load-error.test.js +0 -78
- package/dist/tests/vault-qa-fixes.test.js +0 -194
- package/dist/tests/vault.test.js +0 -429
- package/dist/tests/vector-search.test.js +0 -608
- package/dist/tests/walker.test.js +0 -252
- package/dist/tests/wave2-cluster-bc.test.js +0 -228
- package/dist/tests/wave2-cluster-d.test.js +0 -180
- package/dist/tests/wave2-cluster-e.test.js +0 -179
- package/dist/tests/wiki-qa-fixes.test.js +0 -270
- package/dist/tests/wiki.test.js +0 -529
- package/dist/tests/workflow-cli.test.js +0 -271
- package/dist/tests/workflow-markdown.test.js +0 -171
- package/dist/tests/workflow-path-escape.test.js +0 -132
- package/dist/tests/workflow-qa-fixes.test.js +0 -395
- package/dist/tests/workflows/indexer-rejection.test.js +0 -213
- /package/dist/{src/commands → commands}/completions.js +0 -0
- /package/dist/{src/commands → commands}/config-cli.js +0 -0
- /package/dist/{src/commands → commands}/curate.js +0 -0
- /package/dist/{src/commands → commands}/distill.js +0 -0
- /package/dist/{src/commands → commands}/events.js +0 -0
- /package/dist/{src/commands → commands}/history.js +0 -0
- /package/dist/{src/commands → commands}/info.js +0 -0
- /package/dist/{src/commands → commands}/init.js +0 -0
- /package/dist/{src/commands → commands}/install-audit.js +0 -0
- /package/dist/{src/commands → commands}/migration-help.js +0 -0
- /package/dist/{src/commands → commands}/proposal.js +0 -0
- /package/dist/{src/commands → commands}/propose.js +0 -0
- /package/dist/{src/commands → commands}/reflect.js +0 -0
- /package/dist/{src/commands → commands}/registry-search.js +0 -0
- /package/dist/{src/commands → commands}/remember.js +0 -0
- /package/dist/{src/commands → commands}/search.js +0 -0
- /package/dist/{src/commands → commands}/self-update.js +0 -0
- /package/dist/{src/commands → commands}/show.js +0 -0
- /package/dist/{src/commands → commands}/source-clone.js +0 -0
- /package/dist/{src/commands → commands}/source-manage.js +0 -0
- /package/dist/{src/commands → commands}/vault.js +0 -0
- /package/dist/{src/core → core}/asset-ref.js +0 -0
- /package/dist/{src/core → core}/asset-registry.js +0 -0
- /package/dist/{src/core → core}/asset-spec.js +0 -0
- /package/dist/{src/core → core}/errors.js +0 -0
- /package/dist/{src/core → core}/events.js +0 -0
- /package/dist/{src/core → core}/frontmatter.js +0 -0
- /package/dist/{src/core → core}/lesson-lint.js +0 -0
- /package/dist/{src/core → core}/markdown.js +0 -0
- /package/dist/{src/core → core}/paths.js +0 -0
- /package/dist/{src/core → core}/proposals.js +0 -0
- /package/dist/{src/core → core}/warn.js +0 -0
- /package/dist/{src/core → core}/write-source.js +0 -0
- /package/dist/{src/indexer → indexer}/db.js +0 -0
- /package/dist/{src/indexer → indexer}/file-context.js +0 -0
- /package/dist/{src/indexer → indexer}/graph-boost.js +0 -0
- /package/dist/{src/indexer → indexer}/manifest.js +0 -0
- /package/dist/{src/indexer → indexer}/matchers.js +0 -0
- /package/dist/{src/indexer → indexer}/metadata.js +0 -0
- /package/dist/{src/indexer → indexer}/search-fields.js +0 -0
- /package/dist/{src/indexer → indexer}/semantic-status.js +0 -0
- /package/dist/{src/indexer → indexer}/usage-events.js +0 -0
- /package/dist/{src/indexer → indexer}/walker.js +0 -0
- /package/dist/{src/integrations → integrations}/agent/config.js +0 -0
- /package/dist/{src/integrations → integrations}/agent/detect.js +0 -0
- /package/dist/{src/integrations → integrations}/agent/index.js +0 -0
- /package/dist/{src/integrations → integrations}/agent/profiles.js +0 -0
- /package/dist/{src/integrations → integrations}/agent/prompts.js +0 -0
- /package/dist/{src/integrations → integrations}/agent/spawn.js +0 -0
- /package/dist/{src/integrations → integrations}/github.js +0 -0
- /package/dist/{src/integrations → integrations}/lockfile.js +0 -0
- /package/dist/{src/llm → llm}/embedders/cache.js +0 -0
- /package/dist/{src/llm → llm}/embedders/types.js +0 -0
- /package/dist/{src/llm → llm}/feature-gate.js +0 -0
- /package/dist/{src/llm → llm}/index-passes.js +0 -0
- /package/dist/{src/output → output}/context.js +0 -0
- /package/dist/{src/output → output}/renderers.js +0 -0
- /package/dist/{src/output → output}/shapes.js +0 -0
- /package/dist/{src/output → output}/text.js +0 -0
- /package/dist/{src/registry → registry}/build-index.js +0 -0
- /package/dist/{src/registry → registry}/create-provider-registry.js +0 -0
- /package/dist/{src/registry → registry}/factory.js +0 -0
- /package/dist/{src/registry → registry}/origin-resolve.js +0 -0
- /package/dist/{src/registry → registry}/providers/index.js +0 -0
- /package/dist/{src/registry → registry}/providers/skills-sh.js +0 -0
- /package/dist/{src/registry → registry}/providers/static-index.js +0 -0
- /package/dist/{src/registry → registry}/providers/types.js +0 -0
- /package/dist/{src/registry → registry}/resolve.js +0 -0
- /package/dist/{src/registry → registry}/types.js +0 -0
- /package/dist/{src/setup → setup}/detect.js +0 -0
- /package/dist/{src/setup → setup}/ripgrep-install.js +0 -0
- /package/dist/{src/setup → setup}/ripgrep-resolve.js +0 -0
- /package/dist/{src/setup → setup}/steps.js +0 -0
- /package/dist/{src/sources → sources}/include.js +0 -0
- /package/dist/{src/sources → sources}/provider-factory.js +0 -0
- /package/dist/{src/sources → sources}/provider.js +0 -0
- /package/dist/{src/sources → sources}/providers/filesystem.js +0 -0
- /package/dist/{src/sources → sources}/providers/git.js +0 -0
- /package/dist/{src/sources → sources}/providers/index.js +0 -0
- /package/dist/{src/sources → sources}/providers/install-types.js +0 -0
- /package/dist/{src/sources → sources}/providers/npm.js +0 -0
- /package/dist/{src/sources → sources}/providers/provider-utils.js +0 -0
- /package/dist/{src/sources → sources}/providers/sync-from-ref.js +0 -0
- /package/dist/{src/sources → sources}/providers/tar-utils.js +0 -0
- /package/dist/{src/sources → sources}/resolve.js +0 -0
- /package/dist/{src/sources → sources}/types.js +0 -0
- /package/dist/{src/templates → templates}/wiki-templates.js +0 -0
- /package/dist/{src/version.js → version.js} +0 -0
- /package/dist/{src/wiki → wiki}/wiki.js +0 -0
- /package/dist/{src/workflows → workflows}/authoring.js +0 -0
- /package/dist/{src/workflows → workflows}/cli.js +0 -0
- /package/dist/{src/workflows → workflows}/db.js +0 -0
- /package/dist/{src/workflows → workflows}/document-cache.js +0 -0
- /package/dist/{src/workflows → workflows}/parser.js +0 -0
- /package/dist/{src/workflows → workflows}/renderer.js +0 -0
- /package/dist/{src/workflows → workflows}/runs.js +0 -0
- /package/dist/{src/workflows → workflows}/schema.js +0 -0
- /package/dist/{src/workflows → workflows}/validator.js +0 -0
|
@@ -1,1885 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* akm-bench report rendering (spec §13.3).
|
|
3
|
-
*
|
|
4
|
-
* Two report flavours coexist:
|
|
5
|
-
*
|
|
6
|
-
* • `renderJsonReport` / `renderMarkdownSummary` — the simple v1 envelope
|
|
7
|
-
* introduced in #236. Kept for backward-compat with the empty-corpus
|
|
8
|
-
* skeleton path; not used by the populated `utility` flow.
|
|
9
|
-
*
|
|
10
|
-
* • `renderUtilityReport` — the §13.3 shape, including per-task breakdown,
|
|
11
|
-
* per-arm and corpus-wide aggregates, akm−noakm deltas, and the
|
|
12
|
-
* trajectory subsection. This is what `bench utility` writes when the
|
|
13
|
-
* corpus has tasks.
|
|
14
|
-
*/
|
|
15
|
-
import { execSync } from "node:child_process";
|
|
16
|
-
import { aggregateAkmOverhead, aggregateByMemoryAbility, aggregateByTaskFamily, computeAkmOverhead, computeAssetRegressionCandidates, computeCorpusCoverage, computeDomainAggregates, computeNegativeTransfer, computeWorkflowReliability, histogramKeys, } from "./metrics";
|
|
17
|
-
/**
|
|
18
|
-
* Pretty-print a 2-space-indented JSON envelope. The shape is the v1
|
|
19
|
-
* contract — `bench compare` reads it and refuses to diff across mismatched
|
|
20
|
-
* `model` fields.
|
|
21
|
-
*/
|
|
22
|
-
export function renderJsonReport(input) {
|
|
23
|
-
const envelope = {
|
|
24
|
-
schemaVersion: 1,
|
|
25
|
-
timestamp: input.timestamp,
|
|
26
|
-
branch: input.branch,
|
|
27
|
-
commit: input.commit,
|
|
28
|
-
track: input.track,
|
|
29
|
-
agent: { harness: "opencode", model: input.model },
|
|
30
|
-
aggregate: input.arms,
|
|
31
|
-
};
|
|
32
|
-
return JSON.stringify(envelope, null, 2);
|
|
33
|
-
}
|
|
34
|
-
/**
|
|
35
|
-
* 5-ish-line markdown summary for stderr / PR descriptions. Used by the
|
|
36
|
-
* empty-corpus skeleton path.
|
|
37
|
-
*/
|
|
38
|
-
export function renderMarkdownSummary(input) {
|
|
39
|
-
const lines = [];
|
|
40
|
-
lines.push(`# akm-bench (${input.track}) — ${input.model}`);
|
|
41
|
-
lines.push(`branch \`${input.branch}\` @ \`${input.commit}\` — ${input.timestamp}`);
|
|
42
|
-
for (const [arm, agg] of Object.entries(input.arms)) {
|
|
43
|
-
lines.push(`- **${arm}**: pass_rate=${agg.passRate.toFixed(2)}, tokens_per_pass=${agg.tokensPerPass.toFixed(0)}, wallclock_ms=${agg.wallclockMs.toFixed(0)}, budget_exceeded=${agg.budgetExceeded}`);
|
|
44
|
-
}
|
|
45
|
-
return lines.join("\n");
|
|
46
|
-
}
|
|
47
|
-
/**
|
|
48
|
-
* Project a RunResult onto its compact serialised form for the §13.3 JSON
|
|
49
|
-
* envelope (#249). Mirrors the field list in the issue body.
|
|
50
|
-
*
|
|
51
|
-
* Token-shape seam: `tokens` is spread verbatim from `result.tokens` so when
|
|
52
|
-
* #252 adds a `measurement` field the renderer doesn't need a code change.
|
|
53
|
-
* Do NOT hardcode `{input, output}` projections here.
|
|
54
|
-
*/
|
|
55
|
-
export function serializeRunForReport(result) {
|
|
56
|
-
return {
|
|
57
|
-
task_id: result.taskId,
|
|
58
|
-
arm: result.arm,
|
|
59
|
-
seed: result.seed,
|
|
60
|
-
model: result.model,
|
|
61
|
-
outcome: result.outcome,
|
|
62
|
-
// TODO(#252): when RunResult.tokens grows a `measurement` key, this spread
|
|
63
|
-
// carries it forward without a renderer change.
|
|
64
|
-
tokens: { ...result.tokens },
|
|
65
|
-
wallclock_ms: result.wallclockMs,
|
|
66
|
-
verifier_exit_code: result.verifierExitCode,
|
|
67
|
-
trajectory: {
|
|
68
|
-
correct_asset_loaded: result.trajectory.correctAssetLoaded,
|
|
69
|
-
feedback_recorded: result.trajectory.feedbackRecorded,
|
|
70
|
-
},
|
|
71
|
-
assets_loaded: [...(result.assetsLoaded ?? [])],
|
|
72
|
-
failure_mode: result.failureMode ?? null,
|
|
73
|
-
};
|
|
74
|
-
}
|
|
75
|
-
/**
|
|
76
|
-
* Stamp a utility run into both the §13.3 JSON envelope and a markdown
|
|
77
|
-
* summary. Callers wire stdout/stderr separately.
|
|
78
|
-
*
|
|
79
|
-
* Determinism: given identical input the function is byte-stable. Markdown
|
|
80
|
-
* does not embed `timestamp` in the body table (only in the header), so
|
|
81
|
-
* snapshot tests are stable across reruns.
|
|
82
|
-
*/
|
|
83
|
-
export function renderUtilityReport(input) {
|
|
84
|
-
const json = buildUtilityJson(input);
|
|
85
|
-
const markdown = buildUtilityMarkdown(input);
|
|
86
|
-
return { json, markdown };
|
|
87
|
-
}
|
|
88
|
-
function buildUtilityJson(input) {
|
|
89
|
-
const includeSynth = input.aggregateSynth !== undefined;
|
|
90
|
-
const tasks = input.tasks.map((t) => ({
|
|
91
|
-
id: t.id,
|
|
92
|
-
noakm: serialisePerTaskMetrics(t.noakm),
|
|
93
|
-
akm: serialisePerTaskMetrics(t.akm),
|
|
94
|
-
delta: serialiseDelta(t.delta),
|
|
95
|
-
// #261: per-task synthetic block is emitted ONLY when the runner opted
|
|
96
|
-
// into the synthetic arm AND this task carries a synthetic aggregate.
|
|
97
|
-
// When the arm was not run we leave the key absent — a missing arm is
|
|
98
|
-
// not a zero-pass arm.
|
|
99
|
-
...(includeSynth && t.synthetic ? { synthetic: serialisePerTaskMetrics(t.synthetic) } : {}),
|
|
100
|
-
}));
|
|
101
|
-
// Negative-transfer + domain-level diagnostics (#260). Pure post-processing
|
|
102
|
-
// off `input.tasks` and `input.akmRuns` — runner.ts is intentionally
|
|
103
|
-
// untouched so this slots in alongside the per-task entries that already
|
|
104
|
-
// carry both arms via UtilityReportTaskEntry.
|
|
105
|
-
const negativeTransfer = computeNegativeTransfer(input.tasks);
|
|
106
|
-
const domainDeltas = computeDomainAggregates(input.tasks);
|
|
107
|
-
const assetRegressionCandidates = computeAssetRegressionCandidates(negativeTransfer.topRegressedTasks.map((r) => r.taskId), input.akmRuns ?? []);
|
|
108
|
-
// Token-measurement coverage (issue #252). Folds the corpus-wide picture so
|
|
109
|
-
// operators can tell at a glance whether token economics are reliable. The
|
|
110
|
-
// warning string mirrors what we add to `warnings[]` in markdown output.
|
|
111
|
-
const tokenMeasurement = summariseTokenMeasurement(input);
|
|
112
|
-
const warnings = [...input.warnings];
|
|
113
|
-
if (tokenMeasurement.warning)
|
|
114
|
-
warnings.push(tokenMeasurement.warning);
|
|
115
|
-
const envelope = {
|
|
116
|
-
schemaVersion: 1,
|
|
117
|
-
track: "utility",
|
|
118
|
-
branch: input.branch,
|
|
119
|
-
commit: input.commit,
|
|
120
|
-
timestamp: input.timestamp,
|
|
121
|
-
agent: { harness: "opencode", model: input.model },
|
|
122
|
-
corpus: input.corpus,
|
|
123
|
-
aggregate: {
|
|
124
|
-
noakm: serialiseCorpus(input.aggregateNoakm),
|
|
125
|
-
akm: serialiseCorpus(input.aggregateAkm),
|
|
126
|
-
delta: serialiseDelta(input.aggregateDelta),
|
|
127
|
-
// #261: synthetic aggregate is emitted ONLY when includeSynthetic
|
|
128
|
-
// was set on the runner. Absent otherwise — byte-identical to the
|
|
129
|
-
// pre-#261 envelope.
|
|
130
|
-
...(input.aggregateSynth ? { synthetic: serialiseCorpus(input.aggregateSynth) } : {}),
|
|
131
|
-
// #261: akm_over_synthetic_lift = passRate(akm) - passRate(synthetic).
|
|
132
|
-
// Only computed when the synthetic arm ran. Positive => AKM beats the
|
|
133
|
-
// synthetic-notes baseline; non-positive flags AKM is not adding value
|
|
134
|
-
// beyond what the model can synthesise on its own.
|
|
135
|
-
...(input.aggregateSynth
|
|
136
|
-
? { akm_over_synthetic_lift: input.aggregateAkm.passRate - input.aggregateSynth.passRate }
|
|
137
|
-
: {}),
|
|
138
|
-
},
|
|
139
|
-
trajectory: {
|
|
140
|
-
akm: {
|
|
141
|
-
correct_asset_loaded: input.trajectoryAkm.correctAssetLoaded,
|
|
142
|
-
feedback_recorded: input.trajectoryAkm.feedbackRecorded,
|
|
143
|
-
},
|
|
144
|
-
},
|
|
145
|
-
failure_modes: {
|
|
146
|
-
by_label: input.failureModes.byLabel,
|
|
147
|
-
by_task: input.failureModes.byTask,
|
|
148
|
-
},
|
|
149
|
-
token_measurement: {
|
|
150
|
-
total_runs: tokenMeasurement.totalRuns,
|
|
151
|
-
runs_with_measured_tokens: tokenMeasurement.measuredRuns,
|
|
152
|
-
runs_missing_measurement: tokenMeasurement.missingRuns,
|
|
153
|
-
runs_unsupported_measurement: tokenMeasurement.unsupportedRuns,
|
|
154
|
-
coverage: tokenMeasurement.coverage,
|
|
155
|
-
reliable: tokenMeasurement.reliable,
|
|
156
|
-
},
|
|
157
|
-
tasks,
|
|
158
|
-
negative_transfer_count: negativeTransfer.count,
|
|
159
|
-
negative_transfer_severity: negativeTransfer.severity,
|
|
160
|
-
top_regressed_tasks: negativeTransfer.topRegressedTasks.map((r) => ({
|
|
161
|
-
task_id: r.taskId,
|
|
162
|
-
domain: r.domain,
|
|
163
|
-
noakm_pass_rate: r.noakmPassRate,
|
|
164
|
-
akm_pass_rate: r.akmPassRate,
|
|
165
|
-
delta: r.delta,
|
|
166
|
-
severity: r.severity,
|
|
167
|
-
})),
|
|
168
|
-
domain_level_deltas: domainDeltas.map(serialiseDomainAggregate),
|
|
169
|
-
asset_regression_candidates: assetRegressionCandidates.map(serialiseAssetRegressionCandidate),
|
|
170
|
-
corpus_coverage: buildCorpusCoverageBlock(input),
|
|
171
|
-
workflow: buildWorkflowAggregate(input.workflowChecks ?? []),
|
|
172
|
-
warnings,
|
|
173
|
-
...(input.searchBridge ? { searchBridge: serialiseSearchBridge(input.searchBridge) } : {}),
|
|
174
|
-
};
|
|
175
|
-
// Compact raw runs[] — additive top-level key (#249). One row per
|
|
176
|
-
// (task, arm, seed) execution; both noakm and akm. Older artefacts that
|
|
177
|
-
// pre-date this field stay valid because we only emit it when the runner
|
|
178
|
-
// actually populated `allRuns`.
|
|
179
|
-
if (input.allRuns) {
|
|
180
|
-
envelope.runs = input.allRuns.map(serializeRunForReport);
|
|
181
|
-
}
|
|
182
|
-
// Baseline pass-rate map — additive top-level key. Emitted only when the
|
|
183
|
-
// caller supplied a baseline through `loadBenchRunConfig`; legacy reports
|
|
184
|
-
// stay byte-identical without it.
|
|
185
|
-
if (input.baselineByTaskId) {
|
|
186
|
-
envelope.baseline_by_task_id = { ...input.baselineByTaskId };
|
|
187
|
-
}
|
|
188
|
-
// Per-asset attribution is an additive top-level key (§6.5). Emit it only
|
|
189
|
-
// when the runner populated it so older code paths (e.g. the empty-corpus
|
|
190
|
-
// skeleton) don't gain the key spuriously.
|
|
191
|
-
if (input.perAsset) {
|
|
192
|
-
envelope.perAsset = {
|
|
193
|
-
total_akm_runs: input.perAsset.totalAkmRuns,
|
|
194
|
-
rows: input.perAsset.rows.map((r) => ({
|
|
195
|
-
asset_ref: r.assetRef,
|
|
196
|
-
load_count: r.loadCount,
|
|
197
|
-
load_count_passing: r.loadCountPassing,
|
|
198
|
-
load_count_failing: r.loadCountFailing,
|
|
199
|
-
load_pass_rate: r.loadPassRate,
|
|
200
|
-
})),
|
|
201
|
-
};
|
|
202
|
-
}
|
|
203
|
-
// AKM overhead + tool-use efficiency block (#263). Computed from the akm-
|
|
204
|
-
// arm RunResults attached to the report; missing akmRuns yields an empty
|
|
205
|
-
// aggregate so the key shape stays stable.
|
|
206
|
-
envelope.akm_overhead = buildAkmOverheadBlock(input);
|
|
207
|
-
return envelope;
|
|
208
|
-
}
|
|
209
|
-
// ── AKM overhead block (#263) ──────────────────────────────────────────────
|
|
210
|
-
/**
|
|
211
|
-
* Build the §13.3 `akm_overhead` block from the akm-arm RunResults and (when
|
|
212
|
-
* supplied) per-task metadata. `taskMetadata` lets us split irrelevant from
|
|
213
|
-
* relevant asset loads and compute time-to-first-correct-asset; without it
|
|
214
|
-
* those fields surface as `null` rather than misleading zeros.
|
|
215
|
-
*/
|
|
216
|
-
function buildAkmOverheadBlock(input) {
|
|
217
|
-
const akmRuns = input.akmRuns ?? [];
|
|
218
|
-
const meta = new Map();
|
|
219
|
-
for (const t of input.taskMetadata ?? []) {
|
|
220
|
-
meta.set(t.id, { goldRef: t.goldRef, expectedTransferFrom: t.expectedTransferFrom });
|
|
221
|
-
}
|
|
222
|
-
const perRun = computeAkmOverhead(akmRuns, { taskMetadata: meta });
|
|
223
|
-
const aggregate = aggregateAkmOverhead(perRun, akmRuns);
|
|
224
|
-
return {
|
|
225
|
-
per_run: perRun.map(serialiseAkmOverheadPerRun),
|
|
226
|
-
aggregate: serialiseAkmOverheadAggregate(aggregate),
|
|
227
|
-
};
|
|
228
|
-
}
|
|
229
|
-
function serialiseAkmOverheadPerRun(row) {
|
|
230
|
-
return {
|
|
231
|
-
task_id: row.taskId,
|
|
232
|
-
arm: row.arm,
|
|
233
|
-
seed: row.seed,
|
|
234
|
-
outcome: row.outcome,
|
|
235
|
-
search_count: row.searchCount,
|
|
236
|
-
show_count: row.showCount,
|
|
237
|
-
feedback_count: row.feedbackCount,
|
|
238
|
-
positive_feedback_count: row.positiveFeedbackCount,
|
|
239
|
-
negative_feedback_count: row.negativeFeedbackCount,
|
|
240
|
-
total_tool_calls: row.totalToolCalls,
|
|
241
|
-
assets_loaded_count: row.assetsLoadedCount,
|
|
242
|
-
irrelevant_assets_loaded_count: row.irrelevantAssetsLoadedCount,
|
|
243
|
-
time_to_first_search_ms: row.timeToFirstSearchMs,
|
|
244
|
-
time_to_first_correct_asset_ms: row.timeToFirstCorrectAssetMs,
|
|
245
|
-
context_bytes_loaded: row.contextBytesLoaded,
|
|
246
|
-
asset_bytes_loaded: row.assetBytesLoaded,
|
|
247
|
-
};
|
|
248
|
-
}
|
|
249
|
-
function serialiseAkmOverheadAggregate(agg) {
|
|
250
|
-
return {
|
|
251
|
-
total_runs: agg.totalRuns,
|
|
252
|
-
passing_runs: agg.passingRuns,
|
|
253
|
-
mean_search_count: agg.meanSearchCount,
|
|
254
|
-
mean_show_count: agg.meanShowCount,
|
|
255
|
-
mean_feedback_count: agg.meanFeedbackCount,
|
|
256
|
-
mean_tool_calls: agg.meanToolCalls,
|
|
257
|
-
mean_assets_loaded: agg.meanAssetsLoaded,
|
|
258
|
-
mean_irrelevant_assets_loaded: agg.meanIrrelevantAssetsLoaded,
|
|
259
|
-
mean_time_to_first_search_ms: agg.meanTimeToFirstSearchMs,
|
|
260
|
-
mean_time_to_first_correct_asset_ms: agg.meanTimeToFirstCorrectAssetMs,
|
|
261
|
-
mean_context_bytes_loaded: agg.meanContextBytesLoaded,
|
|
262
|
-
mean_asset_bytes_loaded: agg.meanAssetBytesLoaded,
|
|
263
|
-
total_tool_calls: agg.totalToolCalls,
|
|
264
|
-
tool_calls_per_success: agg.toolCallsPerSuccess,
|
|
265
|
-
cost_per_success: agg.costPerSuccess,
|
|
266
|
-
search_engagement_rate: agg.searchEngagementRate,
|
|
267
|
-
show_engagement_rate: agg.showEngagementRate,
|
|
268
|
-
feedback_engagement_rate: agg.feedbackEngagementRate,
|
|
269
|
-
search_to_show_ratio: agg.searchToShowRatio,
|
|
270
|
-
mean_positive_feedback_count: agg.meanPositiveFeedbackCount,
|
|
271
|
-
mean_negative_feedback_count: agg.meanNegativeFeedbackCount,
|
|
272
|
-
};
|
|
273
|
-
}
|
|
274
|
-
/**
|
|
275
|
-
* Render the §13.3 AKM overhead summary as a compact markdown section (#263).
|
|
276
|
-
* Skipped entirely when the corpus had no akm-arm runs so the report stays
|
|
277
|
-
* tight on the no-akm code path.
|
|
278
|
-
*/
|
|
279
|
-
export function renderAkmOverheadSection(input) {
|
|
280
|
-
const akmRuns = input.akmRuns ?? [];
|
|
281
|
-
if (akmRuns.length === 0)
|
|
282
|
-
return "";
|
|
283
|
-
const meta = new Map();
|
|
284
|
-
for (const t of input.taskMetadata ?? []) {
|
|
285
|
-
meta.set(t.id, { goldRef: t.goldRef, expectedTransferFrom: t.expectedTransferFrom });
|
|
286
|
-
}
|
|
287
|
-
const perRun = computeAkmOverhead(akmRuns, { taskMetadata: meta });
|
|
288
|
-
const agg = aggregateAkmOverhead(perRun, akmRuns);
|
|
289
|
-
const lines = [];
|
|
290
|
-
lines.push("## AKM overhead");
|
|
291
|
-
lines.push("");
|
|
292
|
-
lines.push(`- runs: ${agg.totalRuns} (${agg.passingRuns} passed)`);
|
|
293
|
-
lines.push(`- tool calls: search=${formatMean(agg.meanSearchCount)} show=${formatMean(agg.meanShowCount)} feedback=${formatMean(agg.meanFeedbackCount)} (mean per run)`);
|
|
294
|
-
lines.push(`- total tool calls: ${agg.totalToolCalls} (mean ${formatMean(agg.meanToolCalls)} per run)`);
|
|
295
|
-
lines.push(`- tool_calls_per_success: ${agg.toolCallsPerSuccess === null ? "n/a" : formatMean(agg.toolCallsPerSuccess)}`);
|
|
296
|
-
lines.push(`- assets loaded (mean unique per run): ${formatMean(agg.meanAssetsLoaded)}`);
|
|
297
|
-
lines.push(`- irrelevant assets loaded (mean per tagged run): ${formatNullableMean(agg.meanIrrelevantAssetsLoaded)}`);
|
|
298
|
-
lines.push(`- time_to_first_search: ${formatNullableMs(agg.meanTimeToFirstSearchMs)}`);
|
|
299
|
-
lines.push(`- time_to_first_correct_asset: ${formatNullableMs(agg.meanTimeToFirstCorrectAssetMs)}`);
|
|
300
|
-
lines.push(`- context_bytes_loaded: ${formatNullableBytes(agg.meanContextBytesLoaded)}`);
|
|
301
|
-
lines.push(`- asset_bytes_loaded: ${formatNullableBytes(agg.meanAssetBytesLoaded)}`);
|
|
302
|
-
lines.push(`- cost_per_success: ${agg.costPerSuccess === null ? "n/a" : formatMean(agg.costPerSuccess)} tokens`);
|
|
303
|
-
return lines.join("\n");
|
|
304
|
-
}
|
|
305
|
-
function formatMean(value) {
|
|
306
|
-
return value.toFixed(2);
|
|
307
|
-
}
|
|
308
|
-
function formatNullableMean(value) {
|
|
309
|
-
return value === null ? "n/a" : value.toFixed(2);
|
|
310
|
-
}
|
|
311
|
-
function formatNullableMs(value) {
|
|
312
|
-
return value === null ? "n/a" : `${Math.round(value)}ms`;
|
|
313
|
-
}
|
|
314
|
-
function formatNullableBytes(value) {
|
|
315
|
-
return value === null ? "n/a" : `${Math.round(value)} bytes`;
|
|
316
|
-
}
|
|
317
|
-
/**
|
|
318
|
-
* §6.7 envelope. We expose `null` for percentiles that fell into the missing
|
|
319
|
-
* bucket so JSON consumers don't choke on `Infinity`.
|
|
320
|
-
*/
|
|
321
|
-
function serialiseSearchBridge(s) {
|
|
322
|
-
return {
|
|
323
|
-
runs_observed: s.runsObserved,
|
|
324
|
-
searches_observed: s.searchesObserved,
|
|
325
|
-
gold_rank_distribution: s.goldRankDistribution,
|
|
326
|
-
gold_rank_p50: percentileForJson(s.goldRankP50),
|
|
327
|
-
gold_rank_p90: percentileForJson(s.goldRankP90),
|
|
328
|
-
gold_at_rank_1: s.goldAtRank1,
|
|
329
|
-
gold_missing: s.goldMissing,
|
|
330
|
-
pass_rate_by_rank: s.passRateByRank.map((e) => ({
|
|
331
|
-
rank: e.rank,
|
|
332
|
-
pass_rate: e.passRate,
|
|
333
|
-
run_count: e.runCount,
|
|
334
|
-
})),
|
|
335
|
-
};
|
|
336
|
-
}
|
|
337
|
-
function percentileForJson(value) {
|
|
338
|
-
if (value === null)
|
|
339
|
-
return null;
|
|
340
|
-
if (!Number.isFinite(value))
|
|
341
|
-
return "missing";
|
|
342
|
-
return value;
|
|
343
|
-
}
|
|
344
|
-
function serialiseCorpus(c) {
|
|
345
|
-
return {
|
|
346
|
-
pass_rate: c.passRate,
|
|
347
|
-
tokens_per_pass: c.tokensPerPass,
|
|
348
|
-
tokens_per_run: c.tokensPerRun,
|
|
349
|
-
wallclock_ms: c.wallclockMs,
|
|
350
|
-
};
|
|
351
|
-
}
|
|
352
|
-
function serialiseDelta(d) {
|
|
353
|
-
return {
|
|
354
|
-
pass_rate: d.passRate,
|
|
355
|
-
tokens_per_pass: d.tokensPerPass,
|
|
356
|
-
tokens_per_run: d.tokensPerRun,
|
|
357
|
-
wallclock_ms: d.wallclockMs,
|
|
358
|
-
};
|
|
359
|
-
}
|
|
360
|
-
/** Snake-case wire shape for one row of `domain_level_deltas` (#260). */
|
|
361
|
-
function serialiseDomainAggregate(row) {
|
|
362
|
-
return {
|
|
363
|
-
domain: row.domain,
|
|
364
|
-
task_count: row.taskCount,
|
|
365
|
-
regression_count: row.regressionCount,
|
|
366
|
-
pass_rate_noakm: row.passRateNoakm,
|
|
367
|
-
pass_rate_akm: row.passRateAkm,
|
|
368
|
-
pass_rate_delta: row.passRateDelta,
|
|
369
|
-
tokens_per_pass_delta: row.tokensPerPassDelta,
|
|
370
|
-
wallclock_ms_delta: row.wallclockMsDelta,
|
|
371
|
-
};
|
|
372
|
-
}
|
|
373
|
-
// ── Corpus coverage block (#262) ───────────────────────────────────────────
|
|
374
|
-
/**
|
|
375
|
-
* Build the §13.3 `corpus_coverage` block from a UtilityRunReport (#262).
|
|
376
|
-
* Folds three pieces:
|
|
377
|
-
* - `coverage`: counts per `memory_ability` (closed set + `untagged`) and
|
|
378
|
-
* `task_family`. Operators see at a glance which abilities the corpus
|
|
379
|
-
* covers and which are missing.
|
|
380
|
-
* - `by_memory_ability` / `by_task_family`: per-category aggregates of pass
|
|
381
|
-
* rate, akm − noakm delta, negative transfer count, and (when supplied)
|
|
382
|
-
* workflow-compliance mean.
|
|
383
|
-
*
|
|
384
|
-
* When the runner did not plumb `taskMetadata` (legacy code paths) we emit a
|
|
385
|
-
* skeleton block with zero counts so JSON consumers don't see the key flicker
|
|
386
|
-
* in and out depending on the runner version.
|
|
387
|
-
*/
|
|
388
|
-
function buildCorpusCoverageBlock(input) {
|
|
389
|
-
const taskMetadata = input.taskMetadata ?? [];
|
|
390
|
-
const metaById = new Map();
|
|
391
|
-
for (const m of taskMetadata)
|
|
392
|
-
metaById.set(m.id, m);
|
|
393
|
-
const tagEntries = input.tasks.map((t) => {
|
|
394
|
-
const meta = metaById.get(t.id);
|
|
395
|
-
const entry = {
|
|
396
|
-
id: t.id,
|
|
397
|
-
noakm: t.noakm,
|
|
398
|
-
akm: t.akm,
|
|
399
|
-
};
|
|
400
|
-
if (meta?.memoryAbility)
|
|
401
|
-
entry.memoryAbility = meta.memoryAbility;
|
|
402
|
-
if (meta?.taskFamily)
|
|
403
|
-
entry.taskFamily = meta.taskFamily;
|
|
404
|
-
if (meta?.workflowFocus)
|
|
405
|
-
entry.workflowFocus = meta.workflowFocus;
|
|
406
|
-
if (typeof t.workflowCompliance === "number" && Number.isFinite(t.workflowCompliance)) {
|
|
407
|
-
entry.workflowCompliance = t.workflowCompliance;
|
|
408
|
-
}
|
|
409
|
-
return entry;
|
|
410
|
-
});
|
|
411
|
-
const coverage = computeCorpusCoverage(taskMetadata);
|
|
412
|
-
const byAbility = aggregateByMemoryAbility(tagEntries);
|
|
413
|
-
const byFamily = aggregateByTaskFamily(tagEntries);
|
|
414
|
-
return {
|
|
415
|
-
coverage,
|
|
416
|
-
by_memory_ability: byAbility.map(serialiseCategoryRow),
|
|
417
|
-
by_task_family: byFamily.map(serialiseCategoryRow),
|
|
418
|
-
};
|
|
419
|
-
}
|
|
420
|
-
function serialiseCategoryRow(row) {
|
|
421
|
-
return {
|
|
422
|
-
category: row.category,
|
|
423
|
-
task_count: row.taskCount,
|
|
424
|
-
pass_rate_noakm: row.passRateNoakm,
|
|
425
|
-
pass_rate_akm: row.passRateAkm,
|
|
426
|
-
pass_rate_delta: row.passRateDelta,
|
|
427
|
-
negative_transfer_count: row.negativeTransferCount,
|
|
428
|
-
workflow_compliance: row.workflowCompliance,
|
|
429
|
-
};
|
|
430
|
-
}
|
|
431
|
-
/** Snake-case wire shape for one row of `asset_regression_candidates` (#260). */
|
|
432
|
-
function serialiseAssetRegressionCandidate(row) {
|
|
433
|
-
return {
|
|
434
|
-
asset_ref: row.assetRef,
|
|
435
|
-
regressed_task_count: row.regressedTaskCount,
|
|
436
|
-
regressed_task_ids: row.regressedTaskIds,
|
|
437
|
-
total_load_count: row.totalLoadCount,
|
|
438
|
-
};
|
|
439
|
-
}
|
|
440
|
-
function serialisePerTaskMetrics(m) {
|
|
441
|
-
return {
|
|
442
|
-
pass_rate: m.passRate,
|
|
443
|
-
pass_at_1: m.passAt1,
|
|
444
|
-
tokens_per_pass: m.tokensPerPass,
|
|
445
|
-
tokens_per_run: m.tokensPerRun,
|
|
446
|
-
wallclock_ms: m.wallclockMs,
|
|
447
|
-
pass_rate_stdev: m.passRateStdev,
|
|
448
|
-
budget_exceeded_count: m.budgetExceededCount,
|
|
449
|
-
harness_error_count: m.harnessErrorCount,
|
|
450
|
-
count: m.count,
|
|
451
|
-
runs_with_measured_tokens: m.runsWithMeasuredTokens,
|
|
452
|
-
};
|
|
453
|
-
}
|
|
454
|
-
function summariseTokenMeasurement(input) {
|
|
455
|
-
const runs = input.akmRuns ?? [];
|
|
456
|
-
let measured = 0;
|
|
457
|
-
let missing = 0;
|
|
458
|
-
let unsupported = 0;
|
|
459
|
-
for (const r of runs) {
|
|
460
|
-
const m = r.tokenMeasurement ?? "parsed";
|
|
461
|
-
if (m === "parsed")
|
|
462
|
-
measured += 1;
|
|
463
|
-
else if (m === "missing")
|
|
464
|
-
missing += 1;
|
|
465
|
-
else if (m === "unsupported")
|
|
466
|
-
unsupported += 1;
|
|
467
|
-
}
|
|
468
|
-
const total = runs.length;
|
|
469
|
-
const coverage = total === 0 ? null : measured / total;
|
|
470
|
-
const reliable = total > 0 && missing === 0 && unsupported === 0;
|
|
471
|
-
let warning = null;
|
|
472
|
-
if (total > 0 && !reliable) {
|
|
473
|
-
const parts = [];
|
|
474
|
-
if (missing > 0)
|
|
475
|
-
parts.push(`${missing} missing`);
|
|
476
|
-
if (unsupported > 0)
|
|
477
|
-
parts.push(`${unsupported} unsupported`);
|
|
478
|
-
warning =
|
|
479
|
-
`token measurement unreliable: ${parts.join(", ")} of ${total} akm-arm runs lack parsed token usage; ` +
|
|
480
|
-
`tokens_per_pass and token-budget signals reflect only the ${measured} measured runs.`;
|
|
481
|
-
}
|
|
482
|
-
return {
|
|
483
|
-
totalRuns: total,
|
|
484
|
-
measuredRuns: measured,
|
|
485
|
-
missingRuns: missing,
|
|
486
|
-
unsupportedRuns: unsupported,
|
|
487
|
-
coverage,
|
|
488
|
-
reliable,
|
|
489
|
-
warning,
|
|
490
|
-
};
|
|
491
|
-
}
|
|
492
|
-
function buildUtilityMarkdown(input) {
|
|
493
|
-
const lines = [];
|
|
494
|
-
lines.push(`# akm-bench utility — ${input.model}`);
|
|
495
|
-
lines.push("");
|
|
496
|
-
lines.push(`branch \`${input.branch}\` @ \`${input.commit}\` — ${input.timestamp}`);
|
|
497
|
-
lines.push(`corpus: ${input.corpus.tasks} tasks across ${input.corpus.domains} domains (slice=${input.corpus.slice}, seedsPerArm=${input.corpus.seedsPerArm})`);
|
|
498
|
-
lines.push("");
|
|
499
|
-
lines.push("## Aggregate");
|
|
500
|
-
lines.push("");
|
|
501
|
-
lines.push("| arm | pass_rate | tokens_per_pass | wallclock_ms |");
|
|
502
|
-
lines.push("|-----|-----------|-----------------|--------------|");
|
|
503
|
-
lines.push(corpusRow("noakm", input.aggregateNoakm));
|
|
504
|
-
// #261: synthetic row sits between noakm and akm so the columns read
|
|
505
|
-
// baseline → synthetic → akm in the natural progression. Only rendered
|
|
506
|
-
// when the runner opted into the synthetic arm.
|
|
507
|
-
if (input.aggregateSynth) {
|
|
508
|
-
lines.push(corpusRow("synthetic", input.aggregateSynth));
|
|
509
|
-
}
|
|
510
|
-
lines.push(corpusRow("akm", input.aggregateAkm));
|
|
511
|
-
lines.push(deltaRow(input.aggregateDelta));
|
|
512
|
-
// #261: akm_over_synthetic_lift summary line. When AKM does not beat the
|
|
513
|
-
// synthetic baseline (lift <= 0) we surface a warning marker so operators
|
|
514
|
-
// cannot miss the regression. Otherwise we render the lift as an
|
|
515
|
-
// informative line.
|
|
516
|
-
if (input.aggregateSynth) {
|
|
517
|
-
const lift = input.aggregateAkm.passRate - input.aggregateSynth.passRate;
|
|
518
|
-
lines.push("");
|
|
519
|
-
if (lift <= 0) {
|
|
520
|
-
lines.push(`:warning: **akm_over_synthetic_lift = ${signedFixed(lift, 2)}** — AKM did not beat the synthetic-notes baseline.`);
|
|
521
|
-
}
|
|
522
|
-
else {
|
|
523
|
-
lines.push(`**akm_over_synthetic_lift: ${signedFixed(lift, 2)}**`);
|
|
524
|
-
}
|
|
525
|
-
}
|
|
526
|
-
lines.push("");
|
|
527
|
-
lines.push("## Trajectory (akm)");
|
|
528
|
-
lines.push("");
|
|
529
|
-
lines.push(`- correct_asset_loaded: ${formatPercent(input.trajectoryAkm.correctAssetLoaded)}`);
|
|
530
|
-
lines.push(`- feedback_recorded: ${formatPercent(input.trajectoryAkm.feedbackRecorded)}`);
|
|
531
|
-
// Per-run trajectory detail: when allRuns is present emit a compact table
|
|
532
|
-
// so operators can distinguish null (harness error — no events captured)
|
|
533
|
-
// from false (agent ran, behaviour not observed) from true (confirmed).
|
|
534
|
-
// Symbols: "—" = null, "✗" = false, "✓" = true.
|
|
535
|
-
const akmRuns = (input.allRuns ?? []).filter((r) => r.arm === "akm");
|
|
536
|
-
if (akmRuns.length > 0) {
|
|
537
|
-
lines.push("");
|
|
538
|
-
lines.push("| task | seed | correct_asset_loaded | feedback_recorded |");
|
|
539
|
-
lines.push("|------|------|----------------------|-------------------|");
|
|
540
|
-
for (const r of akmRuns) {
|
|
541
|
-
lines.push(`| ${r.taskId} | ${r.seed} | ${formatTrajBool(r.trajectory.correctAssetLoaded)} | ${formatTrajBool(r.trajectory.feedbackRecorded)} |`);
|
|
542
|
-
}
|
|
543
|
-
}
|
|
544
|
-
lines.push("");
|
|
545
|
-
lines.push("## Per-task pass rates");
|
|
546
|
-
lines.push("");
|
|
547
|
-
// #261: synthetic column is rendered only when the synthetic arm ran.
|
|
548
|
-
// The default header/row stays identical to the pre-#261 output.
|
|
549
|
-
// Baseline column is rendered only when `baselineByTaskId` was supplied
|
|
550
|
-
// by the caller; legacy reports without it produce byte-identical output.
|
|
551
|
-
const includeSynthCol = input.aggregateSynth !== undefined;
|
|
552
|
-
const baselineMap = input.baselineByTaskId;
|
|
553
|
-
const includeBaselineCol = baselineMap !== undefined;
|
|
554
|
-
const baseColHeader = includeBaselineCol ? " baseline | vs base |" : "";
|
|
555
|
-
const baseColSep = includeBaselineCol ? "----------|---------|" : "";
|
|
556
|
-
if (includeSynthCol) {
|
|
557
|
-
lines.push(`| task | noakm | synthetic | akm | delta |${baseColHeader}`);
|
|
558
|
-
lines.push(`|------|-------|-----------|-----|-------|${baseColSep}`);
|
|
559
|
-
}
|
|
560
|
-
else {
|
|
561
|
-
lines.push(`| task | noakm | akm | delta |${baseColHeader}`);
|
|
562
|
-
lines.push(`|------|-------|-----|-------|${baseColSep}`);
|
|
563
|
-
}
|
|
564
|
-
// Sort tasks alphabetically for byte-stable markdown output.
|
|
565
|
-
const sorted = [...input.tasks].sort((a, b) => a.id.localeCompare(b.id));
|
|
566
|
-
for (const t of sorted) {
|
|
567
|
-
lines.push(taskRow(t, includeSynthCol, baselineMap));
|
|
568
|
-
}
|
|
569
|
-
// Corpus-coverage section (#262). Renders only when at least one task was
|
|
570
|
-
// tagged with a `memory_ability`; without tags the section adds no signal
|
|
571
|
-
// and would just churn snapshots.
|
|
572
|
-
const coverageSection = renderCorpusCoverageSection(input);
|
|
573
|
-
if (coverageSection.length > 0) {
|
|
574
|
-
lines.push("");
|
|
575
|
-
lines.push(coverageSection);
|
|
576
|
-
}
|
|
577
|
-
// Negative-transfer + domain diagnostics (#260). The section stays quiet
|
|
578
|
-
// ("none") when no regressions were observed so green corpora don't fill
|
|
579
|
-
// the report with empty subheaders.
|
|
580
|
-
const negativeTransferSection = renderNegativeTransferSection(input);
|
|
581
|
-
lines.push("");
|
|
582
|
-
lines.push(negativeTransferSection);
|
|
583
|
-
// Failure-mode breakdown (§6.6). Appended near the bottom so the headline
|
|
584
|
-
// pass-rate / trajectory tables stay visually anchored at the top.
|
|
585
|
-
const failureSection = renderFailureModeBreakdown(input);
|
|
586
|
-
if (failureSection.length > 0) {
|
|
587
|
-
lines.push("");
|
|
588
|
-
lines.push(failureSection);
|
|
589
|
-
}
|
|
590
|
-
if (input.searchBridge) {
|
|
591
|
-
lines.push("");
|
|
592
|
-
lines.push(renderSearchBridgeTable(input.searchBridge));
|
|
593
|
-
}
|
|
594
|
-
// #257: workflow compliance section. `renderWorkflowComplianceSection`
|
|
595
|
-
// returns "" when there are no checks, so we only push the blank-line
|
|
596
|
-
// separator when there's actually content to render.
|
|
597
|
-
const workflowSection = renderWorkflowComplianceSection(input);
|
|
598
|
-
if (workflowSection.length > 0) {
|
|
599
|
-
lines.push("");
|
|
600
|
-
lines.push(workflowSection);
|
|
601
|
-
}
|
|
602
|
-
// AKM overhead + tool-use efficiency (#263). Skipped when the corpus had
|
|
603
|
-
// no akm-arm runs so the report stays compact on the no-akm path.
|
|
604
|
-
const overheadSection = renderAkmOverheadSection(input);
|
|
605
|
-
if (overheadSection.length > 0) {
|
|
606
|
-
lines.push("");
|
|
607
|
-
lines.push(overheadSection);
|
|
608
|
-
}
|
|
609
|
-
// Token-measurement section (issue #252). Always rendered when there are
|
|
610
|
-
// akm-arm runs to report on, so operators can tell whether tokens economics
|
|
611
|
-
// are trustworthy without scrolling to the warnings block.
|
|
612
|
-
const tokenSummary = summariseTokenMeasurement(input);
|
|
613
|
-
if (tokenSummary.totalRuns > 0) {
|
|
614
|
-
lines.push("");
|
|
615
|
-
lines.push("## Token measurement (akm)");
|
|
616
|
-
lines.push("");
|
|
617
|
-
const cov = tokenSummary.coverage === null ? "n/a" : `${(tokenSummary.coverage * 100).toFixed(1)}%`;
|
|
618
|
-
lines.push(`- runs: ${tokenSummary.totalRuns} total, ${tokenSummary.measuredRuns} measured, ${tokenSummary.missingRuns} missing, ${tokenSummary.unsupportedRuns} unsupported`);
|
|
619
|
-
lines.push(`- coverage: ${cov} (${tokenSummary.reliable ? "reliable" : "unreliable — see warning below"})`);
|
|
620
|
-
}
|
|
621
|
-
const warnings = [...input.warnings];
|
|
622
|
-
if (tokenSummary.warning)
|
|
623
|
-
warnings.push(tokenSummary.warning);
|
|
624
|
-
if (warnings.length > 0) {
|
|
625
|
-
lines.push("");
|
|
626
|
-
lines.push("## Warnings");
|
|
627
|
-
lines.push("");
|
|
628
|
-
for (const w of warnings)
|
|
629
|
-
lines.push(`- ${w}`);
|
|
630
|
-
}
|
|
631
|
-
return lines.join("\n");
|
|
632
|
-
}
|
|
633
|
-
// ── Search-pipeline bridge (§6.7) markdown ─────────────────────────────────
|
|
634
|
-
/**
|
|
635
|
-
* Render the §6.7 search-pipeline bridge as a markdown section.
|
|
636
|
-
*
|
|
637
|
-
* When the corpus has no gold-ref tasks (or simply no `akm search`
|
|
638
|
-
* invocations), the section collapses to a single "(N/A)" sentence so the
|
|
639
|
-
* report stays compact.
|
|
640
|
-
*/
|
|
641
|
-
export function renderSearchBridgeTable(metrics) {
|
|
642
|
-
const lines = [];
|
|
643
|
-
lines.push("## Search → outcome bridge");
|
|
644
|
-
lines.push("");
|
|
645
|
-
if (metrics.searchesObserved === 0 && metrics.runsObserved === 0) {
|
|
646
|
-
lines.push("(no gold-ref tasks in corpus; bridge metrics N/A)");
|
|
647
|
-
return lines.join("\n");
|
|
648
|
-
}
|
|
649
|
-
// Histogram of gold rank.
|
|
650
|
-
lines.push("| rank | count |");
|
|
651
|
-
lines.push("|------|-------|");
|
|
652
|
-
for (const k of histogramKeys()) {
|
|
653
|
-
const count = metrics.goldRankDistribution[k] ?? 0;
|
|
654
|
-
lines.push(`| ${k} | ${count} |`);
|
|
655
|
-
}
|
|
656
|
-
lines.push("");
|
|
657
|
-
// Summary line.
|
|
658
|
-
const p50 = formatRank(metrics.goldRankP50);
|
|
659
|
-
const p90 = formatRank(metrics.goldRankP90);
|
|
660
|
-
lines.push(`p50=${p50}, p90=${p90}, gold_at_rank_1=${formatPercent(metrics.goldAtRank1)}, gold_missing=${formatPercent(metrics.goldMissing)}`);
|
|
661
|
-
lines.push("");
|
|
662
|
-
// pass_rate_by_rank.
|
|
663
|
-
lines.push("| rank | pass_rate | run_count |");
|
|
664
|
-
lines.push("|------|-----------|-----------|");
|
|
665
|
-
if (metrics.passRateByRank.length === 0) {
|
|
666
|
-
lines.push("| (no runs with `akm search` invocations) | — | 0 |");
|
|
667
|
-
}
|
|
668
|
-
else {
|
|
669
|
-
for (const entry of metrics.passRateByRank) {
|
|
670
|
-
lines.push(`| ${entry.rank} | ${entry.passRate.toFixed(2)} | ${entry.runCount} |`);
|
|
671
|
-
}
|
|
672
|
-
}
|
|
673
|
-
return lines.join("\n");
|
|
674
|
-
}
|
|
675
|
-
function formatRank(value) {
|
|
676
|
-
if (value === null)
|
|
677
|
-
return "n/a";
|
|
678
|
-
if (!Number.isFinite(value))
|
|
679
|
-
return "missing";
|
|
680
|
-
return value.toFixed(1);
|
|
681
|
-
}
|
|
682
|
-
function corpusRow(arm, c) {
|
|
683
|
-
const tpp = c.tokensPerPass === null ? "n/a" : c.tokensPerPass.toFixed(0);
|
|
684
|
-
return `| ${arm} | ${c.passRate.toFixed(2)} | ${tpp} | ${c.wallclockMs.toFixed(0)} |`;
|
|
685
|
-
}
|
|
686
|
-
function deltaRow(d) {
|
|
687
|
-
const tpp = d.tokensPerPass === null ? "n/a" : signed(d.tokensPerPass.toFixed(0));
|
|
688
|
-
return `| **delta** | ${signed(d.passRate.toFixed(2))} | ${tpp} | ${signed(d.wallclockMs.toFixed(0))} |`;
|
|
689
|
-
}
|
|
690
|
-
function taskRow(t, includeSynthetic = false, baselineByTaskId) {
|
|
691
|
-
// Baseline-delta cell is rendered only when a baseline map is provided
|
|
692
|
-
// AND this task has an entry. Tasks without a baseline entry get an empty
|
|
693
|
-
// pair of cells so columns stay aligned.
|
|
694
|
-
let baselineCells = "";
|
|
695
|
-
if (baselineByTaskId) {
|
|
696
|
-
const base = baselineByTaskId[t.id];
|
|
697
|
-
if (base === undefined) {
|
|
698
|
-
baselineCells = " n/a | n/a |";
|
|
699
|
-
}
|
|
700
|
-
else {
|
|
701
|
-
const delta = t.akm.passRate - base;
|
|
702
|
-
baselineCells = ` ${base.toFixed(2)} | ${signed(delta.toFixed(2))} |`;
|
|
703
|
-
}
|
|
704
|
-
}
|
|
705
|
-
if (includeSynthetic) {
|
|
706
|
-
// #261: render the synthetic-arm pass-rate when present; "n/a" when the
|
|
707
|
-
// arm did not run for this task. A missing arm is NOT a zero-pass arm —
|
|
708
|
-
// a 0.00 cell would be misleading because the model never tried.
|
|
709
|
-
const synth = t.synthetic ? t.synthetic.passRate.toFixed(2) : "n/a";
|
|
710
|
-
return `| ${t.id} | ${t.noakm.passRate.toFixed(2)} | ${synth} | ${t.akm.passRate.toFixed(2)} | ${signed(t.delta.passRate.toFixed(2))} |${baselineCells}`;
|
|
711
|
-
}
|
|
712
|
-
return `| ${t.id} | ${t.noakm.passRate.toFixed(2)} | ${t.akm.passRate.toFixed(2)} | ${signed(t.delta.passRate.toFixed(2))} |${baselineCells}`;
|
|
713
|
-
}
|
|
714
|
-
function signed(text) {
|
|
715
|
-
if (text.startsWith("-"))
|
|
716
|
-
return text;
|
|
717
|
-
if (text === "0" || text === "0.00" || text === "0.0")
|
|
718
|
-
return text;
|
|
719
|
-
return `+${text}`;
|
|
720
|
-
}
|
|
721
|
-
function formatPercent(value) {
|
|
722
|
-
if (value === null)
|
|
723
|
-
return "n/a";
|
|
724
|
-
return `${(value * 100).toFixed(1)}%`;
|
|
725
|
-
}
|
|
726
|
-
/**
|
|
727
|
-
* Render a `boolean | null` trajectory field for markdown tables.
|
|
728
|
-
*
|
|
729
|
-
* Three-state semantics:
|
|
730
|
-
* - `null` → `"—"` — no trajectory data (harness error; events.jsonl not captured).
|
|
731
|
-
* - `false` → `"✗"` — agent ran but the behaviour was not observed.
|
|
732
|
-
* - `true` → `"✓"` — behaviour confirmed.
|
|
733
|
-
*/
|
|
734
|
-
export function formatTrajBool(value) {
|
|
735
|
-
if (value === null)
|
|
736
|
-
return "—";
|
|
737
|
-
return value ? "✓" : "✗";
|
|
738
|
-
}
|
|
739
|
-
// ── Compare rendering (§8) ─────────────────────────────────────────────────
|
|
740
|
-
/**
|
|
741
|
-
* Render a CompareResult as a deterministic markdown diff.
|
|
742
|
-
*
|
|
743
|
-
* Determinism: no timestamps, no run IDs, no git SHAs in the body — the diff
|
|
744
|
-
* is a pure function of the two inputs' aggregated numbers and per-task
|
|
745
|
-
* tables. Per-task rows are sorted alphabetically (already done by
|
|
746
|
-
* `compareReports`, but re-asserted here defensively).
|
|
747
|
-
*
|
|
748
|
-
* Refusal cases (model mismatch, hash mismatch, schema/track issues) render
|
|
749
|
-
* as a single error block instead of a diff table — there's nothing
|
|
750
|
-
* actionable to show, and the operator's recovery path is in the message.
|
|
751
|
-
*/
|
|
752
|
-
export function renderCompareMarkdown(result) {
|
|
753
|
-
if (!result.ok) {
|
|
754
|
-
return renderCompareFailure(result);
|
|
755
|
-
}
|
|
756
|
-
return renderCompareSuccess(result);
|
|
757
|
-
}
|
|
758
|
-
function renderCompareFailure(result) {
|
|
759
|
-
const lines = [];
|
|
760
|
-
lines.push(`# akm-bench compare — refused (${result.reason})`);
|
|
761
|
-
lines.push("");
|
|
762
|
-
lines.push(result.message);
|
|
763
|
-
if (result.reason === "model_mismatch" && result.baseModel !== undefined && result.currentModel !== undefined) {
|
|
764
|
-
lines.push("");
|
|
765
|
-
lines.push(`- base model: \`${result.baseModel}\``);
|
|
766
|
-
lines.push(`- current model: \`${result.currentModel}\``);
|
|
767
|
-
}
|
|
768
|
-
if (result.reason === "hash_mismatch" &&
|
|
769
|
-
result.baseFixtureContentHash !== undefined &&
|
|
770
|
-
result.currentFixtureContentHash !== undefined) {
|
|
771
|
-
lines.push("");
|
|
772
|
-
lines.push(`- base fixture hash: \`${String(result.baseFixtureContentHash)}\``);
|
|
773
|
-
lines.push(`- current fixture hash: \`${String(result.currentFixtureContentHash)}\``);
|
|
774
|
-
if (result.affectedFixtures && result.affectedFixtures.length > 0) {
|
|
775
|
-
lines.push("");
|
|
776
|
-
lines.push("affected fixtures:");
|
|
777
|
-
for (const f of result.affectedFixtures)
|
|
778
|
-
lines.push(`- ${f}`);
|
|
779
|
-
}
|
|
780
|
-
}
|
|
781
|
-
if (result.reason === "corpus_mismatch") {
|
|
782
|
-
if (result.baseTaskCorpusHash !== undefined || result.currentTaskCorpusHash !== undefined) {
|
|
783
|
-
lines.push("");
|
|
784
|
-
lines.push(`- base taskCorpusHash: \`${String(result.baseTaskCorpusHash ?? "n/a")}\``);
|
|
785
|
-
lines.push(`- current taskCorpusHash: \`${String(result.currentTaskCorpusHash ?? "n/a")}\``);
|
|
786
|
-
}
|
|
787
|
-
if (result.baseSelectedTaskIds && result.currentSelectedTaskIds) {
|
|
788
|
-
const baseSet = new Set(result.baseSelectedTaskIds);
|
|
789
|
-
const currentSet = new Set(result.currentSelectedTaskIds);
|
|
790
|
-
const addedToCurrent = result.currentSelectedTaskIds.filter((id) => !baseSet.has(id)).sort();
|
|
791
|
-
const droppedFromBase = result.baseSelectedTaskIds.filter((id) => !currentSet.has(id)).sort();
|
|
792
|
-
if (addedToCurrent.length > 0) {
|
|
793
|
-
lines.push("");
|
|
794
|
-
lines.push("only in current:");
|
|
795
|
-
for (const id of addedToCurrent)
|
|
796
|
-
lines.push(`- ${id}`);
|
|
797
|
-
}
|
|
798
|
-
if (droppedFromBase.length > 0) {
|
|
799
|
-
lines.push("");
|
|
800
|
-
lines.push("only in base:");
|
|
801
|
-
for (const id of droppedFromBase)
|
|
802
|
-
lines.push(`- ${id}`);
|
|
803
|
-
}
|
|
804
|
-
}
|
|
805
|
-
}
|
|
806
|
-
return lines.join("\n");
|
|
807
|
-
}
|
|
808
|
-
function renderCompareSuccess(result) {
|
|
809
|
-
const lines = [];
|
|
810
|
-
lines.push(`# akm-bench compare — \`${result.currentModel}\``);
|
|
811
|
-
lines.push("");
|
|
812
|
-
if (result.baseFixtureContentHash !== null || result.currentFixtureContentHash !== null) {
|
|
813
|
-
const b = result.baseFixtureContentHash === null ? "n/a" : `\`${result.baseFixtureContentHash}\``;
|
|
814
|
-
const c = result.currentFixtureContentHash === null ? "n/a" : `\`${result.currentFixtureContentHash}\``;
|
|
815
|
-
lines.push(`fixture-content hash: base=${b}, current=${c}`);
|
|
816
|
-
lines.push("");
|
|
817
|
-
}
|
|
818
|
-
lines.push("## Aggregate (akm arm, current − base)");
|
|
819
|
-
lines.push("");
|
|
820
|
-
lines.push("| metric | delta | direction |");
|
|
821
|
-
lines.push("|--------|-------|-----------|");
|
|
822
|
-
lines.push(`| pass_rate | ${signedFixed(result.aggregate.passRateDelta, 2)} | ${signGlyph(result.aggregate.passRateSign)} |`);
|
|
823
|
-
lines.push(`| tokens_per_pass | ${nullableSignedFixed(result.aggregate.tokensPerPassDelta, 0)} | ${signGlyph(result.aggregate.tokensPerPassSign)} |`);
|
|
824
|
-
lines.push(`| wallclock_ms | ${signedFixed(result.aggregate.wallclockMsDelta, 0)} | ${signGlyph(result.aggregate.wallclockMsSign)} |`);
|
|
825
|
-
lines.push("");
|
|
826
|
-
lines.push("## Per-task (akm arm)");
|
|
827
|
-
lines.push("");
|
|
828
|
-
lines.push("| task | base pass_rate | current pass_rate | delta | dir | base stdev | current stdev |");
|
|
829
|
-
lines.push("|------|----------------|-------------------|-------|-----|------------|---------------|");
|
|
830
|
-
const sorted = [...result.perTask].sort((a, b) => a.id.localeCompare(b.id));
|
|
831
|
-
for (const row of sorted)
|
|
832
|
-
lines.push(perTaskCompareRow(row));
|
|
833
|
-
if (result.warnings.length > 0) {
|
|
834
|
-
lines.push("");
|
|
835
|
-
lines.push("## Warnings");
|
|
836
|
-
lines.push("");
|
|
837
|
-
for (const w of result.warnings)
|
|
838
|
-
lines.push(`- ${w}`);
|
|
839
|
-
}
|
|
840
|
-
return lines.join("\n");
|
|
841
|
-
}
|
|
842
|
-
function perTaskCompareRow(row) {
|
|
843
|
-
const baseRate = row.baseMetrics === null ? "n/a" : row.baseMetrics.pass_rate.toFixed(2);
|
|
844
|
-
const currentRate = row.currentMetrics === null ? "n/a" : row.currentMetrics.pass_rate.toFixed(2);
|
|
845
|
-
const delta = row.delta.passRate === null ? "n/a" : signedFixed(row.delta.passRate, 2);
|
|
846
|
-
const dir = signGlyph(row.signMarker);
|
|
847
|
-
const baseStdev = row.baseMetrics === null ? "n/a" : row.baseMetrics.pass_rate_stdev.toFixed(2);
|
|
848
|
-
const currentStdev = row.currentMetrics === null ? "n/a" : row.currentMetrics.pass_rate_stdev.toFixed(2);
|
|
849
|
-
const idCell = row.presence === "both" ? row.id : `${row.id} _(${row.presence})_`;
|
|
850
|
-
return `| ${idCell} | ${baseRate} | ${currentRate} | ${delta} | ${dir} | ${baseStdev} | ${currentStdev} |`;
|
|
851
|
-
}
|
|
852
|
-
function signGlyph(sign) {
|
|
853
|
-
if (sign === "improve")
|
|
854
|
-
return "▲";
|
|
855
|
-
if (sign === "regress")
|
|
856
|
-
return "▼";
|
|
857
|
-
return "▬";
|
|
858
|
-
}
|
|
859
|
-
function signedFixed(value, digits) {
|
|
860
|
-
// Treat numerical zero (or values that round to "-0.00") as "0" so we
|
|
861
|
-
// never emit a misleading "+0.00" or "-0.00" in deterministic output.
|
|
862
|
-
const fixed = value.toFixed(digits);
|
|
863
|
-
if (fixed === "-0" || /^-0\.0+$/.test(fixed))
|
|
864
|
-
return (0).toFixed(digits);
|
|
865
|
-
if (value === 0)
|
|
866
|
-
return fixed;
|
|
867
|
-
return value > 0 ? `+${fixed}` : fixed;
|
|
868
|
-
}
|
|
869
|
-
function nullableSignedFixed(value, digits) {
|
|
870
|
-
if (value === null)
|
|
871
|
-
return "n/a";
|
|
872
|
-
return signedFixed(value, digits);
|
|
873
|
-
}
|
|
874
|
-
// ── Attribution table rendering (§6.5) ─────────────────────────────────────
|
|
875
|
-
/**
|
|
876
|
-
* Threshold for the "highly loaded" slice — assets with a load count at or
|
|
877
|
-
* above this fraction of the per-table maximum get bucketed into the "well
|
|
878
|
-
* used and working" / "well used and not working" callout sections.
|
|
879
|
-
*/
|
|
880
|
-
const HIGH_LOAD_THRESHOLD = 0.5;
|
|
881
|
-
/**
|
|
882
|
-
* Threshold for "working" pass-rate. An asset is "working" if its
|
|
883
|
-
* load_pass_rate is at or above this; "not working" if below.
|
|
884
|
-
*/
|
|
885
|
-
const WORKING_PASS_RATE_THRESHOLD = 0.5;
|
|
886
|
-
/**
|
|
887
|
-
* Render a per-asset attribution table as markdown. Sort order matches
|
|
888
|
-
* `computePerAssetAttribution` (load count desc, pass rate desc, ref asc).
|
|
889
|
-
*
|
|
890
|
-
* The output has three sections:
|
|
891
|
-
* 1. Full sorted table.
|
|
892
|
-
* 2. "Well-used and working" callout — high load, high pass_rate.
|
|
893
|
-
* 3. "Well-used and not working" callout — high load, low pass_rate.
|
|
894
|
-
*
|
|
895
|
-
* The two callouts are the actionable slices: the first is what curation
|
|
896
|
-
* should preserve, the second is what should be improved or removed.
|
|
897
|
-
*/
|
|
898
|
-
export function renderAttributionTable(attr) {
|
|
899
|
-
const lines = [];
|
|
900
|
-
lines.push("## Per-asset attribution");
|
|
901
|
-
lines.push("");
|
|
902
|
-
lines.push(`Total akm-arm runs aggregated: ${attr.totalAkmRuns}`);
|
|
903
|
-
lines.push("");
|
|
904
|
-
if (attr.rows.length === 0) {
|
|
905
|
-
lines.push("_No assets were loaded by the agent during akm-arm runs._");
|
|
906
|
-
return lines.join("\n");
|
|
907
|
-
}
|
|
908
|
-
lines.push("| asset_ref | load_count | load_count_passing | load_count_failing | load_pass_rate |");
|
|
909
|
-
lines.push("|-----------|------------|--------------------|--------------------|----------------|");
|
|
910
|
-
for (const row of attr.rows) {
|
|
911
|
-
lines.push(`| \`${row.assetRef}\` | ${row.loadCount} | ${row.loadCountPassing} | ${row.loadCountFailing} | ${formatRate(row.loadPassRate)} |`);
|
|
912
|
-
}
|
|
913
|
-
// Slice callouts. We compute the high-load threshold relative to the
|
|
914
|
-
// top-loaded asset's count so this scales whether the corpus has 5 or 500
|
|
915
|
-
// total runs.
|
|
916
|
-
const topLoad = attr.rows[0]?.loadCount ?? 0;
|
|
917
|
-
const highLoadCutoff = Math.max(1, Math.ceil(topLoad * HIGH_LOAD_THRESHOLD));
|
|
918
|
-
const heavilyLoaded = attr.rows.filter((r) => r.loadCount >= highLoadCutoff);
|
|
919
|
-
const working = heavilyLoaded.filter((r) => (r.loadPassRate ?? 0) >= WORKING_PASS_RATE_THRESHOLD);
|
|
920
|
-
const notWorking = heavilyLoaded.filter((r) => (r.loadPassRate ?? 0) < WORKING_PASS_RATE_THRESHOLD);
|
|
921
|
-
lines.push("");
|
|
922
|
-
lines.push("### Well-used and working");
|
|
923
|
-
lines.push("");
|
|
924
|
-
if (working.length === 0) {
|
|
925
|
-
lines.push("_None._");
|
|
926
|
-
}
|
|
927
|
-
else {
|
|
928
|
-
for (const r of working) {
|
|
929
|
-
lines.push(`- \`${r.assetRef}\` (load_count=${r.loadCount}, load_pass_rate=${formatRate(r.loadPassRate)})`);
|
|
930
|
-
}
|
|
931
|
-
}
|
|
932
|
-
lines.push("");
|
|
933
|
-
lines.push("### Well-used and NOT working");
|
|
934
|
-
lines.push("");
|
|
935
|
-
if (notWorking.length === 0) {
|
|
936
|
-
lines.push("_None._");
|
|
937
|
-
}
|
|
938
|
-
else {
|
|
939
|
-
for (const r of notWorking) {
|
|
940
|
-
lines.push(`- \`${r.assetRef}\` (load_count=${r.loadCount}, load_pass_rate=${formatRate(r.loadPassRate)})`);
|
|
941
|
-
}
|
|
942
|
-
}
|
|
943
|
-
return lines.join("\n");
|
|
944
|
-
}
|
|
945
|
-
function formatRate(value) {
|
|
946
|
-
if (value === null)
|
|
947
|
-
return "n/a";
|
|
948
|
-
return `${(value * 100).toFixed(1)}%`;
|
|
949
|
-
}
|
|
950
|
-
// ── Failure-mode breakdown (§6.6) ──────────────────────────────────────────
|
|
951
|
-
/**
|
|
952
|
-
* Render the §6.6 "Failure modes" markdown section. Lines are sorted by
|
|
953
|
-
* descending count (ties broken alphabetically by label so output is
|
|
954
|
-
* byte-stable). Each line:
|
|
955
|
-
*
|
|
956
|
-
* `<label> — <count> (<percent>% of failed runs)`
|
|
957
|
-
*
|
|
958
|
-
* Returns an empty string when no failed runs exist (caller decides whether
|
|
959
|
-
* to append a blank section header).
|
|
960
|
-
*/
|
|
961
|
-
export function renderFailureModeBreakdown(report) {
|
|
962
|
-
const entries = Object.entries(report.failureModes.byLabel);
|
|
963
|
-
if (entries.length === 0)
|
|
964
|
-
return "";
|
|
965
|
-
const totalFailures = entries.reduce((acc, [, count]) => acc + count, 0);
|
|
966
|
-
if (totalFailures === 0)
|
|
967
|
-
return "";
|
|
968
|
-
// Sort by descending count, tie-break alphabetically for determinism.
|
|
969
|
-
entries.sort((a, b) => {
|
|
970
|
-
if (b[1] !== a[1])
|
|
971
|
-
return b[1] - a[1];
|
|
972
|
-
return a[0].localeCompare(b[0]);
|
|
973
|
-
});
|
|
974
|
-
const lines = ["## Failure modes", ""];
|
|
975
|
-
for (const [label, count] of entries) {
|
|
976
|
-
const percent = ((count / totalFailures) * 100).toFixed(1);
|
|
977
|
-
lines.push(`- ${label} — ${count} (${percent}% of failed runs)`);
|
|
978
|
-
}
|
|
979
|
-
return lines.join("\n");
|
|
980
|
-
}
|
|
981
|
-
// ── Workflow compliance aggregation (#257) ─────────────────────────────────
|
|
982
|
-
/**
|
|
983
|
-
* Top-violation entry with enough detail to identify which (task, seed)
|
|
984
|
-
* caused each occurrence. The `evidence` array is capped at
|
|
985
|
-
* `MAX_VIOLATION_EVIDENCE` per code so a pathological corpus cannot blow up
|
|
986
|
-
* the report.
|
|
987
|
-
*/
|
|
988
|
-
const MAX_VIOLATION_EVIDENCE = 10;
|
|
989
|
-
/**
|
|
990
|
-
* Maximum number of top-violation entries to surface in JSON / markdown.
|
|
991
|
-
* Operators care about the head of the distribution; the long tail is
|
|
992
|
-
* recoverable from `workflowChecks` if needed.
|
|
993
|
-
*/
|
|
994
|
-
const MAX_TOP_VIOLATIONS = 10;
|
|
995
|
-
/**
|
|
996
|
-
* Map a workflow check `status` onto the public pass/partial/fail bucket.
|
|
997
|
-
* `not_applicable` returns `null` (excluded from the aggregate counts).
|
|
998
|
-
* `harness_error` is bucketed as `fail` so corrupt traces are visibly
|
|
999
|
-
* counted against compliance.
|
|
1000
|
-
*/
|
|
1001
|
-
function bucketWorkflowStatus(status) {
|
|
1002
|
-
if (status === "pass")
|
|
1003
|
-
return "pass";
|
|
1004
|
-
if (status === "partial")
|
|
1005
|
-
return "partial";
|
|
1006
|
-
if (status === "fail")
|
|
1007
|
-
return "fail";
|
|
1008
|
-
if (status === "harness_error")
|
|
1009
|
-
return "fail";
|
|
1010
|
-
return null; // not_applicable
|
|
1011
|
-
}
|
|
1012
|
-
/**
|
|
1013
|
-
* Compute the §257 `workflow` block from a flat list of `WorkflowCheckResult`.
|
|
1014
|
-
* Empty input yields an empty (zero-filled) aggregate so JSON consumers
|
|
1015
|
-
* always see the same shape.
|
|
1016
|
-
*/
|
|
1017
|
-
function buildWorkflowAggregate(checks) {
|
|
1018
|
-
// #258: Compute reliability up front so all early-return paths share the
|
|
1019
|
-
// same shape. Reliability tolerates empty input (`groups === 0`).
|
|
1020
|
-
const reliabilityResult = computeWorkflowReliability(checks);
|
|
1021
|
-
const reliability = {
|
|
1022
|
-
by_workflow: reliabilityResult.byWorkflow,
|
|
1023
|
-
corpus: reliabilityResult.corpus,
|
|
1024
|
-
};
|
|
1025
|
-
const empty = {
|
|
1026
|
-
total_checks: checks.length,
|
|
1027
|
-
applicable_checks: 0,
|
|
1028
|
-
overall_compliance: 0,
|
|
1029
|
-
strict_pass_rate: 0,
|
|
1030
|
-
partial_pass_rate: 0,
|
|
1031
|
-
fail_rate: 0,
|
|
1032
|
-
violation_count: 0,
|
|
1033
|
-
by_workflow: {},
|
|
1034
|
-
top_violations: [],
|
|
1035
|
-
cross_tab: [],
|
|
1036
|
-
reliability,
|
|
1037
|
-
};
|
|
1038
|
-
if (checks.length === 0)
|
|
1039
|
-
return empty;
|
|
1040
|
-
// Bucket counts (corpus-wide) and accumulate per-spec / per-violation /
|
|
1041
|
-
// cross-tab in a single pass.
|
|
1042
|
-
let strict = 0;
|
|
1043
|
-
let partial = 0;
|
|
1044
|
-
let fail = 0;
|
|
1045
|
-
let scoreSum = 0;
|
|
1046
|
-
let applicable = 0;
|
|
1047
|
-
let violationCount = 0;
|
|
1048
|
-
const perSpecAcc = new Map();
|
|
1049
|
-
const violationAcc = new Map();
|
|
1050
|
-
const crossTabAcc = new Map();
|
|
1051
|
-
// We need each (task_outcome, run) bucketed against the WORST workflow
|
|
1052
|
-
// outcome that run produced — otherwise a run with one passing and one
|
|
1053
|
-
// failing spec gets double-counted across cross-tab rows. Reduce per-run.
|
|
1054
|
-
const runWorstOutcome = new Map();
|
|
1055
|
-
// Track which run keys have at least one applicable check; non-applicable
|
|
1056
|
-
// runs do not contribute to the cross-tab.
|
|
1057
|
-
const runHasApplicable = new Set();
|
|
1058
|
-
for (const c of checks) {
|
|
1059
|
-
const bucket = bucketWorkflowStatus(c.status);
|
|
1060
|
-
const runKey = `${c.taskId}::${c.arm}::${c.seed}`;
|
|
1061
|
-
// Per-spec: include `not_applicable` in the spec's `count` column
|
|
1062
|
-
// (operators want to see whether the spec ever fired) but exclude
|
|
1063
|
-
// it from rate denominators.
|
|
1064
|
-
const specEntry = perSpecAcc.get(c.workflowId) ?? {
|
|
1065
|
-
count: 0,
|
|
1066
|
-
scoreSum: 0,
|
|
1067
|
-
pass: 0,
|
|
1068
|
-
partial: 0,
|
|
1069
|
-
fail: 0,
|
|
1070
|
-
violationCount: 0,
|
|
1071
|
-
};
|
|
1072
|
-
specEntry.count += 1;
|
|
1073
|
-
if (bucket !== null) {
|
|
1074
|
-
specEntry.scoreSum += c.score;
|
|
1075
|
-
specEntry[bucket] += 1;
|
|
1076
|
-
}
|
|
1077
|
-
specEntry.violationCount += c.violations.length;
|
|
1078
|
-
perSpecAcc.set(c.workflowId, specEntry);
|
|
1079
|
-
if (bucket === null)
|
|
1080
|
-
continue;
|
|
1081
|
-
applicable += 1;
|
|
1082
|
-
scoreSum += c.score;
|
|
1083
|
-
violationCount += c.violations.length;
|
|
1084
|
-
runHasApplicable.add(runKey);
|
|
1085
|
-
if (bucket === "pass")
|
|
1086
|
-
strict += 1;
|
|
1087
|
-
else if (bucket === "partial")
|
|
1088
|
-
partial += 1;
|
|
1089
|
-
else
|
|
1090
|
-
fail += 1;
|
|
1091
|
-
// Per-violation evidence collection. Cap evidence per code so one noisy
|
|
1092
|
-
// failure mode cannot dominate the section.
|
|
1093
|
-
for (const v of c.violations) {
|
|
1094
|
-
const list = violationAcc.get(v.code) ?? [];
|
|
1095
|
-
if (list.length < MAX_VIOLATION_EVIDENCE) {
|
|
1096
|
-
const ev = {
|
|
1097
|
-
task_id: c.taskId,
|
|
1098
|
-
arm: c.arm,
|
|
1099
|
-
seed: c.seed,
|
|
1100
|
-
workflow_id: c.workflowId,
|
|
1101
|
-
};
|
|
1102
|
-
if (v.message)
|
|
1103
|
-
ev.message = v.message;
|
|
1104
|
-
if (v.expected !== undefined)
|
|
1105
|
-
ev.expected = v.expected;
|
|
1106
|
-
if (v.observed !== undefined)
|
|
1107
|
-
ev.observed = v.observed;
|
|
1108
|
-
list.push(ev);
|
|
1109
|
-
}
|
|
1110
|
-
violationAcc.set(v.code, list);
|
|
1111
|
-
}
|
|
1112
|
-
// Cross-tab bookkeeping: keep the WORST workflow outcome per run so we
|
|
1113
|
-
// get one cell per run (not per (run × spec)).
|
|
1114
|
-
const taskOutcome = readCheckTaskOutcome(c) ?? "unknown";
|
|
1115
|
-
const worst = runWorstOutcome.get(runKey);
|
|
1116
|
-
if (!worst) {
|
|
1117
|
-
runWorstOutcome.set(runKey, { taskOutcome, workflowOutcome: bucket });
|
|
1118
|
-
}
|
|
1119
|
-
else if (severityRank(bucket) > severityRank(worst.workflowOutcome)) {
|
|
1120
|
-
worst.workflowOutcome = bucket;
|
|
1121
|
-
}
|
|
1122
|
-
}
|
|
1123
|
-
// Reduce runWorstOutcome into the public cross_tab rows. We always emit
|
|
1124
|
-
// entries for `pass` and `fail` task outcomes so the table shape is
|
|
1125
|
-
// stable; additional outcomes ("budget_exceeded", "harness_error",
|
|
1126
|
-
// "unknown") only appear when at least one run carried them.
|
|
1127
|
-
const stableOutcomes = ["pass", "fail"];
|
|
1128
|
-
for (const [, entry] of runWorstOutcome) {
|
|
1129
|
-
if (!stableOutcomes.includes(entry.taskOutcome) && entry.taskOutcome !== "unknown") {
|
|
1130
|
-
stableOutcomes.push(entry.taskOutcome);
|
|
1131
|
-
}
|
|
1132
|
-
}
|
|
1133
|
-
for (const [, entry] of runWorstOutcome) {
|
|
1134
|
-
const counts = crossTabAcc.get(entry.taskOutcome) ?? { pass: 0, partial: 0, fail: 0 };
|
|
1135
|
-
counts[entry.workflowOutcome] += 1;
|
|
1136
|
-
crossTabAcc.set(entry.taskOutcome, counts);
|
|
1137
|
-
}
|
|
1138
|
-
const cross_tab = [];
|
|
1139
|
-
for (const outcome of stableOutcomes) {
|
|
1140
|
-
const counts = crossTabAcc.get(outcome) ?? { pass: 0, partial: 0, fail: 0 };
|
|
1141
|
-
cross_tab.push({
|
|
1142
|
-
task_outcome: outcome,
|
|
1143
|
-
pass: counts.pass,
|
|
1144
|
-
partial: counts.partial,
|
|
1145
|
-
fail: counts.fail,
|
|
1146
|
-
total: counts.pass + counts.partial + counts.fail,
|
|
1147
|
-
});
|
|
1148
|
-
}
|
|
1149
|
-
// Append "unknown" row only if any run actually carried it.
|
|
1150
|
-
if (crossTabAcc.has("unknown")) {
|
|
1151
|
-
const counts = crossTabAcc.get("unknown") ?? { pass: 0, partial: 0, fail: 0 };
|
|
1152
|
-
cross_tab.push({
|
|
1153
|
-
task_outcome: "unknown",
|
|
1154
|
-
pass: counts.pass,
|
|
1155
|
-
partial: counts.partial,
|
|
1156
|
-
fail: counts.fail,
|
|
1157
|
-
total: counts.pass + counts.partial + counts.fail,
|
|
1158
|
-
});
|
|
1159
|
-
}
|
|
1160
|
-
if (applicable === 0) {
|
|
1161
|
-
// Every check was `not_applicable`. Surface a non-empty `by_workflow`
|
|
1162
|
-
// (so operators see which specs ran) but leave the rate fields zeroed.
|
|
1163
|
-
const by_workflow = {};
|
|
1164
|
-
for (const [id, e] of perSpecAcc) {
|
|
1165
|
-
by_workflow[id] = {
|
|
1166
|
-
workflow_id: id,
|
|
1167
|
-
count: e.count,
|
|
1168
|
-
score: 0,
|
|
1169
|
-
pass_rate: 0,
|
|
1170
|
-
partial_rate: 0,
|
|
1171
|
-
fail_rate: 0,
|
|
1172
|
-
violation_count: e.violationCount,
|
|
1173
|
-
};
|
|
1174
|
-
}
|
|
1175
|
-
return {
|
|
1176
|
-
total_checks: checks.length,
|
|
1177
|
-
applicable_checks: 0,
|
|
1178
|
-
overall_compliance: 0,
|
|
1179
|
-
strict_pass_rate: 0,
|
|
1180
|
-
partial_pass_rate: 0,
|
|
1181
|
-
fail_rate: 0,
|
|
1182
|
-
violation_count: 0,
|
|
1183
|
-
by_workflow,
|
|
1184
|
-
top_violations: [],
|
|
1185
|
-
cross_tab,
|
|
1186
|
-
reliability,
|
|
1187
|
-
};
|
|
1188
|
-
}
|
|
1189
|
-
const by_workflow = {};
|
|
1190
|
-
for (const [id, e] of perSpecAcc) {
|
|
1191
|
-
const applicableForSpec = e.pass + e.partial + e.fail;
|
|
1192
|
-
const score = applicableForSpec === 0 ? 0 : e.scoreSum / applicableForSpec;
|
|
1193
|
-
const passRate = applicableForSpec === 0 ? 0 : e.pass / applicableForSpec;
|
|
1194
|
-
const partialRate = applicableForSpec === 0 ? 0 : e.partial / applicableForSpec;
|
|
1195
|
-
const failRate = applicableForSpec === 0 ? 0 : e.fail / applicableForSpec;
|
|
1196
|
-
by_workflow[id] = {
|
|
1197
|
-
workflow_id: id,
|
|
1198
|
-
count: e.count,
|
|
1199
|
-
score,
|
|
1200
|
-
pass_rate: passRate,
|
|
1201
|
-
partial_rate: partialRate,
|
|
1202
|
-
fail_rate: failRate,
|
|
1203
|
-
violation_count: e.violationCount,
|
|
1204
|
-
};
|
|
1205
|
-
}
|
|
1206
|
-
// Top-violation list: sort by count desc, tie-break alphabetically by
|
|
1207
|
-
// code so rendering is byte-stable.
|
|
1208
|
-
const top_violations = [];
|
|
1209
|
-
for (const [code, evidence] of violationAcc) {
|
|
1210
|
-
top_violations.push({
|
|
1211
|
-
code,
|
|
1212
|
-
count: evidence.length, // bounded; raw count below for accuracy
|
|
1213
|
-
evidence,
|
|
1214
|
-
});
|
|
1215
|
-
}
|
|
1216
|
-
// Recount: `evidence.length` is capped at MAX_VIOLATION_EVIDENCE; we want
|
|
1217
|
-
// the true count for sorting/reporting. Re-derive from violationAcc by
|
|
1218
|
-
// scanning checks again — cheap.
|
|
1219
|
-
const trueCounts = new Map();
|
|
1220
|
-
for (const c of checks) {
|
|
1221
|
-
if (bucketWorkflowStatus(c.status) === null)
|
|
1222
|
-
continue;
|
|
1223
|
-
for (const v of c.violations) {
|
|
1224
|
-
trueCounts.set(v.code, (trueCounts.get(v.code) ?? 0) + 1);
|
|
1225
|
-
}
|
|
1226
|
-
}
|
|
1227
|
-
for (const tv of top_violations) {
|
|
1228
|
-
tv.count = trueCounts.get(tv.code) ?? tv.count;
|
|
1229
|
-
}
|
|
1230
|
-
top_violations.sort((a, b) => {
|
|
1231
|
-
if (b.count !== a.count)
|
|
1232
|
-
return b.count - a.count;
|
|
1233
|
-
return a.code.localeCompare(b.code);
|
|
1234
|
-
});
|
|
1235
|
-
const trimmedViolations = top_violations.slice(0, MAX_TOP_VIOLATIONS);
|
|
1236
|
-
return {
|
|
1237
|
-
total_checks: checks.length,
|
|
1238
|
-
applicable_checks: applicable,
|
|
1239
|
-
overall_compliance: scoreSum / applicable,
|
|
1240
|
-
strict_pass_rate: strict / applicable,
|
|
1241
|
-
partial_pass_rate: partial / applicable,
|
|
1242
|
-
fail_rate: fail / applicable,
|
|
1243
|
-
violation_count: violationCount,
|
|
1244
|
-
by_workflow,
|
|
1245
|
-
top_violations: trimmedViolations,
|
|
1246
|
-
cross_tab,
|
|
1247
|
-
reliability,
|
|
1248
|
-
};
|
|
1249
|
-
}
|
|
1250
|
-
/**
|
|
1251
|
-
* Severity rank for cross-tab "WORST workflow outcome per run" reduction.
|
|
1252
|
-
* fail > partial > pass.
|
|
1253
|
-
*/
|
|
1254
|
-
function severityRank(b) {
|
|
1255
|
-
if (b === "fail")
|
|
1256
|
-
return 2;
|
|
1257
|
-
if (b === "partial")
|
|
1258
|
-
return 1;
|
|
1259
|
-
return 0;
|
|
1260
|
-
}
|
|
1261
|
-
/**
|
|
1262
|
-
* Recover the task-level outcome that produced a check, when available.
|
|
1263
|
-
* The check shape does not carry it directly; the runner stashes it on a
|
|
1264
|
-
* non-public side-channel field. Returns `undefined` when no task outcome
|
|
1265
|
-
* was attached (older callers, hand-written tests).
|
|
1266
|
-
*/
|
|
1267
|
-
function readCheckTaskOutcome(c) {
|
|
1268
|
-
return typeof c.taskOutcome === "string" ? c.taskOutcome : undefined;
|
|
1269
|
-
}
|
|
1270
|
-
/**
|
|
1271
|
-
* Render the §257 `## Workflow compliance` markdown section. Returns "" when
|
|
1272
|
-
* there are no checks so the report stays compact for runs without
|
|
1273
|
-
* applicable workflow specs.
|
|
1274
|
-
*/
|
|
1275
|
-
export function renderWorkflowComplianceSection(input) {
|
|
1276
|
-
const checks = input.workflowChecks ?? [];
|
|
1277
|
-
const agg = buildWorkflowAggregate(checks);
|
|
1278
|
-
if (agg.total_checks === 0)
|
|
1279
|
-
return "";
|
|
1280
|
-
const lines = [];
|
|
1281
|
-
lines.push("## Workflow compliance");
|
|
1282
|
-
lines.push("");
|
|
1283
|
-
if (agg.applicable_checks === 0) {
|
|
1284
|
-
lines.push("_No workflow specs applied to this corpus._");
|
|
1285
|
-
if (Object.keys(agg.by_workflow).length > 0) {
|
|
1286
|
-
lines.push("");
|
|
1287
|
-
lines.push(`Loaded specs (none matched the run): ${Object.keys(agg.by_workflow).sort().join(", ")}`);
|
|
1288
|
-
}
|
|
1289
|
-
return lines.join("\n");
|
|
1290
|
-
}
|
|
1291
|
-
lines.push(`overall_compliance=${agg.overall_compliance.toFixed(2)}, ` +
|
|
1292
|
-
`strict_pass_rate=${agg.strict_pass_rate.toFixed(2)}, ` +
|
|
1293
|
-
`partial_pass_rate=${agg.partial_pass_rate.toFixed(2)}, ` +
|
|
1294
|
-
`fail_rate=${agg.fail_rate.toFixed(2)}, ` +
|
|
1295
|
-
`violations=${agg.violation_count}`);
|
|
1296
|
-
lines.push("");
|
|
1297
|
-
lines.push("### By workflow");
|
|
1298
|
-
lines.push("");
|
|
1299
|
-
lines.push("| workflow_id | applicable | score | pass | partial | fail | violations |");
|
|
1300
|
-
lines.push("|-------------|-----------:|------:|-----:|--------:|-----:|-----------:|");
|
|
1301
|
-
const sortedSpecs = Object.values(agg.by_workflow).sort((a, b) => a.workflow_id.localeCompare(b.workflow_id));
|
|
1302
|
-
for (const spec of sortedSpecs) {
|
|
1303
|
-
lines.push(`| ${spec.workflow_id} | ${spec.count} | ${spec.score.toFixed(2)} | ${spec.pass_rate.toFixed(2)} | ${spec.partial_rate.toFixed(2)} | ${spec.fail_rate.toFixed(2)} | ${spec.violation_count} |`);
|
|
1304
|
-
}
|
|
1305
|
-
if (agg.top_violations.length > 0) {
|
|
1306
|
-
lines.push("");
|
|
1307
|
-
lines.push("### Top violations");
|
|
1308
|
-
lines.push("");
|
|
1309
|
-
lines.push("| code | count |");
|
|
1310
|
-
lines.push("|------|------:|");
|
|
1311
|
-
for (const tv of agg.top_violations) {
|
|
1312
|
-
lines.push(`| ${tv.code} | ${tv.count} |`);
|
|
1313
|
-
}
|
|
1314
|
-
// Surface the first evidence pointer per top-violation so operators can
|
|
1315
|
-
// jump to a concrete (task, seed) without parsing the JSON envelope.
|
|
1316
|
-
lines.push("");
|
|
1317
|
-
lines.push("### Violation evidence");
|
|
1318
|
-
lines.push("");
|
|
1319
|
-
lines.push("| code | task | seed | workflow | observed |");
|
|
1320
|
-
lines.push("|------|------|-----:|----------|----------|");
|
|
1321
|
-
for (const tv of agg.top_violations) {
|
|
1322
|
-
for (const ev of tv.evidence) {
|
|
1323
|
-
const observed = ev.observed ?? ev.message ?? "";
|
|
1324
|
-
lines.push(`| ${tv.code} | ${ev.task_id} | ${ev.seed} | ${ev.workflow_id} | ${truncateCell(observed)} |`);
|
|
1325
|
-
}
|
|
1326
|
-
}
|
|
1327
|
-
}
|
|
1328
|
-
if (agg.cross_tab.length > 0) {
|
|
1329
|
-
lines.push("");
|
|
1330
|
-
lines.push("### Task outcome × workflow outcome");
|
|
1331
|
-
lines.push("");
|
|
1332
|
-
lines.push("| task_outcome | wf_pass | wf_partial | wf_fail | total |");
|
|
1333
|
-
lines.push("|--------------|--------:|-----------:|--------:|------:|");
|
|
1334
|
-
for (const row of agg.cross_tab) {
|
|
1335
|
-
lines.push(`| ${row.task_outcome} | ${row.pass} | ${row.partial} | ${row.fail} | ${row.total} |`);
|
|
1336
|
-
}
|
|
1337
|
-
}
|
|
1338
|
-
// #258: Reliability sub-section. Skip when no group contributed (all
|
|
1339
|
-
// checks were `not_applicable` or input was empty).
|
|
1340
|
-
const reliability = agg.reliability;
|
|
1341
|
-
if (reliability.corpus.groups > 0) {
|
|
1342
|
-
lines.push("");
|
|
1343
|
-
lines.push("### Reliability (pass@k / pass^k)");
|
|
1344
|
-
lines.push("");
|
|
1345
|
-
lines.push(`corpus pass@k=${reliability.corpus.pass_at_k.toFixed(2)}, ` +
|
|
1346
|
-
`pass^k=${reliability.corpus.pass_all_k.toFixed(2)} ` +
|
|
1347
|
-
`(over ${reliability.corpus.groups} workflow×task groups, ${reliability.corpus.tasks} distinct tasks)`);
|
|
1348
|
-
lines.push("");
|
|
1349
|
-
lines.push("| workflow_id | tasks | k | pass@k | pass^k |");
|
|
1350
|
-
lines.push("|-------------|------:|--:|-------:|-------:|");
|
|
1351
|
-
const sortedReliability = Object.values(reliability.by_workflow).sort((a, b) => a.workflow_id.localeCompare(b.workflow_id));
|
|
1352
|
-
for (const row of sortedReliability) {
|
|
1353
|
-
lines.push(`| ${row.workflow_id} | ${row.tasks} | ${row.k} | ${row.pass_at_k.toFixed(2)} | ${row.pass_all_k.toFixed(2)} |`);
|
|
1354
|
-
}
|
|
1355
|
-
// Inconsistency callout: workflows where the agent CAN comply
|
|
1356
|
-
// (pass@k high) but does not RELIABLY comply (pass^k materially lower).
|
|
1357
|
-
// Threshold: pass@k ≥ 0.5 AND (pass@k − pass^k) ≥ 0.25.
|
|
1358
|
-
const INCONSISTENCY_GAP = 0.25;
|
|
1359
|
-
const PASS_AT_K_FLOOR = 0.5;
|
|
1360
|
-
const inconsistent = sortedReliability.filter((r) => r.pass_at_k >= PASS_AT_K_FLOOR && r.pass_at_k - r.pass_all_k >= INCONSISTENCY_GAP);
|
|
1361
|
-
if (inconsistent.length > 0) {
|
|
1362
|
-
lines.push("");
|
|
1363
|
-
lines.push("**Inconsistent workflows** (high pass@k but low pass^k — agent can comply but does not reliably):");
|
|
1364
|
-
lines.push("");
|
|
1365
|
-
for (const row of inconsistent) {
|
|
1366
|
-
lines.push(`- \`${row.workflow_id}\`: pass@k=${row.pass_at_k.toFixed(2)} vs pass^k=${row.pass_all_k.toFixed(2)} (gap ${(row.pass_at_k - row.pass_all_k).toFixed(2)})`);
|
|
1367
|
-
}
|
|
1368
|
-
}
|
|
1369
|
-
}
|
|
1370
|
-
return lines.join("\n");
|
|
1371
|
-
}
|
|
1372
|
-
/**
|
|
1373
|
-
* Trim a single cell so the markdown table stays scannable. We keep the
|
|
1374
|
-
* head 80 chars and append `…` when clamped.
|
|
1375
|
-
*/
|
|
1376
|
-
function truncateCell(s) {
|
|
1377
|
-
if (s.length <= 80)
|
|
1378
|
-
return s.replace(/\|/g, "\\|");
|
|
1379
|
-
return `${s.slice(0, 80).replace(/\|/g, "\\|")}…`;
|
|
1380
|
-
}
|
|
1381
|
-
// ── Negative-transfer + domain diagnostics markdown (#260) ─────────────────
|
|
1382
|
-
/**
|
|
1383
|
-
* Render the §260 negative-transfer section. Stays quiet when no
|
|
1384
|
-
* regressions exist — emits a single `## Negative transfer\n\nnone` block so
|
|
1385
|
-
* the report remains scannable for green corpora. When regressions exist,
|
|
1386
|
-
* renders headline counts, the top-regressed-task table, the per-domain
|
|
1387
|
-
* delta table, and the asset-regression-candidate table.
|
|
1388
|
-
*/
|
|
1389
|
-
export function renderNegativeTransferSection(input) {
|
|
1390
|
-
const negativeTransfer = computeNegativeTransfer(input.tasks);
|
|
1391
|
-
const lines = ["## Negative transfer", ""];
|
|
1392
|
-
if (negativeTransfer.count === 0) {
|
|
1393
|
-
lines.push("none");
|
|
1394
|
-
return lines.join("\n");
|
|
1395
|
-
}
|
|
1396
|
-
lines.push(`count=${negativeTransfer.count}, severity=${negativeTransfer.severity.toFixed(2)} (sum of noakm − akm pass rate over regressed tasks)`);
|
|
1397
|
-
lines.push("");
|
|
1398
|
-
lines.push("### Top regressed tasks");
|
|
1399
|
-
lines.push("");
|
|
1400
|
-
lines.push("| task | domain | noakm | akm | delta |");
|
|
1401
|
-
lines.push("|------|--------|-------|-----|-------|");
|
|
1402
|
-
for (const row of negativeTransfer.topRegressedTasks) {
|
|
1403
|
-
lines.push(`| ${row.taskId} | ${row.domain} | ${row.noakmPassRate.toFixed(2)} | ${row.akmPassRate.toFixed(2)} | ${signed(row.delta.toFixed(2))} |`);
|
|
1404
|
-
}
|
|
1405
|
-
const domainRows = computeDomainAggregates(input.tasks);
|
|
1406
|
-
if (domainRows.length > 0) {
|
|
1407
|
-
lines.push("");
|
|
1408
|
-
lines.push("### Domain-level deltas");
|
|
1409
|
-
lines.push("");
|
|
1410
|
-
lines.push("| domain | tasks | regressions | noakm pass | akm pass | delta | tokens delta | wallclock delta (ms) |");
|
|
1411
|
-
lines.push("|--------|-------|-------------|------------|----------|-------|--------------|----------------------|");
|
|
1412
|
-
for (const row of domainRows) {
|
|
1413
|
-
const tppDelta = row.tokensPerPassDelta === null ? "n/a" : signed(row.tokensPerPassDelta.toFixed(0));
|
|
1414
|
-
lines.push(`| ${row.domain} | ${row.taskCount} | ${row.regressionCount} | ${row.passRateNoakm.toFixed(2)} | ${row.passRateAkm.toFixed(2)} | ${signed(row.passRateDelta.toFixed(2))} | ${tppDelta} | ${signed(row.wallclockMsDelta.toFixed(0))} |`);
|
|
1415
|
-
}
|
|
1416
|
-
}
|
|
1417
|
-
const candidates = computeAssetRegressionCandidates(negativeTransfer.topRegressedTasks.map((r) => r.taskId), input.akmRuns ?? []);
|
|
1418
|
-
if (candidates.length > 0) {
|
|
1419
|
-
lines.push("");
|
|
1420
|
-
lines.push("### Asset regression candidates");
|
|
1421
|
-
lines.push("");
|
|
1422
|
-
lines.push("| asset_ref | regressed tasks | total loads |");
|
|
1423
|
-
lines.push("|-----------|-----------------|-------------|");
|
|
1424
|
-
for (const row of candidates) {
|
|
1425
|
-
lines.push(`| \`${row.assetRef}\` | ${row.regressedTaskCount} | ${row.totalLoadCount} |`);
|
|
1426
|
-
}
|
|
1427
|
-
}
|
|
1428
|
-
return lines.join("\n");
|
|
1429
|
-
}
|
|
1430
|
-
// ── Corpus-coverage markdown (#262) ────────────────────────────────────────
|
|
1431
|
-
/**
|
|
1432
|
-
* Render the §13.3 corpus_coverage markdown section (#262). Returns "" when
|
|
1433
|
-
* no task carries a `memory_ability` tag — at that point the section adds
|
|
1434
|
-
* no signal and only churns markdown snapshots.
|
|
1435
|
-
*
|
|
1436
|
-
* Sections rendered:
|
|
1437
|
-
* - Coverage counts per memory-ability label (closed set + `untagged`).
|
|
1438
|
-
* - Per-memory-ability pass-rate / akm − noakm delta / negative-transfer
|
|
1439
|
-
* counts, plus workflow compliance when at least one task supplied it.
|
|
1440
|
-
* - A compact `## Task families` rollup when ≥ 2 families are tagged.
|
|
1441
|
-
*/
|
|
1442
|
-
export function renderCorpusCoverageSection(input) {
|
|
1443
|
-
const block = buildCorpusCoverageBlock(input);
|
|
1444
|
-
const taggedAbility = Object.entries(block.coverage.memoryAbilityCounts).some(([k, v]) => k !== "untagged" && v > 0);
|
|
1445
|
-
if (!taggedAbility)
|
|
1446
|
-
return "";
|
|
1447
|
-
const lines = [];
|
|
1448
|
-
lines.push("## Corpus coverage");
|
|
1449
|
-
lines.push("");
|
|
1450
|
-
lines.push("| memory_ability | tasks |");
|
|
1451
|
-
lines.push("|----------------|-------|");
|
|
1452
|
-
// Sort keys: known abilities alphabetically, `untagged` last.
|
|
1453
|
-
const counts = block.coverage.memoryAbilityCounts;
|
|
1454
|
-
const knownKeys = Object.keys(counts)
|
|
1455
|
-
.filter((k) => k !== "untagged")
|
|
1456
|
-
.sort();
|
|
1457
|
-
for (const k of knownKeys)
|
|
1458
|
-
lines.push(`| ${k} | ${counts[k]} |`);
|
|
1459
|
-
if ((counts.untagged ?? 0) > 0)
|
|
1460
|
-
lines.push(`| untagged | ${counts.untagged} |`);
|
|
1461
|
-
if (block.by_memory_ability.length > 0) {
|
|
1462
|
-
lines.push("");
|
|
1463
|
-
lines.push("### By memory_ability");
|
|
1464
|
-
lines.push("");
|
|
1465
|
-
const anyCompliance = block.by_memory_ability.some((r) => r.workflow_compliance !== null);
|
|
1466
|
-
if (anyCompliance) {
|
|
1467
|
-
lines.push("| memory_ability | tasks | noakm | akm | delta | neg.transfer | workflow_compliance |");
|
|
1468
|
-
lines.push("|----------------|-------|-------|-----|-------|--------------|---------------------|");
|
|
1469
|
-
}
|
|
1470
|
-
else {
|
|
1471
|
-
lines.push("| memory_ability | tasks | noakm | akm | delta | neg.transfer |");
|
|
1472
|
-
lines.push("|----------------|-------|-------|-----|-------|--------------|");
|
|
1473
|
-
}
|
|
1474
|
-
for (const row of block.by_memory_ability) {
|
|
1475
|
-
const base = `| ${row.category} | ${row.task_count} | ${row.pass_rate_noakm.toFixed(2)} | ${row.pass_rate_akm.toFixed(2)} | ${signed(row.pass_rate_delta.toFixed(2))} | ${row.negative_transfer_count} |`;
|
|
1476
|
-
if (anyCompliance) {
|
|
1477
|
-
const wc = row.workflow_compliance === null ? "n/a" : row.workflow_compliance.toFixed(2);
|
|
1478
|
-
lines.push(`${base} ${wc} |`);
|
|
1479
|
-
}
|
|
1480
|
-
else {
|
|
1481
|
-
lines.push(base);
|
|
1482
|
-
}
|
|
1483
|
-
}
|
|
1484
|
-
}
|
|
1485
|
-
const families = block.by_task_family;
|
|
1486
|
-
if (families.length >= 2) {
|
|
1487
|
-
lines.push("");
|
|
1488
|
-
lines.push("### By task_family");
|
|
1489
|
-
lines.push("");
|
|
1490
|
-
lines.push("| task_family | tasks | noakm | akm | delta |");
|
|
1491
|
-
lines.push("|-------------|-------|-------|-----|-------|");
|
|
1492
|
-
for (const row of families) {
|
|
1493
|
-
lines.push(`| ${row.category} | ${row.task_count} | ${row.pass_rate_noakm.toFixed(2)} | ${row.pass_rate_akm.toFixed(2)} | ${signed(row.pass_rate_delta.toFixed(2))} |`);
|
|
1494
|
-
}
|
|
1495
|
-
}
|
|
1496
|
-
return lines.join("\n");
|
|
1497
|
-
}
|
|
1498
|
-
// ── Git helpers ────────────────────────────────────────────────────────────
|
|
1499
|
-
/**
|
|
1500
|
-
* Resolve `git rev-parse --abbrev-ref HEAD`. Falls back to `"unknown"` if
|
|
1501
|
-
* git is unavailable or the cwd is not a repo. Tests inject `cwd` to point
|
|
1502
|
-
* at a tmp non-repo to exercise the fallback.
|
|
1503
|
-
*/
|
|
1504
|
-
export function resolveGitBranch(cwd) {
|
|
1505
|
-
return tryGit(["rev-parse", "--abbrev-ref", "HEAD"], cwd);
|
|
1506
|
-
}
|
|
1507
|
-
/**
|
|
1508
|
-
* Resolve `git rev-parse --short HEAD`. Same fallback rules as
|
|
1509
|
-
* `resolveGitBranch`.
|
|
1510
|
-
*/
|
|
1511
|
-
export function resolveGitCommit(cwd) {
|
|
1512
|
-
return tryGit(["rev-parse", "--short", "HEAD"], cwd);
|
|
1513
|
-
}
|
|
1514
|
-
function tryGit(args, cwd) {
|
|
1515
|
-
try {
|
|
1516
|
-
const out = execSync(`git ${args.join(" ")}`, {
|
|
1517
|
-
cwd: cwd ?? process.cwd(),
|
|
1518
|
-
stdio: ["ignore", "pipe", "ignore"],
|
|
1519
|
-
encoding: "utf8",
|
|
1520
|
-
});
|
|
1521
|
-
return out.trim() || "unknown";
|
|
1522
|
-
}
|
|
1523
|
-
catch {
|
|
1524
|
-
return "unknown";
|
|
1525
|
-
}
|
|
1526
|
-
}
|
|
1527
|
-
/**
|
|
1528
|
-
* Threshold below which the markdown summary prepends a warning marker
|
|
1529
|
-
* and the JSON envelope's `warnings[]` carries a structured
|
|
1530
|
-
* `feedback_agreement_below_threshold` entry. Track B's headline numbers
|
|
1531
|
-
* (`improvement_slope`, `over_synthetic_lift`) are unreliable when
|
|
1532
|
-
* Phase 1 feedback disagrees with run outcomes more than 20% of the
|
|
1533
|
-
* time. Spec §6.8.
|
|
1534
|
-
*/
|
|
1535
|
-
export const FEEDBACK_AGREEMENT_WARNING_THRESHOLD = 0.8;
|
|
1536
|
-
/**
|
|
1537
|
-
* Render an evolve run as the §6.3+§6.4 JSON envelope plus a markdown
|
|
1538
|
-
* summary. Mirrors `renderUtilityReport` — caller wires stdout/stderr.
|
|
1539
|
-
*/
|
|
1540
|
-
export function renderEvolveReport(input) {
|
|
1541
|
-
const json = buildEvolveJson(input);
|
|
1542
|
-
const markdown = buildEvolveMarkdown(input);
|
|
1543
|
-
return { json, markdown };
|
|
1544
|
-
}
|
|
1545
|
-
function buildEvolveJson(input) {
|
|
1546
|
-
// For each arm we re-render the §13.3 utility envelope so downstream
|
|
1547
|
-
// consumers can treat each arm exactly like a `bench utility` artefact.
|
|
1548
|
-
const armEnvelope = (r) => buildUtilityJson(r);
|
|
1549
|
-
// §6.8 — derive an additive `warnings[]` entry when the headline
|
|
1550
|
-
// feedback_agreement falls below the trust threshold.
|
|
1551
|
-
const augmentedWarnings = [...input.warnings];
|
|
1552
|
-
if (input.feedbackIntegrity) {
|
|
1553
|
-
const agreement = input.feedbackIntegrity.aggregate.feedback_agreement;
|
|
1554
|
-
if (agreement < FEEDBACK_AGREEMENT_WARNING_THRESHOLD) {
|
|
1555
|
-
augmentedWarnings.push(`feedback_agreement_below_threshold: ${agreement.toFixed(2)} < ${FEEDBACK_AGREEMENT_WARNING_THRESHOLD.toFixed(2)} — Track B headline numbers (improvement_slope, over_synthetic_lift) may be unreliable until AGENTS.md guidance for \`akm feedback\` is tightened.`);
|
|
1556
|
-
}
|
|
1557
|
-
}
|
|
1558
|
-
return {
|
|
1559
|
-
schemaVersion: 1,
|
|
1560
|
-
track: "evolve",
|
|
1561
|
-
branch: input.branch,
|
|
1562
|
-
commit: input.commit,
|
|
1563
|
-
timestamp: input.timestamp,
|
|
1564
|
-
agent: { harness: "opencode", model: input.model },
|
|
1565
|
-
corpus: {
|
|
1566
|
-
domain: input.domain,
|
|
1567
|
-
seedsPerArm: input.seedsPerArm,
|
|
1568
|
-
},
|
|
1569
|
-
proposals: {
|
|
1570
|
-
total_proposals: input.proposals.totalProposals,
|
|
1571
|
-
total_accepted: input.proposals.totalAccepted,
|
|
1572
|
-
acceptance_rate: input.proposals.acceptanceRate,
|
|
1573
|
-
lint_pass_rate: input.proposals.lintPassRate,
|
|
1574
|
-
rows: input.proposals.rows.map((r) => ({
|
|
1575
|
-
asset_ref: r.assetRef,
|
|
1576
|
-
proposal_count: r.proposalCount,
|
|
1577
|
-
lint_pass_count: r.lintPassCount,
|
|
1578
|
-
accepted_count: r.acceptedCount,
|
|
1579
|
-
})),
|
|
1580
|
-
},
|
|
1581
|
-
...(input.lessons ? { lessons: serialiseLessons(input.lessons) } : {}),
|
|
1582
|
-
longitudinal: {
|
|
1583
|
-
improvement_slope: input.longitudinal.improvementSlope,
|
|
1584
|
-
over_synthetic_lift: input.longitudinal.overSyntheticLift,
|
|
1585
|
-
degradation_count: input.longitudinal.degradationCount,
|
|
1586
|
-
pre_pass_rate: input.longitudinal.prePassRate,
|
|
1587
|
-
post_pass_rate: input.longitudinal.postPassRate,
|
|
1588
|
-
synthetic_pass_rate: input.longitudinal.syntheticPassRate,
|
|
1589
|
-
degradations: input.longitudinal.degradations.map((d) => ({
|
|
1590
|
-
task_id: d.taskId,
|
|
1591
|
-
pre_pass_rate: d.prePassRate,
|
|
1592
|
-
post_pass_rate: d.postPassRate,
|
|
1593
|
-
delta: d.delta,
|
|
1594
|
-
failure_mode: d.failureMode,
|
|
1595
|
-
})),
|
|
1596
|
-
},
|
|
1597
|
-
...(input.learningCurve ? { learning: serialiseLearningCurve(input.learningCurve) } : {}),
|
|
1598
|
-
arms: {
|
|
1599
|
-
pre: armEnvelope(input.arms.pre),
|
|
1600
|
-
post: armEnvelope(input.arms.post),
|
|
1601
|
-
synthetic: armEnvelope(input.arms.synthetic),
|
|
1602
|
-
},
|
|
1603
|
-
perAsset: input.arms.post.perAsset
|
|
1604
|
-
? {
|
|
1605
|
-
total_akm_runs: input.arms.post.perAsset.totalAkmRuns,
|
|
1606
|
-
rows: input.arms.post.perAsset.rows.map((r) => ({
|
|
1607
|
-
asset_ref: r.assetRef,
|
|
1608
|
-
load_count: r.loadCount,
|
|
1609
|
-
load_count_passing: r.loadCountPassing,
|
|
1610
|
-
load_count_failing: r.loadCountFailing,
|
|
1611
|
-
load_pass_rate: r.loadPassRate,
|
|
1612
|
-
})),
|
|
1613
|
-
}
|
|
1614
|
-
: { total_akm_runs: 0, rows: [] },
|
|
1615
|
-
failure_modes: {
|
|
1616
|
-
by_label: input.arms.post.failureModes.byLabel,
|
|
1617
|
-
by_task: input.arms.post.failureModes.byTask,
|
|
1618
|
-
},
|
|
1619
|
-
...(input.arms.post.searchBridge ? { searchBridge: serialiseSearchBridge(input.arms.post.searchBridge) } : {}),
|
|
1620
|
-
...(input.feedbackIntegrity ? { feedback_integrity: serialiseFeedbackIntegrity(input.feedbackIntegrity) } : {}),
|
|
1621
|
-
warnings: augmentedWarnings,
|
|
1622
|
-
};
|
|
1623
|
-
}
|
|
1624
|
-
/**
|
|
1625
|
-
* #264 — flatten the LessonMetrics envelope into JSON. Aggregate counters
|
|
1626
|
-
* sit alongside `lessons[]` so consumers can pick the headline numbers off
|
|
1627
|
-
* without walking every row.
|
|
1628
|
-
*/
|
|
1629
|
-
function serialiseLessons(metrics) {
|
|
1630
|
-
return {
|
|
1631
|
-
lessons_created_count: metrics.lessons_created_count,
|
|
1632
|
-
lessons_accepted_count: metrics.lessons_accepted_count,
|
|
1633
|
-
proposal_lint_pass_rate: metrics.proposal_lint_pass_rate,
|
|
1634
|
-
proposal_acceptance_rate: metrics.proposal_acceptance_rate,
|
|
1635
|
-
lesson_reuse_rate: metrics.lesson_reuse_rate,
|
|
1636
|
-
lesson_reuse_success_rate: metrics.lesson_reuse_success_rate,
|
|
1637
|
-
lesson_negative_transfer_count: metrics.lesson_negative_transfer_count,
|
|
1638
|
-
lessons: metrics.lessons.map((l) => ({
|
|
1639
|
-
ref: l.ref,
|
|
1640
|
-
source_failures: l.source_failures,
|
|
1641
|
-
lint_pass: l.lint_pass,
|
|
1642
|
-
accepted: l.accepted,
|
|
1643
|
-
first_reused_on: l.first_reused_on,
|
|
1644
|
-
reuse_count: l.reuse_count,
|
|
1645
|
-
reuse_pass_rate: l.reuse_pass_rate,
|
|
1646
|
-
negative_transfer_count: l.negative_transfer_count,
|
|
1647
|
-
leakage_risk: l.leakage_risk,
|
|
1648
|
-
})),
|
|
1649
|
-
};
|
|
1650
|
-
}
|
|
1651
|
-
/**
|
|
1652
|
-
* §6.4 (issue #265) — flatten a `LearningCurve` into its JSON envelope.
|
|
1653
|
-
* Mirrors the suggested shape from the issue body: an `episodes[]` block
|
|
1654
|
-
* with per-episode rows, plus the headline `learning_slope` and
|
|
1655
|
-
* `time_to_improvement`. `pass_rate_by_episode` is exposed as a flat array
|
|
1656
|
-
* for tools that want to plot without re-projecting the rows.
|
|
1657
|
-
*/
|
|
1658
|
-
function serialiseLearningCurve(curve) {
|
|
1659
|
-
return {
|
|
1660
|
-
episodes: curve.episodes.map((ep) => ({
|
|
1661
|
-
episode_index: ep.episode_index,
|
|
1662
|
-
pass_rate: ep.pass_rate,
|
|
1663
|
-
delta_from_previous_episode: ep.delta_from_previous_episode,
|
|
1664
|
-
cumulative_feedback_events: ep.cumulative_feedback_events,
|
|
1665
|
-
cumulative_proposals_created: ep.cumulative_proposals_created,
|
|
1666
|
-
cumulative_proposals_accepted: ep.cumulative_proposals_accepted,
|
|
1667
|
-
cumulative_lessons_created: ep.cumulative_lessons_created,
|
|
1668
|
-
lesson_reuse_rate: ep.lesson_reuse_rate,
|
|
1669
|
-
})),
|
|
1670
|
-
pass_rate_by_episode: curve.pass_rate_by_episode.slice(),
|
|
1671
|
-
learning_slope: curve.learning_slope,
|
|
1672
|
-
time_to_improvement: curve.time_to_improvement,
|
|
1673
|
-
};
|
|
1674
|
-
}
|
|
1675
|
-
/**
|
|
1676
|
-
* §6.4 (issue #265) — render a compact "Learning curve" markdown table.
|
|
1677
|
-
* One row per episode plus the headline slope + time-to-improvement.
|
|
1678
|
-
*/
|
|
1679
|
-
export function renderLearningCurveSection(curve) {
|
|
1680
|
-
const lines = [];
|
|
1681
|
-
lines.push("## Learning curve");
|
|
1682
|
-
lines.push("");
|
|
1683
|
-
lines.push(`learning_slope=${signedFixed(curve.learning_slope, 3)}, time_to_improvement=${curve.time_to_improvement === null ? "n/a" : String(curve.time_to_improvement)}`);
|
|
1684
|
-
lines.push("");
|
|
1685
|
-
if (curve.episodes.length === 0) {
|
|
1686
|
-
lines.push("_No episodes recorded._");
|
|
1687
|
-
return lines.join("\n");
|
|
1688
|
-
}
|
|
1689
|
-
lines.push("| episode | pass_rate | Δ prev | feedback | proposals | accepted | lessons | reuse |");
|
|
1690
|
-
lines.push("|--------:|----------:|-------:|---------:|----------:|---------:|--------:|------:|");
|
|
1691
|
-
for (const ep of curve.episodes) {
|
|
1692
|
-
lines.push(`| ${ep.episode_index} | ${ep.pass_rate.toFixed(2)} | ${signedFixed(ep.delta_from_previous_episode, 2)} | ${ep.cumulative_feedback_events} | ${ep.cumulative_proposals_created} | ${ep.cumulative_proposals_accepted} | ${ep.cumulative_lessons_created} | ${ep.lesson_reuse_rate === null ? "n/a" : ep.lesson_reuse_rate.toFixed(2)} |`);
|
|
1693
|
-
}
|
|
1694
|
-
return lines.join("\n");
|
|
1695
|
-
}
|
|
1696
|
-
/** §6.8 — flatten the FeedbackIntegrityMetrics envelope into JSON. */
|
|
1697
|
-
function serialiseFeedbackIntegrity(metrics) {
|
|
1698
|
-
return {
|
|
1699
|
-
aggregate: {
|
|
1700
|
-
truePositive: metrics.aggregate.truePositive,
|
|
1701
|
-
falsePositive: metrics.aggregate.falsePositive,
|
|
1702
|
-
trueNegative: metrics.aggregate.trueNegative,
|
|
1703
|
-
falseNegative: metrics.aggregate.falseNegative,
|
|
1704
|
-
feedback_agreement: metrics.aggregate.feedback_agreement,
|
|
1705
|
-
false_positive_rate: metrics.aggregate.false_positive_rate,
|
|
1706
|
-
false_negative_rate: metrics.aggregate.false_negative_rate,
|
|
1707
|
-
feedback_coverage: metrics.aggregate.feedback_coverage,
|
|
1708
|
-
},
|
|
1709
|
-
perAsset: metrics.perAsset.map((row) => ({
|
|
1710
|
-
ref: row.ref,
|
|
1711
|
-
truePositive: row.truePositive,
|
|
1712
|
-
falsePositive: row.falsePositive,
|
|
1713
|
-
trueNegative: row.trueNegative,
|
|
1714
|
-
falseNegative: row.falseNegative,
|
|
1715
|
-
feedback_agreement: row.feedback_agreement,
|
|
1716
|
-
false_positive_rate: row.false_positive_rate,
|
|
1717
|
-
false_negative_rate: row.false_negative_rate,
|
|
1718
|
-
})),
|
|
1719
|
-
};
|
|
1720
|
-
}
|
|
1721
|
-
/**
|
|
1722
|
-
* Render the #264 lessons block — aggregate counters followed by one row
|
|
1723
|
-
* per lesson. Exported for tests so the rendered shape can be asserted
|
|
1724
|
-
* directly without going through `renderEvolveReport`.
|
|
1725
|
-
*/
|
|
1726
|
-
export function renderLessonsTable(metrics) {
|
|
1727
|
-
const lines = [];
|
|
1728
|
-
lines.push("## Lessons");
|
|
1729
|
-
lines.push("");
|
|
1730
|
-
lines.push(`created=${metrics.lessons_created_count}, accepted=${metrics.lessons_accepted_count}, reuse_rate=${metrics.lesson_reuse_rate.toFixed(2)}, reuse_success_rate=${metrics.lesson_reuse_success_rate.toFixed(2)}, negative_transfer=${metrics.lesson_negative_transfer_count}`);
|
|
1731
|
-
lines.push("");
|
|
1732
|
-
if (metrics.lessons.length === 0) {
|
|
1733
|
-
lines.push("_No lessons generated._");
|
|
1734
|
-
return lines.join("\n");
|
|
1735
|
-
}
|
|
1736
|
-
lines.push("| ref | accepted | lint | reuse | reuse_pass | first_reused_on | neg_transfer | leakage |");
|
|
1737
|
-
lines.push("|-----|----------|------|-------|------------|-----------------|--------------|---------|");
|
|
1738
|
-
for (const l of metrics.lessons) {
|
|
1739
|
-
lines.push(`| \`${l.ref}\` | ${l.accepted ? "yes" : "no"} | ${l.lint_pass ? "pass" : "fail"} | ${l.reuse_count} | ${l.reuse_pass_rate.toFixed(2)} | ${l.first_reused_on ?? "n/a"} | ${l.negative_transfer_count} | ${l.leakage_risk} |`);
|
|
1740
|
-
}
|
|
1741
|
-
return lines.join("\n");
|
|
1742
|
-
}
|
|
1743
|
-
/**
|
|
1744
|
-
* Render the §6.8 confusion-matrix table — aggregate 2×2 followed by
|
|
1745
|
-
* per-asset breakdown. Used by `renderEvolveReport`'s markdown body and
|
|
1746
|
-
* exported for tests.
|
|
1747
|
-
*/
|
|
1748
|
-
export function renderFeedbackIntegrityTable(metrics) {
|
|
1749
|
-
const lines = [];
|
|
1750
|
-
const agg = metrics.aggregate;
|
|
1751
|
-
lines.push("## Feedback-signal integrity");
|
|
1752
|
-
lines.push("");
|
|
1753
|
-
lines.push("| | run passed | run failed |");
|
|
1754
|
-
lines.push("|--------------|-----------:|-----------:|");
|
|
1755
|
-
lines.push(`| feedback + | ${agg.truePositive} (TP) | ${agg.falsePositive} (FP) |`);
|
|
1756
|
-
lines.push(`| feedback - | ${agg.falseNegative} (FN) | ${agg.trueNegative} (TN) |`);
|
|
1757
|
-
lines.push("");
|
|
1758
|
-
lines.push("| metric | value |");
|
|
1759
|
-
lines.push("|--------|-------|");
|
|
1760
|
-
lines.push(`| feedback_agreement | ${agg.feedback_agreement.toFixed(2)} |`);
|
|
1761
|
-
lines.push(`| false_positive_rate | ${agg.false_positive_rate.toFixed(2)} |`);
|
|
1762
|
-
lines.push(`| false_negative_rate | ${agg.false_negative_rate.toFixed(2)} |`);
|
|
1763
|
-
lines.push(`| feedback_coverage | ${agg.feedback_coverage.toFixed(2)} |`);
|
|
1764
|
-
lines.push("");
|
|
1765
|
-
if (metrics.perAsset.length > 0) {
|
|
1766
|
-
lines.push("| ref | TP | FP | TN | FN | agreement | FP rate | FN rate |");
|
|
1767
|
-
lines.push("|-----|----|----|----|----|-----------|---------|---------|");
|
|
1768
|
-
for (const row of metrics.perAsset) {
|
|
1769
|
-
lines.push(`| \`${row.ref}\` | ${row.truePositive} | ${row.falsePositive} | ${row.trueNegative} | ${row.falseNegative} | ${formatNullableRate(row.feedback_agreement)} | ${formatNullableRate(row.false_positive_rate)} | ${formatNullableRate(row.false_negative_rate)} |`);
|
|
1770
|
-
}
|
|
1771
|
-
}
|
|
1772
|
-
else {
|
|
1773
|
-
lines.push("_No feedback events recorded._");
|
|
1774
|
-
}
|
|
1775
|
-
return lines.join("\n");
|
|
1776
|
-
}
|
|
1777
|
-
function formatNullableRate(value) {
|
|
1778
|
-
if (value === null)
|
|
1779
|
-
return "n/a";
|
|
1780
|
-
return value.toFixed(2);
|
|
1781
|
-
}
|
|
1782
|
-
function buildEvolveMarkdown(input) {
|
|
1783
|
-
const lines = [];
|
|
1784
|
-
lines.push(`# akm-bench evolve — ${input.model}`);
|
|
1785
|
-
lines.push("");
|
|
1786
|
-
lines.push(`branch \`${input.branch}\` @ \`${input.commit}\` — ${input.timestamp}`);
|
|
1787
|
-
lines.push(`corpus: domain=\`${input.domain}\`, seedsPerArm=${input.seedsPerArm}`);
|
|
1788
|
-
lines.push("");
|
|
1789
|
-
// §6.8 warning marker — prepended above the headline so operators can't
|
|
1790
|
-
// miss it. We also still surface the structured warning in `warnings[]`.
|
|
1791
|
-
if (input.feedbackIntegrity &&
|
|
1792
|
-
input.feedbackIntegrity.aggregate.feedback_agreement < FEEDBACK_AGREEMENT_WARNING_THRESHOLD) {
|
|
1793
|
-
lines.push(`:warning: feedback_agreement = ${input.feedbackIntegrity.aggregate.feedback_agreement.toFixed(2)} — Track B headline numbers (improvement_slope, over_synthetic_lift) may be unreliable until AGENTS.md guidance for \`akm feedback\` is tightened.`);
|
|
1794
|
-
lines.push("");
|
|
1795
|
-
}
|
|
1796
|
-
// Headline: improvement_slope.
|
|
1797
|
-
lines.push(`**improvement_slope: ${signedFixed(input.longitudinal.improvementSlope, 2)}** (post=${input.longitudinal.postPassRate.toFixed(2)}, pre=${input.longitudinal.prePassRate.toFixed(2)})`);
|
|
1798
|
-
// Second line: real feedback_agreement (per #244), or placeholder when
|
|
1799
|
-
// metrics not supplied.
|
|
1800
|
-
if (input.feedbackIntegrity) {
|
|
1801
|
-
lines.push(`**feedback_agreement: ${input.feedbackIntegrity.aggregate.feedback_agreement.toFixed(2)}** (coverage=${input.feedbackIntegrity.aggregate.feedback_coverage.toFixed(2)})`);
|
|
1802
|
-
}
|
|
1803
|
-
else {
|
|
1804
|
-
lines.push("_feedback_agreement: pending (#244)_");
|
|
1805
|
-
}
|
|
1806
|
-
lines.push("");
|
|
1807
|
-
lines.push("## Longitudinal");
|
|
1808
|
-
lines.push("");
|
|
1809
|
-
lines.push("| metric | value |");
|
|
1810
|
-
lines.push("|--------|-------|");
|
|
1811
|
-
lines.push(`| improvement_slope | ${signedFixed(input.longitudinal.improvementSlope, 2)} |`);
|
|
1812
|
-
lines.push(`| over_synthetic_lift | ${signedFixed(input.longitudinal.overSyntheticLift, 2)} |`);
|
|
1813
|
-
lines.push(`| degradation_count | ${input.longitudinal.degradationCount} |`);
|
|
1814
|
-
lines.push(`| pre_pass_rate | ${input.longitudinal.prePassRate.toFixed(2)} |`);
|
|
1815
|
-
lines.push(`| post_pass_rate | ${input.longitudinal.postPassRate.toFixed(2)} |`);
|
|
1816
|
-
lines.push(`| synthetic_pass_rate | ${input.longitudinal.syntheticPassRate.toFixed(2)} |`);
|
|
1817
|
-
lines.push("");
|
|
1818
|
-
if (input.longitudinal.degradations.length > 0) {
|
|
1819
|
-
lines.push("### Degradations");
|
|
1820
|
-
lines.push("");
|
|
1821
|
-
lines.push("| task | pre | post | delta | failure_mode |");
|
|
1822
|
-
lines.push("|------|-----|------|-------|--------------|");
|
|
1823
|
-
for (const d of input.longitudinal.degradations) {
|
|
1824
|
-
lines.push(`| ${d.taskId} | ${d.prePassRate.toFixed(2)} | ${d.postPassRate.toFixed(2)} | ${signedFixed(d.delta, 2)} | ${d.failureMode ?? "n/a"} |`);
|
|
1825
|
-
}
|
|
1826
|
-
lines.push("");
|
|
1827
|
-
}
|
|
1828
|
-
lines.push("## Proposals");
|
|
1829
|
-
lines.push("");
|
|
1830
|
-
lines.push(`acceptance_rate=${input.proposals.acceptanceRate.toFixed(2)}, lint_pass_rate=${input.proposals.lintPassRate.toFixed(2)}, total=${input.proposals.totalProposals}`);
|
|
1831
|
-
lines.push("");
|
|
1832
|
-
if (input.proposals.rows.length > 0) {
|
|
1833
|
-
lines.push("| asset_ref | proposals | lint_pass | accepted |");
|
|
1834
|
-
lines.push("|-----------|-----------|-----------|----------|");
|
|
1835
|
-
for (const row of input.proposals.rows) {
|
|
1836
|
-
lines.push(`| \`${row.assetRef}\` | ${row.proposalCount} | ${row.lintPassCount} | ${row.acceptedCount} |`);
|
|
1837
|
-
}
|
|
1838
|
-
lines.push("");
|
|
1839
|
-
}
|
|
1840
|
-
else {
|
|
1841
|
-
lines.push("_No proposals generated._");
|
|
1842
|
-
lines.push("");
|
|
1843
|
-
}
|
|
1844
|
-
if (input.lessons) {
|
|
1845
|
-
lines.push(renderLessonsTable(input.lessons));
|
|
1846
|
-
lines.push("");
|
|
1847
|
-
}
|
|
1848
|
-
lines.push("## Per-task pre → post → synthetic");
|
|
1849
|
-
lines.push("");
|
|
1850
|
-
lines.push("| task | pre | post | synthetic | post − pre |");
|
|
1851
|
-
lines.push("|------|-----|------|-----------|------------|");
|
|
1852
|
-
const preTasks = new Map();
|
|
1853
|
-
for (const t of input.arms.pre.tasks)
|
|
1854
|
-
preTasks.set(t.id, t);
|
|
1855
|
-
const postTasks = new Map();
|
|
1856
|
-
for (const t of input.arms.post.tasks)
|
|
1857
|
-
postTasks.set(t.id, t);
|
|
1858
|
-
const synthTasks = new Map();
|
|
1859
|
-
for (const t of input.arms.synthetic.tasks)
|
|
1860
|
-
synthTasks.set(t.id, t);
|
|
1861
|
-
const allIds = new Set([...preTasks.keys(), ...postTasks.keys(), ...synthTasks.keys()]);
|
|
1862
|
-
for (const id of [...allIds].sort()) {
|
|
1863
|
-
const pre = preTasks.get(id)?.akm.passRate;
|
|
1864
|
-
const post = postTasks.get(id)?.akm.passRate;
|
|
1865
|
-
const synth = synthTasks.get(id)?.akm.passRate;
|
|
1866
|
-
const delta = pre !== undefined && post !== undefined ? signedFixed(post - pre, 2) : "n/a";
|
|
1867
|
-
lines.push(`| ${id} | ${pre === undefined ? "n/a" : pre.toFixed(2)} | ${post === undefined ? "n/a" : post.toFixed(2)} | ${synth === undefined ? "n/a" : synth.toFixed(2)} | ${delta} |`);
|
|
1868
|
-
}
|
|
1869
|
-
if (input.feedbackIntegrity) {
|
|
1870
|
-
lines.push("");
|
|
1871
|
-
lines.push(renderFeedbackIntegrityTable(input.feedbackIntegrity));
|
|
1872
|
-
}
|
|
1873
|
-
if (input.learningCurve) {
|
|
1874
|
-
lines.push("");
|
|
1875
|
-
lines.push(renderLearningCurveSection(input.learningCurve));
|
|
1876
|
-
}
|
|
1877
|
-
if (input.warnings.length > 0) {
|
|
1878
|
-
lines.push("");
|
|
1879
|
-
lines.push("## Warnings");
|
|
1880
|
-
lines.push("");
|
|
1881
|
-
for (const w of input.warnings)
|
|
1882
|
-
lines.push(`- ${w}`);
|
|
1883
|
-
}
|
|
1884
|
-
return lines.join("\n");
|
|
1885
|
-
}
|