akm-cli 0.6.1 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +66 -0
- package/dist/{cli.js → src/cli.js} +712 -34
- package/dist/{commands → src/commands}/config-cli.js +47 -4
- package/dist/src/commands/distill.js +283 -0
- package/dist/src/commands/events.js +108 -0
- package/dist/src/commands/history.js +191 -0
- package/dist/{commands → src/commands}/installed-stashes.js +1 -1
- package/dist/src/commands/proposal.js +119 -0
- package/dist/src/commands/propose.js +171 -0
- package/dist/src/commands/reflect.js +193 -0
- package/dist/{commands → src/commands}/registry-search.js +71 -7
- package/dist/{commands → src/commands}/remember.js +12 -0
- package/dist/{commands → src/commands}/search.js +104 -4
- package/dist/{commands → src/commands}/self-update.js +4 -3
- package/dist/{commands → src/commands}/show.js +73 -0
- package/dist/{commands → src/commands}/source-add.js +5 -1
- package/dist/{commands → src/commands}/source-manage.js +7 -1
- package/dist/{core → src/core}/asset-ref.js +5 -5
- package/dist/{core → src/core}/asset-spec.js +12 -0
- package/dist/{core → src/core}/common.js +1 -1
- package/dist/{core → src/core}/config.js +203 -121
- package/dist/{core → src/core}/errors.js +4 -0
- package/dist/src/core/events.js +239 -0
- package/dist/src/core/lesson-lint.js +86 -0
- package/dist/src/core/proposals.js +406 -0
- package/dist/src/core/warn.js +72 -0
- package/dist/{core → src/core}/write-source.js +80 -5
- package/dist/{indexer → src/indexer}/db-search.js +114 -24
- package/dist/{indexer → src/indexer}/db.js +76 -23
- package/dist/{indexer → src/indexer}/file-context.js +0 -3
- package/dist/src/indexer/graph-boost.js +179 -0
- package/dist/src/indexer/graph-extraction.js +212 -0
- package/dist/{indexer → src/indexer}/indexer.js +88 -7
- package/dist/{indexer → src/indexer}/matchers.js +1 -1
- package/dist/src/indexer/memory-inference.js +263 -0
- package/dist/{indexer → src/indexer}/metadata.js +111 -3
- package/dist/{indexer → src/indexer}/search-source.js +4 -2
- package/dist/src/integrations/agent/config.js +292 -0
- package/dist/src/integrations/agent/detect.js +94 -0
- package/dist/src/integrations/agent/index.js +17 -0
- package/dist/src/integrations/agent/profiles.js +65 -0
- package/dist/src/integrations/agent/prompts.js +167 -0
- package/dist/src/integrations/agent/spawn.js +272 -0
- package/dist/{integrations → src/integrations}/github.js +9 -3
- package/dist/{integrations → src/integrations}/lockfile.js +0 -26
- package/dist/{llm → src/llm}/client.js +33 -2
- package/dist/{llm → src/llm}/embedders/remote.js +37 -3
- package/dist/src/llm/feature-gate.js +108 -0
- package/dist/src/llm/graph-extract.js +107 -0
- package/dist/src/llm/index-passes.js +35 -0
- package/dist/src/llm/memory-infer.js +86 -0
- package/dist/{output → src/output}/cli-hints.js +15 -2
- package/dist/{output → src/output}/renderers.js +63 -2
- package/dist/src/output/shapes.js +523 -0
- package/dist/src/output/text.js +1116 -0
- package/dist/{registry → src/registry}/build-index.js +19 -8
- package/dist/{registry → src/registry}/factory.js +0 -8
- package/dist/{registry → src/registry}/providers/static-index.js +6 -3
- package/dist/{registry → src/registry}/resolve.js +68 -2
- package/dist/{setup → src/setup}/setup.js +52 -5
- package/dist/{sources → src/sources}/providers/git.js +7 -15
- package/dist/{wiki → src/wiki}/wiki.js +54 -6
- package/dist/{workflows → src/workflows}/runs.js +37 -3
- package/dist/tests/add-website-source.test.js +119 -0
- package/dist/tests/agent/agent-config-loader.test.js +70 -0
- package/dist/tests/agent/agent-config.test.js +221 -0
- package/dist/tests/agent/agent-detect.test.js +100 -0
- package/dist/tests/agent/agent-spawn.test.js +234 -0
- package/dist/tests/agent-output.test.js +186 -0
- package/dist/tests/architecture/agent-no-llm-sdk-guard.test.js +103 -0
- package/dist/tests/architecture/agent-spawn-seam.test.js +193 -0
- package/dist/tests/architecture/llm-stateless-seam.test.js +112 -0
- package/dist/tests/asset-ref.test.js +192 -0
- package/dist/tests/asset-registry.test.js +103 -0
- package/dist/tests/asset-spec.test.js +241 -0
- package/dist/tests/bench/attribution.test.js +996 -0
- package/dist/tests/bench/cleanup-sigint.test.js +83 -0
- package/dist/tests/bench/cleanup.js +234 -0
- package/dist/tests/bench/cleanup.test.js +166 -0
- package/dist/tests/bench/cli.js +1018 -0
- package/dist/tests/bench/cli.test.js +445 -0
- package/dist/tests/bench/compare.test.js +556 -0
- package/dist/tests/bench/corpus.js +317 -0
- package/dist/tests/bench/corpus.test.js +258 -0
- package/dist/tests/bench/doctor.js +525 -0
- package/dist/tests/bench/driver.js +401 -0
- package/dist/tests/bench/driver.test.js +584 -0
- package/dist/tests/bench/environment.js +233 -0
- package/dist/tests/bench/environment.test.js +199 -0
- package/dist/tests/bench/evolve-metrics.js +179 -0
- package/dist/tests/bench/evolve-metrics.test.js +187 -0
- package/dist/tests/bench/evolve.js +647 -0
- package/dist/tests/bench/evolve.test.js +624 -0
- package/dist/tests/bench/failure-modes.test.js +349 -0
- package/dist/tests/bench/feedback-integrity.test.js +457 -0
- package/dist/tests/bench/leakage.test.js +228 -0
- package/dist/tests/bench/learning-curve.test.js +134 -0
- package/dist/tests/bench/metrics.js +2395 -0
- package/dist/tests/bench/metrics.test.js +1150 -0
- package/dist/tests/bench/no-os-tmpdir-invariant.test.js +43 -0
- package/dist/tests/bench/opencode-config.js +194 -0
- package/dist/tests/bench/opencode-config.test.js +370 -0
- package/dist/tests/bench/report.js +1885 -0
- package/dist/tests/bench/report.test.js +1038 -0
- package/dist/tests/bench/run-config.js +355 -0
- package/dist/tests/bench/run-config.test.js +298 -0
- package/dist/tests/bench/run-curate-test.js +32 -0
- package/dist/tests/bench/run-failing-tasks.js +56 -0
- package/dist/tests/bench/run-full-bench.js +51 -0
- package/dist/tests/bench/run-items36-targeted.js +69 -0
- package/dist/tests/bench/run-nano-quick.js +42 -0
- package/dist/tests/bench/run-waveg-targeted.js +62 -0
- package/dist/tests/bench/runner.js +699 -0
- package/dist/tests/bench/runner.test.js +958 -0
- package/dist/tests/bench/search-bridge.test.js +331 -0
- package/dist/tests/bench/tmp.js +131 -0
- package/dist/tests/bench/trajectory.js +116 -0
- package/dist/tests/bench/trajectory.test.js +127 -0
- package/dist/tests/bench/verifier.js +114 -0
- package/dist/tests/bench/verifier.test.js +118 -0
- package/dist/tests/bench/workflow-evaluator.js +557 -0
- package/dist/tests/bench/workflow-evaluator.test.js +421 -0
- package/dist/tests/bench/workflow-spec.js +345 -0
- package/dist/tests/bench/workflow-spec.test.js +363 -0
- package/dist/tests/bench/workflow-trace.js +472 -0
- package/dist/tests/bench/workflow-trace.test.js +254 -0
- package/dist/tests/benchmark-search-quality.js +536 -0
- package/dist/tests/benchmark-suite.js +1441 -0
- package/dist/tests/capture-cli.test.js +112 -0
- package/dist/tests/cli-errors.test.js +204 -0
- package/dist/tests/commands/events.test.js +370 -0
- package/dist/tests/commands/history.test.js +418 -0
- package/dist/tests/commands/import.test.js +103 -0
- package/dist/tests/commands/proposal-cli.test.js +209 -0
- package/dist/tests/commands/reflect-propose-cli.test.js +333 -0
- package/dist/tests/commands/remember.test.js +97 -0
- package/dist/tests/commands/scope-flags.test.js +300 -0
- package/dist/tests/commands/search.test.js +537 -0
- package/dist/tests/commands/show-indexer-parity.test.js +117 -0
- package/dist/tests/commands/show.test.js +294 -0
- package/dist/tests/common.test.js +266 -0
- package/dist/tests/completions.test.js +142 -0
- package/dist/tests/config-cli.test.js +193 -0
- package/dist/tests/config-llm-features.test.js +139 -0
- package/dist/tests/config.test.js +569 -0
- package/dist/tests/contracts/migration-baseline.test.js +43 -0
- package/dist/tests/contracts/reflect-propose-envelope.test.js +139 -0
- package/dist/tests/contracts/spec-helpers.js +46 -0
- package/dist/tests/contracts/v1-spec-section-11-proposal-queue.test.js +228 -0
- package/dist/tests/contracts/v1-spec-section-12-agent-config.test.js +56 -0
- package/dist/tests/contracts/v1-spec-section-13-lesson-type.test.js +34 -0
- package/dist/tests/contracts/v1-spec-section-14-llm-features.test.js +94 -0
- package/dist/tests/contracts/v1-spec-section-4-1-asset-types.test.js +39 -0
- package/dist/tests/contracts/v1-spec-section-4-2-quality-rules.test.js +44 -0
- package/dist/tests/contracts/v1-spec-section-5-configuration.test.js +47 -0
- package/dist/tests/contracts/v1-spec-section-6-orchestration.test.js +40 -0
- package/dist/tests/contracts/v1-spec-section-7-module-layout.test.js +58 -0
- package/dist/tests/contracts/v1-spec-section-8-extension-points.test.js +34 -0
- package/dist/tests/contracts/v1-spec-section-9-4-cli-surface.test.js +75 -0
- package/dist/tests/contracts/v1-spec-section-9-7-llm-agent-boundary.test.js +36 -0
- package/dist/tests/core/write-source.test.js +366 -0
- package/dist/tests/curate-command.test.js +87 -0
- package/dist/tests/db-scoring.test.js +201 -0
- package/dist/tests/db.test.js +654 -0
- package/dist/tests/distill-cli-flag.test.js +208 -0
- package/dist/tests/distill.test.js +515 -0
- package/dist/tests/docker-install.test.js +120 -0
- package/dist/tests/e2e.test.js +1419 -0
- package/dist/tests/embedder.test.js +340 -0
- package/dist/tests/embedding-model-config.test.js +379 -0
- package/dist/tests/feedback-command.test.js +172 -0
- package/dist/tests/file-context.test.js +552 -0
- package/dist/tests/fixtures/scripts/git/summarize-diff.js +9 -0
- package/dist/tests/fixtures/scripts/lint/eslint-check.js +7 -0
- package/dist/tests/fixtures/stashes/load.js +166 -0
- package/dist/tests/fixtures/stashes/load.test.js +97 -0
- package/dist/tests/fixtures/stashes/ranking-baseline/scripts/mem0-search.js +12 -0
- package/dist/tests/frontmatter.test.js +190 -0
- package/dist/tests/fts-field-weighting.test.js +254 -0
- package/dist/tests/fuzzy-search.test.js +230 -0
- package/dist/tests/git-provider-clone.test.js +45 -0
- package/dist/tests/github.test.js +161 -0
- package/dist/tests/graph-boost-ranking.test.js +305 -0
- package/dist/tests/graph-extraction.test.js +282 -0
- package/dist/tests/helpers/usage-events.js +8 -0
- package/dist/tests/index-pass-llm.test.js +161 -0
- package/dist/tests/indexer.test.js +570 -0
- package/dist/tests/info-command.test.js +166 -0
- package/dist/tests/init.test.js +69 -0
- package/dist/tests/install-script.test.js +246 -0
- package/dist/tests/integration/agent-real-profile.test.js +94 -0
- package/dist/tests/issue-36-repro.test.js +304 -0
- package/dist/tests/issues-191-194.test.js +160 -0
- package/dist/tests/lesson-lint.test.js +111 -0
- package/dist/tests/llm-client.test.js +115 -0
- package/dist/tests/llm-feature-gate.test.js +151 -0
- package/dist/tests/llm.test.js +139 -0
- package/dist/tests/lockfile.test.js +216 -0
- package/dist/tests/manifest.test.js +205 -0
- package/dist/tests/markdown.test.js +126 -0
- package/dist/tests/matchers-unit.test.js +189 -0
- package/dist/tests/memory-inference.test.js +299 -0
- package/dist/tests/merge-scoring.test.js +136 -0
- package/dist/tests/metadata.test.js +313 -0
- package/dist/tests/migration-help.test.js +89 -0
- package/dist/tests/origin-resolve.test.js +124 -0
- package/dist/tests/output-baseline.test.js +218 -0
- package/dist/tests/output-shapes-unit.test.js +478 -0
- package/dist/tests/parallel-search.test.js +272 -0
- package/dist/tests/parameter-metadata.test.js +365 -0
- package/dist/tests/paths.test.js +177 -0
- package/dist/tests/progressive-disclosure.test.js +280 -0
- package/dist/tests/proposals.test.js +279 -0
- package/dist/tests/proposed-quality.test.js +271 -0
- package/dist/tests/provider-registry.test.js +32 -0
- package/dist/tests/ranking-regression.test.js +548 -0
- package/dist/tests/reflect-propose.test.js +455 -0
- package/dist/tests/registry-build-index.test.js +394 -0
- package/dist/tests/registry-cli.test.js +290 -0
- package/dist/tests/registry-index-v2.test.js +430 -0
- package/dist/tests/registry-install.test.js +728 -0
- package/dist/tests/registry-providers/parity.test.js +189 -0
- package/dist/tests/registry-providers/skills-sh.test.js +309 -0
- package/dist/tests/registry-providers/static-index.test.js +238 -0
- package/dist/tests/registry-resolve.test.js +126 -0
- package/dist/tests/registry-search.test.js +923 -0
- package/dist/tests/remember-frontmatter.test.js +378 -0
- package/dist/tests/remember-unit.test.js +123 -0
- package/dist/tests/ripgrep-install.test.js +251 -0
- package/dist/tests/ripgrep-resolve.test.js +108 -0
- package/dist/tests/ripgrep.test.js +163 -0
- package/dist/tests/save-command.test.js +94 -0
- package/dist/tests/save-trust-qa-fixes.test.js +270 -0
- package/dist/tests/scoring-pipeline.test.js +648 -0
- package/dist/tests/search-include-proposed-cli.test.js +118 -0
- package/dist/tests/self-update.test.js +442 -0
- package/dist/tests/semantic-search-e2e.test.js +512 -0
- package/dist/tests/semantic-status.test.js +471 -0
- package/dist/tests/setup-run.integration.js +877 -0
- package/dist/tests/setup-wizard.test.js +198 -0
- package/dist/tests/setup.test.js +131 -0
- package/dist/tests/source-add.test.js +11 -0
- package/dist/tests/source-clone.test.js +254 -0
- package/dist/tests/source-manage.test.js +366 -0
- package/dist/tests/source-providers/filesystem.test.js +82 -0
- package/dist/tests/source-providers/git.test.js +252 -0
- package/dist/tests/source-providers/website.test.js +128 -0
- package/dist/tests/source-qa-fixes.test.js +286 -0
- package/dist/tests/source-registry.test.js +350 -0
- package/dist/tests/source-resolve.test.js +100 -0
- package/dist/tests/source-source.test.js +281 -0
- package/dist/tests/source.test.js +533 -0
- package/dist/tests/tar-utils-scan.test.js +73 -0
- package/dist/tests/toggle-components.test.js +73 -0
- package/dist/tests/usage-telemetry.test.js +265 -0
- package/dist/tests/utility-scoring.test.js +558 -0
- package/dist/tests/vault-load-error.test.js +78 -0
- package/dist/tests/vault-qa-fixes.test.js +194 -0
- package/dist/tests/vault.test.js +429 -0
- package/dist/tests/vector-search.test.js +608 -0
- package/dist/tests/walker.test.js +252 -0
- package/dist/tests/wave2-cluster-bc.test.js +228 -0
- package/dist/tests/wave2-cluster-d.test.js +180 -0
- package/dist/tests/wave2-cluster-e.test.js +179 -0
- package/dist/tests/wiki-qa-fixes.test.js +270 -0
- package/dist/tests/wiki.test.js +529 -0
- package/dist/tests/workflow-cli.test.js +271 -0
- package/dist/tests/workflow-markdown.test.js +171 -0
- package/dist/tests/workflow-path-escape.test.js +132 -0
- package/dist/tests/workflow-qa-fixes.test.js +395 -0
- package/dist/tests/workflows/indexer-rejection.test.js +213 -0
- package/docs/README.md +8 -0
- package/docs/migration/release-notes/0.7.0.md +244 -0
- package/package.json +2 -2
- package/dist/core/warn.js +0 -27
- package/dist/output/shapes.js +0 -212
- package/dist/output/text.js +0 -520
- /package/dist/{commands → src/commands}/completions.js +0 -0
- /package/dist/{commands → src/commands}/curate.js +0 -0
- /package/dist/{commands → src/commands}/info.js +0 -0
- /package/dist/{commands → src/commands}/init.js +0 -0
- /package/dist/{commands → src/commands}/install-audit.js +0 -0
- /package/dist/{commands → src/commands}/migration-help.js +0 -0
- /package/dist/{commands → src/commands}/source-clone.js +0 -0
- /package/dist/{commands → src/commands}/vault.js +0 -0
- /package/dist/{core → src/core}/asset-registry.js +0 -0
- /package/dist/{core → src/core}/frontmatter.js +0 -0
- /package/dist/{core → src/core}/markdown.js +0 -0
- /package/dist/{core → src/core}/paths.js +0 -0
- /package/dist/{indexer → src/indexer}/manifest.js +0 -0
- /package/dist/{indexer → src/indexer}/search-fields.js +0 -0
- /package/dist/{indexer → src/indexer}/semantic-status.js +0 -0
- /package/dist/{indexer → src/indexer}/usage-events.js +0 -0
- /package/dist/{indexer → src/indexer}/walker.js +0 -0
- /package/dist/{llm → src/llm}/embedder.js +0 -0
- /package/dist/{llm → src/llm}/embedders/cache.js +0 -0
- /package/dist/{llm → src/llm}/embedders/local.js +0 -0
- /package/dist/{llm → src/llm}/embedders/types.js +0 -0
- /package/dist/{llm → src/llm}/metadata-enhance.js +0 -0
- /package/dist/{output → src/output}/context.js +0 -0
- /package/dist/{registry → src/registry}/create-provider-registry.js +0 -0
- /package/dist/{registry → src/registry}/origin-resolve.js +0 -0
- /package/dist/{registry → src/registry}/providers/index.js +0 -0
- /package/dist/{registry → src/registry}/providers/skills-sh.js +0 -0
- /package/dist/{registry → src/registry}/providers/types.js +0 -0
- /package/dist/{registry → src/registry}/types.js +0 -0
- /package/dist/{setup → src/setup}/detect.js +0 -0
- /package/dist/{setup → src/setup}/ripgrep-install.js +0 -0
- /package/dist/{setup → src/setup}/ripgrep-resolve.js +0 -0
- /package/dist/{setup → src/setup}/steps.js +0 -0
- /package/dist/{sources → src/sources}/include.js +0 -0
- /package/dist/{sources → src/sources}/provider-factory.js +0 -0
- /package/dist/{sources → src/sources}/provider.js +0 -0
- /package/dist/{sources → src/sources}/providers/filesystem.js +0 -0
- /package/dist/{sources → src/sources}/providers/index.js +0 -0
- /package/dist/{sources → src/sources}/providers/install-types.js +0 -0
- /package/dist/{sources → src/sources}/providers/npm.js +0 -0
- /package/dist/{sources → src/sources}/providers/provider-utils.js +0 -0
- /package/dist/{sources → src/sources}/providers/sync-from-ref.js +0 -0
- /package/dist/{sources → src/sources}/providers/tar-utils.js +0 -0
- /package/dist/{sources → src/sources}/providers/website.js +0 -0
- /package/dist/{sources → src/sources}/resolve.js +0 -0
- /package/dist/{sources → src/sources}/types.js +0 -0
- /package/dist/{templates → src/templates}/wiki-templates.js +0 -0
- /package/dist/{version.js → src/version.js} +0 -0
- /package/dist/{workflows → src/workflows}/authoring.js +0 -0
- /package/dist/{workflows → src/workflows}/cli.js +0 -0
- /package/dist/{workflows → src/workflows}/db.js +0 -0
- /package/dist/{workflows → src/workflows}/document-cache.js +0 -0
- /package/dist/{workflows → src/workflows}/parser.js +0 -0
- /package/dist/{workflows → src/workflows}/renderer.js +0 -0
- /package/dist/{workflows → src/workflows}/schema.js +0 -0
- /package/dist/{workflows → src/workflows}/validator.js +0 -0
|
@@ -0,0 +1,2395 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* akm-bench metrics (spec §6).
|
|
3
|
+
*
|
|
4
|
+
* Outcome metrics (§6.1) and trajectory metrics (§6.2). Both are pure
|
|
5
|
+
* functions over `RunResult[]` slices so the runner can compose them
|
|
6
|
+
* however it likes. The §6.3+ catalog (proposal-quality, longitudinal,
|
|
7
|
+
* attribution, failure-mode taxonomy) lands in #239/#240/#243.
|
|
8
|
+
*
|
|
9
|
+
* The failure-mode taxonomy classifier (§6.6) lives in this file
|
|
10
|
+
* (`classifyFailureMode`).
|
|
11
|
+
*
|
|
12
|
+
* Search-pipeline bridge metrics (§6.7) are below: they tie the synthetic
|
|
13
|
+
* MRR/Recall@K view in `tests/benchmark-suite.ts` to real-task pass rate
|
|
14
|
+
* by logging gold-rank-of-search per `akm search` invocation and slicing
|
|
15
|
+
* pass-rate by the rank of the agent's *chosen* search.
|
|
16
|
+
*/
|
|
17
|
+
import fs from "node:fs";
|
|
18
|
+
import path from "node:path";
|
|
19
|
+
import { safeRealpath } from "../../src/core/common";
|
|
20
|
+
import { MEMORY_ABILITY_VALUES } from "./corpus";
|
|
21
|
+
import { serializeRunForReport } from "./report";
|
|
22
|
+
import { benchMkdtemp } from "./tmp";
|
|
23
|
+
import { normalizeRunToTrace } from "./workflow-trace";
|
|
24
|
+
/**
|
|
25
|
+
* Aggregate outcome metrics over a flat list of RunResults.
|
|
26
|
+
*
|
|
27
|
+
* Aggregations across multiple arms are the caller's responsibility — pass
|
|
28
|
+
* each arm's slice in separately. Backward-compatible v1 contract; the
|
|
29
|
+
* richer per-task / corpus shapes below subsume this.
|
|
30
|
+
*/
|
|
31
|
+
export function computeOutcomeAggregate(results) {
|
|
32
|
+
if (results.length === 0) {
|
|
33
|
+
return { passRate: 0, tokensPerPass: 0, wallclockMs: 0, budgetExceeded: 0, runsWithMeasuredTokens: 0 };
|
|
34
|
+
}
|
|
35
|
+
let passes = 0;
|
|
36
|
+
let budgetExceeded = 0;
|
|
37
|
+
let totalTokensInMeasuredPasses = 0;
|
|
38
|
+
let measuredPasses = 0;
|
|
39
|
+
let runsWithMeasuredTokens = 0;
|
|
40
|
+
let totalWallclock = 0;
|
|
41
|
+
for (const r of results) {
|
|
42
|
+
totalWallclock += r.wallclockMs;
|
|
43
|
+
if (isMeasured(r)) {
|
|
44
|
+
runsWithMeasuredTokens += 1;
|
|
45
|
+
}
|
|
46
|
+
if (r.outcome === "pass") {
|
|
47
|
+
passes += 1;
|
|
48
|
+
// Only fold tokens into the mean when we actually measured them
|
|
49
|
+
// (issue #252) — otherwise a `0` would silently understate cost.
|
|
50
|
+
if (isMeasured(r)) {
|
|
51
|
+
measuredPasses += 1;
|
|
52
|
+
totalTokensInMeasuredPasses += r.tokens.input + r.tokens.output;
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
else if (r.outcome === "budget_exceeded") {
|
|
56
|
+
budgetExceeded += 1;
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
return {
|
|
60
|
+
passRate: passes / results.length,
|
|
61
|
+
tokensPerPass: measuredPasses === 0 ? 0 : totalTokensInMeasuredPasses / measuredPasses,
|
|
62
|
+
wallclockMs: totalWallclock / results.length,
|
|
63
|
+
budgetExceeded,
|
|
64
|
+
runsWithMeasuredTokens,
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
/**
|
|
68
|
+
* Treat older artefacts without `tokenMeasurement` as `"parsed"` for backward
|
|
69
|
+
* compatibility — pre-#252 reports always returned numeric zero, and rejecting
|
|
70
|
+
* them entirely would break compare/attribute over historical runs.
|
|
71
|
+
*/
|
|
72
|
+
function isMeasured(r) {
|
|
73
|
+
return (r.tokenMeasurement ?? "parsed") === "parsed";
|
|
74
|
+
}
|
|
75
|
+
/**
|
|
76
|
+
* Aggregate K seed runs of one (task, arm) pair into PerTaskMetrics. Returns
|
|
77
|
+
* a zeroed envelope on empty input — callers decide whether to skip or render.
|
|
78
|
+
*/
|
|
79
|
+
export function aggregatePerTask(results) {
|
|
80
|
+
if (results.length === 0) {
|
|
81
|
+
return {
|
|
82
|
+
passRate: 0,
|
|
83
|
+
passAt1: 0,
|
|
84
|
+
tokensPerPass: null,
|
|
85
|
+
wallclockMs: 0,
|
|
86
|
+
passRateStdev: 0,
|
|
87
|
+
budgetExceededCount: 0,
|
|
88
|
+
harnessErrorCount: 0,
|
|
89
|
+
count: 0,
|
|
90
|
+
runsWithMeasuredTokens: 0,
|
|
91
|
+
tokensPerRun: null,
|
|
92
|
+
};
|
|
93
|
+
}
|
|
94
|
+
let passes = 0;
|
|
95
|
+
let measuredPasses = 0;
|
|
96
|
+
let totalTokensInMeasuredPasses = 0;
|
|
97
|
+
let totalWallclock = 0;
|
|
98
|
+
let budgetExceeded = 0;
|
|
99
|
+
let harnessError = 0;
|
|
100
|
+
let runsWithMeasuredTokens = 0;
|
|
101
|
+
let totalTokensInMeasuredRuns = 0;
|
|
102
|
+
let measuredRuns = 0;
|
|
103
|
+
// For the standard deviation we need a fixed-iteration buffer of pass/fail.
|
|
104
|
+
const passSamples = [];
|
|
105
|
+
for (const r of results) {
|
|
106
|
+
totalWallclock += r.wallclockMs;
|
|
107
|
+
if (isMeasured(r)) {
|
|
108
|
+
runsWithMeasuredTokens += 1;
|
|
109
|
+
measuredRuns += 1;
|
|
110
|
+
totalTokensInMeasuredRuns += r.tokens.input + r.tokens.output;
|
|
111
|
+
}
|
|
112
|
+
const isPass = r.outcome === "pass" ? 1 : 0;
|
|
113
|
+
passSamples.push(isPass);
|
|
114
|
+
if (isPass === 1) {
|
|
115
|
+
passes += 1;
|
|
116
|
+
// Only count tokens for measured passes (issue #252). A pass with
|
|
117
|
+
// missing measurement contributes to `passRate` but NOT to
|
|
118
|
+
// `tokensPerPass` — preserving "tokens per measured pass" semantics.
|
|
119
|
+
if (isMeasured(r)) {
|
|
120
|
+
measuredPasses += 1;
|
|
121
|
+
totalTokensInMeasuredPasses += r.tokens.input + r.tokens.output;
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
else if (r.outcome === "budget_exceeded") {
|
|
125
|
+
budgetExceeded += 1;
|
|
126
|
+
}
|
|
127
|
+
else if (r.outcome === "harness_error") {
|
|
128
|
+
harnessError += 1;
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
const seed0 = results.find((r) => r.seed === 0) ?? results[0];
|
|
132
|
+
const passAt1 = seed0 && seed0.outcome === "pass" ? 1 : 0;
|
|
133
|
+
return {
|
|
134
|
+
passRate: passes / results.length,
|
|
135
|
+
passAt1,
|
|
136
|
+
tokensPerPass: measuredPasses === 0 ? null : totalTokensInMeasuredPasses / measuredPasses,
|
|
137
|
+
wallclockMs: totalWallclock / results.length,
|
|
138
|
+
passRateStdev: stdev(passSamples),
|
|
139
|
+
budgetExceededCount: budgetExceeded,
|
|
140
|
+
harnessErrorCount: harnessError,
|
|
141
|
+
count: results.length,
|
|
142
|
+
runsWithMeasuredTokens,
|
|
143
|
+
tokensPerRun: measuredRuns === 0 ? null : totalTokensInMeasuredRuns / measuredRuns,
|
|
144
|
+
};
|
|
145
|
+
}
|
|
146
|
+
/** Sample standard deviation. Returns 0 for length ≤ 1 (no spread to measure). */
|
|
147
|
+
function stdev(values) {
|
|
148
|
+
if (values.length <= 1)
|
|
149
|
+
return 0;
|
|
150
|
+
const mean = values.reduce((a, b) => a + b, 0) / values.length;
|
|
151
|
+
const sumSq = values.reduce((acc, v) => acc + (v - mean) * (v - mean), 0);
|
|
152
|
+
// Sample stdev (Bessel's correction) — n-1 denominator.
|
|
153
|
+
return Math.sqrt(sumSq / (values.length - 1));
|
|
154
|
+
}
|
|
155
|
+
/**
|
|
156
|
+
* Mean across per-task metrics. Each task contributes once, regardless of
|
|
157
|
+
* how many seeds it ran (K is already collapsed in `aggregatePerTask`).
|
|
158
|
+
*
|
|
159
|
+
* `tokensPerPass`: tasks where `tokensPerPass` is `null` (no passes) are
|
|
160
|
+
* dropped from that mean. The result is `null` if every task failed.
|
|
161
|
+
*/
|
|
162
|
+
export function aggregateCorpus(perTask) {
|
|
163
|
+
const tasks = Object.values(perTask);
|
|
164
|
+
if (tasks.length === 0) {
|
|
165
|
+
return { passRate: 0, tokensPerPass: null, wallclockMs: 0, tokensPerRun: null };
|
|
166
|
+
}
|
|
167
|
+
const passRate = tasks.reduce((a, t) => a + t.passRate, 0) / tasks.length;
|
|
168
|
+
const wallclockMs = tasks.reduce((a, t) => a + t.wallclockMs, 0) / tasks.length;
|
|
169
|
+
const tppValues = tasks.map((t) => t.tokensPerPass).filter((v) => v !== null);
|
|
170
|
+
const tokensPerPass = tppValues.length === 0 ? null : tppValues.reduce((a, b) => a + b, 0) / tppValues.length;
|
|
171
|
+
const tprValues = tasks.map((t) => t.tokensPerRun).filter((v) => v !== null);
|
|
172
|
+
const tokensPerRun = tprValues.length === 0 ? null : tprValues.reduce((a, b) => a + b, 0) / tprValues.length;
|
|
173
|
+
return { passRate, tokensPerPass, wallclockMs, tokensPerRun };
|
|
174
|
+
}
|
|
175
|
+
/**
|
|
176
|
+
* Compute the akm − noakm delta. Negative `tokensPerPass`/`wallclockMs` mean
|
|
177
|
+
* akm was cheaper / faster; positive means it cost more. Pass-rate uses the
|
|
178
|
+
* opposite convention (positive = akm wins).
|
|
179
|
+
*/
|
|
180
|
+
export function computeCorpusDelta(noakm, akm) {
|
|
181
|
+
return {
|
|
182
|
+
passRate: akm.passRate - noakm.passRate,
|
|
183
|
+
tokensPerPass: akm.tokensPerPass === null || noakm.tokensPerPass === null ? null : akm.tokensPerPass - noakm.tokensPerPass,
|
|
184
|
+
wallclockMs: akm.wallclockMs - noakm.wallclockMs,
|
|
185
|
+
tokensPerRun: akm.tokensPerRun === null || noakm.tokensPerRun === null ? null : akm.tokensPerRun - noakm.tokensPerRun,
|
|
186
|
+
};
|
|
187
|
+
}
|
|
188
|
+
/** Per-task delta with the same null-safety as the corpus delta. */
|
|
189
|
+
export function computePerTaskDelta(noakm, akm) {
|
|
190
|
+
return {
|
|
191
|
+
passRate: akm.passRate - noakm.passRate,
|
|
192
|
+
tokensPerPass: akm.tokensPerPass === null || noakm.tokensPerPass === null ? null : akm.tokensPerPass - noakm.tokensPerPass,
|
|
193
|
+
wallclockMs: akm.wallclockMs - noakm.wallclockMs,
|
|
194
|
+
tokensPerRun: akm.tokensPerRun === null || noakm.tokensPerRun === null ? null : akm.tokensPerRun - noakm.tokensPerRun,
|
|
195
|
+
};
|
|
196
|
+
}
|
|
197
|
+
/**
|
|
198
|
+
* Extract the domain prefix from a task ID. The corpus convention is
|
|
199
|
+
* `<domain>/<task-name>`; we split on the first `/`. Tasks lacking a slash
|
|
200
|
+
* fall back to the literal `unknown` bucket so they aggregate predictably
|
|
201
|
+
* rather than producing per-task domains-of-one.
|
|
202
|
+
*/
|
|
203
|
+
export function domainOfTaskId(taskId) {
|
|
204
|
+
const idx = taskId.indexOf("/");
|
|
205
|
+
if (idx <= 0)
|
|
206
|
+
return "unknown";
|
|
207
|
+
return taskId.slice(0, idx);
|
|
208
|
+
}
|
|
209
|
+
/**
|
|
210
|
+
* Compute the negative-transfer aggregate over a set of per-task entries
|
|
211
|
+
* (one entry per task; both arms already aggregated into PerTaskMetrics).
|
|
212
|
+
*
|
|
213
|
+
* A task is "regressed" when `akm.passRate < noakm.passRate`. Ties (equal
|
|
214
|
+
* pass rate, including 0=0) are NOT regressions. `topRegressedTasks` is
|
|
215
|
+
* sorted by `severity` descending then `taskId` ascending so output is
|
|
216
|
+
* deterministic.
|
|
217
|
+
*/
|
|
218
|
+
export function computeNegativeTransfer(tasks) {
|
|
219
|
+
const regressed = [];
|
|
220
|
+
for (const t of tasks) {
|
|
221
|
+
const delta = t.akm.passRate - t.noakm.passRate;
|
|
222
|
+
if (delta >= 0)
|
|
223
|
+
continue;
|
|
224
|
+
regressed.push({
|
|
225
|
+
taskId: t.id,
|
|
226
|
+
domain: domainOfTaskId(t.id),
|
|
227
|
+
noakmPassRate: t.noakm.passRate,
|
|
228
|
+
akmPassRate: t.akm.passRate,
|
|
229
|
+
delta,
|
|
230
|
+
severity: -delta,
|
|
231
|
+
});
|
|
232
|
+
}
|
|
233
|
+
regressed.sort((a, b) => {
|
|
234
|
+
if (b.severity !== a.severity)
|
|
235
|
+
return b.severity - a.severity;
|
|
236
|
+
return a.taskId.localeCompare(b.taskId);
|
|
237
|
+
});
|
|
238
|
+
const severity = regressed.reduce((acc, r) => acc + r.severity, 0);
|
|
239
|
+
return { count: regressed.length, severity, topRegressedTasks: regressed };
|
|
240
|
+
}
|
|
241
|
+
/**
|
|
242
|
+
* Compute per-domain aggregates over a set of per-task entries. Each task
|
|
243
|
+
* contributes once to its domain (K seeds already collapsed). Output rows
|
|
244
|
+
* are sorted by `domain` ascending so JSON / markdown are byte-stable.
|
|
245
|
+
*
|
|
246
|
+
* Domain extraction uses `domainOfTaskId` (split on first `/`).
|
|
247
|
+
*/
|
|
248
|
+
export function computeDomainAggregates(tasks) {
|
|
249
|
+
const buckets = new Map();
|
|
250
|
+
for (const t of tasks) {
|
|
251
|
+
const d = domainOfTaskId(t.id);
|
|
252
|
+
let arr = buckets.get(d);
|
|
253
|
+
if (!arr) {
|
|
254
|
+
arr = [];
|
|
255
|
+
buckets.set(d, arr);
|
|
256
|
+
}
|
|
257
|
+
arr.push(t);
|
|
258
|
+
}
|
|
259
|
+
const rows = [];
|
|
260
|
+
for (const [domain, group] of buckets) {
|
|
261
|
+
const n = group.length;
|
|
262
|
+
let noakmSum = 0;
|
|
263
|
+
let akmSum = 0;
|
|
264
|
+
let wallNoakm = 0;
|
|
265
|
+
let wallAkm = 0;
|
|
266
|
+
let regressionCount = 0;
|
|
267
|
+
const noakmTpp = [];
|
|
268
|
+
const akmTpp = [];
|
|
269
|
+
for (const t of group) {
|
|
270
|
+
noakmSum += t.noakm.passRate;
|
|
271
|
+
akmSum += t.akm.passRate;
|
|
272
|
+
wallNoakm += t.noakm.wallclockMs;
|
|
273
|
+
wallAkm += t.akm.wallclockMs;
|
|
274
|
+
if (t.akm.passRate < t.noakm.passRate)
|
|
275
|
+
regressionCount += 1;
|
|
276
|
+
if (t.noakm.tokensPerPass !== null)
|
|
277
|
+
noakmTpp.push(t.noakm.tokensPerPass);
|
|
278
|
+
if (t.akm.tokensPerPass !== null)
|
|
279
|
+
akmTpp.push(t.akm.tokensPerPass);
|
|
280
|
+
}
|
|
281
|
+
const passRateNoakm = noakmSum / n;
|
|
282
|
+
const passRateAkm = akmSum / n;
|
|
283
|
+
const meanNoakmTpp = noakmTpp.length === 0 ? null : noakmTpp.reduce((a, b) => a + b, 0) / noakmTpp.length;
|
|
284
|
+
const meanAkmTpp = akmTpp.length === 0 ? null : akmTpp.reduce((a, b) => a + b, 0) / akmTpp.length;
|
|
285
|
+
const tokensPerPassDelta = meanNoakmTpp === null || meanAkmTpp === null ? null : meanAkmTpp - meanNoakmTpp;
|
|
286
|
+
rows.push({
|
|
287
|
+
domain,
|
|
288
|
+
taskCount: n,
|
|
289
|
+
regressionCount,
|
|
290
|
+
passRateNoakm,
|
|
291
|
+
passRateAkm,
|
|
292
|
+
passRateDelta: passRateAkm - passRateNoakm,
|
|
293
|
+
tokensPerPassDelta,
|
|
294
|
+
wallclockMsDelta: wallAkm / n - wallNoakm / n,
|
|
295
|
+
});
|
|
296
|
+
}
|
|
297
|
+
rows.sort((a, b) => a.domain.localeCompare(b.domain));
|
|
298
|
+
return rows;
|
|
299
|
+
}
|
|
300
|
+
/**
|
|
301
|
+
* Compute asset-regression-candidate rows (#260). Walks the AKM-arm runs,
|
|
302
|
+
* keeps only those whose `taskId` is in `regressedTaskIds`, and tallies how
|
|
303
|
+
* often each loaded asset shows up. `regressedTaskCount` (distinct task IDs
|
|
304
|
+
* touched) is the primary sort key — assets that hurt many tasks are more
|
|
305
|
+
* actionable than assets that flooded one task across seeds.
|
|
306
|
+
*
|
|
307
|
+
* Sort: regressedTaskCount desc, totalLoadCount desc, assetRef asc.
|
|
308
|
+
*/
|
|
309
|
+
export function computeAssetRegressionCandidates(regressedTaskIds, akmRuns) {
|
|
310
|
+
const regressed = new Set(regressedTaskIds);
|
|
311
|
+
if (regressed.size === 0)
|
|
312
|
+
return [];
|
|
313
|
+
const taskIdsByAsset = new Map();
|
|
314
|
+
const totalLoadByAsset = new Map();
|
|
315
|
+
for (const run of akmRuns) {
|
|
316
|
+
if (!regressed.has(run.taskId))
|
|
317
|
+
continue;
|
|
318
|
+
const assets = run.assetsLoaded ?? [];
|
|
319
|
+
for (const ref of assets) {
|
|
320
|
+
let bucket = taskIdsByAsset.get(ref);
|
|
321
|
+
if (!bucket) {
|
|
322
|
+
bucket = new Set();
|
|
323
|
+
taskIdsByAsset.set(ref, bucket);
|
|
324
|
+
}
|
|
325
|
+
bucket.add(run.taskId);
|
|
326
|
+
totalLoadByAsset.set(ref, (totalLoadByAsset.get(ref) ?? 0) + 1);
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
const rows = [];
|
|
330
|
+
for (const [assetRef, taskIds] of taskIdsByAsset) {
|
|
331
|
+
rows.push({
|
|
332
|
+
assetRef,
|
|
333
|
+
regressedTaskCount: taskIds.size,
|
|
334
|
+
regressedTaskIds: [...taskIds].sort(),
|
|
335
|
+
totalLoadCount: totalLoadByAsset.get(assetRef) ?? 0,
|
|
336
|
+
});
|
|
337
|
+
}
|
|
338
|
+
rows.sort((a, b) => {
|
|
339
|
+
if (b.regressedTaskCount !== a.regressedTaskCount)
|
|
340
|
+
return b.regressedTaskCount - a.regressedTaskCount;
|
|
341
|
+
if (b.totalLoadCount !== a.totalLoadCount)
|
|
342
|
+
return b.totalLoadCount - a.totalLoadCount;
|
|
343
|
+
return a.assetRef.localeCompare(b.assetRef);
|
|
344
|
+
});
|
|
345
|
+
return rows;
|
|
346
|
+
}
|
|
347
|
+
// ── Per-asset attribution (§6.5) ───────────────────────────────────────────
|
|
348
|
+
/**
|
|
349
|
+
* Extract the unique asset refs an agent loaded during a run by scanning
|
|
350
|
+
* `events[]` and `verifierStdout` for `akm show <ref>` invocations.
|
|
351
|
+
*
|
|
352
|
+
* Detection strategy (all heuristic, all conservative):
|
|
353
|
+
* 1. `event.eventType === "show"` with `event.ref` (forward-compat — akm
|
|
354
|
+
* itself does not currently emit `show` events).
|
|
355
|
+
* 2. Substring match on `akm show <ref>` in stdout. The ref shape is
|
|
356
|
+
* `[origin//]type:name` per the v1 contract; we accept word-boundary
|
|
357
|
+
* terminators after the name.
|
|
358
|
+
* 3. Tool-call JSON `{"args":["show","<ref>"]}` — the form opencode logs
|
|
359
|
+
* when the agent invokes the akm CLI as a tool. We extract refs that
|
|
360
|
+
* look like asset refs from the args array entries adjacent to "show".
|
|
361
|
+
*
|
|
362
|
+
* Returns refs in first-seen order, deduplicated. Bounded scan: stdout is
|
|
363
|
+
* truncated at 16 MiB (the same cap the trajectory parser uses) to keep
|
|
364
|
+
* runaway agents from OOMing the bench.
|
|
365
|
+
*/
|
|
366
|
+
const ASSET_LOAD_STDOUT_SCAN_CAP = 16 * 1024 * 1024;
|
|
367
|
+
// Asset ref grammar: optional `origin//` prefix, type:name, where type and
|
|
368
|
+
// name are lowercase letters, digits, `_`, `-`. We deliberately do NOT match
|
|
369
|
+
// `://` schemes (those are install locators, not asset refs). The character
|
|
370
|
+
// class is intentionally tight so we don't mis-pickup arbitrary words after
|
|
371
|
+
// `akm show`. The `name` segment is restricted to `[A-Za-z0-9_-]+` (no `/`,
|
|
372
|
+
// no `.`) — the v1 grammar in src/core/asset-ref.ts permits `/` and `.` in
|
|
373
|
+
// names (e.g. `script:db/migrate/run.sh`), but the masker treats names as
|
|
374
|
+
// untrusted input and rejects any traversal-shaped value, so the bench-side
|
|
375
|
+
// scanner does not need (or want) to extract such refs from agent stdout.
|
|
376
|
+
// Limiting the regex here is defense-in-depth against a prompt-injected
|
|
377
|
+
// agent emitting `akm show "skill:../../etc"` and us pulling that ref into
|
|
378
|
+
// the masking flow.
|
|
379
|
+
const ASSET_REF_PATTERN = /(?:[a-z0-9_-]+\/\/)?[a-z][a-z0-9_-]*:[A-Za-z0-9_-]+/g;
|
|
380
|
+
export function extractAssetLoads(runResult) {
|
|
381
|
+
const seen = new Set();
|
|
382
|
+
const out = [];
|
|
383
|
+
const push = (ref) => {
|
|
384
|
+
if (!ref)
|
|
385
|
+
return;
|
|
386
|
+
if (seen.has(ref))
|
|
387
|
+
return;
|
|
388
|
+
seen.add(ref);
|
|
389
|
+
out.push(ref);
|
|
390
|
+
};
|
|
391
|
+
// 1. Events stream.
|
|
392
|
+
for (const event of runResult.events) {
|
|
393
|
+
if (event.eventType === "show" && typeof event.ref === "string") {
|
|
394
|
+
push(event.ref);
|
|
395
|
+
}
|
|
396
|
+
const meta = event.metadata;
|
|
397
|
+
if (meta && typeof meta === "object" && event.eventType === "show") {
|
|
398
|
+
const candidate = meta.ref;
|
|
399
|
+
if (typeof candidate === "string")
|
|
400
|
+
push(candidate);
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
// 2 & 3. Stdout scanning. Bound the scan so a runaway agent stdout cannot
|
|
404
|
+
// OOM the bench. Truncation is silent — the trajectory parser already
|
|
405
|
+
// surfaces a warning for the same data on its own scan.
|
|
406
|
+
let haystack = runResult.verifierStdout || "";
|
|
407
|
+
if (haystack.length > ASSET_LOAD_STDOUT_SCAN_CAP) {
|
|
408
|
+
haystack = haystack.slice(0, ASSET_LOAD_STDOUT_SCAN_CAP);
|
|
409
|
+
}
|
|
410
|
+
// `akm show <ref>` literal form. Accept optional quoting around the ref so
|
|
411
|
+
// shell traces like `akm show "skill:foo"` work too.
|
|
412
|
+
const literalRe = /akm\s+show\s+["']?((?:[a-z0-9_-]+\/\/)?[a-z][a-z0-9_-]*:[A-Za-z0-9_-]+)["']?/g;
|
|
413
|
+
for (const literalMatch of haystack.matchAll(literalRe)) {
|
|
414
|
+
push(literalMatch[1]);
|
|
415
|
+
}
|
|
416
|
+
// Tool-call JSON form. `"args":[..., "show", "<ref>", ...]`. We extract
|
|
417
|
+
// every refish token in the haystack that follows a "show" arg in JSON-y
|
|
418
|
+
// form. A second cheap pass keeps the pattern simple.
|
|
419
|
+
const toolCallRe = /"show"\s*,\s*"((?:[a-z0-9_-]+\/\/)?[a-z][a-z0-9_-]*:[A-Za-z0-9_-]+)"/g;
|
|
420
|
+
for (const toolCallMatch of haystack.matchAll(toolCallRe)) {
|
|
421
|
+
push(toolCallMatch[1]);
|
|
422
|
+
}
|
|
423
|
+
return out;
|
|
424
|
+
}
|
|
425
|
+
// Suppress the unused warning for `ASSET_REF_PATTERN` above. The constant is
|
|
426
|
+
// retained as the documentation seam called out by the #251 review addenda,
|
|
427
|
+
// even though `extractAssetLoads` uses inline regexes for its two scan forms.
|
|
428
|
+
void ASSET_REF_PATTERN;
|
|
429
|
+
/**
|
|
430
|
+
* Anchored variant of `ASSET_REF_PATTERN` for whole-string validation.
|
|
431
|
+
*
|
|
432
|
+
* Used by `materialiseMaskedStash` (#251) to gate every asset ref BEFORE we
|
|
433
|
+
* touch the filesystem. The base `ASSET_REF_PATTERN` is `/g`-flagged for
|
|
434
|
+
* scanning agent stdout; we re-anchor here so a hostile string like
|
|
435
|
+
* `skill:foo/../../etc` is rejected as a whole even though the regex would
|
|
436
|
+
* happily match a `skill:foo` substring under `/g`.
|
|
437
|
+
*
|
|
438
|
+
* Rejects `..`, absolute paths, drive letters, null bytes, `/`, `\`, and
|
|
439
|
+
* anything else outside the v1 ref grammar (mirrors src/core/asset-ref.ts).
|
|
440
|
+
*/
|
|
441
|
+
const ASSET_REF_ANCHORED = /^(?:[a-z0-9_-]+\/\/)?[a-z][a-z0-9_-]*:[A-Za-z0-9_-]+$/;
|
|
442
|
+
/**
|
|
443
|
+
* Reject hostile asset refs before they reach any `fs.rmSync` call. The ref
|
|
444
|
+
* comes from agent stdout (untrusted; the agent could be prompt-injected) so
|
|
445
|
+
* we apply the anchored grammar pattern first, then the per-segment shape
|
|
446
|
+
* check after the colon-split. Defense in depth — each layer is sufficient
|
|
447
|
+
* on its own; the layered structure makes a future grammar relax safe.
|
|
448
|
+
*/
|
|
449
|
+
function isSafeAssetRef(ref) {
|
|
450
|
+
if (!ref)
|
|
451
|
+
return false;
|
|
452
|
+
if (ref.includes("\0"))
|
|
453
|
+
return false;
|
|
454
|
+
return ASSET_REF_ANCHORED.test(ref);
|
|
455
|
+
}
|
|
456
|
+
/**
|
|
457
|
+
* Aggregate per-asset load + pass counts across all akm-arm runs in a report.
|
|
458
|
+
*
|
|
459
|
+
* Sort order (stable, deterministic):
|
|
460
|
+
* 1. loadCount descending (most-used first)
|
|
461
|
+
* 2. loadPassRate descending (working assets above broken ones at the same load count)
|
|
462
|
+
* 3. assetRef ascending (alphabetical tiebreak)
|
|
463
|
+
*
|
|
464
|
+
* Only `arm === "akm"` runs contribute. The `noakm` arm has no stash and
|
|
465
|
+
* cannot load assets, so including it would zero-bias the rates.
|
|
466
|
+
*/
|
|
467
|
+
export function computePerAssetAttribution(report) {
|
|
468
|
+
const passing = new Map();
|
|
469
|
+
const failing = new Map();
|
|
470
|
+
let totalAkmRuns = 0;
|
|
471
|
+
// The §13.3 task entry doesn't carry RunResults — we read them from the
|
|
472
|
+
// shared akm-arm runs collection that the runner stamps onto `report.akmRuns`.
|
|
473
|
+
const akmRuns = collectAkmRuns(report);
|
|
474
|
+
for (const r of akmRuns) {
|
|
475
|
+
totalAkmRuns += 1;
|
|
476
|
+
const isPass = r.outcome === "pass";
|
|
477
|
+
for (const ref of r.assetsLoaded ?? []) {
|
|
478
|
+
const bucket = isPass ? passing : failing;
|
|
479
|
+
bucket.set(ref, (bucket.get(ref) ?? 0) + 1);
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
const refs = new Set([...passing.keys(), ...failing.keys()]);
|
|
483
|
+
const rows = [];
|
|
484
|
+
for (const ref of refs) {
|
|
485
|
+
const p = passing.get(ref) ?? 0;
|
|
486
|
+
const f = failing.get(ref) ?? 0;
|
|
487
|
+
const total = p + f;
|
|
488
|
+
rows.push({
|
|
489
|
+
assetRef: ref,
|
|
490
|
+
loadCountPassing: p,
|
|
491
|
+
loadCountFailing: f,
|
|
492
|
+
loadCount: total,
|
|
493
|
+
loadPassRate: total === 0 ? null : p / total,
|
|
494
|
+
});
|
|
495
|
+
}
|
|
496
|
+
rows.sort((a, b) => {
|
|
497
|
+
if (b.loadCount !== a.loadCount)
|
|
498
|
+
return b.loadCount - a.loadCount;
|
|
499
|
+
const ar = a.loadPassRate ?? -1;
|
|
500
|
+
const br = b.loadPassRate ?? -1;
|
|
501
|
+
if (br !== ar)
|
|
502
|
+
return br - ar;
|
|
503
|
+
return a.assetRef.localeCompare(b.assetRef);
|
|
504
|
+
});
|
|
505
|
+
return { rows, totalAkmRuns };
|
|
506
|
+
}
|
|
507
|
+
/**
|
|
508
|
+
* Pull the akm-arm RunResults out of a UtilityRunReport. The runner stamps
|
|
509
|
+
* them into the optional `akmRuns` field on the report so attribution can
|
|
510
|
+
* post-process them without re-running.
|
|
511
|
+
*/
|
|
512
|
+
function collectAkmRuns(report) {
|
|
513
|
+
if (Array.isArray(report.akmRuns))
|
|
514
|
+
return report.akmRuns;
|
|
515
|
+
return [];
|
|
516
|
+
}
|
|
517
|
+
// ── runs[] serialisation (#249) ────────────────────────────────────────────
|
|
518
|
+
/**
|
|
519
|
+
* Project a list of RunResults onto the compact `runs[]` rows persisted
|
|
520
|
+
* inside the §13.3 JSON envelope (#249). One row per (task, arm, seed)
|
|
521
|
+
* triple; the renderer walks the input order verbatim, which the runner
|
|
522
|
+
* already builds deterministically (per-task block, noakm before akm,
|
|
523
|
+
* seeds in ascending order).
|
|
524
|
+
*
|
|
525
|
+
* Aggregate metrics (per-task, trajectory, failure-mode, search-bridge,
|
|
526
|
+
* attribution) MUST be recomputable from these rows + task metadata. This
|
|
527
|
+
* helper is the canonical projection — keep it in lockstep with the field
|
|
528
|
+
* list in the issue body.
|
|
529
|
+
*/
|
|
530
|
+
export function aggregateRunsForReport(runs) {
|
|
531
|
+
return runs.map(serializeRunForReport);
|
|
532
|
+
}
|
|
533
|
+
/**
|
|
534
|
+
* Hydrate a persisted `runs[]` row back into the `RunResult` shape that
|
|
535
|
+
* downstream metrics helpers (`computePerAssetAttribution`, `aggregateCorpus`,
|
|
536
|
+
* etc.) expect. Used by `bench attribute` / `bench compare` when they read a
|
|
537
|
+
* §13.3 envelope from disk: the persisted row carries a compact subset, but
|
|
538
|
+
* it carries everything those helpers need.
|
|
539
|
+
*
|
|
540
|
+
* Fields the row deliberately does NOT carry are filled with safe defaults:
|
|
541
|
+
* • `events: []` — events.jsonl is not persisted; downstream attribution
|
|
542
|
+
* only consults `assetsLoaded` and `verifierStdout`.
|
|
543
|
+
* • `verifierStdout: ""` — full stdout is intentionally omitted from the
|
|
544
|
+
* envelope (#249 acceptance criterion). `assetsLoaded` already carries
|
|
545
|
+
* the post-hoc extraction the agent run produced.
|
|
546
|
+
* • `schemaVersion: 1` — the report schema implies it.
|
|
547
|
+
*
|
|
548
|
+
* Tokens are passed through as-is so a future `measurement` field added by
|
|
549
|
+
* #252 lands on the rehydrated row automatically. TODO(#252): keep this
|
|
550
|
+
* spread.
|
|
551
|
+
*/
|
|
552
|
+
export function rehydrateRunFromSerialized(row) {
|
|
553
|
+
// The compact row uses a permissive Record shape for tokens (see
|
|
554
|
+
// RunRecordSerialized). Coerce defensively so older artefacts with only
|
|
555
|
+
// {input, output} hydrate cleanly.
|
|
556
|
+
const tok = row.tokens;
|
|
557
|
+
return {
|
|
558
|
+
schemaVersion: 1,
|
|
559
|
+
taskId: row.task_id,
|
|
560
|
+
arm: row.arm,
|
|
561
|
+
seed: row.seed,
|
|
562
|
+
model: row.model,
|
|
563
|
+
outcome: row.outcome,
|
|
564
|
+
tokens: {
|
|
565
|
+
...tok,
|
|
566
|
+
input: typeof tok.input === "number" ? tok.input : 0,
|
|
567
|
+
output: typeof tok.output === "number" ? tok.output : 0,
|
|
568
|
+
},
|
|
569
|
+
wallclockMs: row.wallclock_ms,
|
|
570
|
+
trajectory: {
|
|
571
|
+
correctAssetLoaded: row.trajectory.correct_asset_loaded,
|
|
572
|
+
feedbackRecorded: row.trajectory.feedback_recorded,
|
|
573
|
+
},
|
|
574
|
+
events: [],
|
|
575
|
+
verifierStdout: "",
|
|
576
|
+
verifierExitCode: row.verifier_exit_code,
|
|
577
|
+
assetsLoaded: [...row.assets_loaded],
|
|
578
|
+
failureMode: (row.failure_mode ?? null),
|
|
579
|
+
};
|
|
580
|
+
}
|
|
581
|
+
/**
|
|
582
|
+
* Pick the top-N most-loaded assets from a base report and re-run the corpus
|
|
583
|
+
* with each one masked from its source stash. Returns a marginal-contribution
|
|
584
|
+
* row per masked asset.
|
|
585
|
+
*
|
|
586
|
+
* Cost: N * (tasks × arms × seedsPerArm) re-runs. Operators clamp N before
|
|
587
|
+
* calling — but we also clamp internally if `topN` exceeds the unique-asset
|
|
588
|
+
* count to avoid surprising no-op runs.
|
|
589
|
+
*
|
|
590
|
+
* Source-fixture safety: every masked re-run materialises a fresh tmp copy
|
|
591
|
+
* of the fixture stash, deletes the masked asset's files there, and points
|
|
592
|
+
* the re-run at the tmp dir. The shipped fixture in `tests/fixtures/stashes/`
|
|
593
|
+
* is NEVER mutated.
|
|
594
|
+
*/
|
|
595
|
+
export async function runMaskedCorpus(opts) {
|
|
596
|
+
const baseReport = opts.baseReport;
|
|
597
|
+
const fixturesRoot = opts.fixturesRoot ?? path.resolve(__dirname, "..", "fixtures", "stashes");
|
|
598
|
+
const attribution = computePerAssetAttribution(baseReport);
|
|
599
|
+
const desired = Math.max(1, opts.topN ?? 5);
|
|
600
|
+
const clamped = Math.min(desired, attribution.rows.length);
|
|
601
|
+
const baseAkmPassRate = baseReport.aggregateAkm.passRate;
|
|
602
|
+
const top = attribution.rows.slice(0, clamped);
|
|
603
|
+
const attributions = [];
|
|
604
|
+
const maskedRefs = [];
|
|
605
|
+
for (const row of top) {
|
|
606
|
+
const maskedTasks = [];
|
|
607
|
+
const tmpDirs = [];
|
|
608
|
+
try {
|
|
609
|
+
for (const baseTask of baseReport.taskMetadata ?? []) {
|
|
610
|
+
const maskedStashDir = materialiseMaskedStash(fixturesRoot, baseTask.stash, row.assetRef);
|
|
611
|
+
if (maskedStashDir)
|
|
612
|
+
tmpDirs.push(maskedStashDir);
|
|
613
|
+
// Issue #251: forward the masked stashDir via the explicit
|
|
614
|
+
// `stashDirOverride` field on the cloned TaskMetadata. We MUST NOT
|
|
615
|
+
// mutate `baseTask.stash` (the fixture name) — the runner uses that
|
|
616
|
+
// to call `loadFixtureStash`, and overloading it breaks the
|
|
617
|
+
// `__no-stash__` resolution branch in runner.ts. The runner's AKM-arm
|
|
618
|
+
// branch checks `task.stashDirOverride` first.
|
|
619
|
+
//
|
|
620
|
+
// When `materialiseMaskedStash` returned `null` (asset not present in
|
|
621
|
+
// this fixture, or hostile ref shape rejected by the validator), we
|
|
622
|
+
// intentionally leave both fields untouched. The runner falls back to
|
|
623
|
+
// the normal materialisation flow against the unchanged source
|
|
624
|
+
// fixture — so the re-run still happens, but the result mirrors the
|
|
625
|
+
// base. This is a meaningful diagnostic (the ref didn't bind in this
|
|
626
|
+
// fixture) and is the same accounting `cost-accounting`-style tests
|
|
627
|
+
// assert against.
|
|
628
|
+
if (maskedStashDir) {
|
|
629
|
+
maskedTasks.push({ ...baseTask, stashDirOverride: maskedStashDir });
|
|
630
|
+
}
|
|
631
|
+
else {
|
|
632
|
+
maskedTasks.push({ ...baseTask });
|
|
633
|
+
}
|
|
634
|
+
}
|
|
635
|
+
const maskedReport = await opts.runUtility({
|
|
636
|
+
...opts.baseOptions,
|
|
637
|
+
tasks: maskedTasks,
|
|
638
|
+
// The masked stash already has the correct content on disk, and the
|
|
639
|
+
// runner now resolves it via `task.stashDirOverride`. We still pass
|
|
640
|
+
// `materialiseStash: false` so the runner does not call
|
|
641
|
+
// `loadFixtureStash` against the (unmasked) named fixture — that
|
|
642
|
+
// would waste work and risk re-indexing the source dir.
|
|
643
|
+
materialiseStash: false,
|
|
644
|
+
});
|
|
645
|
+
const maskedPassRate = maskedReport.aggregateAkm.passRate;
|
|
646
|
+
attributions.push({
|
|
647
|
+
assetRef: row.assetRef,
|
|
648
|
+
basePassRate: baseAkmPassRate,
|
|
649
|
+
maskedPassRate,
|
|
650
|
+
marginalContribution: baseAkmPassRate - maskedPassRate,
|
|
651
|
+
});
|
|
652
|
+
maskedRefs.push(row.assetRef);
|
|
653
|
+
}
|
|
654
|
+
finally {
|
|
655
|
+
// Cleanup runs in BOTH success and failure paths (acceptance criterion).
|
|
656
|
+
// Best-effort: a tmpfs failure here is logged via the `try/catch` below
|
|
657
|
+
// and the host OS reaps the tmp dir on reboot.
|
|
658
|
+
for (const dir of tmpDirs) {
|
|
659
|
+
try {
|
|
660
|
+
fs.rmSync(dir, { recursive: true, force: true });
|
|
661
|
+
}
|
|
662
|
+
catch {
|
|
663
|
+
// Best-effort cleanup; tmpfs cleanup will handle leaks.
|
|
664
|
+
}
|
|
665
|
+
}
|
|
666
|
+
}
|
|
667
|
+
}
|
|
668
|
+
return {
|
|
669
|
+
baseReport,
|
|
670
|
+
attributions,
|
|
671
|
+
runsPerformed: clamped,
|
|
672
|
+
maskingStrategy: "leave-one-out",
|
|
673
|
+
maskedRefs,
|
|
674
|
+
};
|
|
675
|
+
}
|
|
676
|
+
/**
|
|
677
|
+
* Copy a fixture stash into a fresh tmp dir, delete every file matching the
|
|
678
|
+
* masked asset ref, and return the tmp dir path. Returns `null` if the named
|
|
679
|
+
* asset is not present in the fixture (we still re-run, but the result will
|
|
680
|
+
* mirror the base — which is itself a meaningful diagnostic).
|
|
681
|
+
*
|
|
682
|
+
* The masking heuristic:
|
|
683
|
+
* 1. Walk `<stash>/*<...>/.stash.json` files.
|
|
684
|
+
* 2. For each entry whose `name` + `type` matches the asset ref, drop the
|
|
685
|
+
* entry and delete its `filename` if present.
|
|
686
|
+
* 3. Rewrite the `.stash.json` with the trimmed entries (or remove it if
|
|
687
|
+
* it is now empty).
|
|
688
|
+
*/
|
|
689
|
+
export function materialiseMaskedStash(fixturesRoot, stashName, assetRef) {
|
|
690
|
+
// #271: validate stashName containment BEFORE touching the filesystem.
|
|
691
|
+
// `stashName` originates from a task YAML which, while authored, is part
|
|
692
|
+
// of the fixture corpus the bench loads; a fixture with `stash: "../../etc"`
|
|
693
|
+
// would otherwise resolve outside `fixturesRoot` and let masking edits or
|
|
694
|
+
// copies escape the bench sandbox. path.relative gives the cleanest
|
|
695
|
+
// containment check (handles `..` AND absolute path injection in one go).
|
|
696
|
+
const fixturesRootResolved = path.resolve(fixturesRoot);
|
|
697
|
+
const sourceDir = path.resolve(fixturesRootResolved, stashName);
|
|
698
|
+
const rel = path.relative(fixturesRootResolved, sourceDir);
|
|
699
|
+
if (rel.startsWith("..") || path.isAbsolute(rel))
|
|
700
|
+
return null;
|
|
701
|
+
if (!fs.existsSync(path.join(sourceDir, "MANIFEST.json")))
|
|
702
|
+
return null;
|
|
703
|
+
// Issue #251 review addendum: validate the WHOLE ref against the anchored
|
|
704
|
+
// grammar before we touch the filesystem. The downstream `isSafeAssetNameSegment`
|
|
705
|
+
// + `isPathContained` checks are still applied — this is defense in depth.
|
|
706
|
+
if (!isSafeAssetRef(assetRef))
|
|
707
|
+
return null;
|
|
708
|
+
const colonIdx = assetRef.indexOf(":");
|
|
709
|
+
if (colonIdx < 0) {
|
|
710
|
+
// Malformed ref: still produce a tmp copy with no edits so the caller's
|
|
711
|
+
// re-run sees the unmodified fixture.
|
|
712
|
+
const tmpRoot = benchMkdtemp(`akm-bench-masked-${stashName}-`);
|
|
713
|
+
copyDirRecursive(sourceDir, tmpRoot);
|
|
714
|
+
return tmpRoot;
|
|
715
|
+
}
|
|
716
|
+
const typeWithOrigin = assetRef.slice(0, colonIdx);
|
|
717
|
+
const name = assetRef.slice(colonIdx + 1);
|
|
718
|
+
const type = typeWithOrigin.includes("//") ? (typeWithOrigin.split("//")[1] ?? typeWithOrigin) : typeWithOrigin;
|
|
719
|
+
// SECURITY: the asset ref originates from agent stdout (untrusted; the
|
|
720
|
+
// agent could be prompt-injected). The masking heuristic below will
|
|
721
|
+
// `fs.rmSync` files under the tmp stash dir whose names are derived from
|
|
722
|
+
// `name`. A traversal-shaped name (`../etc`, `/abs/path`, `..\\..`) would
|
|
723
|
+
// escape the tmp root and delete arbitrary disk content. Reject those
|
|
724
|
+
// shapes BEFORE we materialise — and re-validate after path-resolving
|
|
725
|
+
// each candidate. Mirrors src/core/asset-ref.ts validateName().
|
|
726
|
+
if (!isSafeAssetNameSegment(name))
|
|
727
|
+
return null;
|
|
728
|
+
const tmpRoot = benchMkdtemp(`akm-bench-masked-${stashName}-`);
|
|
729
|
+
copyDirRecursive(sourceDir, tmpRoot);
|
|
730
|
+
// Walk every .stash.json under the tmp root and edit in place.
|
|
731
|
+
walkStashJsonFiles(tmpRoot, (jsonPath) => {
|
|
732
|
+
let raw;
|
|
733
|
+
try {
|
|
734
|
+
raw = fs.readFileSync(jsonPath, "utf8");
|
|
735
|
+
}
|
|
736
|
+
catch {
|
|
737
|
+
return;
|
|
738
|
+
}
|
|
739
|
+
let parsed;
|
|
740
|
+
try {
|
|
741
|
+
parsed = JSON.parse(raw);
|
|
742
|
+
}
|
|
743
|
+
catch {
|
|
744
|
+
return;
|
|
745
|
+
}
|
|
746
|
+
const entries = parsed.entries ?? [];
|
|
747
|
+
const kept = [];
|
|
748
|
+
const jsonDir = path.dirname(jsonPath);
|
|
749
|
+
for (const entry of entries) {
|
|
750
|
+
if (entry.type === type && entry.name === name) {
|
|
751
|
+
// Remove the entry's content file(s). The on-disk `filename` is read
|
|
752
|
+
// from the fixture .stash.json (trusted) but the value still passes
|
|
753
|
+
// through path.relative containment so a malicious fixture can't use
|
|
754
|
+
// this path to escape either.
|
|
755
|
+
const filename = entry.filename;
|
|
756
|
+
if (typeof filename === "string" && isSafeAssetNameSegment(filename)) {
|
|
757
|
+
const target = path.resolve(jsonDir, filename);
|
|
758
|
+
if (isPathContained(tmpRoot, target)) {
|
|
759
|
+
try {
|
|
760
|
+
fs.rmSync(target, { force: true });
|
|
761
|
+
}
|
|
762
|
+
catch {
|
|
763
|
+
// ignore
|
|
764
|
+
}
|
|
765
|
+
}
|
|
766
|
+
}
|
|
767
|
+
// Some fixtures keep a per-asset directory (e.g. skills/<name>/SKILL.md).
|
|
768
|
+
const dirCandidate = path.resolve(jsonDir, name);
|
|
769
|
+
if (isPathContained(tmpRoot, dirCandidate) &&
|
|
770
|
+
fs.existsSync(dirCandidate) &&
|
|
771
|
+
fs.statSync(dirCandidate).isDirectory()) {
|
|
772
|
+
try {
|
|
773
|
+
fs.rmSync(dirCandidate, { recursive: true, force: true });
|
|
774
|
+
}
|
|
775
|
+
catch {
|
|
776
|
+
// ignore
|
|
777
|
+
}
|
|
778
|
+
}
|
|
779
|
+
continue;
|
|
780
|
+
}
|
|
781
|
+
kept.push(entry);
|
|
782
|
+
}
|
|
783
|
+
if (kept.length === entries.length)
|
|
784
|
+
return; // nothing changed
|
|
785
|
+
if (kept.length === 0) {
|
|
786
|
+
try {
|
|
787
|
+
fs.rmSync(jsonPath, { force: true });
|
|
788
|
+
}
|
|
789
|
+
catch {
|
|
790
|
+
// ignore
|
|
791
|
+
}
|
|
792
|
+
}
|
|
793
|
+
else {
|
|
794
|
+
fs.writeFileSync(jsonPath, `${JSON.stringify({ ...parsed, entries: kept }, null, 2)}\n`);
|
|
795
|
+
}
|
|
796
|
+
});
|
|
797
|
+
return tmpRoot;
|
|
798
|
+
}
|
|
799
|
+
/**
|
|
800
|
+
* Reject any segment that could escape the tmp stash root when used as a
|
|
801
|
+
* relative path component:
|
|
802
|
+
* - empty string
|
|
803
|
+
* - any `/` or `\\` (path separators)
|
|
804
|
+
* - a `..` segment in any form
|
|
805
|
+
* - a leading `/` (POSIX absolute) or `C:` (Windows drive)
|
|
806
|
+
* - any null byte
|
|
807
|
+
*
|
|
808
|
+
* Mirrors src/core/asset-ref.ts validateName(), but returns a boolean
|
|
809
|
+
* (callers map this to "skip" rather than "throw").
|
|
810
|
+
*/
|
|
811
|
+
function isSafeAssetNameSegment(value) {
|
|
812
|
+
if (!value)
|
|
813
|
+
return false;
|
|
814
|
+
if (value.includes("\0"))
|
|
815
|
+
return false;
|
|
816
|
+
if (value.includes("/") || value.includes("\\"))
|
|
817
|
+
return false;
|
|
818
|
+
if (value === ".." || value === ".")
|
|
819
|
+
return false;
|
|
820
|
+
if (/^[A-Za-z]:/.test(value))
|
|
821
|
+
return false;
|
|
822
|
+
return true;
|
|
823
|
+
}
|
|
824
|
+
/**
|
|
825
|
+
* After resolving a target path, confirm it lives under `root`. Defense in
|
|
826
|
+
* depth: even if a traversal-shaped name slipped past the segment check,
|
|
827
|
+
* this catches escapes via symlinks or odd `path.join` semantics.
|
|
828
|
+
*
|
|
829
|
+
* #271: aligned with `isWithin` in `src/core/common.ts` — both inputs go
|
|
830
|
+
* through `safeRealpath` so a symlink inside `root` that points outside
|
|
831
|
+
* cannot fool the `path.relative` containment check. The shared helper
|
|
832
|
+
* also handles not-yet-existing children (walks up to the closest existing
|
|
833
|
+
* ancestor and resolves symlinks there) so we keep the existing semantics
|
|
834
|
+
* for `target` paths the masking heuristic is about to create.
|
|
835
|
+
*/
|
|
836
|
+
export function isPathContained(root, target) {
|
|
837
|
+
const rootResolved = safeRealpath(root);
|
|
838
|
+
const targetResolved = safeRealpath(target);
|
|
839
|
+
const rel = path.relative(rootResolved, targetResolved);
|
|
840
|
+
if (rel === "")
|
|
841
|
+
return true;
|
|
842
|
+
if (rel.startsWith(".."))
|
|
843
|
+
return false;
|
|
844
|
+
if (path.isAbsolute(rel))
|
|
845
|
+
return false;
|
|
846
|
+
return true;
|
|
847
|
+
}
|
|
848
|
+
function walkStashJsonFiles(root, visit) {
|
|
849
|
+
const stack = [root];
|
|
850
|
+
while (stack.length > 0) {
|
|
851
|
+
const cur = stack.pop();
|
|
852
|
+
if (!cur)
|
|
853
|
+
continue;
|
|
854
|
+
let entries;
|
|
855
|
+
try {
|
|
856
|
+
entries = fs.readdirSync(cur, { withFileTypes: true });
|
|
857
|
+
}
|
|
858
|
+
catch {
|
|
859
|
+
continue;
|
|
860
|
+
}
|
|
861
|
+
for (const entry of entries) {
|
|
862
|
+
const abs = path.join(cur, entry.name);
|
|
863
|
+
if (entry.isDirectory())
|
|
864
|
+
stack.push(abs);
|
|
865
|
+
else if (entry.isFile() && entry.name === ".stash.json")
|
|
866
|
+
visit(abs);
|
|
867
|
+
}
|
|
868
|
+
}
|
|
869
|
+
}
|
|
870
|
+
function copyDirRecursive(src, dest) {
|
|
871
|
+
fs.mkdirSync(dest, { recursive: true });
|
|
872
|
+
const entries = fs.readdirSync(src, { withFileTypes: true });
|
|
873
|
+
for (const entry of entries) {
|
|
874
|
+
const s = path.join(src, entry.name);
|
|
875
|
+
const d = path.join(dest, entry.name);
|
|
876
|
+
if (entry.isDirectory())
|
|
877
|
+
copyDirRecursive(s, d);
|
|
878
|
+
else if (entry.isFile())
|
|
879
|
+
fs.copyFileSync(s, d);
|
|
880
|
+
}
|
|
881
|
+
}
|
|
882
|
+
/** Aggregate trajectory booleans across a bag of runs. */
|
|
883
|
+
export function aggregateTrajectory(results) {
|
|
884
|
+
if (results.length === 0) {
|
|
885
|
+
return { correctAssetLoaded: null, feedbackRecorded: 0 };
|
|
886
|
+
}
|
|
887
|
+
let knownAsset = 0;
|
|
888
|
+
let assetLoaded = 0;
|
|
889
|
+
let feedback = 0;
|
|
890
|
+
for (const r of results) {
|
|
891
|
+
if (r.trajectory.correctAssetLoaded !== null) {
|
|
892
|
+
knownAsset += 1;
|
|
893
|
+
if (r.trajectory.correctAssetLoaded)
|
|
894
|
+
assetLoaded += 1;
|
|
895
|
+
}
|
|
896
|
+
if (r.trajectory.feedbackRecorded === true)
|
|
897
|
+
feedback += 1;
|
|
898
|
+
}
|
|
899
|
+
return {
|
|
900
|
+
correctAssetLoaded: knownAsset === 0 ? null : assetLoaded / knownAsset,
|
|
901
|
+
feedbackRecorded: feedback / results.length,
|
|
902
|
+
};
|
|
903
|
+
}
|
|
904
|
+
/**
|
|
905
|
+
* Sign threshold below which a delta is rendered as `flat`. `pass_rate` is
|
|
906
|
+
* normalised to `[0, 1]`, so a 0.005 (0.5pp) tolerance keeps tiny K-seed
|
|
907
|
+
* sampling jitter from looking like a regression.
|
|
908
|
+
*/
|
|
909
|
+
const PASS_RATE_FLAT_TOLERANCE = 0.005;
|
|
910
|
+
/** `tokens_per_pass` and `wallclock_ms` use raw counts; 0 is the only "flat". */
|
|
911
|
+
const COUNT_FLAT_TOLERANCE = 0;
|
|
912
|
+
function classifyPassRate(delta) {
|
|
913
|
+
if (delta === null)
|
|
914
|
+
return "flat";
|
|
915
|
+
if (Math.abs(delta) <= PASS_RATE_FLAT_TOLERANCE)
|
|
916
|
+
return "flat";
|
|
917
|
+
return delta > 0 ? "improve" : "regress";
|
|
918
|
+
}
|
|
919
|
+
function classifyCount(delta, lowerIsBetter) {
|
|
920
|
+
if (delta === null)
|
|
921
|
+
return "flat";
|
|
922
|
+
if (Math.abs(delta) <= COUNT_FLAT_TOLERANCE)
|
|
923
|
+
return "flat";
|
|
924
|
+
if (lowerIsBetter)
|
|
925
|
+
return delta < 0 ? "improve" : "regress";
|
|
926
|
+
return delta > 0 ? "improve" : "regress";
|
|
927
|
+
}
|
|
928
|
+
function readModel(r) {
|
|
929
|
+
return r.agent?.model ?? "<unknown>";
|
|
930
|
+
}
|
|
931
|
+
function readFixtureHash(r) {
|
|
932
|
+
const v = r.corpus?.fixtureContentHash;
|
|
933
|
+
return v === undefined || v === null ? null : v;
|
|
934
|
+
}
|
|
935
|
+
function readTaskCorpusHash(r) {
|
|
936
|
+
const v = r.corpus?.taskCorpusHash;
|
|
937
|
+
return v === undefined || v === null ? null : v;
|
|
938
|
+
}
|
|
939
|
+
function readSelectedTaskIds(r) {
|
|
940
|
+
const v = r.corpus?.selectedTaskIds;
|
|
941
|
+
return Array.isArray(v) ? v : null;
|
|
942
|
+
}
|
|
943
|
+
function arraysEqualIgnoringOrder(a, b) {
|
|
944
|
+
if (a.length !== b.length)
|
|
945
|
+
return false;
|
|
946
|
+
const sa = [...a].sort();
|
|
947
|
+
const sb = [...b].sort();
|
|
948
|
+
for (let i = 0; i < sa.length; i += 1)
|
|
949
|
+
if (sa[i] !== sb[i])
|
|
950
|
+
return false;
|
|
951
|
+
return true;
|
|
952
|
+
}
|
|
953
|
+
function akmAgg(r) {
|
|
954
|
+
const a = r.aggregate?.akm ?? {};
|
|
955
|
+
return {
|
|
956
|
+
pass_rate: a.pass_rate ?? 0,
|
|
957
|
+
tokens_per_pass: a.tokens_per_pass ?? null,
|
|
958
|
+
wallclock_ms: a.wallclock_ms ?? 0,
|
|
959
|
+
};
|
|
960
|
+
}
|
|
961
|
+
/**
|
|
962
|
+
* Diff two parsed UtilityRunReport JSONs.
|
|
963
|
+
*
|
|
964
|
+
* Refusal cases:
|
|
965
|
+
* • Either side missing `schemaVersion: 1` or `track: "utility"` →
|
|
966
|
+
* `schema_mismatch` / `track_mismatch`.
|
|
967
|
+
* • `agent.model` differs → `model_mismatch`.
|
|
968
|
+
* • Both sides report a `corpus.fixtureContentHash` and they differ →
|
|
969
|
+
* `hash_mismatch`. Missing hash on either side proceeds with a warning
|
|
970
|
+
* (Wave A may add it; older reports won't have it).
|
|
971
|
+
*
|
|
972
|
+
* On success the per-task table includes rows for every task in either side,
|
|
973
|
+
* plus aggregate deltas computed against the akm arm only (the noakm arm is
|
|
974
|
+
* the control — its delta is meaningless). `pass_rate` is in `[0, 1]`,
|
|
975
|
+
* higher is better; `tokens_per_pass` and `wallclock_ms` are counts, lower
|
|
976
|
+
* is better.
|
|
977
|
+
*/
|
|
978
|
+
export function compareReports(base, current, options = {}) {
|
|
979
|
+
// Schema-version gate.
|
|
980
|
+
if (base.schemaVersion !== 1 || current.schemaVersion !== 1) {
|
|
981
|
+
return {
|
|
982
|
+
ok: false,
|
|
983
|
+
reason: "schema_mismatch",
|
|
984
|
+
message: `compare requires schemaVersion=1 on both sides; got base=${String(base.schemaVersion)}, current=${String(current.schemaVersion)}`,
|
|
985
|
+
};
|
|
986
|
+
}
|
|
987
|
+
// Track gate. Cross-track diffs are nonsensical.
|
|
988
|
+
if (base.track !== "utility" || current.track !== "utility") {
|
|
989
|
+
return {
|
|
990
|
+
ok: false,
|
|
991
|
+
reason: "track_mismatch",
|
|
992
|
+
message: `compare only supports track="utility"; got base="${String(base.track)}", current="${String(current.track)}"`,
|
|
993
|
+
};
|
|
994
|
+
}
|
|
995
|
+
const baseModel = readModel(base);
|
|
996
|
+
const currentModel = readModel(current);
|
|
997
|
+
if (baseModel !== currentModel) {
|
|
998
|
+
return {
|
|
999
|
+
ok: false,
|
|
1000
|
+
reason: "model_mismatch",
|
|
1001
|
+
message: `cannot compare across different models: base="${baseModel}", current="${currentModel}". Rerun on the same model.`,
|
|
1002
|
+
baseModel,
|
|
1003
|
+
currentModel,
|
|
1004
|
+
};
|
|
1005
|
+
}
|
|
1006
|
+
const baseHash = readFixtureHash(base);
|
|
1007
|
+
const currentHash = readFixtureHash(current);
|
|
1008
|
+
const warnings = [];
|
|
1009
|
+
// #250 — task corpus hash + selected task IDs. Refused unless either side
|
|
1010
|
+
// is legacy (missing the hash) or the operator passed
|
|
1011
|
+
// `allowCorpusMismatch`. Legacy reports (no taskCorpusHash) degrade to a
|
|
1012
|
+
// warning so older artefacts can still be diffed.
|
|
1013
|
+
const baseTaskHash = readTaskCorpusHash(base);
|
|
1014
|
+
const currentTaskHash = readTaskCorpusHash(current);
|
|
1015
|
+
const baseIds = readSelectedTaskIds(base);
|
|
1016
|
+
const currentIds = readSelectedTaskIds(current);
|
|
1017
|
+
if (baseTaskHash !== null && currentTaskHash !== null && baseTaskHash !== currentTaskHash) {
|
|
1018
|
+
if (!options.allowCorpusMismatch) {
|
|
1019
|
+
return {
|
|
1020
|
+
ok: false,
|
|
1021
|
+
reason: "corpus_mismatch",
|
|
1022
|
+
message: `cannot compare across different task corpora: base taskCorpusHash="${baseTaskHash}", current="${currentTaskHash}". Rerun against the same task selection or pass --allow-corpus-mismatch to override.`,
|
|
1023
|
+
baseModel,
|
|
1024
|
+
currentModel,
|
|
1025
|
+
baseTaskCorpusHash: baseTaskHash,
|
|
1026
|
+
currentTaskCorpusHash: currentTaskHash,
|
|
1027
|
+
...(baseIds ? { baseSelectedTaskIds: baseIds } : {}),
|
|
1028
|
+
...(currentIds ? { currentSelectedTaskIds: currentIds } : {}),
|
|
1029
|
+
};
|
|
1030
|
+
}
|
|
1031
|
+
warnings.push(`task corpus hashes differ (base="${baseTaskHash}", current="${currentTaskHash}") — diff requested via --allow-corpus-mismatch`);
|
|
1032
|
+
}
|
|
1033
|
+
else if (baseTaskHash === null &&
|
|
1034
|
+
currentTaskHash === null &&
|
|
1035
|
+
baseIds !== null &&
|
|
1036
|
+
currentIds !== null &&
|
|
1037
|
+
!arraysEqualIgnoringOrder(baseIds, currentIds)) {
|
|
1038
|
+
// Both sides legacy (no taskCorpusHash) but both expose selectedTaskIds
|
|
1039
|
+
// and they differ. We can still detect a mismatched corpus from the ID
|
|
1040
|
+
// list alone — refuse unless the operator opted in.
|
|
1041
|
+
if (!options.allowCorpusMismatch) {
|
|
1042
|
+
return {
|
|
1043
|
+
ok: false,
|
|
1044
|
+
reason: "corpus_mismatch",
|
|
1045
|
+
message: `cannot compare across different selected task IDs. Rerun against the same task selection or pass --allow-corpus-mismatch to override.`,
|
|
1046
|
+
baseModel,
|
|
1047
|
+
currentModel,
|
|
1048
|
+
baseSelectedTaskIds: baseIds,
|
|
1049
|
+
currentSelectedTaskIds: currentIds,
|
|
1050
|
+
};
|
|
1051
|
+
}
|
|
1052
|
+
warnings.push("selected task IDs differ — diff requested via --allow-corpus-mismatch");
|
|
1053
|
+
}
|
|
1054
|
+
if (baseTaskHash === null)
|
|
1055
|
+
warnings.push("base report has no corpus.taskCorpusHash; proceeding without task-corpus-pin check");
|
|
1056
|
+
if (currentTaskHash === null)
|
|
1057
|
+
warnings.push("current report has no corpus.taskCorpusHash; proceeding without task-corpus-pin check");
|
|
1058
|
+
if (baseHash !== null && currentHash !== null && baseHash !== currentHash) {
|
|
1059
|
+
if (!options.allowFixtureMismatch) {
|
|
1060
|
+
return {
|
|
1061
|
+
ok: false,
|
|
1062
|
+
reason: "hash_mismatch",
|
|
1063
|
+
message: `cannot compare across different fixture-content hashes: base="${baseHash}", current="${currentHash}". Rerun against matching fixtures or pass --allow-fixture-mismatch to override.`,
|
|
1064
|
+
baseModel,
|
|
1065
|
+
currentModel,
|
|
1066
|
+
baseFixtureContentHash: baseHash,
|
|
1067
|
+
currentFixtureContentHash: currentHash,
|
|
1068
|
+
};
|
|
1069
|
+
}
|
|
1070
|
+
warnings.push(`fixture-content hashes differ (base="${baseHash}", current="${currentHash}") — diff requested via --allow-fixture-mismatch`);
|
|
1071
|
+
}
|
|
1072
|
+
if (baseHash === null)
|
|
1073
|
+
warnings.push("base report has no corpus.fixtureContentHash; proceeding without fixture-pin check");
|
|
1074
|
+
if (currentHash === null)
|
|
1075
|
+
warnings.push("current report has no corpus.fixtureContentHash; proceeding without fixture-pin check");
|
|
1076
|
+
// Aggregate (akm arm is the one that matters — noakm is the control).
|
|
1077
|
+
const ba = akmAgg(base);
|
|
1078
|
+
const ca = akmAgg(current);
|
|
1079
|
+
const passRateDelta = ca.pass_rate - ba.pass_rate;
|
|
1080
|
+
const tokensPerPassDelta = ba.tokens_per_pass === null || ca.tokens_per_pass === null ? null : ca.tokens_per_pass - ba.tokens_per_pass;
|
|
1081
|
+
const wallclockMsDelta = ca.wallclock_ms - ba.wallclock_ms;
|
|
1082
|
+
const aggregate = {
|
|
1083
|
+
passRateDelta,
|
|
1084
|
+
passRateSign: classifyPassRate(passRateDelta),
|
|
1085
|
+
tokensPerPassDelta,
|
|
1086
|
+
tokensPerPassSign: classifyCount(tokensPerPassDelta, true),
|
|
1087
|
+
wallclockMsDelta,
|
|
1088
|
+
wallclockMsSign: classifyCount(wallclockMsDelta, true),
|
|
1089
|
+
};
|
|
1090
|
+
// Per-task rows. Outer-join on task id.
|
|
1091
|
+
const baseTasks = new Map();
|
|
1092
|
+
for (const t of base.tasks ?? [])
|
|
1093
|
+
baseTasks.set(t.id, t);
|
|
1094
|
+
const currentTasks = new Map();
|
|
1095
|
+
for (const t of current.tasks ?? [])
|
|
1096
|
+
currentTasks.set(t.id, t);
|
|
1097
|
+
const allIds = new Set();
|
|
1098
|
+
for (const id of baseTasks.keys())
|
|
1099
|
+
allIds.add(id);
|
|
1100
|
+
for (const id of currentTasks.keys())
|
|
1101
|
+
allIds.add(id);
|
|
1102
|
+
const perTask = [];
|
|
1103
|
+
for (const id of [...allIds].sort()) {
|
|
1104
|
+
const b = baseTasks.get(id);
|
|
1105
|
+
const c = currentTasks.get(id);
|
|
1106
|
+
const bM = b?.akm ?? null;
|
|
1107
|
+
const cM = c?.akm ?? null;
|
|
1108
|
+
const presence = b !== undefined && c !== undefined ? "both" : b !== undefined ? "base-only" : "current-only";
|
|
1109
|
+
const passRateDelta_ = bM !== null && cM !== null ? cM.pass_rate - bM.pass_rate : null;
|
|
1110
|
+
const tokensPerPassDelta_ = bM !== null && cM !== null && bM.tokens_per_pass !== null && cM.tokens_per_pass !== null
|
|
1111
|
+
? cM.tokens_per_pass - bM.tokens_per_pass
|
|
1112
|
+
: null;
|
|
1113
|
+
const wallclockMsDelta_ = bM !== null && cM !== null ? cM.wallclock_ms - bM.wallclock_ms : null;
|
|
1114
|
+
perTask.push({
|
|
1115
|
+
id,
|
|
1116
|
+
presence,
|
|
1117
|
+
baseMetrics: bM,
|
|
1118
|
+
currentMetrics: cM,
|
|
1119
|
+
delta: { passRate: passRateDelta_, tokensPerPass: tokensPerPassDelta_, wallclockMs: wallclockMsDelta_ },
|
|
1120
|
+
signMarker: classifyPassRate(passRateDelta_),
|
|
1121
|
+
});
|
|
1122
|
+
}
|
|
1123
|
+
return {
|
|
1124
|
+
ok: true,
|
|
1125
|
+
baseModel,
|
|
1126
|
+
currentModel,
|
|
1127
|
+
baseFixtureContentHash: baseHash,
|
|
1128
|
+
currentFixtureContentHash: currentHash,
|
|
1129
|
+
warnings,
|
|
1130
|
+
aggregate,
|
|
1131
|
+
perTask,
|
|
1132
|
+
};
|
|
1133
|
+
}
|
|
1134
|
+
/** Maximum rank at which the gold ref still counts as "found"; > this is `search_low_rank`. */
|
|
1135
|
+
const SEARCH_RANK_CUTOFF = 5;
|
|
1136
|
+
/** Cap on the number of characters of `verifierStdout` we substring-scan. Mirrors trajectory.ts. */
|
|
1137
|
+
const FAILURE_MODE_STDOUT_SCAN_CAP = 16 * 1024 * 1024;
|
|
1138
|
+
/**
|
|
1139
|
+
* Classify a single failed run into one of the §6.6 labels. Pure function —
|
|
1140
|
+
* consults `runResult.trajectory.correctAssetLoaded` first (trajectory data
|
|
1141
|
+
* is authoritative when present), then falls back to string-matching
|
|
1142
|
+
* `runResult.events[]` and `runResult.verifierStdout`. Never calls an LLM,
|
|
1143
|
+
* never touches the filesystem.
|
|
1144
|
+
*
|
|
1145
|
+
* Decision tree (priority order — first match wins):
|
|
1146
|
+
* 1. Run not failed (`pass`, `budget_exceeded`, `harness_error`) → `null`.
|
|
1147
|
+
* 2. `trajectory.correctAssetLoaded === true` → the agent loaded the gold
|
|
1148
|
+
* asset but still failed. This is `loaded_ignored` (agent wrote from
|
|
1149
|
+
* memory instead of applying asset content). This short-circuit fixes
|
|
1150
|
+
* the 2026-05-03 baseline bug where 24/25 `search_no_gold` labels were
|
|
1151
|
+
* wrong because the classifier didn't consult trajectory data.
|
|
1152
|
+
* 3. No `akm search` call in the trace:
|
|
1153
|
+
* a. If task has no `goldRef` (so `correctAssetLoaded` is always null)
|
|
1154
|
+
* → `no_events` (trajectory metric undefined; cannot distinguish
|
|
1155
|
+
* "agent ran but events absent" from "agent never ran").
|
|
1156
|
+
* b. Otherwise → `no_search`.
|
|
1157
|
+
* 4. Search ran, no goldRef → `unrelated_bug`.
|
|
1158
|
+
* 5. Search ran; gold ref absent from results → `search_no_gold`.
|
|
1159
|
+
* (Only reachable when `correctAssetLoaded` is false or null, since
|
|
1160
|
+
* true is handled in step 2.)
|
|
1161
|
+
* 6. Gold ref present at rank > 5 → `search_low_rank`.
|
|
1162
|
+
* 7. `akm show` invoked on a non-gold ref AND gold ref never loaded
|
|
1163
|
+
* → `loaded_wrong`.
|
|
1164
|
+
* 8. Gold ref loaded; verifier output suggests the action contradicts the
|
|
1165
|
+
* asset's guidance → `loaded_ignored`.
|
|
1166
|
+
* 9. Gold ref loaded and apparently followed → `followed_wrong`.
|
|
1167
|
+
* 10. Default → `unrelated_bug`.
|
|
1168
|
+
*/
|
|
1169
|
+
export function classifyFailureMode(taskMeta, runResult) {
|
|
1170
|
+
if (runResult.outcome !== "fail")
|
|
1171
|
+
return null;
|
|
1172
|
+
const goldRef = taskMeta.goldRef;
|
|
1173
|
+
const correctAssetLoaded = runResult.trajectory?.correctAssetLoaded;
|
|
1174
|
+
// 1. Trajectory short-circuit: if events data confirms the gold asset was
|
|
1175
|
+
// loaded, the failure must be compliance-related, not discovery-related.
|
|
1176
|
+
// Return `loaded_ignored` immediately without scanning stdout.
|
|
1177
|
+
if (correctAssetLoaded === true) {
|
|
1178
|
+
return "loaded_ignored";
|
|
1179
|
+
}
|
|
1180
|
+
const trace = collectTrace(runResult);
|
|
1181
|
+
// 2. no_search / no_events — no `akm search` invocation anywhere in the trace.
|
|
1182
|
+
if (!hasAkmSearch(trace, runResult)) {
|
|
1183
|
+
// When there is no goldRef, correctAssetLoaded is always null (the metric
|
|
1184
|
+
// is undefined). We cannot tell whether the agent genuinely didn't search
|
|
1185
|
+
// or whether events data was simply absent. Use `no_events` to surface
|
|
1186
|
+
// this ambiguity rather than conflating it with `no_search`.
|
|
1187
|
+
if (!goldRef) {
|
|
1188
|
+
return "no_events";
|
|
1189
|
+
}
|
|
1190
|
+
return "no_search";
|
|
1191
|
+
}
|
|
1192
|
+
// Without a gold ref the search-based and load-based checks are undefined.
|
|
1193
|
+
// We can only distinguish "no_search" / "no_events" from everything else.
|
|
1194
|
+
if (!goldRef) {
|
|
1195
|
+
return "unrelated_bug";
|
|
1196
|
+
}
|
|
1197
|
+
const searchRank = findGoldSearchRank(trace, goldRef);
|
|
1198
|
+
// 3. search_no_gold — search ran (precondition above) but gold ref absent.
|
|
1199
|
+
// Only reachable when correctAssetLoaded is false or null (trajectory
|
|
1200
|
+
// data indicates gold was not loaded), because true is handled above.
|
|
1201
|
+
if (searchRank === null) {
|
|
1202
|
+
return "search_no_gold";
|
|
1203
|
+
}
|
|
1204
|
+
// 4. search_low_rank — present but below the cutoff.
|
|
1205
|
+
if (searchRank > SEARCH_RANK_CUTOFF) {
|
|
1206
|
+
return "search_low_rank";
|
|
1207
|
+
}
|
|
1208
|
+
const goldLoaded = hasAkmShow(trace, runResult, goldRef);
|
|
1209
|
+
const otherRefLoaded = hasAkmShowOtherRef(trace, runResult, goldRef);
|
|
1210
|
+
// 5. loaded_wrong — agent showed a non-gold ref AND never loaded the gold.
|
|
1211
|
+
if (otherRefLoaded && !goldLoaded) {
|
|
1212
|
+
return "loaded_wrong";
|
|
1213
|
+
}
|
|
1214
|
+
// The remaining branches all assume the gold was loaded.
|
|
1215
|
+
if (!goldLoaded) {
|
|
1216
|
+
// Gold ref was found in search at an acceptable rank, but the agent
|
|
1217
|
+
// never loaded anything (gold or otherwise) before failing. The taxonomy
|
|
1218
|
+
// table has no row for "found but never opened" — treat as unrelated_bug.
|
|
1219
|
+
return "unrelated_bug";
|
|
1220
|
+
}
|
|
1221
|
+
// 6. loaded_ignored — verifier diagnostic indicates the action contradicts
|
|
1222
|
+
// the loaded asset. Conservative heuristic: look for explicit "ignored"
|
|
1223
|
+
// or "not applied" markers in the verifier stdout. Without an LLM we
|
|
1224
|
+
// cannot detect subtler contradictions, so this branch only fires when
|
|
1225
|
+
// the verifier itself flagged the contradiction.
|
|
1226
|
+
if (verifierIndicatesIgnored(runResult.verifierStdout)) {
|
|
1227
|
+
return "loaded_ignored";
|
|
1228
|
+
}
|
|
1229
|
+
// 7. followed_wrong — gold loaded, apparently followed, verifier still
|
|
1230
|
+
// failed. The §6.6 spec maps this to "the asset itself is wrong".
|
|
1231
|
+
return "followed_wrong";
|
|
1232
|
+
}
|
|
1233
|
+
/** Build a `FailureModeAggregate` from a list of (taskId, label) pairs. */
|
|
1234
|
+
export function aggregateFailureModes(entries) {
|
|
1235
|
+
const byLabel = {};
|
|
1236
|
+
const byTask = {};
|
|
1237
|
+
for (const { taskId, mode } of entries) {
|
|
1238
|
+
byLabel[mode] = (byLabel[mode] ?? 0) + 1;
|
|
1239
|
+
if (!byTask[taskId])
|
|
1240
|
+
byTask[taskId] = {};
|
|
1241
|
+
byTask[taskId][mode] = (byTask[taskId][mode] ?? 0) + 1;
|
|
1242
|
+
}
|
|
1243
|
+
return { byLabel, byTask };
|
|
1244
|
+
}
|
|
1245
|
+
// ── Failure-mode classifier helpers ────────────────────────────────────────
|
|
1246
|
+
/**
|
|
1247
|
+
* Concatenated string used for substring scans. We pre-build this once per
|
|
1248
|
+
* classify call so the helper functions can share it. Stdout is capped per
|
|
1249
|
+
* the trajectory parser convention to keep runaway agents from OOMing the
|
|
1250
|
+
* bench.
|
|
1251
|
+
*/
|
|
1252
|
+
function collectTrace(runResult) {
|
|
1253
|
+
const stdout = runResult.verifierStdout ?? "";
|
|
1254
|
+
const capped = stdout.length > FAILURE_MODE_STDOUT_SCAN_CAP ? stdout.slice(0, FAILURE_MODE_STDOUT_SCAN_CAP) : stdout;
|
|
1255
|
+
return capped;
|
|
1256
|
+
}
|
|
1257
|
+
/** Does the trace contain any `akm search` invocation (CLI form OR event)? */
|
|
1258
|
+
function hasAkmSearch(trace, runResult) {
|
|
1259
|
+
// Tool-call CLI form, e.g. `akm search "deploy homelab"`.
|
|
1260
|
+
if (/\bakm\s+search\b/.test(trace))
|
|
1261
|
+
return true;
|
|
1262
|
+
// Tool-call JSON form, e.g. `"args":["search","..."]`.
|
|
1263
|
+
if (trace.includes(`"search"`) && /["']search["']/.test(trace))
|
|
1264
|
+
return true;
|
|
1265
|
+
// Event-stream form (search verbs aren't currently emitted but the field
|
|
1266
|
+
// is forward-compatible — see core/events.ts).
|
|
1267
|
+
for (const event of runResult.events) {
|
|
1268
|
+
if (event.eventType === "search" || event.eventType === "search_invoked")
|
|
1269
|
+
return true;
|
|
1270
|
+
}
|
|
1271
|
+
return false;
|
|
1272
|
+
}
|
|
1273
|
+
/**
|
|
1274
|
+
* Find the 1-based rank of `goldRef` in the search results captured in the
|
|
1275
|
+
* trace, or `null` if not present. Best-effort heuristics:
|
|
1276
|
+
* 1. Look for an `akm search` block followed by a numbered list (`1. skill:foo`).
|
|
1277
|
+
* 2. Look for a JSON-ish results array containing the ref.
|
|
1278
|
+
* 3. Fall back to substring presence — if the ref appears anywhere after
|
|
1279
|
+
* a search invocation, treat it as rank-unknown. We err on the side of
|
|
1280
|
+
* `1` (best case for the agent) so the classifier doesn't false-positive
|
|
1281
|
+
* on `search_low_rank`.
|
|
1282
|
+
*/
|
|
1283
|
+
function findGoldSearchRank(trace, goldRef) {
|
|
1284
|
+
// Locate the first `akm search` invocation; restrict the rank search to
|
|
1285
|
+
// text after it so we don't pick up `akm show` output.
|
|
1286
|
+
const searchMatch = trace.match(/\bakm\s+search\b/);
|
|
1287
|
+
if (!searchMatch || searchMatch.index === undefined) {
|
|
1288
|
+
// Caller already verified search ran; if our regex disagrees, fall back
|
|
1289
|
+
// to scanning the full trace.
|
|
1290
|
+
return findRefRankInText(trace, goldRef);
|
|
1291
|
+
}
|
|
1292
|
+
const after = trace.slice(searchMatch.index);
|
|
1293
|
+
return findRefRankInText(after, goldRef);
|
|
1294
|
+
}
|
|
1295
|
+
function findRefRankInText(text, goldRef) {
|
|
1296
|
+
const escaped = goldRef.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
1297
|
+
// Numbered list: lines of the form `<rank>. <ref>` or `<rank>) <ref>`.
|
|
1298
|
+
const numberedRe = /^\s*(\d{1,3})[.)]\s+([^\s]+)/gm;
|
|
1299
|
+
let match;
|
|
1300
|
+
while (true) {
|
|
1301
|
+
match = numberedRe.exec(text);
|
|
1302
|
+
if (match === null)
|
|
1303
|
+
break;
|
|
1304
|
+
const ref = match[2];
|
|
1305
|
+
if (refsMatch(ref, goldRef)) {
|
|
1306
|
+
return Number.parseInt(match[1], 10);
|
|
1307
|
+
}
|
|
1308
|
+
}
|
|
1309
|
+
// JSON array form: `"results":["a","b","skill:foo"]`. Estimate rank by
|
|
1310
|
+
// splitting on commas after the bracket. Best-effort.
|
|
1311
|
+
const jsonRe = /"results"\s*:\s*\[([^\]]+)\]/;
|
|
1312
|
+
const jsonMatch = text.match(jsonRe);
|
|
1313
|
+
if (jsonMatch) {
|
|
1314
|
+
const items = jsonMatch[1].split(",").map((s) => s.trim().replace(/^["']|["']$/g, ""));
|
|
1315
|
+
const idx = items.findIndex((item) => refsMatch(item, goldRef));
|
|
1316
|
+
if (idx >= 0)
|
|
1317
|
+
return idx + 1;
|
|
1318
|
+
}
|
|
1319
|
+
// Substring presence — assume rank 1 (best case for the agent, conservative
|
|
1320
|
+
// for the `search_low_rank` rule).
|
|
1321
|
+
const refRe = new RegExp(`\\b${escaped}\\b`);
|
|
1322
|
+
if (refRe.test(text))
|
|
1323
|
+
return 1;
|
|
1324
|
+
return null;
|
|
1325
|
+
}
|
|
1326
|
+
/** True when `candidate` is `goldRef` or a strict ref-extension thereof. */
|
|
1327
|
+
function refsMatch(candidate, goldRef) {
|
|
1328
|
+
if (candidate === goldRef)
|
|
1329
|
+
return true;
|
|
1330
|
+
if (candidate.endsWith(`//${goldRef}`))
|
|
1331
|
+
return true;
|
|
1332
|
+
if (candidate.startsWith(`${goldRef}/`))
|
|
1333
|
+
return true;
|
|
1334
|
+
return false;
|
|
1335
|
+
}
|
|
1336
|
+
/** Did the agent invoke `akm show <goldRef>` at any point? */
|
|
1337
|
+
function hasAkmShow(trace, runResult, goldRef) {
|
|
1338
|
+
const escaped = goldRef.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
1339
|
+
// CLI form, exact ref. Also matches origin-prefixed variants like
|
|
1340
|
+
// `akm show team//skill:foo` because the `[\w/]*//` prefix is optional.
|
|
1341
|
+
const cliRe = new RegExp(`\\bakm\\s+show\\s+["']?(?:[\\w-]+//)?${escaped}(?:\\b|\\W)`);
|
|
1342
|
+
if (cliRe.test(trace))
|
|
1343
|
+
return true;
|
|
1344
|
+
// Tool-call JSON form: `"args":["show","skill:foo"]`.
|
|
1345
|
+
if (trace.includes(`"show"`) && trace.includes(goldRef))
|
|
1346
|
+
return true;
|
|
1347
|
+
// Event-stream metadata.ref.
|
|
1348
|
+
for (const event of runResult.events) {
|
|
1349
|
+
if (typeof event.ref === "string" && refsMatch(event.ref, goldRef)) {
|
|
1350
|
+
// Only count "show" or "load" eventTypes; a `feedback` event mentioning
|
|
1351
|
+
// the ref doesn't mean the agent loaded it during this run.
|
|
1352
|
+
if (event.eventType === "show" || event.eventType === "load" || event.eventType === "tool_call")
|
|
1353
|
+
return true;
|
|
1354
|
+
}
|
|
1355
|
+
const meta = event.metadata;
|
|
1356
|
+
if (meta && typeof meta === "object") {
|
|
1357
|
+
const candidate = meta.ref;
|
|
1358
|
+
if (typeof candidate === "string" && refsMatch(candidate, goldRef)) {
|
|
1359
|
+
if (event.eventType === "show" || event.eventType === "load" || event.eventType === "tool_call")
|
|
1360
|
+
return true;
|
|
1361
|
+
}
|
|
1362
|
+
}
|
|
1363
|
+
}
|
|
1364
|
+
return false;
|
|
1365
|
+
}
|
|
1366
|
+
/** Did the agent invoke `akm show <ref>` for some ref OTHER than `goldRef`? */
|
|
1367
|
+
function hasAkmShowOtherRef(trace, runResult, goldRef) {
|
|
1368
|
+
// CLI form: capture the ref argument and reject when it matches the gold.
|
|
1369
|
+
const cliRe = /\bakm\s+show\s+["']?([^\s"'`]+)/g;
|
|
1370
|
+
let match;
|
|
1371
|
+
while (true) {
|
|
1372
|
+
match = cliRe.exec(trace);
|
|
1373
|
+
if (match === null)
|
|
1374
|
+
break;
|
|
1375
|
+
if (!refsMatch(match[1], goldRef))
|
|
1376
|
+
return true;
|
|
1377
|
+
}
|
|
1378
|
+
// Tool-call JSON form: `"args":["show","..."]`. Best-effort scan.
|
|
1379
|
+
const jsonRe = /\["show",\s*"([^"]+)"/g;
|
|
1380
|
+
while (true) {
|
|
1381
|
+
match = jsonRe.exec(trace);
|
|
1382
|
+
if (match === null)
|
|
1383
|
+
break;
|
|
1384
|
+
if (!refsMatch(match[1], goldRef))
|
|
1385
|
+
return true;
|
|
1386
|
+
}
|
|
1387
|
+
// Event-stream form.
|
|
1388
|
+
for (const event of runResult.events) {
|
|
1389
|
+
if (event.eventType !== "show" && event.eventType !== "load" && event.eventType !== "tool_call")
|
|
1390
|
+
continue;
|
|
1391
|
+
if (typeof event.ref === "string" && !refsMatch(event.ref, goldRef))
|
|
1392
|
+
return true;
|
|
1393
|
+
const meta = event.metadata;
|
|
1394
|
+
if (meta && typeof meta === "object") {
|
|
1395
|
+
const candidate = meta.ref;
|
|
1396
|
+
if (typeof candidate === "string" && !refsMatch(candidate, goldRef))
|
|
1397
|
+
return true;
|
|
1398
|
+
}
|
|
1399
|
+
}
|
|
1400
|
+
return false;
|
|
1401
|
+
}
|
|
1402
|
+
/**
|
|
1403
|
+
* Conservative heuristic for the `loaded_ignored` branch. Without an LLM we
|
|
1404
|
+
* cannot reliably decide whether an arbitrary action contradicts arbitrary
|
|
1405
|
+
* asset content; we only fire when the verifier's own diagnostic explicitly
|
|
1406
|
+
* flags the gold-asset guidance as ignored.
|
|
1407
|
+
*
|
|
1408
|
+
* The verifier stdout strings are deterministic — they come from
|
|
1409
|
+
* `runVerifier` and the per-task `verify.sh` scripts. Tasks that want to
|
|
1410
|
+
* surface this label should emit one of the agreed-upon markers below.
|
|
1411
|
+
*/
|
|
1412
|
+
function verifierIndicatesIgnored(verifierStdout) {
|
|
1413
|
+
if (!verifierStdout)
|
|
1414
|
+
return false;
|
|
1415
|
+
const lower = verifierStdout.toLowerCase();
|
|
1416
|
+
return (lower.includes("ignored gold guidance") ||
|
|
1417
|
+
lower.includes("guidance ignored") ||
|
|
1418
|
+
lower.includes("did not follow loaded asset") ||
|
|
1419
|
+
lower.includes("contradicts loaded asset"));
|
|
1420
|
+
}
|
|
1421
|
+
/** Cap on the number of result refs we extract per `akm search` invocation. */
|
|
1422
|
+
const TOP_K = 10;
|
|
1423
|
+
/**
|
|
1424
|
+
* Extract the gold rank for every `akm search` invocation in a run.
|
|
1425
|
+
*
|
|
1426
|
+
* The parser scans `runResult.verifierStdout` (which carries the captured
|
|
1427
|
+
* agent stdout including its tool-call trace) for `akm search` commands
|
|
1428
|
+
* and the result lists that follow them. The first 10 hits are considered;
|
|
1429
|
+
* if the gold ref appears, `rankOfGold` is its 1-based position, else
|
|
1430
|
+
* `null`.
|
|
1431
|
+
*
|
|
1432
|
+
* Pure function: never reads from disk and never mutates inputs. When
|
|
1433
|
+
* `goldRef` is undefined the function returns `[]` — we only attribute
|
|
1434
|
+
* ranks for tasks that actually have a gold asset.
|
|
1435
|
+
*/
|
|
1436
|
+
export function extractGoldRanks(runResult, goldRef) {
|
|
1437
|
+
if (!goldRef)
|
|
1438
|
+
return [];
|
|
1439
|
+
const haystack = runResult.verifierStdout;
|
|
1440
|
+
if (!haystack)
|
|
1441
|
+
return [];
|
|
1442
|
+
const events = [];
|
|
1443
|
+
// Walk the stdout linearly. A search invocation looks like
|
|
1444
|
+
// `akm search "<query>"` or `akm search <query>`
|
|
1445
|
+
// and the subsequent block carries the result list. A new `akm` command
|
|
1446
|
+
// (or end of stdout) terminates the previous search's result block.
|
|
1447
|
+
const lines = haystack.split(/\r?\n/);
|
|
1448
|
+
let active = null;
|
|
1449
|
+
// Regex for an `akm search` invocation. Captures the rest of the line
|
|
1450
|
+
// after `search ` so we can pick up the query whether it's quoted or not.
|
|
1451
|
+
const searchInvocationRe = /\bakm\s+search\s+(.+?)(?:\s+--|$)/;
|
|
1452
|
+
// A different `akm <verb>` (not `search`) terminates the active block.
|
|
1453
|
+
const akmInvocationRe = /\bakm\s+(\w+)/;
|
|
1454
|
+
for (const rawLine of lines) {
|
|
1455
|
+
const line = rawLine.trim();
|
|
1456
|
+
if (!line)
|
|
1457
|
+
continue;
|
|
1458
|
+
const searchMatch = line.match(searchInvocationRe);
|
|
1459
|
+
if (searchMatch) {
|
|
1460
|
+
// Flush any active block before starting a new one.
|
|
1461
|
+
if (active) {
|
|
1462
|
+
active.rankOfGold = computeRank(active.results, goldRef);
|
|
1463
|
+
events.push(active);
|
|
1464
|
+
}
|
|
1465
|
+
const query = stripQuotes(searchMatch[1].trim());
|
|
1466
|
+
active = { query, results: [], rankOfGold: null };
|
|
1467
|
+
// Some traces inline the JSON result on the same line — try to extract.
|
|
1468
|
+
collectRefsFromLine(line, active.results);
|
|
1469
|
+
continue;
|
|
1470
|
+
}
|
|
1471
|
+
if (!active)
|
|
1472
|
+
continue;
|
|
1473
|
+
// A non-search akm invocation closes the active search block.
|
|
1474
|
+
const akmMatch = line.match(akmInvocationRe);
|
|
1475
|
+
if (akmMatch && akmMatch[1] !== "search") {
|
|
1476
|
+
active.rankOfGold = computeRank(active.results, goldRef);
|
|
1477
|
+
events.push(active);
|
|
1478
|
+
active = null;
|
|
1479
|
+
continue;
|
|
1480
|
+
}
|
|
1481
|
+
collectRefsFromLine(line, active.results);
|
|
1482
|
+
}
|
|
1483
|
+
if (active) {
|
|
1484
|
+
active.rankOfGold = computeRank(active.results, goldRef);
|
|
1485
|
+
events.push(active);
|
|
1486
|
+
}
|
|
1487
|
+
return events;
|
|
1488
|
+
}
|
|
1489
|
+
/** Trim leading/trailing single or double quotes from a query string. */
|
|
1490
|
+
function stripQuotes(s) {
|
|
1491
|
+
if (s.length >= 2) {
|
|
1492
|
+
const first = s[0];
|
|
1493
|
+
const last = s[s.length - 1];
|
|
1494
|
+
if ((first === '"' && last === '"') || (first === "'" && last === "'")) {
|
|
1495
|
+
return s.slice(1, -1);
|
|
1496
|
+
}
|
|
1497
|
+
}
|
|
1498
|
+
return s;
|
|
1499
|
+
}
|
|
1500
|
+
/**
|
|
1501
|
+
* Pull asset refs from a single line into `out`. Matches both plain
|
|
1502
|
+
* `ref: <ref>` lines (text mode) and `"ref":"<ref>"` (JSON mode). We
|
|
1503
|
+
* stop at TOP_K results to mirror the spec's top-10 cutoff.
|
|
1504
|
+
*/
|
|
1505
|
+
function collectRefsFromLine(line, out) {
|
|
1506
|
+
if (out.length >= TOP_K)
|
|
1507
|
+
return;
|
|
1508
|
+
// JSON form: `"ref":"skill:foo"` or `"ref": "skill:foo"`. Multiple per line possible.
|
|
1509
|
+
const jsonRe = /"ref"\s*:\s*"([^"]+)"/g;
|
|
1510
|
+
let m;
|
|
1511
|
+
m = jsonRe.exec(line);
|
|
1512
|
+
while (m !== null) {
|
|
1513
|
+
if (out.length >= TOP_K)
|
|
1514
|
+
return;
|
|
1515
|
+
out.push(m[1]);
|
|
1516
|
+
m = jsonRe.exec(line);
|
|
1517
|
+
}
|
|
1518
|
+
// Plain text form: ` ref: skill:foo`. Only treat the line as a ref-bearing
|
|
1519
|
+
// line if it starts with `ref:` (after whitespace). Avoids picking up
|
|
1520
|
+
// every `:` in arbitrary stdout.
|
|
1521
|
+
const textRe = /^ref:\s*([^\s,]+)/;
|
|
1522
|
+
const tm = line.match(textRe);
|
|
1523
|
+
if (tm && out.length < TOP_K) {
|
|
1524
|
+
out.push(tm[1]);
|
|
1525
|
+
}
|
|
1526
|
+
}
|
|
1527
|
+
/**
|
|
1528
|
+
* 1-based rank of `goldRef` in `results`, or `null` if absent within the
|
|
1529
|
+
* top 10. We use `matchesGold` for prefix-tolerant matching so
|
|
1530
|
+
* `team//skill:foo` counts as `skill:foo` (mirrors trajectory parser).
|
|
1531
|
+
*/
|
|
1532
|
+
function computeRank(results, goldRef) {
|
|
1533
|
+
const cap = Math.min(results.length, TOP_K);
|
|
1534
|
+
for (let i = 0; i < cap; i += 1) {
|
|
1535
|
+
if (matchesGold(results[i], goldRef))
|
|
1536
|
+
return i + 1;
|
|
1537
|
+
}
|
|
1538
|
+
return null;
|
|
1539
|
+
}
|
|
1540
|
+
function matchesGold(candidate, gold) {
|
|
1541
|
+
if (candidate === gold)
|
|
1542
|
+
return true;
|
|
1543
|
+
if (candidate.endsWith(`//${gold}`))
|
|
1544
|
+
return true;
|
|
1545
|
+
if (candidate.startsWith(`${gold}/`))
|
|
1546
|
+
return true;
|
|
1547
|
+
return false;
|
|
1548
|
+
}
|
|
1549
|
+
/**
|
|
1550
|
+
* Aggregate gold-rank records across all akm-arm runs in the corpus.
|
|
1551
|
+
*
|
|
1552
|
+
* The function operates on `report.goldRankRecords`, which the runner
|
|
1553
|
+
* populates per (task, arm, seed). When the corpus has no gold-ref tasks
|
|
1554
|
+
* at all (every record list is empty), every metric collapses to a zero
|
|
1555
|
+
* envelope and the `passRateByRank` table is empty — the renderer turns
|
|
1556
|
+
* that into a single "(N/A)" sentence.
|
|
1557
|
+
*/
|
|
1558
|
+
export function computeSearchBridge(report) {
|
|
1559
|
+
const records = report.goldRankRecords ?? [];
|
|
1560
|
+
// Histogram + percentile inputs across every search.
|
|
1561
|
+
const histogram = emptyHistogram();
|
|
1562
|
+
const allRanks = [];
|
|
1563
|
+
let totalSearches = 0;
|
|
1564
|
+
for (const rec of records) {
|
|
1565
|
+
for (const ev of rec.searches) {
|
|
1566
|
+
totalSearches += 1;
|
|
1567
|
+
allRanks.push(ev.rankOfGold);
|
|
1568
|
+
const bucket = ev.rankOfGold === null ? "missing" : String(ev.rankOfGold);
|
|
1569
|
+
histogram[bucket] = (histogram[bucket] ?? 0) + 1;
|
|
1570
|
+
}
|
|
1571
|
+
}
|
|
1572
|
+
const goldAtRank1 = totalSearches === 0 ? 0 : (histogram["1"] ?? 0) / totalSearches;
|
|
1573
|
+
const goldMissing = totalSearches === 0 ? 0 : (histogram.missing ?? 0) / totalSearches;
|
|
1574
|
+
const goldRankP50 = totalSearches === 0 ? null : percentile(allRanks, 50);
|
|
1575
|
+
const goldRankP90 = totalSearches === 0 ? null : percentile(allRanks, 90);
|
|
1576
|
+
// pass_rate_by_rank — split runs by the rank in *the search the agent
|
|
1577
|
+
// actually ran*. We use the last `akm search` of the run (or "missing"
|
|
1578
|
+
// when no search at all happened, or "missing" when the agent searched
|
|
1579
|
+
// but gold wasn't in the top 10 in that final search). Runs without any
|
|
1580
|
+
// `akm search` invocation are dropped from this slice — `pass_rate_by_rank`
|
|
1581
|
+
// only describes what happened given a search.
|
|
1582
|
+
const passRateBuckets = new Map();
|
|
1583
|
+
for (const rec of records) {
|
|
1584
|
+
if (rec.searches.length === 0)
|
|
1585
|
+
continue;
|
|
1586
|
+
const chosen = rec.searches[rec.searches.length - 1];
|
|
1587
|
+
const bucket = chosen.rankOfGold === null ? "missing" : String(chosen.rankOfGold);
|
|
1588
|
+
const slot = passRateBuckets.get(bucket) ?? { passes: 0, total: 0 };
|
|
1589
|
+
slot.total += 1;
|
|
1590
|
+
if (rec.outcome === "pass")
|
|
1591
|
+
slot.passes += 1;
|
|
1592
|
+
passRateBuckets.set(bucket, slot);
|
|
1593
|
+
}
|
|
1594
|
+
const passRateByRank = [];
|
|
1595
|
+
for (const rank of histogramKeys()) {
|
|
1596
|
+
const slot = passRateBuckets.get(rank);
|
|
1597
|
+
if (!slot)
|
|
1598
|
+
continue;
|
|
1599
|
+
passRateByRank.push({
|
|
1600
|
+
rank,
|
|
1601
|
+
passRate: slot.total === 0 ? 0 : slot.passes / slot.total,
|
|
1602
|
+
runCount: slot.total,
|
|
1603
|
+
});
|
|
1604
|
+
}
|
|
1605
|
+
return {
|
|
1606
|
+
goldRankDistribution: histogram,
|
|
1607
|
+
goldRankP50,
|
|
1608
|
+
goldRankP90,
|
|
1609
|
+
goldAtRank1,
|
|
1610
|
+
goldMissing,
|
|
1611
|
+
passRateByRank,
|
|
1612
|
+
runsObserved: records.length,
|
|
1613
|
+
searchesObserved: totalSearches,
|
|
1614
|
+
};
|
|
1615
|
+
}
|
|
1616
|
+
/** Ordered keys used for both the histogram and the pass_rate_by_rank table. */
|
|
1617
|
+
export function histogramKeys() {
|
|
1618
|
+
return ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "missing"];
|
|
1619
|
+
}
|
|
1620
|
+
function emptyHistogram() {
|
|
1621
|
+
const out = {};
|
|
1622
|
+
for (const k of histogramKeys())
|
|
1623
|
+
out[k] = 0;
|
|
1624
|
+
return out;
|
|
1625
|
+
}
|
|
1626
|
+
/**
|
|
1627
|
+
* Linear-interpolated percentile over a list of ranks. `null` ranks are
|
|
1628
|
+
* treated as `Infinity` so the missing bucket pushes percentiles up
|
|
1629
|
+
* correctly. Returns `Infinity` when the percentile lands in the missing
|
|
1630
|
+
* region; the renderer surfaces that as the literal `"missing"` token so
|
|
1631
|
+
* downstream JSON consumers don't choke on `Infinity`.
|
|
1632
|
+
*/
|
|
1633
|
+
function percentile(ranks, p) {
|
|
1634
|
+
if (ranks.length === 0)
|
|
1635
|
+
return Number.NaN;
|
|
1636
|
+
const sorted = ranks.map((r) => (r === null ? Number.POSITIVE_INFINITY : r)).sort((a, b) => a - b);
|
|
1637
|
+
// Nearest-rank method (avoids interpolation between Infinity and a finite).
|
|
1638
|
+
// index = ceil(p/100 * N) - 1, clamped to [0, N-1].
|
|
1639
|
+
const idx = Math.min(sorted.length - 1, Math.max(0, Math.ceil((p / 100) * sorted.length) - 1));
|
|
1640
|
+
return sorted[idx];
|
|
1641
|
+
}
|
|
1642
|
+
/**
|
|
1643
|
+
* Aggregate proposal-quality metrics from the evolve runner's proposal log.
|
|
1644
|
+
* Pure function — does not touch disk and does not invoke any subprocess.
|
|
1645
|
+
*/
|
|
1646
|
+
export function computeProposalQualityMetrics(proposalLog) {
|
|
1647
|
+
const byRef = new Map();
|
|
1648
|
+
let totalAccepted = 0;
|
|
1649
|
+
let totalLintPass = 0;
|
|
1650
|
+
for (const entry of proposalLog) {
|
|
1651
|
+
let row = byRef.get(entry.assetRef);
|
|
1652
|
+
if (!row) {
|
|
1653
|
+
row = { assetRef: entry.assetRef, proposalCount: 0, lintPassCount: 0, acceptedCount: 0 };
|
|
1654
|
+
byRef.set(entry.assetRef, row);
|
|
1655
|
+
}
|
|
1656
|
+
row.proposalCount += 1;
|
|
1657
|
+
if (entry.lintPass) {
|
|
1658
|
+
row.lintPassCount += 1;
|
|
1659
|
+
totalLintPass += 1;
|
|
1660
|
+
}
|
|
1661
|
+
if (entry.decision === "accept") {
|
|
1662
|
+
row.acceptedCount += 1;
|
|
1663
|
+
totalAccepted += 1;
|
|
1664
|
+
}
|
|
1665
|
+
}
|
|
1666
|
+
const rows = [...byRef.values()].sort((a, b) => a.assetRef.localeCompare(b.assetRef));
|
|
1667
|
+
const totalProposals = proposalLog.length;
|
|
1668
|
+
return {
|
|
1669
|
+
rows,
|
|
1670
|
+
totalProposals,
|
|
1671
|
+
totalAccepted,
|
|
1672
|
+
acceptanceRate: totalProposals === 0 ? 0 : totalAccepted / totalProposals,
|
|
1673
|
+
lintPassRate: totalProposals === 0 ? 0 : totalLintPass / totalProposals,
|
|
1674
|
+
};
|
|
1675
|
+
}
|
|
1676
|
+
/**
|
|
1677
|
+
* Compute longitudinal metrics from three §13.3 utility-shaped reports. Each
|
|
1678
|
+
* input report is expected to share the same eval-slice corpus, with one arm
|
|
1679
|
+
* driving the akm side: `pre` = pre-evolve stash, `post` = evolved stash,
|
|
1680
|
+
* `synthetic` = no-stash scratchpad arm.
|
|
1681
|
+
*
|
|
1682
|
+
* The "arm" we read off each report is `aggregateAkm.passRate` — the runners
|
|
1683
|
+
* produce the akm arm for all three (synthetic is just the akm arm with a
|
|
1684
|
+
* stripped stashDir; pre/post differ by stash content). `seedsPerArm` for
|
|
1685
|
+
* the degradation threshold is taken from the post report's corpus envelope.
|
|
1686
|
+
*/
|
|
1687
|
+
export function computeLongitudinalMetrics(preReport, postReport, syntheticReport) {
|
|
1688
|
+
const prePassRate = preReport.aggregateAkm.passRate;
|
|
1689
|
+
const postPassRate = postReport.aggregateAkm.passRate;
|
|
1690
|
+
const syntheticPassRate = syntheticReport.aggregateAkm.passRate;
|
|
1691
|
+
const seedsPerArm = Math.max(1, postReport.corpus.seedsPerArm);
|
|
1692
|
+
const oneSeedFraction = 1 / seedsPerArm;
|
|
1693
|
+
// Per-task degradation: outer-join pre and post on task id.
|
|
1694
|
+
const preTasks = new Map();
|
|
1695
|
+
for (const t of preReport.tasks)
|
|
1696
|
+
preTasks.set(t.id, t);
|
|
1697
|
+
const postTasks = new Map();
|
|
1698
|
+
for (const t of postReport.tasks)
|
|
1699
|
+
postTasks.set(t.id, t);
|
|
1700
|
+
// Index post failure-mode labels by task id (one mode per task — first
|
|
1701
|
+
// failed run wins; matches the §6.6 by-task aggregate's natural ordering).
|
|
1702
|
+
const postFailureByTask = {};
|
|
1703
|
+
const postFailureByTaskMap = postReport.failureModes?.byTask ?? {};
|
|
1704
|
+
for (const [taskId, byMode] of Object.entries(postFailureByTaskMap)) {
|
|
1705
|
+
const labels = Object.keys(byMode);
|
|
1706
|
+
if (labels.length > 0)
|
|
1707
|
+
postFailureByTask[taskId] = labels[0];
|
|
1708
|
+
}
|
|
1709
|
+
const degradations = [];
|
|
1710
|
+
const allIds = new Set();
|
|
1711
|
+
for (const id of preTasks.keys())
|
|
1712
|
+
allIds.add(id);
|
|
1713
|
+
for (const id of postTasks.keys())
|
|
1714
|
+
allIds.add(id);
|
|
1715
|
+
for (const id of [...allIds].sort()) {
|
|
1716
|
+
const pre = preTasks.get(id);
|
|
1717
|
+
const post = postTasks.get(id);
|
|
1718
|
+
if (!pre || !post)
|
|
1719
|
+
continue;
|
|
1720
|
+
const preRate = pre.akm.passRate;
|
|
1721
|
+
const postRate = post.akm.passRate;
|
|
1722
|
+
const dropped = preRate - postRate;
|
|
1723
|
+
if (dropped > oneSeedFraction) {
|
|
1724
|
+
degradations.push({
|
|
1725
|
+
taskId: id,
|
|
1726
|
+
prePassRate: preRate,
|
|
1727
|
+
postPassRate: postRate,
|
|
1728
|
+
delta: postRate - preRate,
|
|
1729
|
+
failureMode: postFailureByTask[id] ?? null,
|
|
1730
|
+
});
|
|
1731
|
+
}
|
|
1732
|
+
}
|
|
1733
|
+
return {
|
|
1734
|
+
improvementSlope: postPassRate - prePassRate,
|
|
1735
|
+
overSyntheticLift: postPassRate - syntheticPassRate,
|
|
1736
|
+
degradationCount: degradations.length,
|
|
1737
|
+
degradations,
|
|
1738
|
+
prePassRate,
|
|
1739
|
+
postPassRate,
|
|
1740
|
+
syntheticPassRate,
|
|
1741
|
+
};
|
|
1742
|
+
}
|
|
1743
|
+
/** Threshold above `pass_rate[0]` that defines "improvement" for §6.4. */
|
|
1744
|
+
export const LEARNING_IMPROVEMENT_THRESHOLD = 0.05;
|
|
1745
|
+
export function computeLearningCurve(episodes) {
|
|
1746
|
+
// Stable sort by episode_index — defensive against unordered inputs.
|
|
1747
|
+
const sorted = [...episodes].sort((a, b) => a.episode_index - b.episode_index);
|
|
1748
|
+
// Recompute per-episode deltas so the contract holds regardless of what
|
|
1749
|
+
// the caller stamped on the input record.
|
|
1750
|
+
const normalised = sorted.map((ep, i) => {
|
|
1751
|
+
const prev = i === 0 ? null : sorted[i - 1];
|
|
1752
|
+
const delta = prev === null ? 0 : ep.pass_rate - prev.pass_rate;
|
|
1753
|
+
return { ...ep, delta_from_previous_episode: delta };
|
|
1754
|
+
});
|
|
1755
|
+
const passRateByEpisode = normalised.map((ep) => ep.pass_rate);
|
|
1756
|
+
// Linear regression slope: sum((xi - x_mean) * (yi - y_mean)) /
|
|
1757
|
+
// sum((xi - x_mean)^2). For a single episode the denominator is 0 — we
|
|
1758
|
+
// return 0 (no observable trend) rather than NaN.
|
|
1759
|
+
const n = normalised.length;
|
|
1760
|
+
let learningSlope = 0;
|
|
1761
|
+
if (n >= 2) {
|
|
1762
|
+
const xs = normalised.map((ep) => ep.episode_index);
|
|
1763
|
+
const xMean = xs.reduce((s, v) => s + v, 0) / n;
|
|
1764
|
+
const yMean = passRateByEpisode.reduce((s, v) => s + v, 0) / n;
|
|
1765
|
+
let num = 0;
|
|
1766
|
+
let den = 0;
|
|
1767
|
+
for (let i = 0; i < n; i += 1) {
|
|
1768
|
+
const dx = xs[i] - xMean;
|
|
1769
|
+
const dy = passRateByEpisode[i] - yMean;
|
|
1770
|
+
num += dx * dy;
|
|
1771
|
+
den += dx * dx;
|
|
1772
|
+
}
|
|
1773
|
+
learningSlope = den === 0 ? 0 : num / den;
|
|
1774
|
+
}
|
|
1775
|
+
// time_to_improvement: smallest episode_index strictly greater than
|
|
1776
|
+
// `pass_rate[0] + threshold`. Episode 0 itself is excluded — improvement
|
|
1777
|
+
// is only meaningful relative to baseline.
|
|
1778
|
+
let timeToImprovement = null;
|
|
1779
|
+
if (n >= 2) {
|
|
1780
|
+
const baseline = passRateByEpisode[0];
|
|
1781
|
+
for (let i = 1; i < n; i += 1) {
|
|
1782
|
+
if (passRateByEpisode[i] > baseline + LEARNING_IMPROVEMENT_THRESHOLD) {
|
|
1783
|
+
timeToImprovement = normalised[i].episode_index;
|
|
1784
|
+
break;
|
|
1785
|
+
}
|
|
1786
|
+
}
|
|
1787
|
+
}
|
|
1788
|
+
return {
|
|
1789
|
+
episodes: normalised,
|
|
1790
|
+
pass_rate_by_episode: passRateByEpisode,
|
|
1791
|
+
learning_slope: learningSlope,
|
|
1792
|
+
time_to_improvement: timeToImprovement,
|
|
1793
|
+
};
|
|
1794
|
+
}
|
|
1795
|
+
/**
|
|
1796
|
+
* Compute the §6.8 feedback-signal integrity confusion matrix.
|
|
1797
|
+
*
|
|
1798
|
+
* Pure function — does not touch disk and does not invoke any subprocess.
|
|
1799
|
+
* The join is by `(taskId, seed)` so that a feedback event is attributed
|
|
1800
|
+
* to the run that produced it, NOT to a later run that happens to touch
|
|
1801
|
+
* the same gold ref. This matters when the same gold ref appears across
|
|
1802
|
+
* multiple Phase 1 runs (e.g. multiple seeds, or two tasks sharing a
|
|
1803
|
+
* skill); the per-asset row aggregates across all runs that referenced it
|
|
1804
|
+
* in feedback, but each individual feedback event's matrix cell is
|
|
1805
|
+
* decided by its own run's outcome.
|
|
1806
|
+
*
|
|
1807
|
+
* NaN-safety: a per-asset row with zero feedback events (cannot happen via
|
|
1808
|
+
* this function — every row is derived from at least one feedback entry —
|
|
1809
|
+
* but defensive against future callers passing curated subsets) emits all
|
|
1810
|
+
* three rates as `null`. `false_positive_rate` is `null` when `FP+TN===0`
|
|
1811
|
+
* even if the row has `FN+TP>0`, and vice versa.
|
|
1812
|
+
*/
|
|
1813
|
+
export function computeFeedbackIntegrity(input) {
|
|
1814
|
+
const akmRuns = input.phase1.akmRuns ?? [];
|
|
1815
|
+
// Build a (taskId, seed) → outcome lookup so every feedback event
|
|
1816
|
+
// resolves in O(1). When two runs share the same key (shouldn't happen
|
|
1817
|
+
// — runner emits unique seeds per task — but defensive) the first
|
|
1818
|
+
// wins.
|
|
1819
|
+
const runOutcomeByKey = new Map();
|
|
1820
|
+
for (const r of akmRuns) {
|
|
1821
|
+
const key = `${r.taskId}::${r.seed}`;
|
|
1822
|
+
if (!runOutcomeByKey.has(key))
|
|
1823
|
+
runOutcomeByKey.set(key, r.outcome);
|
|
1824
|
+
}
|
|
1825
|
+
const perRef = new Map();
|
|
1826
|
+
let aggTP = 0;
|
|
1827
|
+
let aggFP = 0;
|
|
1828
|
+
let aggTN = 0;
|
|
1829
|
+
let aggFN = 0;
|
|
1830
|
+
// Track which (taskId, seed) keys had any feedback dispatched (ok or
|
|
1831
|
+
// not), for the coverage denominator. We count an attempted dispatch as
|
|
1832
|
+
// covered — if `ok===false`, the operator wanted feedback but the CLI
|
|
1833
|
+
// failed; that's still a covered run for the purpose of §6.8 (and is
|
|
1834
|
+
// surfaced in the warnings list elsewhere).
|
|
1835
|
+
const coveredKeys = new Set();
|
|
1836
|
+
for (const fb of input.feedbackLog) {
|
|
1837
|
+
const key = `${fb.taskId}::${fb.seed}`;
|
|
1838
|
+
coveredKeys.add(key);
|
|
1839
|
+
if (!fb.ok)
|
|
1840
|
+
continue; // failed dispatches don't label a matrix cell.
|
|
1841
|
+
const outcome = runOutcomeByKey.get(key);
|
|
1842
|
+
if (outcome === undefined)
|
|
1843
|
+
continue; // run not found — defensive, drop.
|
|
1844
|
+
// harness_error runs are not labelled (the bench skips dispatching
|
|
1845
|
+
// feedback for them; if a fake test injects one, we drop it from the
|
|
1846
|
+
// matrix to avoid mislabelling).
|
|
1847
|
+
if (outcome === "harness_error")
|
|
1848
|
+
continue;
|
|
1849
|
+
const passed = outcome === "pass";
|
|
1850
|
+
let row = perRef.get(fb.goldRef);
|
|
1851
|
+
if (!row) {
|
|
1852
|
+
row = { truePositive: 0, falsePositive: 0, trueNegative: 0, falseNegative: 0 };
|
|
1853
|
+
perRef.set(fb.goldRef, row);
|
|
1854
|
+
}
|
|
1855
|
+
if (fb.signal === "positive" && passed) {
|
|
1856
|
+
row.truePositive += 1;
|
|
1857
|
+
aggTP += 1;
|
|
1858
|
+
}
|
|
1859
|
+
else if (fb.signal === "positive" && !passed) {
|
|
1860
|
+
row.falsePositive += 1;
|
|
1861
|
+
aggFP += 1;
|
|
1862
|
+
}
|
|
1863
|
+
else if (fb.signal === "negative" && !passed) {
|
|
1864
|
+
row.trueNegative += 1;
|
|
1865
|
+
aggTN += 1;
|
|
1866
|
+
}
|
|
1867
|
+
else if (fb.signal === "negative" && passed) {
|
|
1868
|
+
row.falseNegative += 1;
|
|
1869
|
+
aggFN += 1;
|
|
1870
|
+
}
|
|
1871
|
+
}
|
|
1872
|
+
const aggTotal = aggTP + aggFP + aggTN + aggFN;
|
|
1873
|
+
const totalPhase1Runs = akmRuns.length;
|
|
1874
|
+
const aggregate = {
|
|
1875
|
+
truePositive: aggTP,
|
|
1876
|
+
falsePositive: aggFP,
|
|
1877
|
+
trueNegative: aggTN,
|
|
1878
|
+
falseNegative: aggFN,
|
|
1879
|
+
feedback_agreement: aggTotal === 0 ? 0 : (aggTP + aggTN) / aggTotal,
|
|
1880
|
+
false_positive_rate: aggFP + aggTN === 0 ? 0 : aggFP / (aggFP + aggTN),
|
|
1881
|
+
false_negative_rate: aggFN + aggTP === 0 ? 0 : aggFN / (aggFN + aggTP),
|
|
1882
|
+
feedback_coverage: totalPhase1Runs === 0 ? 0 : coveredKeys.size / totalPhase1Runs,
|
|
1883
|
+
};
|
|
1884
|
+
const perAsset = [];
|
|
1885
|
+
for (const [ref, row] of [...perRef.entries()].sort((a, b) => a[0].localeCompare(b[0]))) {
|
|
1886
|
+
const total = row.truePositive + row.falsePositive + row.trueNegative + row.falseNegative;
|
|
1887
|
+
const fpDenom = row.falsePositive + row.trueNegative;
|
|
1888
|
+
const fnDenom = row.falseNegative + row.truePositive;
|
|
1889
|
+
perAsset.push({
|
|
1890
|
+
ref,
|
|
1891
|
+
truePositive: row.truePositive,
|
|
1892
|
+
falsePositive: row.falsePositive,
|
|
1893
|
+
trueNegative: row.trueNegative,
|
|
1894
|
+
falseNegative: row.falseNegative,
|
|
1895
|
+
feedback_agreement: total === 0 ? null : (row.truePositive + row.trueNegative) / total,
|
|
1896
|
+
false_positive_rate: fpDenom === 0 ? null : row.falsePositive / fpDenom,
|
|
1897
|
+
false_negative_rate: fnDenom === 0 ? null : row.falseNegative / fnDenom,
|
|
1898
|
+
});
|
|
1899
|
+
}
|
|
1900
|
+
return { aggregate, perAsset };
|
|
1901
|
+
}
|
|
1902
|
+
function aggregateByKey(entries, pickKey) {
|
|
1903
|
+
const buckets = new Map();
|
|
1904
|
+
for (const entry of entries) {
|
|
1905
|
+
const key = pickKey(entry);
|
|
1906
|
+
if (!key)
|
|
1907
|
+
continue;
|
|
1908
|
+
let arr = buckets.get(key);
|
|
1909
|
+
if (!arr) {
|
|
1910
|
+
arr = [];
|
|
1911
|
+
buckets.set(key, arr);
|
|
1912
|
+
}
|
|
1913
|
+
arr.push(entry);
|
|
1914
|
+
}
|
|
1915
|
+
const rows = [];
|
|
1916
|
+
for (const [category, group] of buckets) {
|
|
1917
|
+
const n = group.length;
|
|
1918
|
+
let noakmSum = 0;
|
|
1919
|
+
let akmSum = 0;
|
|
1920
|
+
let regressionCount = 0;
|
|
1921
|
+
let complianceSum = 0;
|
|
1922
|
+
let complianceCount = 0;
|
|
1923
|
+
for (const t of group) {
|
|
1924
|
+
noakmSum += t.noakm.passRate;
|
|
1925
|
+
akmSum += t.akm.passRate;
|
|
1926
|
+
if (t.akm.passRate < t.noakm.passRate)
|
|
1927
|
+
regressionCount += 1;
|
|
1928
|
+
if (typeof t.workflowCompliance === "number" && Number.isFinite(t.workflowCompliance)) {
|
|
1929
|
+
complianceSum += t.workflowCompliance;
|
|
1930
|
+
complianceCount += 1;
|
|
1931
|
+
}
|
|
1932
|
+
}
|
|
1933
|
+
rows.push({
|
|
1934
|
+
category,
|
|
1935
|
+
taskCount: n,
|
|
1936
|
+
passRateNoakm: noakmSum / n,
|
|
1937
|
+
passRateAkm: akmSum / n,
|
|
1938
|
+
passRateDelta: akmSum / n - noakmSum / n,
|
|
1939
|
+
negativeTransferCount: regressionCount,
|
|
1940
|
+
workflowCompliance: complianceCount === 0 ? null : complianceSum / complianceCount,
|
|
1941
|
+
});
|
|
1942
|
+
}
|
|
1943
|
+
rows.sort((a, b) => a.category.localeCompare(b.category));
|
|
1944
|
+
return rows;
|
|
1945
|
+
}
|
|
1946
|
+
/**
|
|
1947
|
+
* Aggregate per-task entries by `memoryAbility` (#262). Tasks lacking a tag
|
|
1948
|
+
* are skipped so the report only surfaces categories with explicit
|
|
1949
|
+
* coverage. Output rows are sorted by category for byte-stable JSON.
|
|
1950
|
+
*
|
|
1951
|
+
* The closed set of memory-ability values is exported as
|
|
1952
|
+
* {@link MEMORY_ABILITY_VALUES} from `corpus.ts`.
|
|
1953
|
+
*/
|
|
1954
|
+
export function aggregateByMemoryAbility(entries) {
|
|
1955
|
+
return aggregateByKey(entries, (e) => e.memoryAbility);
|
|
1956
|
+
}
|
|
1957
|
+
/**
|
|
1958
|
+
* Aggregate per-task entries by `taskFamily` (#262). Tasks lacking a tag
|
|
1959
|
+
* are skipped. `taskFamily` follows the `<domain>/<short-name>` grammar —
|
|
1960
|
+
* tasks sharing a family are expected to transfer knowledge between each
|
|
1961
|
+
* other. Output rows are sorted by category for byte-stable JSON.
|
|
1962
|
+
*/
|
|
1963
|
+
export function aggregateByTaskFamily(entries) {
|
|
1964
|
+
return aggregateByKey(entries, (e) => e.taskFamily);
|
|
1965
|
+
}
|
|
1966
|
+
export function computeCorpusCoverage(tasks) {
|
|
1967
|
+
const memoryAbilityCounts = {
|
|
1968
|
+
untagged: 0,
|
|
1969
|
+
};
|
|
1970
|
+
for (const ability of MEMORY_ABILITY_VALUES) {
|
|
1971
|
+
memoryAbilityCounts[ability] = 0;
|
|
1972
|
+
}
|
|
1973
|
+
const taskFamilyCounts = {};
|
|
1974
|
+
let untaggedFamily = 0;
|
|
1975
|
+
for (const task of tasks) {
|
|
1976
|
+
if (task.memoryAbility) {
|
|
1977
|
+
memoryAbilityCounts[task.memoryAbility] = (memoryAbilityCounts[task.memoryAbility] ?? 0) + 1;
|
|
1978
|
+
}
|
|
1979
|
+
else {
|
|
1980
|
+
memoryAbilityCounts.untagged += 1;
|
|
1981
|
+
}
|
|
1982
|
+
if (task.taskFamily) {
|
|
1983
|
+
taskFamilyCounts[task.taskFamily] = (taskFamilyCounts[task.taskFamily] ?? 0) + 1;
|
|
1984
|
+
}
|
|
1985
|
+
else {
|
|
1986
|
+
untaggedFamily += 1;
|
|
1987
|
+
}
|
|
1988
|
+
}
|
|
1989
|
+
if (untaggedFamily > 0)
|
|
1990
|
+
taskFamilyCounts.untagged = untaggedFamily;
|
|
1991
|
+
return {
|
|
1992
|
+
totalTasks: tasks.length,
|
|
1993
|
+
memoryAbilityCounts,
|
|
1994
|
+
taskFamilyCounts,
|
|
1995
|
+
};
|
|
1996
|
+
}
|
|
1997
|
+
/**
|
|
1998
|
+
* Verb counts considered "AKM tool calls" for `totalToolCalls`. We
|
|
1999
|
+
* deliberately keep this list small — each verb folded in MUST be a
|
|
2000
|
+
* user-initiated CLI invocation, not a background bookkeeping event.
|
|
2001
|
+
* Adding new verbs here is additive and changes only `totalToolCalls`.
|
|
2002
|
+
*/
|
|
2003
|
+
export const AKM_TOOL_CALL_TYPES = new Set([
|
|
2004
|
+
"akm_search",
|
|
2005
|
+
"akm_show",
|
|
2006
|
+
"akm_feedback",
|
|
2007
|
+
]);
|
|
2008
|
+
/**
|
|
2009
|
+
* Compute per-run AKM overhead records by replaying #254's normalised trace.
|
|
2010
|
+
*
|
|
2011
|
+
* Pure function: never mutates `runs` and never reads disk. The optional
|
|
2012
|
+
* `taskMetadata` lookup is used only to label loads as relevant / irrelevant
|
|
2013
|
+
* and to compute `timeToFirstCorrectAssetMs`.
|
|
2014
|
+
*
|
|
2015
|
+
* Returned array length matches `runs.length`; element order matches input
|
|
2016
|
+
* order. Runs whose trace contains no AKM events still produce a record
|
|
2017
|
+
* with all counts at zero and timings at `null`.
|
|
2018
|
+
*/
|
|
2019
|
+
export function computeAkmOverhead(runs, options = {}) {
|
|
2020
|
+
const out = [];
|
|
2021
|
+
for (const run of runs) {
|
|
2022
|
+
out.push(perRun(run, options.taskMetadata));
|
|
2023
|
+
}
|
|
2024
|
+
return out;
|
|
2025
|
+
}
|
|
2026
|
+
function perRun(run, taskMetadata) {
|
|
2027
|
+
const trace = normalizeRunToTrace(run);
|
|
2028
|
+
const events = trace.events;
|
|
2029
|
+
let searchCount = 0;
|
|
2030
|
+
let showCount = 0;
|
|
2031
|
+
let feedbackCount = 0;
|
|
2032
|
+
let positiveFeedbackCount = 0;
|
|
2033
|
+
let negativeFeedbackCount = 0;
|
|
2034
|
+
const uniqueShowRefs = new Set();
|
|
2035
|
+
for (const ev of events) {
|
|
2036
|
+
if (ev.type === "akm_search")
|
|
2037
|
+
searchCount += 1;
|
|
2038
|
+
else if (ev.type === "akm_show") {
|
|
2039
|
+
showCount += 1;
|
|
2040
|
+
if (typeof ev.assetRef === "string" && ev.assetRef.length > 0) {
|
|
2041
|
+
uniqueShowRefs.add(ev.assetRef);
|
|
2042
|
+
}
|
|
2043
|
+
}
|
|
2044
|
+
else if (ev.type === "akm_feedback") {
|
|
2045
|
+
feedbackCount += 1;
|
|
2046
|
+
// Polarity is carried in args as "--positive" or "--negative".
|
|
2047
|
+
// Events sourced from events.jsonl also have args populated by
|
|
2048
|
+
// normalizeRunToTrace. Absence of both flags is treated as unknown
|
|
2049
|
+
// (contributes to feedbackCount but not to either polarity counter).
|
|
2050
|
+
if (ev.args?.includes("--positive"))
|
|
2051
|
+
positiveFeedbackCount += 1;
|
|
2052
|
+
else if (ev.args?.includes("--negative"))
|
|
2053
|
+
negativeFeedbackCount += 1;
|
|
2054
|
+
}
|
|
2055
|
+
}
|
|
2056
|
+
const totalToolCalls = searchCount + showCount + feedbackCount;
|
|
2057
|
+
// Run-start anchor: earliest parseable ts in the trace. We use the trace
|
|
2058
|
+
// (not RunResult.events directly) so harness lifecycle markers, when
|
|
2059
|
+
// supplied, can serve as the anchor for stdout-derived events that lack a
|
|
2060
|
+
// native ts.
|
|
2061
|
+
const runStartMs = earliestEventMs(events);
|
|
2062
|
+
const timeToFirstSearchMs = computeFirstEventOffsetMs(events, runStartMs, (ev) => ev.type === "akm_search");
|
|
2063
|
+
// Resolve task metadata once. Missing metadata means we can't judge
|
|
2064
|
+
// relevance — emit null counts rather than zero.
|
|
2065
|
+
const meta = taskMetadata?.get(run.taskId);
|
|
2066
|
+
const goldRef = meta?.goldRef;
|
|
2067
|
+
const transferFrom = meta?.expectedTransferFrom ?? [];
|
|
2068
|
+
const knownRelevant = new Set();
|
|
2069
|
+
if (typeof goldRef === "string" && goldRef.length > 0)
|
|
2070
|
+
knownRelevant.add(goldRef);
|
|
2071
|
+
for (const r of transferFrom) {
|
|
2072
|
+
if (typeof r === "string" && r.length > 0)
|
|
2073
|
+
knownRelevant.add(r);
|
|
2074
|
+
}
|
|
2075
|
+
let irrelevantAssetsLoadedCount;
|
|
2076
|
+
if (!meta) {
|
|
2077
|
+
// No metadata: cannot tell relevant from irrelevant. Surface null.
|
|
2078
|
+
irrelevantAssetsLoadedCount = null;
|
|
2079
|
+
}
|
|
2080
|
+
else {
|
|
2081
|
+
let count = 0;
|
|
2082
|
+
for (const ref of uniqueShowRefs) {
|
|
2083
|
+
if (!knownRelevant.has(ref))
|
|
2084
|
+
count += 1;
|
|
2085
|
+
}
|
|
2086
|
+
irrelevantAssetsLoadedCount = count;
|
|
2087
|
+
}
|
|
2088
|
+
let timeToFirstCorrectAssetMs = null;
|
|
2089
|
+
if (typeof goldRef === "string" && goldRef.length > 0) {
|
|
2090
|
+
timeToFirstCorrectAssetMs = computeFirstEventOffsetMs(events, runStartMs, (ev) => ev.type === "akm_show" && ev.assetRef === goldRef);
|
|
2091
|
+
}
|
|
2092
|
+
return {
|
|
2093
|
+
taskId: run.taskId,
|
|
2094
|
+
arm: run.arm,
|
|
2095
|
+
seed: run.seed,
|
|
2096
|
+
outcome: run.outcome,
|
|
2097
|
+
searchCount,
|
|
2098
|
+
showCount,
|
|
2099
|
+
feedbackCount,
|
|
2100
|
+
positiveFeedbackCount,
|
|
2101
|
+
negativeFeedbackCount,
|
|
2102
|
+
totalToolCalls,
|
|
2103
|
+
assetsLoadedCount: uniqueShowRefs.size,
|
|
2104
|
+
irrelevantAssetsLoadedCount,
|
|
2105
|
+
timeToFirstSearchMs,
|
|
2106
|
+
timeToFirstCorrectAssetMs,
|
|
2107
|
+
// Byte sizes are not yet wired through the trace (#254 does not capture
|
|
2108
|
+
// payload sizes). Callers MUST treat null as "unavailable", not zero.
|
|
2109
|
+
contextBytesLoaded: null,
|
|
2110
|
+
assetBytesLoaded: null,
|
|
2111
|
+
};
|
|
2112
|
+
}
|
|
2113
|
+
/**
|
|
2114
|
+
* Aggregate per-run AKM overhead records into the corpus-wide block (#263).
|
|
2115
|
+
*
|
|
2116
|
+
* Pure: never mutates `perRun`. When `perRun` is empty, returns a zero/null
|
|
2117
|
+
* envelope so callers can render a "no AKM activity" section without
|
|
2118
|
+
* branching. `passingRuns === 0` always implies `toolCallsPerSuccess === null`
|
|
2119
|
+
* and `costPerSuccess === null`.
|
|
2120
|
+
*/
|
|
2121
|
+
export function aggregateAkmOverhead(perRun, rawRuns = []) {
|
|
2122
|
+
const n = perRun.length;
|
|
2123
|
+
if (n === 0) {
|
|
2124
|
+
return {
|
|
2125
|
+
totalRuns: 0,
|
|
2126
|
+
passingRuns: 0,
|
|
2127
|
+
meanSearchCount: 0,
|
|
2128
|
+
meanShowCount: 0,
|
|
2129
|
+
meanFeedbackCount: 0,
|
|
2130
|
+
meanToolCalls: 0,
|
|
2131
|
+
meanAssetsLoaded: 0,
|
|
2132
|
+
meanIrrelevantAssetsLoaded: null,
|
|
2133
|
+
meanTimeToFirstSearchMs: null,
|
|
2134
|
+
meanTimeToFirstCorrectAssetMs: null,
|
|
2135
|
+
meanContextBytesLoaded: null,
|
|
2136
|
+
meanAssetBytesLoaded: null,
|
|
2137
|
+
totalToolCalls: 0,
|
|
2138
|
+
toolCallsPerSuccess: null,
|
|
2139
|
+
costPerSuccess: null,
|
|
2140
|
+
searchEngagementRate: 0,
|
|
2141
|
+
showEngagementRate: 0,
|
|
2142
|
+
feedbackEngagementRate: 0,
|
|
2143
|
+
searchToShowRatio: null,
|
|
2144
|
+
meanPositiveFeedbackCount: 0,
|
|
2145
|
+
meanNegativeFeedbackCount: 0,
|
|
2146
|
+
};
|
|
2147
|
+
}
|
|
2148
|
+
let searchSum = 0;
|
|
2149
|
+
let showSum = 0;
|
|
2150
|
+
let feedbackSum = 0;
|
|
2151
|
+
let toolCallsSum = 0;
|
|
2152
|
+
let assetsSum = 0;
|
|
2153
|
+
let irrelevantSum = 0;
|
|
2154
|
+
let irrelevantCount = 0;
|
|
2155
|
+
let firstSearchSum = 0;
|
|
2156
|
+
let firstSearchCount = 0;
|
|
2157
|
+
let firstCorrectSum = 0;
|
|
2158
|
+
let firstCorrectCount = 0;
|
|
2159
|
+
let contextBytesSum = 0;
|
|
2160
|
+
let contextBytesCount = 0;
|
|
2161
|
+
let assetBytesSum = 0;
|
|
2162
|
+
let assetBytesCount = 0;
|
|
2163
|
+
// Build a quick lookup for token measurement off `rawRuns` so the cost-
|
|
2164
|
+
// per-success calc can honour the parsed/missing/unsupported distinction
|
|
2165
|
+
// without forcing the caller to project tokens onto AkmOverheadPerRun.
|
|
2166
|
+
const rawByKey = new Map();
|
|
2167
|
+
for (const r of rawRuns) {
|
|
2168
|
+
rawByKey.set(`${r.taskId}${r.arm}${r.seed}`, r);
|
|
2169
|
+
}
|
|
2170
|
+
let passingRuns = 0;
|
|
2171
|
+
let parsedPassTokenSum = 0;
|
|
2172
|
+
let parsedPassCount = 0;
|
|
2173
|
+
let anyPassMissingMeasurement = false;
|
|
2174
|
+
let searchEngagedRuns = 0;
|
|
2175
|
+
let showEngagedRuns = 0;
|
|
2176
|
+
let feedbackEngagedRuns = 0;
|
|
2177
|
+
let positiveFeedbackSum = 0;
|
|
2178
|
+
let negativeFeedbackSum = 0;
|
|
2179
|
+
for (const row of perRun) {
|
|
2180
|
+
searchSum += row.searchCount;
|
|
2181
|
+
showSum += row.showCount;
|
|
2182
|
+
feedbackSum += row.feedbackCount;
|
|
2183
|
+
toolCallsSum += row.totalToolCalls;
|
|
2184
|
+
assetsSum += row.assetsLoadedCount;
|
|
2185
|
+
if (row.searchCount > 0)
|
|
2186
|
+
searchEngagedRuns += 1;
|
|
2187
|
+
if (row.showCount > 0)
|
|
2188
|
+
showEngagedRuns += 1;
|
|
2189
|
+
if (row.feedbackCount > 0)
|
|
2190
|
+
feedbackEngagedRuns += 1;
|
|
2191
|
+
positiveFeedbackSum += row.positiveFeedbackCount;
|
|
2192
|
+
negativeFeedbackSum += row.negativeFeedbackCount;
|
|
2193
|
+
if (row.irrelevantAssetsLoadedCount !== null) {
|
|
2194
|
+
irrelevantSum += row.irrelevantAssetsLoadedCount;
|
|
2195
|
+
irrelevantCount += 1;
|
|
2196
|
+
}
|
|
2197
|
+
if (row.timeToFirstSearchMs !== null) {
|
|
2198
|
+
firstSearchSum += row.timeToFirstSearchMs;
|
|
2199
|
+
firstSearchCount += 1;
|
|
2200
|
+
}
|
|
2201
|
+
if (row.timeToFirstCorrectAssetMs !== null) {
|
|
2202
|
+
firstCorrectSum += row.timeToFirstCorrectAssetMs;
|
|
2203
|
+
firstCorrectCount += 1;
|
|
2204
|
+
}
|
|
2205
|
+
if (row.contextBytesLoaded !== null) {
|
|
2206
|
+
contextBytesSum += row.contextBytesLoaded;
|
|
2207
|
+
contextBytesCount += 1;
|
|
2208
|
+
}
|
|
2209
|
+
if (row.assetBytesLoaded !== null) {
|
|
2210
|
+
assetBytesSum += row.assetBytesLoaded;
|
|
2211
|
+
assetBytesCount += 1;
|
|
2212
|
+
}
|
|
2213
|
+
if (row.outcome === "pass") {
|
|
2214
|
+
passingRuns += 1;
|
|
2215
|
+
const raw = rawByKey.get(`${row.taskId}${row.arm}${row.seed}`);
|
|
2216
|
+
// Treat absent tokenMeasurement as `parsed` for backward compat with
|
|
2217
|
+
// older artefacts (mirrors `isMeasured` behaviour above).
|
|
2218
|
+
const measurement = raw?.tokenMeasurement ?? "parsed";
|
|
2219
|
+
if (raw && measurement === "parsed") {
|
|
2220
|
+
parsedPassTokenSum += raw.tokens.input + raw.tokens.output;
|
|
2221
|
+
parsedPassCount += 1;
|
|
2222
|
+
}
|
|
2223
|
+
else if (raw) {
|
|
2224
|
+
anyPassMissingMeasurement = true;
|
|
2225
|
+
}
|
|
2226
|
+
else {
|
|
2227
|
+
// No matching raw run supplied — cannot honour cost-per-success.
|
|
2228
|
+
anyPassMissingMeasurement = true;
|
|
2229
|
+
}
|
|
2230
|
+
}
|
|
2231
|
+
}
|
|
2232
|
+
const toolCallsPerSuccess = passingRuns === 0 ? null : toolCallsSum / passingRuns;
|
|
2233
|
+
// Cost-per-success: null unless EVERY passing run has parsed measurement.
|
|
2234
|
+
// Mixed measurement statuses cannot be averaged honestly (issue #252).
|
|
2235
|
+
const costPerSuccess = passingRuns === 0 || anyPassMissingMeasurement || parsedPassCount === 0
|
|
2236
|
+
? null
|
|
2237
|
+
: parsedPassTokenSum / parsedPassCount;
|
|
2238
|
+
const searchToShowRatio = searchSum === 0 ? null : showSum / searchSum;
|
|
2239
|
+
return {
|
|
2240
|
+
totalRuns: n,
|
|
2241
|
+
passingRuns,
|
|
2242
|
+
meanSearchCount: searchSum / n,
|
|
2243
|
+
meanShowCount: showSum / n,
|
|
2244
|
+
meanFeedbackCount: feedbackSum / n,
|
|
2245
|
+
meanToolCalls: toolCallsSum / n,
|
|
2246
|
+
meanAssetsLoaded: assetsSum / n,
|
|
2247
|
+
meanIrrelevantAssetsLoaded: irrelevantCount === 0 ? null : irrelevantSum / irrelevantCount,
|
|
2248
|
+
meanTimeToFirstSearchMs: firstSearchCount === 0 ? null : firstSearchSum / firstSearchCount,
|
|
2249
|
+
meanTimeToFirstCorrectAssetMs: firstCorrectCount === 0 ? null : firstCorrectSum / firstCorrectCount,
|
|
2250
|
+
meanContextBytesLoaded: contextBytesCount === 0 ? null : contextBytesSum / contextBytesCount,
|
|
2251
|
+
meanAssetBytesLoaded: assetBytesCount === 0 ? null : assetBytesSum / assetBytesCount,
|
|
2252
|
+
totalToolCalls: toolCallsSum,
|
|
2253
|
+
toolCallsPerSuccess,
|
|
2254
|
+
costPerSuccess,
|
|
2255
|
+
searchEngagementRate: searchEngagedRuns / n,
|
|
2256
|
+
showEngagementRate: showEngagedRuns / n,
|
|
2257
|
+
feedbackEngagementRate: feedbackEngagedRuns / n,
|
|
2258
|
+
searchToShowRatio,
|
|
2259
|
+
meanPositiveFeedbackCount: positiveFeedbackSum / n,
|
|
2260
|
+
meanNegativeFeedbackCount: negativeFeedbackSum / n,
|
|
2261
|
+
};
|
|
2262
|
+
}
|
|
2263
|
+
/**
|
|
2264
|
+
* Bucket a workflow check status onto pass / non-pass for reliability.
|
|
2265
|
+
*
|
|
2266
|
+
* Reliability is a strict pass-or-not metric (issue #258). Anything other
|
|
2267
|
+
* than `pass` (including `partial`, `fail`, `harness_error`) counts as a
|
|
2268
|
+
* non-pass. `not_applicable` returns `null` so the caller can skip the
|
|
2269
|
+
* entire (task, seed) pair — it never contributes to either numerator or
|
|
2270
|
+
* denominator.
|
|
2271
|
+
*/
|
|
2272
|
+
function bucketReliabilityStatus(status) {
|
|
2273
|
+
if (status === "not_applicable")
|
|
2274
|
+
return null;
|
|
2275
|
+
if (status === "pass")
|
|
2276
|
+
return "pass";
|
|
2277
|
+
return "non_pass";
|
|
2278
|
+
}
|
|
2279
|
+
/**
|
|
2280
|
+
* Compute workflow reliability metrics (`pass@k` and `pass^k`) per workflow
|
|
2281
|
+
* and corpus-wide from a flat list of `WorkflowCheckResult`.
|
|
2282
|
+
*
|
|
2283
|
+
* Methodology (per #258 review addendum):
|
|
2284
|
+
* 1. Filter out `not_applicable` checks entirely.
|
|
2285
|
+
* 2. For each `(workflow_id, task_id)` group, collapse seeds to the set
|
|
2286
|
+
* of statuses observed.
|
|
2287
|
+
* 3. `pass_at_k` per task = 1 if at least one seed is `pass`, else 0.
|
|
2288
|
+
* 4. `pass_all_k` per task = 1 if every seed is `pass`, else 0.
|
|
2289
|
+
* 5. Per-workflow row averages over its task set.
|
|
2290
|
+
* 6. Corpus rollup averages over every (workflow, task) group equally.
|
|
2291
|
+
*
|
|
2292
|
+
* Pure: never mutates `checks`. Returns a stable shape for empty input.
|
|
2293
|
+
*/
|
|
2294
|
+
export function computeWorkflowReliability(checks) {
|
|
2295
|
+
// Group by (workflow_id, task_id) → list of statuses across seeds.
|
|
2296
|
+
// Use Map<string, Map<string, WorkflowCheckStatus[]>> so iteration order
|
|
2297
|
+
// is insertion order (deterministic given deterministic input).
|
|
2298
|
+
const grouped = new Map();
|
|
2299
|
+
for (const c of checks) {
|
|
2300
|
+
if (bucketReliabilityStatus(c.status) === null)
|
|
2301
|
+
continue;
|
|
2302
|
+
let perWorkflow = grouped.get(c.workflowId);
|
|
2303
|
+
if (!perWorkflow) {
|
|
2304
|
+
perWorkflow = new Map();
|
|
2305
|
+
grouped.set(c.workflowId, perWorkflow);
|
|
2306
|
+
}
|
|
2307
|
+
const list = perWorkflow.get(c.taskId);
|
|
2308
|
+
if (list)
|
|
2309
|
+
list.push(c.status);
|
|
2310
|
+
else
|
|
2311
|
+
perWorkflow.set(c.taskId, [c.status]);
|
|
2312
|
+
}
|
|
2313
|
+
const byWorkflow = {};
|
|
2314
|
+
let corpusPassAtKSum = 0;
|
|
2315
|
+
let corpusPassAllKSum = 0;
|
|
2316
|
+
let corpusGroupCount = 0;
|
|
2317
|
+
const corpusTasks = new Set();
|
|
2318
|
+
for (const [workflowId, perTask] of grouped) {
|
|
2319
|
+
let passAtKSum = 0;
|
|
2320
|
+
let passAllKSum = 0;
|
|
2321
|
+
let kMax = 0;
|
|
2322
|
+
for (const [taskId, statuses] of perTask) {
|
|
2323
|
+
if (statuses.length > kMax)
|
|
2324
|
+
kMax = statuses.length;
|
|
2325
|
+
const allPass = statuses.every((s) => s === "pass");
|
|
2326
|
+
const anyPass = statuses.some((s) => s === "pass");
|
|
2327
|
+
if (anyPass)
|
|
2328
|
+
passAtKSum += 1;
|
|
2329
|
+
if (allPass)
|
|
2330
|
+
passAllKSum += 1;
|
|
2331
|
+
corpusPassAtKSum += anyPass ? 1 : 0;
|
|
2332
|
+
corpusPassAllKSum += allPass ? 1 : 0;
|
|
2333
|
+
corpusGroupCount += 1;
|
|
2334
|
+
corpusTasks.add(taskId);
|
|
2335
|
+
}
|
|
2336
|
+
const taskCount = perTask.size;
|
|
2337
|
+
byWorkflow[workflowId] = {
|
|
2338
|
+
workflow_id: workflowId,
|
|
2339
|
+
pass_at_k: taskCount === 0 ? 0 : passAtKSum / taskCount,
|
|
2340
|
+
pass_all_k: taskCount === 0 ? 0 : passAllKSum / taskCount,
|
|
2341
|
+
tasks: taskCount,
|
|
2342
|
+
k: kMax,
|
|
2343
|
+
};
|
|
2344
|
+
}
|
|
2345
|
+
const corpus = {
|
|
2346
|
+
pass_at_k: corpusGroupCount === 0 ? 0 : corpusPassAtKSum / corpusGroupCount,
|
|
2347
|
+
pass_all_k: corpusGroupCount === 0 ? 0 : corpusPassAllKSum / corpusGroupCount,
|
|
2348
|
+
groups: corpusGroupCount,
|
|
2349
|
+
tasks: corpusTasks.size,
|
|
2350
|
+
};
|
|
2351
|
+
return { byWorkflow, corpus };
|
|
2352
|
+
}
|
|
2353
|
+
/** Earliest parseable ts (ms epoch) among events; null when none. */
|
|
2354
|
+
function earliestEventMs(events) {
|
|
2355
|
+
let earliest = null;
|
|
2356
|
+
for (const ev of events) {
|
|
2357
|
+
const ms = parseTsToMs(ev.ts);
|
|
2358
|
+
if (ms === null)
|
|
2359
|
+
continue;
|
|
2360
|
+
if (earliest === null || ms < earliest)
|
|
2361
|
+
earliest = ms;
|
|
2362
|
+
}
|
|
2363
|
+
return earliest;
|
|
2364
|
+
}
|
|
2365
|
+
/**
|
|
2366
|
+
* Find the first event matching `predicate`, parse its ts, and return
|
|
2367
|
+
* `(ts - runStartMs)`. Returns `null` if no matching event has a parseable
|
|
2368
|
+
* ts, if `runStartMs` is null, or if the offset would be negative (a clock
|
|
2369
|
+
* inversion we refuse to silently coerce to zero).
|
|
2370
|
+
*/
|
|
2371
|
+
function computeFirstEventOffsetMs(events, runStartMs, predicate) {
|
|
2372
|
+
if (runStartMs === null)
|
|
2373
|
+
return null;
|
|
2374
|
+
for (const ev of events) {
|
|
2375
|
+
if (!predicate(ev))
|
|
2376
|
+
continue;
|
|
2377
|
+
const ms = parseTsToMs(ev.ts);
|
|
2378
|
+
if (ms === null)
|
|
2379
|
+
continue;
|
|
2380
|
+
const offset = ms - runStartMs;
|
|
2381
|
+
if (offset < 0)
|
|
2382
|
+
return null;
|
|
2383
|
+
return offset;
|
|
2384
|
+
}
|
|
2385
|
+
return null;
|
|
2386
|
+
}
|
|
2387
|
+
/** Parse an ISO ts to ms-epoch; null when missing or unparseable. */
|
|
2388
|
+
function parseTsToMs(ts) {
|
|
2389
|
+
if (typeof ts !== "string" || ts.length === 0)
|
|
2390
|
+
return null;
|
|
2391
|
+
const ms = Date.parse(ts);
|
|
2392
|
+
if (Number.isNaN(ms))
|
|
2393
|
+
return null;
|
|
2394
|
+
return ms;
|
|
2395
|
+
}
|