@tangle-network/agent-eval 0.31.0 → 0.32.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +79 -0
- package/dist/benchmarks/index.d.ts +2 -2
- package/dist/{chunk-Y2CPBYKH.js → chunk-B73G44OH.js} +2 -2
- package/dist/{chunk-75ZREHD7.js → chunk-DTEJNZYK.js} +2 -1
- package/dist/chunk-DTEJNZYK.js.map +1 -0
- package/dist/{chunk-XEL6UP7C.js → chunk-S4Y5VXMS.js} +2 -2
- package/dist/{chunk-WSI4K3WB.js → chunk-ZN2CMQIW.js} +53 -1
- package/dist/chunk-ZN2CMQIW.js.map +1 -0
- package/dist/{control-BFpqHFV2.d.ts → control-p2ns7elI.d.ts} +1 -1
- package/dist/control.d.ts +2 -2
- package/dist/control.js +2 -2
- package/dist/{index-TVjRYWRm.d.ts → index-BTqhGHJT.d.ts} +1 -1
- package/dist/index.d.ts +235 -13
- package/dist/index.js +296 -47
- package/dist/index.js.map +1 -1
- package/dist/meta-eval/index.d.ts +2 -2
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +3 -3
- package/dist/optimization.js +3 -3
- package/dist/{release-report-C8r4Vben.d.ts → release-report-DLWbBPtH.d.ts} +2 -2
- package/dist/reporting.d.ts +4 -4
- package/dist/{researcher-BmgJ_901.d.ts → researcher-BRHa5Jxo.d.ts} +8 -2
- package/dist/rl.d.ts +5 -5
- package/dist/rl.js +1 -1
- package/dist/{rubric-predictive-validity-Bm-CbN46.d.ts → rubric-predictive-validity-CMHypZ_M.d.ts} +1 -1
- package/dist/{run-record-nYf9x2hU.d.ts → run-record-BfX5y68A.d.ts} +42 -1
- package/dist/{summary-report-dir7A-eQ.d.ts → summary-report-D7AQS7eB.d.ts} +1 -1
- package/package.json +1 -1
- package/dist/chunk-75ZREHD7.js.map +0 -1
- package/dist/chunk-WSI4K3WB.js.map +0 -1
- /package/dist/{chunk-Y2CPBYKH.js.map → chunk-B73G44OH.js.map} +0 -0
- /package/dist/{chunk-XEL6UP7C.js.map → chunk-S4Y5VXMS.js.map} +0 -0
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,84 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.31.1 — 2026-05-20
|
|
4
|
+
|
|
5
|
+
### Republish of 0.31.0 — dist drift fix
|
|
6
|
+
|
|
7
|
+
The `v0.31.0` tag's npm tarball shipped a stale `dist/` — `JudgeScoresRecord`
|
|
8
|
+
was missing from `dist/index.d.ts` and the `recordOutcome.judgeScores`
|
|
9
|
+
propagation never made it into `dist/index.js`, even though the source on
|
|
10
|
+
the tagged commit had both. Consumers that bumped to `^0.31.0` got a
|
|
11
|
+
typecheck failure on `RunOutcome.judgeScores` (since the type wasn't
|
|
12
|
+
re-exported) and a silent drop on the wire (since the campaign runner
|
|
13
|
+
didn't carry the field through).
|
|
14
|
+
|
|
15
|
+
Cause: a build artifact picked up by the publish workflow predated the
|
|
16
|
+
source merge. The retag forces a clean `pnpm build` and republish; this
|
|
17
|
+
patch carries no source change beyond the version bump.
|
|
18
|
+
|
|
19
|
+
Verified after this tag: `dist/index.d.ts` contains `JudgeScoresRecord`,
|
|
20
|
+
`dist/index.js` propagates `outcome.judgeScores` end-to-end via
|
|
21
|
+
`recordOutcome.judgeScores`, and a downstream `pnpm install
|
|
22
|
+
@tangle-network/agent-eval@0.31.1` types-clean against the shape
|
|
23
|
+
documented in 0.31.0.
|
|
24
|
+
|
|
25
|
+
## 0.31.0 — 2026-05-20
|
|
26
|
+
|
|
27
|
+
### `JudgeScoresRecord` on `RunRecord.outcome` — substrate-blessed ensemble shape
|
|
28
|
+
|
|
29
|
+
Multi-judge consumers (forge-chat in agent-builder, and four sibling
|
|
30
|
+
product agents on the same trajectory) compute per-judge per-dimension
|
|
31
|
+
scores per cell, then collapse to a single composite for the gate. The
|
|
32
|
+
substrate's `RunOutcome` only had a slot for the composite plus a free
|
|
33
|
+
`raw: Record<string, number>` bag. Consumers were either dropping the
|
|
34
|
+
breakdown on the floor or smuggling it through stringly-typed `raw`
|
|
35
|
+
keys like `judge_kimi_helpfulness` — neither survives a corpus-IRR run
|
|
36
|
+
(0.27.2's `corpusInterRaterAgreement` expects structured per-judge
|
|
37
|
+
per-dim records, not parsed strings).
|
|
38
|
+
|
|
39
|
+
This release ships the typed slot so every product agent speaks the
|
|
40
|
+
same shape, and the inter-rater primitives consume it without a
|
|
41
|
+
per-consumer adapter.
|
|
42
|
+
|
|
43
|
+
### Added
|
|
44
|
+
|
|
45
|
+
- **`JudgeScoresRecord`** (`src/run-record.ts`) — `perJudge[judgeId][dim]`
|
|
46
|
+
is the canonical store; `perDimMean` and `composite` are precomputed
|
|
47
|
+
projections so reporters and IRR primitives don't repeat the
|
|
48
|
+
aggregation; `failedJudges?: string[]` records dead-judge ids
|
|
49
|
+
explicitly (no inferring partial-failure from missing keys);
|
|
50
|
+
`notes?: string` carries panel prose.
|
|
51
|
+
- **`RunOutcome.judgeScores?: JudgeScoresRecord`** — optional. Single-
|
|
52
|
+
judge or scalar-only runs leave it unset; ensemble runs populate it.
|
|
53
|
+
- **`CampaignRunOutcome.judgeScores?: JudgeScoresRecord`** — runners
|
|
54
|
+
return it on the per-cell outcome; `runEvalCampaign` threads it onto
|
|
55
|
+
the resulting `RunRecord.outcome.judgeScores` without coercion.
|
|
56
|
+
|
|
57
|
+
### Validator extended
|
|
58
|
+
|
|
59
|
+
`validateRunRecord` validates `outcome.judgeScores` when present.
|
|
60
|
+
Every `perJudge[judge][dim]` and every `perDimMean[dim]` and the
|
|
61
|
+
`composite` must be finite numbers — the NaN-as-silent-zero bug class
|
|
62
|
+
banned by `CLAUDE.md` cannot pass the boundary. `failedJudges` must be
|
|
63
|
+
an array of non-empty strings; `notes` must be a string. Round-trip
|
|
64
|
+
tested in `tests/run-record.test.ts`.
|
|
65
|
+
|
|
66
|
+
### Fail-loud contract
|
|
67
|
+
|
|
68
|
+
A judge that throws lands in `failedJudges` by id, not a silent zero
|
|
69
|
+
in `perJudge`. The composite is computed over surviving judges only;
|
|
70
|
+
the partial-failure signal is preserved through to the gate.
|
|
71
|
+
`tests/eval-campaign.test.ts` covers the four shapes (full, partial,
|
|
72
|
+
missing, with notes) plus an explicit fail-loud case where one judge
|
|
73
|
+
throws and the run record carries `failedJudges: ['glm-5.1@...']`.
|
|
74
|
+
|
|
75
|
+
### Consumer contract
|
|
76
|
+
|
|
77
|
+
`tests/consumer-contract.test.ts` pins `JudgeScoresRecord` as a
|
|
78
|
+
type-level export at the root entry. The 0.30.0 surface is preserved —
|
|
79
|
+
the new field is additive on `RunOutcome` and the new type is a new
|
|
80
|
+
export, so existing consumers stay green.
|
|
81
|
+
|
|
3
82
|
## 0.29.0 — 2026-05-19
|
|
4
83
|
|
|
5
84
|
### Analyst kinds + cross-run findings context
|
|
@@ -1,3 +1,3 @@
|
|
|
1
|
-
export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as deterministicSplit, e as routing } from '../index-
|
|
2
|
-
import '../run-record-
|
|
1
|
+
export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as deterministicSplit, e as routing } from '../index-BTqhGHJT.js';
|
|
2
|
+
import '../run-record-BfX5y68A.js';
|
|
3
3
|
import '../errors-mje_cKOs.js';
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import {
|
|
2
2
|
validateRunRecord
|
|
3
|
-
} from "./chunk-
|
|
3
|
+
} from "./chunk-ZN2CMQIW.js";
|
|
4
4
|
import {
|
|
5
5
|
pairedBootstrap,
|
|
6
6
|
pairedWilcoxon
|
|
@@ -1409,4 +1409,4 @@ export {
|
|
|
1409
1409
|
CallbackResearcher,
|
|
1410
1410
|
NoopResearcher
|
|
1411
1411
|
};
|
|
1412
|
-
//# sourceMappingURL=chunk-
|
|
1412
|
+
//# sourceMappingURL=chunk-B73G44OH.js.map
|
|
@@ -202,6 +202,7 @@ async function runEvalCampaign(opts) {
|
|
|
202
202
|
};
|
|
203
203
|
if (splitTag === "holdout") recordOutcome.holdoutScore = outcome.score;
|
|
204
204
|
else recordOutcome.searchScore = outcome.score;
|
|
205
|
+
if (outcome.judgeScores !== void 0) recordOutcome.judgeScores = outcome.judgeScores;
|
|
205
206
|
const record = {
|
|
206
207
|
runId,
|
|
207
208
|
experimentId: opts.campaignId,
|
|
@@ -284,4 +285,4 @@ function defaultRunId(params) {
|
|
|
284
285
|
export {
|
|
285
286
|
runEvalCampaign
|
|
286
287
|
};
|
|
287
|
-
//# sourceMappingURL=chunk-
|
|
288
|
+
//# sourceMappingURL=chunk-DTEJNZYK.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/eval-campaign.ts"],"sourcesContent":["/**\n * EvalCampaign — opinionated matrix runner that wires the four\n * capture-integrity directives by construction.\n *\n * The canonical benchmark shape — matrix runner → for each\n * (variant, scenario, seed) → start a TraceEmitter → call LLMs → end the\n * run → analyze — has a bug class at the integration boundary: raw\n * events not captured, route silently wrong, integrity not asserted,\n * analyst never run. The directives in `SKILL.md § Capture integrity`\n * are the mitigations.\n *\n * `EvalCampaign` is the structural fix — consumers don't wire the\n * integrity surface themselves; the campaign owns it. Specifically:\n *\n * - calls `assertLlmRoute` once at preflight before any work runs\n * - constructs a per-run `TraceStore` and `RawProviderSink` via factories\n * - constructs the `TraceEmitter` with `onRunComplete: [analyst hook]`\n * - hands the runner an `LlmClientOptions` pre-wired with the sink and\n * trace context — the runner can't accidentally call an LLM without\n * capturing the raw HTTP envelope\n * - calls `assertRunCaptured` after every `endRun` and routes failures\n * through a configurable policy (`throw` / `mark_failed` / `log`)\n * - assembles per-run `RunRecord`s and runs `researchReport` at the end\n * so the campaign artifact is launch-decision-grade by default\n * - embeds the campaign fingerprint (a SHA-256 over the canonicalised\n * run set) and optional `preregistrationHash` in the report\n *\n * The runner contract is intentionally narrow: produce a `CampaignRunOutcome`\n * given a fully-wired `CampaignRunContext`. Everything orchestration-shaped\n * lives in the campaign. This is the inversion-of-control point — consumers\n * stop writing matrix runners and start writing scenario-runners.\n *\n * Out of scope for v1 (tracked in `docs/research-report-methodology.md`):\n *\n * - Distributed/cluster execution (concurrency is local async)\n * - Adaptive sampling / sequential interim looks\n * - Resume from partial state across crashes\n * - LLM-call retry beyond what `LlmClient` already does\n */\n\nimport { assertLlmRoute, type LlmClientOptions, type LlmRouteRequirements } from './llm-client'\nimport { canonicalize, hashJson } from './pre-registration'\nimport type {\n JudgeScoresRecord,\n RunJudgeMetadata,\n RunOutcome,\n RunRecord,\n RunSplitTag,\n RunTokenUsage,\n} from './run-record'\nimport { type ResearchReport, type ResearchReportOptions, researchReport } from './summary-report'\nimport type { RunCompleteHook } from './trace/emitter'\nimport { TraceEmitter } from './trace/emitter'\nimport {\n assertRunCaptured,\n RunIntegrityError,\n type RunIntegrityExpectations,\n type RunIntegrityReport,\n} from './trace/integrity'\nimport { FileSystemRawProviderSink, type RawProviderSink } from './trace/raw-provider-sink'\nimport type { TraceStore } from './trace/store'\n\n// ── Public types ─────────────────────────────────────────────────────────\n\nexport interface CampaignVariant<V> {\n id: string\n payload: V\n}\n\nexport interface CampaignScenario {\n scenarioId: string\n /** Free-form metadata propagated to runs and reports. */\n tags?: Record<string, string>\n}\n\nexport interface CampaignRunContext<V> {\n /** Stable run id. The campaign generates this; the runner does not. */\n runId: string\n /** Logical experiment id (campaignId by default; overridable per-run via opts). */\n experimentId: string\n variant: V\n variantId: string\n scenarioId: string\n scenarioTags: Record<string, string>\n seed: number\n splitTag: RunSplitTag\n /**\n * The TraceEmitter for this run, with `onRunComplete` hooks pre-wired\n * (analyst auto-execution if configured, plus integrity check). The\n * runner MUST call `emitter.startRun` before doing any work and either\n * `emitter.endRun` or `emitter.abortRun` before returning.\n */\n emitter: TraceEmitter\n store: TraceStore\n rawSink: RawProviderSink\n /**\n * Pre-wired LLM client options — `rawSink` and `traceContext` are populated\n * so any `callLlm(req, ctx.llmOpts)` automatically captures raw HTTP. The\n * runner can spread additional fields if needed.\n */\n llmOpts: LlmClientOptions\n}\n\nexport interface CampaignRunOutcome {\n /** Did the run pass? Mirrors `RunOutcome.pass` semantics. */\n pass: boolean\n /** Score for the run on its split. Maps to `searchScore` or `holdoutScore`. */\n score: number\n /** Mandatory cost in USD. Use 0 + raw.cost_unknown=1 only if truly unknown. */\n costUsd: number\n tokenUsage: RunTokenUsage\n /** Snapshot model id (e.g. `claude-sonnet-4-6@2025-04-15`). */\n model: string\n /** sha256 of the effective prompt sent to the model. */\n promptHash: string\n /** sha256 of the effective config (model, temperature, tools, judges, splits). */\n configHash: string\n /** Optional extra numeric metrics to land in `outcome.raw`. */\n raw?: Record<string, number>\n /** Optional failure-taxonomy tag if the run failed. */\n failureMode?: string\n /** Optional judge metadata when a judge was used. */\n judgeMetadata?: RunJudgeMetadata\n /**\n * Optional per-judge / per-dim breakdown for ensemble-judged runs.\n * Propagated to `outcome.judgeScores` on the resulting `RunRecord`.\n * Single-judge or scalar-only runs leave this unset.\n */\n judgeScores?: JudgeScoresRecord\n}\n\nexport type CampaignRunner<V> = (ctx: CampaignRunContext<V>) => Promise<CampaignRunOutcome>\n\nexport type CampaignIntegrityPolicy = 'throw' | 'mark_failed' | 'log'\n\nexport interface EvalCampaignOptions<V> {\n /**\n * Stable id for the campaign. Used as the default `experimentId` on\n * every run, and folded into the campaign fingerprint.\n */\n campaignId: string\n variants: CampaignVariant<V>[]\n scenarios: CampaignScenario[]\n /** Default `[0, 1, 2]`. */\n seeds?: number[]\n /** Default `'holdout'` — the split that anchors a launch decision. */\n splitTag?: RunSplitTag\n /** Git SHA the campaign is run against. Mandatory; `RunRecord` rejects unset. */\n commitSha: string\n /**\n * LLM client config. Augmented per-run with `rawSink` and `traceContext`\n * before being passed to the runner. The campaign asserts this config\n * matches `routeRequirements` once at preflight.\n */\n llmOpts: LlmClientOptions\n /**\n * Default `{ requireExplicitBaseUrl: true, requireAuth: true }` — fail\n * loud if the campaign would silently fall back to the public router or\n * run unauthenticated. Override with an empty object to disable.\n */\n routeRequirements?: LlmRouteRequirements\n /**\n * Per-run TraceStore factory. Common shape: a fresh store per run keyed\n * on `runId`. Implementations that share a store across the campaign\n * are valid — the campaign only writes through `emitter`.\n */\n storeFactory: (params: CampaignFactoryParams) => TraceStore\n /**\n * Per-run RawProviderSink factory. Defaults to `FileSystemRawProviderSink`\n * rooted at `${workDir}/raw-events/${runId}` if `workDir` is supplied;\n * otherwise required. Forensic capture is non-negotiable in a campaign\n * run — pass `NoopRawProviderSink` explicitly if you want to opt out.\n */\n rawSinkFactory?: (params: CampaignFactoryParams) => RawProviderSink\n /**\n * Filesystem root for default `rawSinkFactory`. Ignored if\n * `rawSinkFactory` is supplied.\n */\n workDir?: string\n /**\n * Extra `onRunComplete` hooks the campaign appends (after its own\n * integrity-check hook). Pass `traceAnalystOnRunComplete(...)` here.\n */\n onRunComplete?: RunCompleteHook[]\n /**\n * Per-run integrity expectations. Defaults to:\n * `{ llmSpansMin: 1, requireRawCoverageOfLlmSpans: true, requireOutcome: true }`.\n * Override (e.g. `{ llmSpansMin: 0 }`) for runs that don't call LLMs.\n */\n integrity?: RunIntegrityExpectations\n /** Behaviour when integrity fails. Default `'mark_failed'`. */\n onIntegrityFailure?: CampaignIntegrityPolicy\n /**\n * Per-run runner. Receives a fully-wired context; produces an outcome\n * the campaign converts into a `RunRecord`.\n */\n runner: CampaignRunner<V>\n /**\n * If set, the campaign computes `researchReport` at the end. `comparator`\n * is a `variantId`. Other fields are forwarded verbatim.\n */\n report?: { comparator?: string } & Omit<\n ResearchReportOptions,\n 'comparator' | 'preregistrationHash' | 'generatedAt'\n >\n /**\n * Hash of a signed `HypothesisManifest` (see `pre-registration.ts`).\n * Embedded in the campaign fingerprint and the research report.\n */\n preregistrationHash?: string\n /** Local concurrency. Default `1` (sequential). */\n concurrency?: number\n /**\n * Override the time source. Tests pass a mock to make wallMs deterministic.\n */\n now?: () => number\n /** Override the runId generator. Tests pin this. */\n runId?: (params: CampaignFactoryParams) => string\n}\n\nexport interface CampaignFactoryParams {\n campaignId: string\n runId: string\n variantId: string\n scenarioId: string\n seed: number\n}\n\nexport interface FailedRun {\n runId: string\n variantId: string\n scenarioId: string\n seed: number\n reason: string\n error?: string\n}\n\nexport interface EvalCampaignResult {\n campaignId: string\n /** SHA-256 over canonicalised `(variantIds, scenarioIds, seeds, comparator, splitTag, baseUrl, provider, preregistrationHash)`. */\n campaignFingerprint: string\n preregistrationHash: string | null\n /** Successful runs only. Failed runs land in `failedRuns`. */\n runs: RunRecord[]\n /** Integrity reports for every successful run. */\n integrityReports: RunIntegrityReport[]\n failedRuns: FailedRun[]\n /** Computed when `report` is set on options. */\n report?: ResearchReport\n startedAt: string\n endedAt: string\n}\n\n// ── Implementation ───────────────────────────────────────────────────────\n\nconst DEFAULT_INTEGRITY: RunIntegrityExpectations = {\n llmSpansMin: 1,\n requireRawCoverageOfLlmSpans: true,\n requireOutcome: true,\n}\n\nconst DEFAULT_ROUTE: LlmRouteRequirements = {\n requireExplicitBaseUrl: true,\n requireAuth: true,\n}\n\nexport async function runEvalCampaign<V>(\n opts: EvalCampaignOptions<V>,\n): Promise<EvalCampaignResult> {\n // ── Preflight ──────────────────────────────────────────────────────\n assertLlmRoute(opts.llmOpts, opts.routeRequirements ?? DEFAULT_ROUTE)\n\n if (opts.variants.length === 0) {\n throw new Error('runEvalCampaign: variants must be non-empty.')\n }\n if (opts.scenarios.length === 0) {\n throw new Error('runEvalCampaign: scenarios must be non-empty.')\n }\n const variantIds = new Set<string>()\n for (const v of opts.variants) {\n if (variantIds.has(v.id)) {\n throw new Error(`runEvalCampaign: duplicate variant id \"${v.id}\".`)\n }\n variantIds.add(v.id)\n }\n const scenarioIds = new Set<string>()\n for (const s of opts.scenarios) {\n if (scenarioIds.has(s.scenarioId)) {\n throw new Error(`runEvalCampaign: duplicate scenarioId \"${s.scenarioId}\".`)\n }\n scenarioIds.add(s.scenarioId)\n }\n if (opts.report?.comparator && !variantIds.has(opts.report.comparator)) {\n throw new Error(\n `runEvalCampaign: report.comparator \"${opts.report.comparator}\" is not a configured variantId.`,\n )\n }\n if (!opts.commitSha) {\n throw new Error('runEvalCampaign: commitSha is required (every RunRecord needs it).')\n }\n\n const seeds = opts.seeds ?? [0, 1, 2]\n const splitTag: RunSplitTag = opts.splitTag ?? 'holdout'\n const concurrency = Math.max(1, opts.concurrency ?? 1)\n const integrity = { ...DEFAULT_INTEGRITY, ...(opts.integrity ?? {}) }\n const onIntegrityFailure: CampaignIntegrityPolicy = opts.onIntegrityFailure ?? 'mark_failed'\n const now = opts.now ?? (() => Date.now())\n const baseUrl = (opts.llmOpts.baseUrl ?? '').replace(/\\/+$/, '')\n const provider = opts.llmOpts.provider ?? null\n const preregistrationHash = opts.preregistrationHash ?? null\n\n const rawSinkFactory = opts.rawSinkFactory ?? defaultRawSinkFactory(opts.workDir)\n\n // ── Fingerprint ────────────────────────────────────────────────────\n const campaignFingerprint = await hashJson(\n canonicalize({\n campaignId: opts.campaignId,\n variants: opts.variants.map((v) => v.id).sort(),\n scenarios: opts.scenarios.map((s) => s.scenarioId).sort(),\n seeds: [...seeds].sort((a, b) => a - b),\n splitTag,\n comparator: opts.report?.comparator ?? null,\n baseUrl,\n provider,\n preregistrationHash,\n }),\n )\n\n // ── Plan the matrix ────────────────────────────────────────────────\n type Cell = { variant: CampaignVariant<V>; scenario: CampaignScenario; seed: number }\n const cells: Cell[] = []\n for (const variant of opts.variants) {\n for (const scenario of opts.scenarios) {\n for (const seed of seeds) {\n cells.push({ variant, scenario, seed })\n }\n }\n }\n\n const startedAt = new Date(now()).toISOString()\n const runs: RunRecord[] = []\n const integrityReports: RunIntegrityReport[] = []\n const failedRuns: FailedRun[] = []\n\n // ── Execute (bounded-concurrency worker pool) ──────────────────────\n let cursor = 0\n async function worker(): Promise<void> {\n while (true) {\n const i = cursor++\n if (i >= cells.length) return\n const cell = cells[i]!\n try {\n const result = await runOneCell(cell)\n runs.push(result.record)\n integrityReports.push(result.integrity)\n } catch (err) {\n if (err instanceof CellExecutionError) {\n failedRuns.push(err.failed)\n if (err.integrity) integrityReports.push(err.integrity)\n } else {\n // Genuine bug — not a runner failure, not an integrity failure.\n // Surface it; don't silently mask.\n throw err\n }\n }\n }\n }\n\n async function runOneCell(\n cell: Cell,\n ): Promise<{ record: RunRecord; integrity: RunIntegrityReport }> {\n const runId = (opts.runId ?? defaultRunId)({\n campaignId: opts.campaignId,\n runId: '', // unused by default generator\n variantId: cell.variant.id,\n scenarioId: cell.scenario.scenarioId,\n seed: cell.seed,\n })\n const factoryParams: CampaignFactoryParams = {\n campaignId: opts.campaignId,\n runId,\n variantId: cell.variant.id,\n scenarioId: cell.scenario.scenarioId,\n seed: cell.seed,\n }\n const store = opts.storeFactory(factoryParams)\n const rawSink = rawSinkFactory(factoryParams)\n\n const emitter = new TraceEmitter(store, {\n runId,\n now: opts.now,\n onRunComplete: opts.onRunComplete,\n })\n\n const llmOpts: LlmClientOptions = {\n ...opts.llmOpts,\n rawSink,\n traceContext: { runId },\n }\n\n const ctx: CampaignRunContext<V> = {\n runId,\n experimentId: opts.campaignId,\n variant: cell.variant.payload,\n variantId: cell.variant.id,\n scenarioId: cell.scenario.scenarioId,\n scenarioTags: cell.scenario.tags ?? {},\n seed: cell.seed,\n splitTag,\n emitter,\n store,\n rawSink,\n llmOpts,\n }\n\n const wallStart = now()\n let outcome: CampaignRunOutcome\n try {\n outcome = await opts.runner(ctx)\n } catch (err) {\n const message = err instanceof Error ? err.message : String(err)\n // The runner threw mid-execution; give it a chance to have aborted.\n try {\n await emitter.abortRun(message)\n } catch {\n // Already aborted/ended; ignore.\n }\n throw new CellExecutionError({\n runId,\n variantId: cell.variant.id,\n scenarioId: cell.scenario.scenarioId,\n seed: cell.seed,\n reason: 'runner_threw',\n error: message,\n })\n }\n const wallMs = now() - wallStart\n\n const integrityReport = await assertRunCaptured(store, runId, { ...integrity, rawSink })\n if (!integrityReport.ok) {\n switch (onIntegrityFailure) {\n case 'throw':\n throw new RunIntegrityError(integrityReport)\n case 'mark_failed':\n throw new CellExecutionError(\n {\n runId,\n variantId: cell.variant.id,\n scenarioId: cell.scenario.scenarioId,\n seed: cell.seed,\n reason: 'integrity_failed',\n error: integrityReport.issues.map((i) => i.code).join(', '),\n },\n integrityReport,\n )\n case 'log':\n // Caller wants the run admitted with a flagged report; fall through.\n break\n }\n }\n\n const recordOutcome: RunOutcome = {\n raw: outcome.raw ?? {},\n }\n if (splitTag === 'holdout') recordOutcome.holdoutScore = outcome.score\n else recordOutcome.searchScore = outcome.score\n if (outcome.judgeScores !== undefined) recordOutcome.judgeScores = outcome.judgeScores\n\n const record: RunRecord = {\n runId,\n experimentId: opts.campaignId,\n candidateId: cell.variant.id,\n seed: cell.seed,\n model: outcome.model,\n promptHash: outcome.promptHash,\n configHash: outcome.configHash,\n commitSha: opts.commitSha,\n wallMs,\n costUsd: outcome.costUsd,\n tokenUsage: outcome.tokenUsage,\n judgeMetadata: outcome.judgeMetadata,\n outcome: recordOutcome,\n failureMode: outcome.failureMode,\n splitTag,\n scenarioId: cell.scenario.scenarioId,\n }\n return { record, integrity: integrityReport }\n }\n\n const workers = Array.from({ length: Math.min(concurrency, cells.length) }, () => worker())\n await Promise.all(workers)\n\n // ── Optional research report ───────────────────────────────────────\n let report: ResearchReport | undefined\n if (opts.report) {\n const reportOpts: ResearchReportOptions = {\n ...opts.report,\n comparator: opts.report.comparator,\n split: splitTag === 'dev' ? 'search' : splitTag,\n generatedAt: new Date(now()).toISOString(),\n preregistrationHash: preregistrationHash ?? undefined,\n }\n report = await researchReport(runs, reportOpts)\n }\n\n const endedAt = new Date(now()).toISOString()\n\n return {\n campaignId: opts.campaignId,\n campaignFingerprint,\n preregistrationHash,\n runs,\n integrityReports,\n failedRuns,\n report,\n startedAt,\n endedAt,\n }\n}\n\n// ── Internal ─────────────────────────────────────────────────────────────\n\nclass CellExecutionError extends Error {\n readonly failed: FailedRun\n readonly integrity?: RunIntegrityReport\n constructor(failed: FailedRun, integrity?: RunIntegrityReport) {\n super(`cell ${failed.variantId}/${failed.scenarioId}@${failed.seed} failed: ${failed.reason}`)\n this.failed = failed\n this.integrity = integrity\n }\n}\n\nfunction defaultRawSinkFactory(workDir: string | undefined) {\n return (params: CampaignFactoryParams): RawProviderSink => {\n if (!workDir) {\n throw new Error(\n 'runEvalCampaign: rawSinkFactory not supplied and workDir not set. Pass either to enable raw provider capture, or pass `new NoopRawProviderSink()` via rawSinkFactory to opt out explicitly.',\n )\n }\n return new FileSystemRawProviderSink({\n dir: `${workDir}/raw-events/${params.runId}`,\n })\n }\n}\n\nfunction defaultRunId(params: CampaignFactoryParams): string {\n // Stable across re-runs: fingerprint of (campaignId, variantId, scenarioId, seed).\n // Caller can override via opts.runId for non-deterministic IDs.\n const base = `${params.campaignId}::${params.variantId}::${params.scenarioId}::${params.seed}`\n // Lightweight hex: we don't need crypto-grade here, just stability + uniqueness.\n let h1 = 0x811c9dc5\n let h2 = 0x12345678\n for (let i = 0; i < base.length; i++) {\n const c = base.charCodeAt(i)\n h1 = Math.imul(h1 ^ c, 0x01000193) >>> 0\n h2 = Math.imul(h2 ^ c, 0x9e3779b1) >>> 0\n }\n return `run-${h1.toString(16).padStart(8, '0')}${h2.toString(16).padStart(8, '0')}`\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;AA+PA,IAAM,oBAA8C;AAAA,EAClD,aAAa;AAAA,EACb,8BAA8B;AAAA,EAC9B,gBAAgB;AAClB;AAEA,IAAM,gBAAsC;AAAA,EAC1C,wBAAwB;AAAA,EACxB,aAAa;AACf;AAEA,eAAsB,gBACpB,MAC6B;AAE7B,iBAAe,KAAK,SAAS,KAAK,qBAAqB,aAAa;AAEpE,MAAI,KAAK,SAAS,WAAW,GAAG;AAC9B,UAAM,IAAI,MAAM,8CAA8C;AAAA,EAChE;AACA,MAAI,KAAK,UAAU,WAAW,GAAG;AAC/B,UAAM,IAAI,MAAM,+CAA+C;AAAA,EACjE;AACA,QAAM,aAAa,oBAAI,IAAY;AACnC,aAAW,KAAK,KAAK,UAAU;AAC7B,QAAI,WAAW,IAAI,EAAE,EAAE,GAAG;AACxB,YAAM,IAAI,MAAM,0CAA0C,EAAE,EAAE,IAAI;AAAA,IACpE;AACA,eAAW,IAAI,EAAE,EAAE;AAAA,EACrB;AACA,QAAM,cAAc,oBAAI,IAAY;AACpC,aAAW,KAAK,KAAK,WAAW;AAC9B,QAAI,YAAY,IAAI,EAAE,UAAU,GAAG;AACjC,YAAM,IAAI,MAAM,0CAA0C,EAAE,UAAU,IAAI;AAAA,IAC5E;AACA,gBAAY,IAAI,EAAE,UAAU;AAAA,EAC9B;AACA,MAAI,KAAK,QAAQ,cAAc,CAAC,WAAW,IAAI,KAAK,OAAO,UAAU,GAAG;AACtE,UAAM,IAAI;AAAA,MACR,uCAAuC,KAAK,OAAO,UAAU;AAAA,IAC/D;AAAA,EACF;AACA,MAAI,CAAC,KAAK,WAAW;AACnB,UAAM,IAAI,MAAM,oEAAoE;AAAA,EACtF;AAEA,QAAM,QAAQ,KAAK,SAAS,CAAC,GAAG,GAAG,CAAC;AACpC,QAAM,WAAwB,KAAK,YAAY;AAC/C,QAAM,cAAc,KAAK,IAAI,GAAG,KAAK,eAAe,CAAC;AACrD,QAAM,YAAY,EAAE,GAAG,mBAAmB,GAAI,KAAK,aAAa,CAAC,EAAG;AACpE,QAAM,qBAA8C,KAAK,sBAAsB;AAC/E,QAAM,MAAM,KAAK,QAAQ,MAAM,KAAK,IAAI;AACxC,QAAM,WAAW,KAAK,QAAQ,WAAW,IAAI,QAAQ,QAAQ,EAAE;AAC/D,QAAM,WAAW,KAAK,QAAQ,YAAY;AAC1C,QAAM,sBAAsB,KAAK,uBAAuB;AAExD,QAAM,iBAAiB,KAAK,kBAAkB,sBAAsB,KAAK,OAAO;AAGhF,QAAM,sBAAsB,MAAM;AAAA,IAChC,aAAa;AAAA,MACX,YAAY,KAAK;AAAA,MACjB,UAAU,KAAK,SAAS,IAAI,CAAC,MAAM,EAAE,EAAE,EAAE,KAAK;AAAA,MAC9C,WAAW,KAAK,UAAU,IAAI,CAAC,MAAM,EAAE,UAAU,EAAE,KAAK;AAAA,MACxD,OAAO,CAAC,GAAG,KAAK,EAAE,KAAK,CAAC,GAAG,MAAM,IAAI,CAAC;AAAA,MACtC;AAAA,MACA,YAAY,KAAK,QAAQ,cAAc;AAAA,MACvC;AAAA,MACA;AAAA,MACA;AAAA,IACF,CAAC;AAAA,EACH;AAIA,QAAM,QAAgB,CAAC;AACvB,aAAW,WAAW,KAAK,UAAU;AACnC,eAAW,YAAY,KAAK,WAAW;AACrC,iBAAW,QAAQ,OAAO;AACxB,cAAM,KAAK,EAAE,SAAS,UAAU,KAAK,CAAC;AAAA,MACxC;AAAA,IACF;AAAA,EACF;AAEA,QAAM,YAAY,IAAI,KAAK,IAAI,CAAC,EAAE,YAAY;AAC9C,QAAM,OAAoB,CAAC;AAC3B,QAAM,mBAAyC,CAAC;AAChD,QAAM,aAA0B,CAAC;AAGjC,MAAI,SAAS;AACb,iBAAe,SAAwB;AACrC,WAAO,MAAM;AACX,YAAM,IAAI;AACV,UAAI,KAAK,MAAM,OAAQ;AACvB,YAAM,OAAO,MAAM,CAAC;AACpB,UAAI;AACF,cAAM,SAAS,MAAM,WAAW,IAAI;AACpC,aAAK,KAAK,OAAO,MAAM;AACvB,yBAAiB,KAAK,OAAO,SAAS;AAAA,MACxC,SAAS,KAAK;AACZ,YAAI,eAAe,oBAAoB;AACrC,qBAAW,KAAK,IAAI,MAAM;AAC1B,cAAI,IAAI,UAAW,kBAAiB,KAAK,IAAI,SAAS;AAAA,QACxD,OAAO;AAGL,gBAAM;AAAA,QACR;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAEA,iBAAe,WACb,MAC+D;AAC/D,UAAM,SAAS,KAAK,SAAS,cAAc;AAAA,MACzC,YAAY,KAAK;AAAA,MACjB,OAAO;AAAA;AAAA,MACP,WAAW,KAAK,QAAQ;AAAA,MACxB,YAAY,KAAK,SAAS;AAAA,MAC1B,MAAM,KAAK;AAAA,IACb,CAAC;AACD,UAAM,gBAAuC;AAAA,MAC3C,YAAY,KAAK;AAAA,MACjB;AAAA,MACA,WAAW,KAAK,QAAQ;AAAA,MACxB,YAAY,KAAK,SAAS;AAAA,MAC1B,MAAM,KAAK;AAAA,IACb;AACA,UAAM,QAAQ,KAAK,aAAa,aAAa;AAC7C,UAAM,UAAU,eAAe,aAAa;AAE5C,UAAM,UAAU,IAAI,aAAa,OAAO;AAAA,MACtC;AAAA,MACA,KAAK,KAAK;AAAA,MACV,eAAe,KAAK;AAAA,IACtB,CAAC;AAED,UAAM,UAA4B;AAAA,MAChC,GAAG,KAAK;AAAA,MACR;AAAA,MACA,cAAc,EAAE,MAAM;AAAA,IACxB;AAEA,UAAM,MAA6B;AAAA,MACjC;AAAA,MACA,cAAc,KAAK;AAAA,MACnB,SAAS,KAAK,QAAQ;AAAA,MACtB,WAAW,KAAK,QAAQ;AAAA,MACxB,YAAY,KAAK,SAAS;AAAA,MAC1B,cAAc,KAAK,SAAS,QAAQ,CAAC;AAAA,MACrC,MAAM,KAAK;AAAA,MACX;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,IACF;AAEA,UAAM,YAAY,IAAI;AACtB,QAAI;AACJ,QAAI;AACF,gBAAU,MAAM,KAAK,OAAO,GAAG;AAAA,IACjC,SAAS,KAAK;AACZ,YAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAE/D,UAAI;AACF,cAAM,QAAQ,SAAS,OAAO;AAAA,MAChC,QAAQ;AAAA,MAER;AACA,YAAM,IAAI,mBAAmB;AAAA,QAC3B;AAAA,QACA,WAAW,KAAK,QAAQ;AAAA,QACxB,YAAY,KAAK,SAAS;AAAA,QAC1B,MAAM,KAAK;AAAA,QACX,QAAQ;AAAA,QACR,OAAO;AAAA,MACT,CAAC;AAAA,IACH;AACA,UAAM,SAAS,IAAI,IAAI;AAEvB,UAAM,kBAAkB,MAAM,kBAAkB,OAAO,OAAO,EAAE,GAAG,WAAW,QAAQ,CAAC;AACvF,QAAI,CAAC,gBAAgB,IAAI;AACvB,cAAQ,oBAAoB;AAAA,QAC1B,KAAK;AACH,gBAAM,IAAI,kBAAkB,eAAe;AAAA,QAC7C,KAAK;AACH,gBAAM,IAAI;AAAA,YACR;AAAA,cACE;AAAA,cACA,WAAW,KAAK,QAAQ;AAAA,cACxB,YAAY,KAAK,SAAS;AAAA,cAC1B,MAAM,KAAK;AAAA,cACX,QAAQ;AAAA,cACR,OAAO,gBAAgB,OAAO,IAAI,CAAC,MAAM,EAAE,IAAI,EAAE,KAAK,IAAI;AAAA,YAC5D;AAAA,YACA;AAAA,UACF;AAAA,QACF,KAAK;AAEH;AAAA,MACJ;AAAA,IACF;AAEA,UAAM,gBAA4B;AAAA,MAChC,KAAK,QAAQ,OAAO,CAAC;AAAA,IACvB;AACA,QAAI,aAAa,UAAW,eAAc,eAAe,QAAQ;AAAA,QAC5D,eAAc,cAAc,QAAQ;AACzC,QAAI,QAAQ,gBAAgB,OAAW,eAAc,cAAc,QAAQ;AAE3E,UAAM,SAAoB;AAAA,MACxB;AAAA,MACA,cAAc,KAAK;AAAA,MACnB,aAAa,KAAK,QAAQ;AAAA,MAC1B,MAAM,KAAK;AAAA,MACX,OAAO,QAAQ;AAAA,MACf,YAAY,QAAQ;AAAA,MACpB,YAAY,QAAQ;AAAA,MACpB,WAAW,KAAK;AAAA,MAChB;AAAA,MACA,SAAS,QAAQ;AAAA,MACjB,YAAY,QAAQ;AAAA,MACpB,eAAe,QAAQ;AAAA,MACvB,SAAS;AAAA,MACT,aAAa,QAAQ;AAAA,MACrB;AAAA,MACA,YAAY,KAAK,SAAS;AAAA,IAC5B;AACA,WAAO,EAAE,QAAQ,WAAW,gBAAgB;AAAA,EAC9C;AAEA,QAAM,UAAU,MAAM,KAAK,EAAE,QAAQ,KAAK,IAAI,aAAa,MAAM,MAAM,EAAE,GAAG,MAAM,OAAO,CAAC;AAC1F,QAAM,QAAQ,IAAI,OAAO;AAGzB,MAAI;AACJ,MAAI,KAAK,QAAQ;AACf,UAAM,aAAoC;AAAA,MACxC,GAAG,KAAK;AAAA,MACR,YAAY,KAAK,OAAO;AAAA,MACxB,OAAO,aAAa,QAAQ,WAAW;AAAA,MACvC,aAAa,IAAI,KAAK,IAAI,CAAC,EAAE,YAAY;AAAA,MACzC,qBAAqB,uBAAuB;AAAA,IAC9C;AACA,aAAS,MAAM,eAAe,MAAM,UAAU;AAAA,EAChD;AAEA,QAAM,UAAU,IAAI,KAAK,IAAI,CAAC,EAAE,YAAY;AAE5C,SAAO;AAAA,IACL,YAAY,KAAK;AAAA,IACjB;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AACF;AAIA,IAAM,qBAAN,cAAiC,MAAM;AAAA,EAC5B;AAAA,EACA;AAAA,EACT,YAAY,QAAmB,WAAgC;AAC7D,UAAM,QAAQ,OAAO,SAAS,IAAI,OAAO,UAAU,IAAI,OAAO,IAAI,YAAY,OAAO,MAAM,EAAE;AAC7F,SAAK,SAAS;AACd,SAAK,YAAY;AAAA,EACnB;AACF;AAEA,SAAS,sBAAsB,SAA6B;AAC1D,SAAO,CAAC,WAAmD;AACzD,QAAI,CAAC,SAAS;AACZ,YAAM,IAAI;AAAA,QACR;AAAA,MACF;AAAA,IACF;AACA,WAAO,IAAI,0BAA0B;AAAA,MACnC,KAAK,GAAG,OAAO,eAAe,OAAO,KAAK;AAAA,IAC5C,CAAC;AAAA,EACH;AACF;AAEA,SAAS,aAAa,QAAuC;AAG3D,QAAM,OAAO,GAAG,OAAO,UAAU,KAAK,OAAO,SAAS,KAAK,OAAO,UAAU,KAAK,OAAO,IAAI;AAE5F,MAAI,KAAK;AACT,MAAI,KAAK;AACT,WAAS,IAAI,GAAG,IAAI,KAAK,QAAQ,KAAK;AACpC,UAAM,IAAI,KAAK,WAAW,CAAC;AAC3B,SAAK,KAAK,KAAK,KAAK,GAAG,QAAU,MAAM;AACvC,SAAK,KAAK,KAAK,KAAK,GAAG,UAAU,MAAM;AAAA,EACzC;AACA,SAAO,OAAO,GAAG,SAAS,EAAE,EAAE,SAAS,GAAG,GAAG,CAAC,GAAG,GAAG,SAAS,EAAE,EAAE,SAAS,GAAG,GAAG,CAAC;AACnF;","names":[]}
|
|
@@ -4,7 +4,7 @@ import {
|
|
|
4
4
|
} from "./chunk-NCRFYPS3.js";
|
|
5
5
|
import {
|
|
6
6
|
validateRunRecord
|
|
7
|
-
} from "./chunk-
|
|
7
|
+
} from "./chunk-ZN2CMQIW.js";
|
|
8
8
|
import {
|
|
9
9
|
TraceEmitter
|
|
10
10
|
} from "./chunk-TVVP3ZZQ.js";
|
|
@@ -610,4 +610,4 @@ export {
|
|
|
610
610
|
runProposeReviewAsControlLoop,
|
|
611
611
|
controlFailureClassFromVerification
|
|
612
612
|
};
|
|
613
|
-
//# sourceMappingURL=chunk-
|
|
613
|
+
//# sourceMappingURL=chunk-S4Y5VXMS.js.map
|
|
@@ -99,6 +99,9 @@ function validateRunRecord(input) {
|
|
|
99
99
|
for (const [k, v] of Object.entries(raw)) {
|
|
100
100
|
expectFiniteNumber(v, `outcome.raw.${k}`);
|
|
101
101
|
}
|
|
102
|
+
if (outRec.judgeScores !== void 0) {
|
|
103
|
+
validateJudgeScores(outRec.judgeScores, "outcome.judgeScores");
|
|
104
|
+
}
|
|
102
105
|
if (obj.failureMode !== void 0) expectString(obj.failureMode, "failureMode");
|
|
103
106
|
if (typeof obj.splitTag !== "string" || !SPLIT_TAGS.includes(obj.splitTag)) {
|
|
104
107
|
throw new RunRecordValidationError(
|
|
@@ -138,6 +141,55 @@ function expectFiniteNumber(value, path) {
|
|
|
138
141
|
throw new RunRecordValidationError(`expected finite number`, path);
|
|
139
142
|
}
|
|
140
143
|
}
|
|
144
|
+
function validateJudgeScores(value, path) {
|
|
145
|
+
if (value === null || typeof value !== "object") {
|
|
146
|
+
throw new RunRecordValidationError("judgeScores must be an object", path);
|
|
147
|
+
}
|
|
148
|
+
const rec = value;
|
|
149
|
+
const perJudge = rec.perJudge;
|
|
150
|
+
if (perJudge === null || typeof perJudge !== "object") {
|
|
151
|
+
throw new RunRecordValidationError("perJudge must be an object", `${path}.perJudge`);
|
|
152
|
+
}
|
|
153
|
+
for (const [judgeId, dims] of Object.entries(perJudge)) {
|
|
154
|
+
if (dims === null || typeof dims !== "object") {
|
|
155
|
+
throw new RunRecordValidationError(
|
|
156
|
+
"per-judge entry must be an object of dimension scores",
|
|
157
|
+
`${path}.perJudge.${judgeId}`
|
|
158
|
+
);
|
|
159
|
+
}
|
|
160
|
+
for (const [dim, score] of Object.entries(dims)) {
|
|
161
|
+
expectFiniteNumber(score, `${path}.perJudge.${judgeId}.${dim}`);
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
const perDimMean = rec.perDimMean;
|
|
165
|
+
if (perDimMean === null || typeof perDimMean !== "object") {
|
|
166
|
+
throw new RunRecordValidationError("perDimMean must be an object", `${path}.perDimMean`);
|
|
167
|
+
}
|
|
168
|
+
for (const [dim, mean] of Object.entries(perDimMean)) {
|
|
169
|
+
expectFiniteNumber(mean, `${path}.perDimMean.${dim}`);
|
|
170
|
+
}
|
|
171
|
+
expectFiniteNumber(rec.composite, `${path}.composite`);
|
|
172
|
+
if (rec.failedJudges !== void 0) {
|
|
173
|
+
if (!Array.isArray(rec.failedJudges)) {
|
|
174
|
+
throw new RunRecordValidationError(
|
|
175
|
+
"failedJudges must be an array of strings",
|
|
176
|
+
`${path}.failedJudges`
|
|
177
|
+
);
|
|
178
|
+
}
|
|
179
|
+
for (let i = 0; i < rec.failedJudges.length; i++) {
|
|
180
|
+
const id = rec.failedJudges[i];
|
|
181
|
+
if (typeof id !== "string" || id.length === 0) {
|
|
182
|
+
throw new RunRecordValidationError(
|
|
183
|
+
"failedJudges entry must be a non-empty string",
|
|
184
|
+
`${path}.failedJudges[${i}]`
|
|
185
|
+
);
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
if (rec.notes !== void 0 && typeof rec.notes !== "string") {
|
|
190
|
+
throw new RunRecordValidationError("notes must be a string", `${path}.notes`);
|
|
191
|
+
}
|
|
192
|
+
}
|
|
141
193
|
function modelHasSnapshot(model) {
|
|
142
194
|
if (model.includes("@")) return true;
|
|
143
195
|
if (/-\d{8}$/.test(model)) return true;
|
|
@@ -153,4 +205,4 @@ export {
|
|
|
153
205
|
parseRunRecordSafe,
|
|
154
206
|
roundTripRunRecord
|
|
155
207
|
};
|
|
156
|
-
//# sourceMappingURL=chunk-
|
|
208
|
+
//# sourceMappingURL=chunk-ZN2CMQIW.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/run-record.ts"],"sourcesContent":["/**\n * Paper-grade RunRecord schema + runtime validator.\n *\n * Every run that participates in a promotion gate, paper table, or\n * researcher loop SHOULD be recorded as a `RunRecord`. The mandatory\n * fields are exactly those the paper \"Two Loops, Three Roles\" requires\n * for reproducibility: who/what/when/cost/seed/hash, plus the search vs\n * holdout split tag and either a `searchScore` or a `holdoutScore`.\n *\n * This is intentionally NOT a replacement for the rich `Run` /\n * `ProposeReviewReport` / `ScenarioResult` types already in the\n * package. Those are runtime structures with full provenance. A\n * `RunRecord` is the analysis-time projection — the JSON-friendly\n * row you'd put in a parquet file or paste into a notebook.\n *\n * Validate at the boundary:\n *\n * const rec = validateRunRecord(rawJson) // throws on missing\n * const ok = isRunRecord(rawJson) // boolean check\n * const rec = parseRunRecordSafe(rawJson) // { ok, value | error }\n *\n * The validator runs in pure TS — zod is intentionally NOT a\n * dependency. Round-trip tested in `tests/run-record.test.ts`.\n */\n\n/** Search/dev/holdout split tag. 'search' is the paper-grade alias for the\n * combined train+test pool that the optimizer is allowed to read. */\nexport type RunSplitTag = 'search' | 'dev' | 'holdout'\n\nexport interface RunTokenUsage {\n input: number\n output: number\n cached?: number\n}\n\nexport interface RunJudgeMetadata {\n model: string\n promptVersion: string\n /** [0,1] confidence the judge declared. Constant judge confidence\n * across many runs is a fallback signal (see `canary.ts`). */\n confidence: number\n /** True if the judge degraded to a fallback path (rules-only,\n * prior-call cache, etc.). The canary uses this to alert. */\n fallback: boolean\n}\n\n/**\n * Per-judge / per-dimension breakdown for runs scored by an ensemble of\n * judges over a multi-dimensional rubric.\n *\n * The collapsed `outcome.searchScore` / `holdoutScore` carries the\n * composite the gate uses. The full breakdown belongs here so consumers\n * can answer \"which judge disagreed?\", \"which dimension dragged the\n * composite down?\", and \"did half the panel fail?\" without re-running.\n *\n * `perJudge[judgeId][dim]` is the canonical source; `perDimMean` and\n * `composite` are convenience projections — derivable but precomputed so\n * downstream IRR primitives (`interRaterReliability`,\n * `corpusInterRaterAgreement`) and reporters don't pay the same\n * aggregation twice.\n *\n * Fail-loud discipline: judges that errored out land in `failedJudges`\n * by id. A missing key in `perJudge` is ambiguous (silent zero vs not\n * run); the explicit list makes a partial-failure recorded as such.\n */\nexport interface JudgeScoresRecord {\n /** Per-judge per-dimension scores. `{ \"kimi-k2.6\": { helpfulness: 0.8, clarity: 0.7 }, ... }`. */\n perJudge: Record<string, Record<string, number>>\n /** Per-dim mean across judges. Convenience — derivable from `perJudge`. */\n perDimMean: Record<string, number>\n /** Composite mean across all dims and judges. Mirrors the score\n * the gate sees on `outcome.searchScore` / `holdoutScore`. */\n composite: number\n /** Judges that errored or returned an unparseable verdict. Recorded\n * by id (e.g. `['glm-5.1']`) so a partial-failure case is explicit,\n * not inferred from missing keys in `perJudge`. */\n failedJudges?: string[]\n /** Free-form notes the judges emitted (joined across judges or\n * first-judge only — consumer's choice). */\n notes?: string\n}\n\nexport interface RunOutcome {\n /** Score on the search/optimization split. Optional because a\n * holdout-only evaluation only fills `holdoutScore`. */\n searchScore?: number\n /** Score on the held-out split. Optional because a search-only run\n * only fills `searchScore`. At least one must be present. */\n holdoutScore?: number\n /** Bag of any other metric the run produced — judge dimensions,\n * pass/fail counters, latency stats, etc. Numeric only — keeps\n * reporters honest. */\n raw: Record<string, number>\n /** Per-judge / per-dim breakdown. Consumers writing ensemble\n * judgements populate this; substrate primitives like\n * `interRaterReliability` and `corpusInterRaterAgreement` accept\n * these records as input. Optional — single-judge or scalar-only\n * runs leave it unset. */\n judgeScores?: JudgeScoresRecord\n}\n\n/**\n * Mandatory paper-grade fields for a single evaluation run. Optional\n * fields are extension points; mandatory fields throw if missing.\n *\n * Hash discipline:\n * - `promptHash` is the sha256 of the EFFECTIVE prompt sent to the\n * model (after any steering bundle merge).\n * - `configHash` is the sha256 of the effective run config (model,\n * temperature, tools, judges, splits). The pair (promptHash,\n * configHash) uniquely identifies an experimental cell.\n *\n * Model snapshot discipline:\n * - `model` MUST encode a snapshot version. Bare aliases like\n * `claude-sonnet-4` or `gpt-4o` are banned — they remap silently.\n * Use `claude-sonnet-4-6@2025-04-15` or `gpt-4o-2024-11-20`.\n */\nexport interface RunRecord {\n /** UUID for the run. */\n runId: string\n /** Logical experiment grouping (a treatment vs a baseline within\n * the same sweep should share `experimentId`). */\n experimentId: string\n /** Stable identifier for the candidate (variant) being run. The\n * promotion gate compares two `candidateId`s on matched items. */\n candidateId: string\n /** RNG seed for the run. Always recorded — silent re-seeding is\n * the most common cause of non-reproducible numbers. */\n seed: number\n /** Model identifier WITH snapshot version. */\n model: string\n /** sha256 of the effective prompt (post-steering). */\n promptHash: string\n /** sha256 of the effective config. */\n configHash: string\n /** Git SHA the harness was run from. */\n commitSha: string\n /** End-to-end wall-clock duration in milliseconds. */\n wallMs: number\n /** Time spent queued before execution started, if known. */\n queueMs?: number\n /** Total USD cost. Mandatory — runs without a cost number are\n * unbounded by definition and must not be admitted into the gate. */\n costUsd: number\n /** Token usage breakdown. */\n tokenUsage: RunTokenUsage\n /** Judge-side metadata, if a judge was used. */\n judgeMetadata?: RunJudgeMetadata\n /** Per-split scores + raw bag. */\n outcome: RunOutcome\n /** Categorical failure tag, when the run failed and the harness\n * classified it. Free-form string; standard tags live in\n * `failure-taxonomy.ts`. */\n failureMode?: string\n /** Which split this run was drawn from. */\n splitTag: RunSplitTag\n /**\n * Stable scenario identifier the run was scored against. Optional for\n * backwards compatibility, but **strongly recommended**: every primitive\n * that pairs runs by scenario (preferences, paired stats, BT tournament)\n * keys on this. The campaign artifact populates it canonically; legacy\n * runs without it fall back to inference from `outcome.raw.scenario_id`\n * or `experimentId`.\n */\n scenarioId?: string\n}\n\n// ── Validation ───────────────────────────────────────────────────────\n\nconst MANDATORY_TOP_LEVEL = [\n 'runId',\n 'experimentId',\n 'candidateId',\n 'seed',\n 'model',\n 'promptHash',\n 'configHash',\n 'commitSha',\n 'wallMs',\n 'costUsd',\n 'tokenUsage',\n 'outcome',\n 'splitTag',\n] as const\n\nimport { ValidationError } from './errors'\n\nconst SPLIT_TAGS: ReadonlyArray<RunSplitTag> = ['search', 'dev', 'holdout']\n\nexport class RunRecordValidationError extends ValidationError {\n readonly path: string\n constructor(message: string, path = '') {\n super(path ? `${message} (at ${path})` : message)\n this.path = path\n }\n}\n\n/**\n * Strict validator. Throws `RunRecordValidationError` on the first\n * missing or wrongly-typed field. Returns the input cast to\n * `RunRecord` on success — the validator does not coerce.\n */\nexport function validateRunRecord(input: unknown): RunRecord {\n if (input === null || typeof input !== 'object') {\n throw new RunRecordValidationError('expected object')\n }\n const obj = input as Record<string, unknown>\n\n for (const key of MANDATORY_TOP_LEVEL) {\n if (!(key in obj)) {\n throw new RunRecordValidationError(`missing mandatory field \"${key}\"`)\n }\n }\n\n expectString(obj.runId, 'runId')\n expectString(obj.experimentId, 'experimentId')\n expectString(obj.candidateId, 'candidateId')\n expectFiniteNumber(obj.seed, 'seed')\n expectString(obj.model, 'model')\n expectString(obj.promptHash, 'promptHash')\n expectString(obj.configHash, 'configHash')\n expectString(obj.commitSha, 'commitSha')\n expectFiniteNumber(obj.wallMs, 'wallMs')\n if (obj.queueMs !== undefined) expectFiniteNumber(obj.queueMs, 'queueMs')\n expectFiniteNumber(obj.costUsd, 'costUsd')\n\n // Snapshot discipline: bare model aliases are not paper-grade.\n if (!modelHasSnapshot(obj.model as string)) {\n throw new RunRecordValidationError(\n `model \"${obj.model}\" lacks a snapshot version (use 'name@YYYY-MM-DD' or 'name-YYYYMMDD')`,\n 'model',\n )\n }\n\n // Token usage.\n const tu = obj.tokenUsage\n if (tu === null || typeof tu !== 'object') {\n throw new RunRecordValidationError('tokenUsage must be an object', 'tokenUsage')\n }\n const tuRec = tu as Record<string, unknown>\n expectFiniteNumber(tuRec.input, 'tokenUsage.input')\n expectFiniteNumber(tuRec.output, 'tokenUsage.output')\n if (tuRec.cached !== undefined) expectFiniteNumber(tuRec.cached, 'tokenUsage.cached')\n\n // Judge metadata, optional.\n if (obj.judgeMetadata !== undefined) {\n const jm = obj.judgeMetadata\n if (jm === null || typeof jm !== 'object') {\n throw new RunRecordValidationError('judgeMetadata must be an object', 'judgeMetadata')\n }\n const jmRec = jm as Record<string, unknown>\n expectString(jmRec.model, 'judgeMetadata.model')\n expectString(jmRec.promptVersion, 'judgeMetadata.promptVersion')\n expectFiniteNumber(jmRec.confidence, 'judgeMetadata.confidence')\n if (typeof jmRec.fallback !== 'boolean') {\n throw new RunRecordValidationError(\n 'judgeMetadata.fallback must be boolean',\n 'judgeMetadata.fallback',\n )\n }\n }\n\n // Outcome.\n const out = obj.outcome\n if (out === null || typeof out !== 'object') {\n throw new RunRecordValidationError('outcome must be an object', 'outcome')\n }\n const outRec = out as Record<string, unknown>\n if (outRec.searchScore !== undefined)\n expectFiniteNumber(outRec.searchScore, 'outcome.searchScore')\n if (outRec.holdoutScore !== undefined)\n expectFiniteNumber(outRec.holdoutScore, 'outcome.holdoutScore')\n if (outRec.searchScore === undefined && outRec.holdoutScore === undefined) {\n throw new RunRecordValidationError(\n 'outcome must define searchScore or holdoutScore (or both)',\n 'outcome',\n )\n }\n const raw = outRec.raw\n if (raw === null || typeof raw !== 'object') {\n throw new RunRecordValidationError('outcome.raw must be an object', 'outcome.raw')\n }\n for (const [k, v] of Object.entries(raw as Record<string, unknown>)) {\n expectFiniteNumber(v, `outcome.raw.${k}`)\n }\n\n // Per-judge / per-dim breakdown, optional.\n if (outRec.judgeScores !== undefined) {\n validateJudgeScores(outRec.judgeScores, 'outcome.judgeScores')\n }\n\n // Failure mode optional.\n if (obj.failureMode !== undefined) expectString(obj.failureMode, 'failureMode')\n\n // Split tag.\n if (typeof obj.splitTag !== 'string' || !SPLIT_TAGS.includes(obj.splitTag as RunSplitTag)) {\n throw new RunRecordValidationError(\n `splitTag must be one of ${SPLIT_TAGS.join(', ')}, got ${String(obj.splitTag)}`,\n 'splitTag',\n )\n }\n\n return input as RunRecord\n}\n\n/** Boolean validator — convenience for filtering arrays. */\nexport function isRunRecord(input: unknown): input is RunRecord {\n try {\n validateRunRecord(input)\n return true\n } catch {\n return false\n }\n}\n\n/** Non-throwing validator — returns a discriminated union. */\nexport function parseRunRecordSafe(\n input: unknown,\n): { ok: true; value: RunRecord } | { ok: false; error: RunRecordValidationError } {\n try {\n return { ok: true, value: validateRunRecord(input) }\n } catch (e) {\n if (e instanceof RunRecordValidationError) return { ok: false, error: e }\n throw e\n }\n}\n\n/** Round-trip helper — `JSON.parse(JSON.stringify(record))` then validate. */\nexport function roundTripRunRecord(record: RunRecord): RunRecord {\n const json = JSON.stringify(record)\n return validateRunRecord(JSON.parse(json))\n}\n\n// ── Internals ────────────────────────────────────────────────────────\n\nfunction expectString(value: unknown, path: string): void {\n if (typeof value !== 'string' || value.length === 0) {\n throw new RunRecordValidationError(`expected non-empty string`, path)\n }\n}\n\nfunction expectFiniteNumber(value: unknown, path: string): void {\n if (typeof value !== 'number' || !Number.isFinite(value)) {\n throw new RunRecordValidationError(`expected finite number`, path)\n }\n}\n\nfunction validateJudgeScores(value: unknown, path: string): void {\n if (value === null || typeof value !== 'object') {\n throw new RunRecordValidationError('judgeScores must be an object', path)\n }\n const rec = value as Record<string, unknown>\n\n const perJudge = rec.perJudge\n if (perJudge === null || typeof perJudge !== 'object') {\n throw new RunRecordValidationError('perJudge must be an object', `${path}.perJudge`)\n }\n for (const [judgeId, dims] of Object.entries(perJudge as Record<string, unknown>)) {\n if (dims === null || typeof dims !== 'object') {\n throw new RunRecordValidationError(\n 'per-judge entry must be an object of dimension scores',\n `${path}.perJudge.${judgeId}`,\n )\n }\n for (const [dim, score] of Object.entries(dims as Record<string, unknown>)) {\n expectFiniteNumber(score, `${path}.perJudge.${judgeId}.${dim}`)\n }\n }\n\n const perDimMean = rec.perDimMean\n if (perDimMean === null || typeof perDimMean !== 'object') {\n throw new RunRecordValidationError('perDimMean must be an object', `${path}.perDimMean`)\n }\n for (const [dim, mean] of Object.entries(perDimMean as Record<string, unknown>)) {\n expectFiniteNumber(mean, `${path}.perDimMean.${dim}`)\n }\n\n expectFiniteNumber(rec.composite, `${path}.composite`)\n\n if (rec.failedJudges !== undefined) {\n if (!Array.isArray(rec.failedJudges)) {\n throw new RunRecordValidationError(\n 'failedJudges must be an array of strings',\n `${path}.failedJudges`,\n )\n }\n for (let i = 0; i < rec.failedJudges.length; i++) {\n const id = rec.failedJudges[i]\n if (typeof id !== 'string' || id.length === 0) {\n throw new RunRecordValidationError(\n 'failedJudges entry must be a non-empty string',\n `${path}.failedJudges[${i}]`,\n )\n }\n }\n }\n\n if (rec.notes !== undefined && typeof rec.notes !== 'string') {\n throw new RunRecordValidationError('notes must be a string', `${path}.notes`)\n }\n}\n\n/**\n * Heuristic snapshot check. Accepts:\n * - `name@YYYY-MM-DD` (Anthropic style: `claude-sonnet-4-6@2025-04-15`)\n * - `name-YYYYMMDD` (OpenAI style: `gpt-4o-2024-11-20`)\n * - `name@<arbitrary-token>` (allow opaque snapshots like `@v3`)\n * - explicit `:date-...` Vertex-style tags\n *\n * Rejects bare aliases like `claude-sonnet-4` or `gpt-4o` that remap\n * silently as providers ship new snapshots.\n */\nfunction modelHasSnapshot(model: string): boolean {\n if (model.includes('@')) return true\n if (/-\\d{8}$/.test(model)) return true\n if (/-\\d{4}-\\d{2}-\\d{2}$/.test(model)) return true\n if (/:date-/.test(model)) return true\n return false\n}\n"],"mappings":";;;;;AAyKA,IAAM,sBAAsB;AAAA,EAC1B;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAIA,IAAM,aAAyC,CAAC,UAAU,OAAO,SAAS;AAEnE,IAAM,2BAAN,cAAuC,gBAAgB;AAAA,EACnD;AAAA,EACT,YAAY,SAAiB,OAAO,IAAI;AACtC,UAAM,OAAO,GAAG,OAAO,QAAQ,IAAI,MAAM,OAAO;AAChD,SAAK,OAAO;AAAA,EACd;AACF;AAOO,SAAS,kBAAkB,OAA2B;AAC3D,MAAI,UAAU,QAAQ,OAAO,UAAU,UAAU;AAC/C,UAAM,IAAI,yBAAyB,iBAAiB;AAAA,EACtD;AACA,QAAM,MAAM;AAEZ,aAAW,OAAO,qBAAqB;AACrC,QAAI,EAAE,OAAO,MAAM;AACjB,YAAM,IAAI,yBAAyB,4BAA4B,GAAG,GAAG;AAAA,IACvE;AAAA,EACF;AAEA,eAAa,IAAI,OAAO,OAAO;AAC/B,eAAa,IAAI,cAAc,cAAc;AAC7C,eAAa,IAAI,aAAa,aAAa;AAC3C,qBAAmB,IAAI,MAAM,MAAM;AACnC,eAAa,IAAI,OAAO,OAAO;AAC/B,eAAa,IAAI,YAAY,YAAY;AACzC,eAAa,IAAI,YAAY,YAAY;AACzC,eAAa,IAAI,WAAW,WAAW;AACvC,qBAAmB,IAAI,QAAQ,QAAQ;AACvC,MAAI,IAAI,YAAY,OAAW,oBAAmB,IAAI,SAAS,SAAS;AACxE,qBAAmB,IAAI,SAAS,SAAS;AAGzC,MAAI,CAAC,iBAAiB,IAAI,KAAe,GAAG;AAC1C,UAAM,IAAI;AAAA,MACR,UAAU,IAAI,KAAK;AAAA,MACnB;AAAA,IACF;AAAA,EACF;AAGA,QAAM,KAAK,IAAI;AACf,MAAI,OAAO,QAAQ,OAAO,OAAO,UAAU;AACzC,UAAM,IAAI,yBAAyB,gCAAgC,YAAY;AAAA,EACjF;AACA,QAAM,QAAQ;AACd,qBAAmB,MAAM,OAAO,kBAAkB;AAClD,qBAAmB,MAAM,QAAQ,mBAAmB;AACpD,MAAI,MAAM,WAAW,OAAW,oBAAmB,MAAM,QAAQ,mBAAmB;AAGpF,MAAI,IAAI,kBAAkB,QAAW;AACnC,UAAM,KAAK,IAAI;AACf,QAAI,OAAO,QAAQ,OAAO,OAAO,UAAU;AACzC,YAAM,IAAI,yBAAyB,mCAAmC,eAAe;AAAA,IACvF;AACA,UAAM,QAAQ;AACd,iBAAa,MAAM,OAAO,qBAAqB;AAC/C,iBAAa,MAAM,eAAe,6BAA6B;AAC/D,uBAAmB,MAAM,YAAY,0BAA0B;AAC/D,QAAI,OAAO,MAAM,aAAa,WAAW;AACvC,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAGA,QAAM,MAAM,IAAI;AAChB,MAAI,QAAQ,QAAQ,OAAO,QAAQ,UAAU;AAC3C,UAAM,IAAI,yBAAyB,6BAA6B,SAAS;AAAA,EAC3E;AACA,QAAM,SAAS;AACf,MAAI,OAAO,gBAAgB;AACzB,uBAAmB,OAAO,aAAa,qBAAqB;AAC9D,MAAI,OAAO,iBAAiB;AAC1B,uBAAmB,OAAO,cAAc,sBAAsB;AAChE,MAAI,OAAO,gBAAgB,UAAa,OAAO,iBAAiB,QAAW;AACzE,UAAM,IAAI;AAAA,MACR;AAAA,MACA;AAAA,IACF;AAAA,EACF;AACA,QAAM,MAAM,OAAO;AACnB,MAAI,QAAQ,QAAQ,OAAO,QAAQ,UAAU;AAC3C,UAAM,IAAI,yBAAyB,iCAAiC,aAAa;AAAA,EACnF;AACA,aAAW,CAAC,GAAG,CAAC,KAAK,OAAO,QAAQ,GAA8B,GAAG;AACnE,uBAAmB,GAAG,eAAe,CAAC,EAAE;AAAA,EAC1C;AAGA,MAAI,OAAO,gBAAgB,QAAW;AACpC,wBAAoB,OAAO,aAAa,qBAAqB;AAAA,EAC/D;AAGA,MAAI,IAAI,gBAAgB,OAAW,cAAa,IAAI,aAAa,aAAa;AAG9E,MAAI,OAAO,IAAI,aAAa,YAAY,CAAC,WAAW,SAAS,IAAI,QAAuB,GAAG;AACzF,UAAM,IAAI;AAAA,MACR,2BAA2B,WAAW,KAAK,IAAI,CAAC,SAAS,OAAO,IAAI,QAAQ,CAAC;AAAA,MAC7E;AAAA,IACF;AAAA,EACF;AAEA,SAAO;AACT;AAGO,SAAS,YAAY,OAAoC;AAC9D,MAAI;AACF,sBAAkB,KAAK;AACvB,WAAO;AAAA,EACT,QAAQ;AACN,WAAO;AAAA,EACT;AACF;AAGO,SAAS,mBACd,OACiF;AACjF,MAAI;AACF,WAAO,EAAE,IAAI,MAAM,OAAO,kBAAkB,KAAK,EAAE;AAAA,EACrD,SAAS,GAAG;AACV,QAAI,aAAa,yBAA0B,QAAO,EAAE,IAAI,OAAO,OAAO,EAAE;AACxE,UAAM;AAAA,EACR;AACF;AAGO,SAAS,mBAAmB,QAA8B;AAC/D,QAAM,OAAO,KAAK,UAAU,MAAM;AAClC,SAAO,kBAAkB,KAAK,MAAM,IAAI,CAAC;AAC3C;AAIA,SAAS,aAAa,OAAgB,MAAoB;AACxD,MAAI,OAAO,UAAU,YAAY,MAAM,WAAW,GAAG;AACnD,UAAM,IAAI,yBAAyB,6BAA6B,IAAI;AAAA,EACtE;AACF;AAEA,SAAS,mBAAmB,OAAgB,MAAoB;AAC9D,MAAI,OAAO,UAAU,YAAY,CAAC,OAAO,SAAS,KAAK,GAAG;AACxD,UAAM,IAAI,yBAAyB,0BAA0B,IAAI;AAAA,EACnE;AACF;AAEA,SAAS,oBAAoB,OAAgB,MAAoB;AAC/D,MAAI,UAAU,QAAQ,OAAO,UAAU,UAAU;AAC/C,UAAM,IAAI,yBAAyB,iCAAiC,IAAI;AAAA,EAC1E;AACA,QAAM,MAAM;AAEZ,QAAM,WAAW,IAAI;AACrB,MAAI,aAAa,QAAQ,OAAO,aAAa,UAAU;AACrD,UAAM,IAAI,yBAAyB,8BAA8B,GAAG,IAAI,WAAW;AAAA,EACrF;AACA,aAAW,CAAC,SAAS,IAAI,KAAK,OAAO,QAAQ,QAAmC,GAAG;AACjF,QAAI,SAAS,QAAQ,OAAO,SAAS,UAAU;AAC7C,YAAM,IAAI;AAAA,QACR;AAAA,QACA,GAAG,IAAI,aAAa,OAAO;AAAA,MAC7B;AAAA,IACF;AACA,eAAW,CAAC,KAAK,KAAK,KAAK,OAAO,QAAQ,IAA+B,GAAG;AAC1E,yBAAmB,OAAO,GAAG,IAAI,aAAa,OAAO,IAAI,GAAG,EAAE;AAAA,IAChE;AAAA,EACF;AAEA,QAAM,aAAa,IAAI;AACvB,MAAI,eAAe,QAAQ,OAAO,eAAe,UAAU;AACzD,UAAM,IAAI,yBAAyB,gCAAgC,GAAG,IAAI,aAAa;AAAA,EACzF;AACA,aAAW,CAAC,KAAK,IAAI,KAAK,OAAO,QAAQ,UAAqC,GAAG;AAC/E,uBAAmB,MAAM,GAAG,IAAI,eAAe,GAAG,EAAE;AAAA,EACtD;AAEA,qBAAmB,IAAI,WAAW,GAAG,IAAI,YAAY;AAErD,MAAI,IAAI,iBAAiB,QAAW;AAClC,QAAI,CAAC,MAAM,QAAQ,IAAI,YAAY,GAAG;AACpC,YAAM,IAAI;AAAA,QACR;AAAA,QACA,GAAG,IAAI;AAAA,MACT;AAAA,IACF;AACA,aAAS,IAAI,GAAG,IAAI,IAAI,aAAa,QAAQ,KAAK;AAChD,YAAM,KAAK,IAAI,aAAa,CAAC;AAC7B,UAAI,OAAO,OAAO,YAAY,GAAG,WAAW,GAAG;AAC7C,cAAM,IAAI;AAAA,UACR;AAAA,UACA,GAAG,IAAI,iBAAiB,CAAC;AAAA,QAC3B;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAEA,MAAI,IAAI,UAAU,UAAa,OAAO,IAAI,UAAU,UAAU;AAC5D,UAAM,IAAI,yBAAyB,0BAA0B,GAAG,IAAI,QAAQ;AAAA,EAC9E;AACF;AAYA,SAAS,iBAAiB,OAAwB;AAChD,MAAI,MAAM,SAAS,GAAG,EAAG,QAAO;AAChC,MAAI,UAAU,KAAK,KAAK,EAAG,QAAO;AAClC,MAAI,sBAAsB,KAAK,KAAK,EAAG,QAAO;AAC9C,MAAI,SAAS,KAAK,KAAK,EAAG,QAAO;AACjC,SAAO;AACT;","names":[]}
|
|
@@ -2,7 +2,7 @@ import { b as FeedbackLabel, p as ProposedSideEffect } from './feedback-trajecto
|
|
|
2
2
|
import { C as ControlEvalResult, a as ControlRunResult, h as ControlRuntimeConfig } from './control-runtime-BZ_lVLYW.js';
|
|
3
3
|
import { T as TraceEmitter } from './emitter-DP_cSSiw.js';
|
|
4
4
|
import { T as TraceStore, F as FailureClass } from './store-Db2Bv8Cf.js';
|
|
5
|
-
import { a as RunSplitTag, b as RunTokenUsage, R as RunRecord } from './run-record-
|
|
5
|
+
import { a as RunSplitTag, b as RunTokenUsage, R as RunRecord } from './run-record-BfX5y68A.js';
|
|
6
6
|
|
|
7
7
|
interface ActionExecutionPolicy {
|
|
8
8
|
allowedTypes?: string[];
|
package/dist/control.d.ts
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, p as RunEvidenceMetadata, s as controlRunToRunRecord, u as evaluateActionPolicy, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-
|
|
1
|
+
export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, p as RunEvidenceMetadata, s as controlRunToRunRecord, u as evaluateActionPolicy, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-p2ns7elI.js';
|
|
2
2
|
export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, C as ControlEvalResult, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-BZ_lVLYW.js';
|
|
3
3
|
import './feedback-trajectory-iATEAHmc.js';
|
|
4
4
|
import './dataset-ueRVTUoY.js';
|
|
5
5
|
import './errors-mje_cKOs.js';
|
|
6
6
|
import './emitter-DP_cSSiw.js';
|
|
7
7
|
import './store-Db2Bv8Cf.js';
|
|
8
|
-
import './run-record-
|
|
8
|
+
import './run-record-BfX5y68A.js';
|
package/dist/control.js
CHANGED
|
@@ -4,7 +4,7 @@ import {
|
|
|
4
4
|
runProposeReview,
|
|
5
5
|
runProposeReviewAsControlLoop,
|
|
6
6
|
scoreFromEvals
|
|
7
|
-
} from "./chunk-
|
|
7
|
+
} from "./chunk-S4Y5VXMS.js";
|
|
8
8
|
import {
|
|
9
9
|
allCriticalPassed,
|
|
10
10
|
objectiveEval,
|
|
@@ -13,7 +13,7 @@ import {
|
|
|
13
13
|
stopOnRepeatedAction,
|
|
14
14
|
subjectiveEval
|
|
15
15
|
} from "./chunk-NCRFYPS3.js";
|
|
16
|
-
import "./chunk-
|
|
16
|
+
import "./chunk-ZN2CMQIW.js";
|
|
17
17
|
import "./chunk-TVVP3ZZQ.js";
|
|
18
18
|
import "./chunk-QYJT52YW.js";
|
|
19
19
|
import "./chunk-PZ5AY32C.js";
|