@tangle-network/agent-eval 0.59.1 → 0.60.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapters/http.d.ts +1 -1
- package/dist/adapters/http.js +1 -1
- package/dist/adapters/langchain.d.ts +1 -1
- package/dist/adapters/langchain.js +1 -1
- package/dist/adapters/otel.d.ts +2 -2
- package/dist/adapters/otel.js +1 -1
- package/dist/benchmarks/index.js +2 -2
- package/dist/builder-eval/index.js +1 -1
- package/dist/campaign/index.d.ts +7 -3
- package/dist/campaign/index.js +21 -16
- package/dist/campaign/index.js.map +1 -1
- package/dist/{chunk-MHQPVHXU.js → chunk-6QDKWHLS.js} +2 -2
- package/dist/{chunk-N4SBKEPJ.js → chunk-GBHRUAOF.js} +106 -1
- package/dist/chunk-GBHRUAOF.js.map +1 -0
- package/dist/{chunk-JB4UWIM6.js → chunk-LBSXXH56.js} +265 -14
- package/dist/chunk-LBSXXH56.js.map +1 -0
- package/dist/{chunk-74Y2EMNH.js → chunk-NOPYCRNG.js} +6 -5
- package/dist/{chunk-74Y2EMNH.js.map → chunk-NOPYCRNG.js.map} +1 -1
- package/dist/chunk-PZ5AY32C.js +10 -0
- package/dist/chunk-SHTXZ4O2.js +113 -0
- package/dist/chunk-SHTXZ4O2.js.map +1 -0
- package/dist/cli.js +1 -1
- package/dist/contract/index.d.ts +42 -10
- package/dist/contract/index.js +55 -15
- package/dist/contract/index.js.map +1 -1
- package/dist/control.js +1 -1
- package/dist/governance/index.js +1 -1
- package/dist/hosted/index.d.ts +2 -2
- package/dist/hosted/index.js +1 -1
- package/dist/{index-D2nT6_KT.d.ts → index-BIkvdkSU.d.ts} +1 -1
- package/dist/index.js +8 -8
- package/dist/knowledge/index.js +1 -1
- package/dist/matrix/index.js +1 -1
- package/dist/meta-eval/index.js +1 -1
- package/dist/multishot/index.js +1 -1
- package/dist/openapi.json +1 -1
- package/dist/pipelines/index.js +1 -1
- package/dist/prm/index.js +1 -1
- package/dist/{run-improvement-loop-BhfdjrMY.d.ts → provenance-BM8vmMBa.d.ts} +205 -3
- package/dist/reporting.js +1 -1
- package/dist/rl.d.ts +1 -1
- package/dist/rl.js +1 -1
- package/dist/{run-campaign-ZURVWMMI.js → run-campaign-5XENUKRF.js} +3 -3
- package/dist/telemetry/file.js +1 -1
- package/dist/telemetry/index.js +1 -1
- package/dist/traces.js +1 -1
- package/dist/{types-BgrxOJSf.d.ts → types-VCIXx_yo.d.ts} +32 -4
- package/dist/wire/index.js +1 -1
- package/package.json +25 -12
- package/dist/chunk-JB4UWIM6.js.map +0 -1
- package/dist/chunk-N4SBKEPJ.js.map +0 -1
- package/dist/chunk-NSBPE2FW.js +0 -17
- package/dist/chunk-ZWEQJIM6.js +0 -220
- package/dist/chunk-ZWEQJIM6.js.map +0 -1
- /package/dist/{chunk-MHQPVHXU.js.map → chunk-6QDKWHLS.js.map} +0 -0
- /package/dist/{chunk-NSBPE2FW.js.map → chunk-PZ5AY32C.js.map} +0 -0
- /package/dist/{run-campaign-ZURVWMMI.js.map → run-campaign-5XENUKRF.js.map} +0 -0
package/dist/adapters/http.d.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { S as Scenario, D as DispatchFn, b as DispatchContext } from '../types-
|
|
1
|
+
import { S as Scenario, D as DispatchFn, b as DispatchContext } from '../types-VCIXx_yo.js';
|
|
2
2
|
|
|
3
3
|
/**
|
|
4
4
|
* # `@tangle-network/agent-eval/adapters/http` — distributed Dispatch over HTTP.
|
package/dist/adapters/http.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { S as Scenario, J as JudgeScore, D as DispatchFn, a as JudgeConfig } from '../types-
|
|
1
|
+
import { S as Scenario, J as JudgeScore, D as DispatchFn, a as JudgeConfig } from '../types-VCIXx_yo.js';
|
|
2
2
|
|
|
3
3
|
/**
|
|
4
4
|
* # `@tangle-network/agent-eval/adapters/langchain` — wrap any LangChain
|
package/dist/adapters/otel.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { T as TraceSpanEvent, H as HostedClient } from '../index-
|
|
2
|
-
import '../types-
|
|
1
|
+
import { T as TraceSpanEvent, H as HostedClient } from '../index-BIkvdkSU.js';
|
|
2
|
+
import '../types-VCIXx_yo.js';
|
|
3
3
|
import '../summary-report-DLxh4yWk.js';
|
|
4
4
|
import '../run-record-etiCMsUq.js';
|
|
5
5
|
import '../errors-mje_cKOs.js';
|
package/dist/adapters/otel.js
CHANGED
package/dist/benchmarks/index.js
CHANGED
package/dist/campaign/index.d.ts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
export { C as CampaignStorage, D as DefaultProductionGateOptions, E as EvolutionaryDriverOptions, G as GepaDriverConstraints,
|
|
2
|
-
import { L as LabeledScenarioStore, c as LabeledScenarioWrite, d as LabeledScenarioSampleArgs, e as LabeledScenarioRecord, f as LabelTrust, C as CodeSurface } from '../types-
|
|
3
|
-
export { g as CampaignAggregates, h as CampaignArtifactWriter, i as CampaignCellResult, j as CampaignCostMeter, k as CampaignResult, l as CampaignTraceWriter, b as DispatchContext, D as DispatchFn, G as Gate, m as GateContext, n as GateDecision, o as GateResult, p as GenerationCandidate, q as GenerationRecord, I as ImprovementDriver, r as JudgeAggregate, a as JudgeConfig, s as JudgeDimension, J as JudgeScore, t as LabeledScenarioSource, M as MutableSurface, u as Mutator, O as OptimizerConfig, P as ProposeContext, R as RedactionStatus, S as Scenario,
|
|
1
|
+
export { B as BuildLoopProvenanceArgs, C as CampaignStorage, D as DefaultProductionGateOptions, E as EmitLoopProvenanceArgs, a as EmitLoopProvenanceResult, b as EvolutionaryDriverOptions, G as GepaDriverConstraints, c as GepaDriverOptions, H as HeldOutGateOptions, L as LoopProvenanceBackend, d as LoopProvenanceCandidate, e as LoopProvenanceRecord, O as OpenAutoPrOptions, f as OpenAutoPrResult, R as RunCampaignOptions, g as RunEvalOptions, h as RunImprovementLoopOptions, i as RunImprovementLoopResult, j as RunOptimizationOptions, k as RunOptimizationResult, l as buildLoopProvenanceRecord, m as composeGate, n as countSentenceEdits, o as defaultProductionGate, p as defaultRenderDiff, q as emitLoopProvenance, r as evolutionaryDriver, s as extractH2Sections, t as fsCampaignStorage, u as gepaDriver, v as heldOutGate, w as inMemoryCampaignStorage, x as loopProvenanceSpans, y as openAutoPr, z as provenanceRecordPath, A as provenanceSpansPath, F as runCampaign, I as runEval, J as runImprovementLoop, K as runOptimization, M as surfaceContentHash, N as surfaceHash } from '../provenance-BM8vmMBa.js';
|
|
2
|
+
import { L as LabeledScenarioStore, c as LabeledScenarioWrite, d as LabeledScenarioSampleArgs, e as LabeledScenarioRecord, f as LabelTrust, C as CodeSurface } from '../types-VCIXx_yo.js';
|
|
3
|
+
export { g as CampaignAggregates, h as CampaignArtifactWriter, i as CampaignCellResult, j as CampaignCostMeter, k as CampaignResult, l as CampaignTraceWriter, b as DispatchContext, D as DispatchFn, G as Gate, m as GateContext, n as GateDecision, o as GateResult, p as GenerationCandidate, q as GenerationRecord, I as ImprovementDriver, r as JudgeAggregate, a as JudgeConfig, s as JudgeDimension, J as JudgeScore, t as LabeledScenarioSource, M as MutableSurface, u as Mutator, O as OptimizerConfig, P as ProposeContext, v as ProposedCandidate, R as RedactionStatus, S as Scenario, w as ScenarioAggregate, x as SessionScript, T as TraceSpan, y as isProposedCandidate, z as labelTrustRank } from '../types-VCIXx_yo.js';
|
|
4
4
|
import '../llm-client-BXVRUZyX.js';
|
|
5
5
|
import '../errors-mje_cKOs.js';
|
|
6
6
|
import '../raw-provider-sink-C46HDghv.js';
|
|
@@ -9,6 +9,10 @@ import '../dataset-BlwAtYYf.js';
|
|
|
9
9
|
import '../store-CKUAgsJz.js';
|
|
10
10
|
import '../schema-m0gsnbt3.js';
|
|
11
11
|
import '../run-record-etiCMsUq.js';
|
|
12
|
+
import '../index-BIkvdkSU.js';
|
|
13
|
+
import '../summary-report-DLxh4yWk.js';
|
|
14
|
+
import '../failure-cluster-CL7IVgkJ.js';
|
|
15
|
+
import '../judge-calibration-DilmB3Ml.js';
|
|
12
16
|
|
|
13
17
|
/**
|
|
14
18
|
* @experimental
|
package/dist/campaign/index.js
CHANGED
|
@@ -1,47 +1,44 @@
|
|
|
1
1
|
import {
|
|
2
|
+
buildLoopProvenanceRecord,
|
|
2
3
|
composeGate,
|
|
3
4
|
countSentenceEdits,
|
|
4
5
|
defaultProductionGate,
|
|
6
|
+
defaultRenderDiff,
|
|
7
|
+
emitLoopProvenance,
|
|
5
8
|
evolutionaryDriver,
|
|
6
9
|
extractH2Sections,
|
|
7
10
|
gepaDriver,
|
|
8
11
|
heldOutGate,
|
|
12
|
+
isProposedCandidate,
|
|
13
|
+
labelTrustRank,
|
|
14
|
+
loopProvenanceSpans,
|
|
9
15
|
openAutoPr,
|
|
16
|
+
provenanceRecordPath,
|
|
17
|
+
provenanceSpansPath,
|
|
10
18
|
runEval,
|
|
11
19
|
runImprovementLoop,
|
|
12
20
|
runOptimization,
|
|
21
|
+
surfaceContentHash,
|
|
13
22
|
surfaceHash
|
|
14
|
-
} from "../chunk-
|
|
23
|
+
} from "../chunk-LBSXXH56.js";
|
|
15
24
|
import {
|
|
16
25
|
fsCampaignStorage,
|
|
17
26
|
inMemoryCampaignStorage,
|
|
18
27
|
runCampaign
|
|
19
|
-
} from "../chunk-
|
|
20
|
-
import "../chunk-
|
|
28
|
+
} from "../chunk-NOPYCRNG.js";
|
|
29
|
+
import "../chunk-GBHRUAOF.js";
|
|
21
30
|
import "../chunk-YV7J7X5N.js";
|
|
22
31
|
import "../chunk-S3SDD56V.js";
|
|
23
32
|
import "../chunk-GGE4NNQT.js";
|
|
24
33
|
import "../chunk-VXNVVBZO.js";
|
|
25
34
|
import "../chunk-PC4UYEBM.js";
|
|
26
35
|
import "../chunk-QYJT52YW.js";
|
|
27
|
-
import "../chunk-
|
|
36
|
+
import "../chunk-PZ5AY32C.js";
|
|
28
37
|
|
|
29
38
|
// src/campaign/labeled-store/fs-adapter.ts
|
|
30
39
|
import { createHash } from "crypto";
|
|
31
40
|
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
|
|
32
41
|
import { join } from "path";
|
|
33
|
-
|
|
34
|
-
// src/campaign/types.ts
|
|
35
|
-
var LABEL_TRUST_RANK = {
|
|
36
|
-
unverified: 0,
|
|
37
|
-
"verified-signal": 1,
|
|
38
|
-
"human-rated": 2
|
|
39
|
-
};
|
|
40
|
-
function labelTrustRank(trust) {
|
|
41
|
-
return LABEL_TRUST_RANK[trust ?? "unverified"];
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
// src/campaign/labeled-store/fs-adapter.ts
|
|
45
42
|
var LabeledScenarioStoreError = class extends Error {
|
|
46
43
|
constructor(code, message) {
|
|
47
44
|
super(message);
|
|
@@ -312,9 +309,12 @@ export {
|
|
|
312
309
|
FsLabeledScenarioStore,
|
|
313
310
|
LabeledScenarioStoreError,
|
|
314
311
|
WorktreeAdapterError,
|
|
312
|
+
buildLoopProvenanceRecord,
|
|
315
313
|
composeGate,
|
|
316
314
|
countSentenceEdits,
|
|
317
315
|
defaultProductionGate,
|
|
316
|
+
defaultRenderDiff,
|
|
317
|
+
emitLoopProvenance,
|
|
318
318
|
evolutionaryDriver,
|
|
319
319
|
extractH2Sections,
|
|
320
320
|
fsCampaignStorage,
|
|
@@ -322,13 +322,18 @@ export {
|
|
|
322
322
|
gitWorktreeAdapter,
|
|
323
323
|
heldOutGate,
|
|
324
324
|
inMemoryCampaignStorage,
|
|
325
|
+
isProposedCandidate,
|
|
325
326
|
labelTrustRank,
|
|
327
|
+
loopProvenanceSpans,
|
|
326
328
|
openAutoPr,
|
|
329
|
+
provenanceRecordPath,
|
|
330
|
+
provenanceSpansPath,
|
|
327
331
|
resolveWorktreePath,
|
|
328
332
|
runCampaign,
|
|
329
333
|
runEval,
|
|
330
334
|
runImprovementLoop,
|
|
331
335
|
runOptimization,
|
|
336
|
+
surfaceContentHash,
|
|
332
337
|
surfaceHash
|
|
333
338
|
};
|
|
334
339
|
//# sourceMappingURL=index.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../src/campaign/labeled-store/fs-adapter.ts","../../src/campaign/types.ts","../../src/campaign/worktree/index.ts"],"sourcesContent":["/**\n * @experimental\n *\n * Filesystem `LabeledScenarioStore` adapter. The default capture sink for\n * traces + eval artifacts. Production deployments typically swap for a\n * Turso/SQLite adapter (same interface).\n *\n * Records land as one JSONL file per source under `<root>/<source>.jsonl`.\n * Each line is a `LabeledScenarioRecord`. Append-only — no in-place edits.\n *\n * Safety properties enforced at write-time:\n *\n * - **Provenance required**: writes without `source`, `sourceVersionHash`,\n * `capturedAt`, `redactionStatus` are rejected. Closes the alignment\n * reviewer's data-poisoning gap.\n * - **Per-source rate limits**: optional `rateLimitBucket` + `maxWritesPerMinute`\n * stops a single tenant/source from flooding the store.\n *\n * Safety properties enforced at sample-time:\n *\n * - **Required split + capturedBefore**: substrate refuses to sample without\n * an explicit `split` ('train' | 'test') AND a temporal cutoff. Eliminates\n * accidental train/test contamination.\n * - **Default training-source filter**: when the store is sampled with\n * `split: 'train'`, production-trace records are EXCLUDED unless the\n * caller passes `filter.source: 'production-trace'` explicitly. Closes\n * the contamination-by-default gap flagged by the senior eval engineer.\n */\n\nimport { createHash } from 'node:crypto'\nimport { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs'\nimport { join } from 'node:path'\nimport type {\n LabeledScenarioRecord,\n LabeledScenarioSampleArgs,\n LabeledScenarioSource,\n LabeledScenarioStore,\n LabeledScenarioWrite,\n LabelTrust,\n} from '../types'\nimport { labelTrustRank } from '../types'\n\nexport interface FsLabeledScenarioStoreOptions {\n /** Root directory for JSONL files. Created if missing. */\n root: string\n /** Per-source rate limit. When set, writes exceeding the cap are rejected\n * with a typed error. Default: no limit. */\n maxWritesPerMinutePerBucket?: number\n /** Test seam — override `Date.now()` for deterministic tests. */\n now?: () => number\n}\n\nexport class LabeledScenarioStoreError extends Error {\n constructor(\n public readonly code: string,\n message: string,\n ) {\n super(message)\n this.name = 'LabeledScenarioStoreError'\n }\n}\n\ninterface RateLimitState {\n bucket: string\n windowStartMs: number\n count: number\n}\n\nexport class FsLabeledScenarioStore implements LabeledScenarioStore {\n private readonly now: () => number\n private readonly rateLimits = new Map<string, RateLimitState>()\n\n constructor(private readonly options: FsLabeledScenarioStoreOptions) {\n if (!existsSync(options.root)) mkdirSync(options.root, { recursive: true })\n this.now = options.now ?? Date.now\n }\n\n async observe(write: LabeledScenarioWrite): Promise<void> {\n this.assertProvenance(write)\n this.assertRateLimit(write)\n const record = this.toRecord(write)\n const path = this.pathForSource(write.source)\n const line = `${JSON.stringify(record)}\\n`\n // Append atomically. For high-throughput a writev-friendly buffered\n // implementation lands in the Turso adapter; FS adapter is for tests +\n // local dev + small workloads.\n appendLine(path, line)\n }\n\n async sample(args: LabeledScenarioSampleArgs): Promise<LabeledScenarioRecord[]> {\n if (!args.split) {\n throw new LabeledScenarioStoreError(\n 'split_required',\n 'sample() requires an explicit `split` (train | test) — substrate refuses ambiguous reads',\n )\n }\n if (!args.capturedBefore) {\n throw new LabeledScenarioStoreError(\n 'capturedBefore_required',\n 'sample() requires an explicit `capturedBefore` timestamp for temporal-split discipline',\n )\n }\n\n const all: LabeledScenarioRecord[] = []\n for (const source of ALL_SOURCES) {\n // Default training-source filter: when sampling train, EXCLUDE\n // production-trace records unless the caller asks for them.\n if (args.split === 'train' && source === 'production-trace') {\n const explicit = sourceFilterContains(args.filter?.source, 'production-trace')\n if (!explicit) continue\n }\n const path = this.pathForSource(source)\n if (!existsSync(path)) continue\n const lines = readFileSync(path, 'utf8').split('\\n').filter(Boolean)\n for (const line of lines) {\n let record: LabeledScenarioRecord\n try {\n record = JSON.parse(line) as LabeledScenarioRecord\n } catch {\n continue\n }\n if (!matchesFilter(record, args, source)) continue\n all.push(record)\n }\n }\n\n // Deterministic order: by capturedAt ascending, then recordHash.\n all.sort((a, b) => {\n if (a.capturedAt !== b.capturedAt) return a.capturedAt.localeCompare(b.capturedAt)\n return a.recordHash.localeCompare(b.recordHash)\n })\n\n return all.slice(0, args.count)\n }\n\n async size(): Promise<{\n train: number\n test: number\n bySource: Record<string, number>\n byTrust: Record<LabelTrust, number>\n }> {\n const bySource: Record<string, number> = {}\n const byTrust: Record<LabelTrust, number> = {\n unverified: 0,\n 'verified-signal': 0,\n 'human-rated': 0,\n }\n let total = 0\n for (const source of ALL_SOURCES) {\n const path = this.pathForSource(source)\n if (!existsSync(path)) {\n bySource[source] = 0\n continue\n }\n const lines = readFileSync(path, 'utf8').split('\\n').filter(Boolean)\n bySource[source] = lines.length\n total += lines.length\n for (const line of lines) {\n let trust: LabelTrust = 'unverified'\n try {\n trust = (JSON.parse(line) as LabeledScenarioRecord).labelTrust ?? 'unverified'\n } catch {\n // A malformed line counts as unverified — never silently gold.\n }\n byTrust[trust] += 1\n }\n }\n // FS adapter doesn't track split assignments per-record (split is\n // computed at sample-time based on `capturedBefore`). For size(), we\n // report `train`+`test` as the same total — split is a sampling concept.\n return { train: total, test: total, bySource, byTrust }\n }\n\n private assertProvenance(write: LabeledScenarioWrite): void {\n if (!write.source) {\n throw new LabeledScenarioStoreError(\n 'missing_source',\n 'LabeledScenarioWrite requires `source`',\n )\n }\n if (!write.sourceVersionHash || write.sourceVersionHash.length === 0) {\n throw new LabeledScenarioStoreError(\n 'missing_source_version',\n 'LabeledScenarioWrite requires `sourceVersionHash` (git sha or substrate version)',\n )\n }\n if (!write.capturedAt) {\n throw new LabeledScenarioStoreError(\n 'missing_captured_at',\n 'LabeledScenarioWrite requires `capturedAt` ISO timestamp',\n )\n }\n if (!write.redactionStatus) {\n throw new LabeledScenarioStoreError(\n 'missing_redaction_status',\n 'LabeledScenarioWrite requires explicit `redactionStatus` — raw / redacted-pii / redacted-secrets / fully-redacted',\n )\n }\n if (!ALL_SOURCES.includes(write.source)) {\n throw new LabeledScenarioStoreError(\n 'unknown_source',\n `LabeledScenarioWrite.source must be one of: ${ALL_SOURCES.join(', ')}`,\n )\n }\n }\n\n private assertRateLimit(write: LabeledScenarioWrite): void {\n const cap = this.options.maxWritesPerMinutePerBucket\n if (!cap || !write.rateLimitBucket) return\n const now = this.now()\n const windowMs = 60_000\n let state = this.rateLimits.get(write.rateLimitBucket)\n if (!state || now - state.windowStartMs >= windowMs) {\n state = { bucket: write.rateLimitBucket, windowStartMs: now, count: 0 }\n this.rateLimits.set(write.rateLimitBucket, state)\n }\n if (state.count >= cap) {\n throw new LabeledScenarioStoreError(\n 'rate_limit_exceeded',\n `LabeledScenarioStore: bucket ${write.rateLimitBucket} exceeded ${cap} writes/min`,\n )\n }\n state.count += 1\n }\n\n private toRecord(write: LabeledScenarioWrite): LabeledScenarioRecord {\n const recordHash = sha256(\n JSON.stringify({\n id: write.scenario.id,\n src: write.source,\n at: write.capturedAt,\n ver: write.sourceVersionHash,\n }),\n )\n // FS adapter assigns split at sample-time, but we cache a hint here\n // based on capturedAt vs the world's \"now\" — sampler overrides this.\n return {\n ...write,\n recordHash,\n split: 'train',\n }\n }\n\n private pathForSource(source: string): string {\n return join(this.options.root, `${source}.jsonl`)\n }\n}\n\nconst ALL_SOURCES: LabeledScenarioWrite['source'][] = [\n 'production-trace',\n 'eval-run',\n 'manual',\n 'red-team',\n 'synthetic',\n]\n\nfunction sourceFilterContains(\n filter: LabeledScenarioSource | LabeledScenarioSource[] | undefined,\n needle: LabeledScenarioSource,\n): boolean {\n if (!filter) return false\n if (Array.isArray(filter)) return filter.includes(needle)\n return filter === needle\n}\n\nfunction matchesFilter(\n record: LabeledScenarioRecord,\n args: LabeledScenarioSampleArgs,\n source: string,\n): boolean {\n // Temporal cutoff — train must be capturedAt < capturedBefore.\n if (args.split === 'train' && record.capturedAt >= args.capturedBefore) return false\n if (args.split === 'test' && record.capturedAt < args.capturedBefore) return false\n\n const f = args.filter\n if (!f) return true\n if (f.kind && record.scenario.kind !== f.kind) return false\n if (f.source) {\n const sources = Array.isArray(f.source) ? f.source : [f.source]\n if (!sources.includes(source as never)) return false\n }\n if (f.minComposite !== undefined || f.maxComposite !== undefined) {\n const composites = Object.values(record.judgeScores).map((s) => s.composite)\n const max = composites.length === 0 ? 0 : Math.max(...composites)\n if (f.minComposite !== undefined && max < f.minComposite) return false\n if (f.maxComposite !== undefined && max > f.maxComposite) return false\n }\n if (f.minTrust !== undefined && labelTrustRank(record.labelTrust) < labelTrustRank(f.minTrust)) {\n return false\n }\n return true\n}\n\nfunction sha256(input: string): string {\n return createHash('sha256').update(input).digest('hex').slice(0, 16)\n}\n\nfunction appendLine(path: string, line: string): void {\n if (existsSync(path)) {\n const existing = readFileSync(path, 'utf8')\n writeFileSync(path, existing + line)\n } else {\n writeFileSync(path, line)\n }\n}\n","/**\n * @experimental\n *\n * Pass A substrate types — `runCampaign` is the one primitive every\n * eval flow composes from. Three contracts in this file:\n *\n * - `Scenario` input set\n * - `DispatchFn` how to run one scenario → artifact\n * - `CampaignResult` defined output schema (the contract downstream tools depend on)\n *\n * Three more lifted from earlier substrate work (re-exported):\n *\n * - `JudgeConfig` pluggable dimensional scorer (0.38)\n * - `Mutator` optimization-loop surface mutator\n * - `Gate` promotion gate (`HeldOutGate` and friends adapt to this)\n *\n * No new architecture vs 0.38 — Pass A formalizes the shapes so consumers\n * can build dashboards / CI gates / regression diffs against a stable schema.\n */\n\n/** @experimental Stable identifier + kind tag for any scenario. Consumers\n * extend with their per-domain payload (persona, task, requirement, ...). */\nexport interface Scenario {\n id: string\n kind: string\n tags?: string[]\n}\n\n/** @experimental Context handed to every dispatch invocation. Scoped — every\n * trace/span carries the cellId, every artifact write lands under the cell's\n * artifact root, the cost meter accumulates per cell. */\nexport interface DispatchContext {\n cellId: string\n rep: number\n generation?: number\n seed: number\n signal: AbortSignal\n trace: CampaignTraceWriter\n artifacts: CampaignArtifactWriter\n cost: CampaignCostMeter\n /** Populated when this run is part of a multi-cycle improvement loop. */\n cycleId?: string\n /** Populated when the substrate resumed from a prior cache hit. */\n resumedFrom?: string\n /**\n * Opaque placement key supplied by `RunCampaignOptions.cellPlacement`.\n * The substrate forwards it through unchanged; placement-aware Dispatch\n * implementations (e.g. `httpDispatch` from `/adapters/http`) read it to\n * route the cell to the right worker / region / sandbox. `undefined`\n * when no placement strategy is configured.\n */\n placement?: string\n}\n\n/** @experimental One function: scenario + ctx → artifact. Dispatcher chooses\n * whether to call `runMultishot`, `runLoop`, raw `streamPrompt`, anything. */\nexport type DispatchFn<TScenario extends Scenario, TArtifact> = (\n scenario: TScenario,\n ctx: DispatchContext,\n) => Promise<TArtifact>\n\n// ── Sessions ──────────────────────────────────────────────────────────\n\n/** @experimental One session within a multi-session journey. Dispatch is\n * invoked once per session in order; state from prior session's artifact\n * is exposed via `ctx.priorSessionArtifact`. */\nexport interface SessionScript<TScenario, TArtifact> {\n id: string\n intent: string\n maxTurns?: number\n /** When true, knowledge accumulated this session persists to next. */\n affectsKnowledge?: boolean\n /** Optional per-session persona evolution — called after the session\n * resolves. Returns the persona shape used by the NEXT session. */\n evolveAfterSession?: (artifact: TArtifact, sessionIndex: number, scenario: TScenario) => TScenario\n}\n\n// ── Judges (re-export 0.38 shape) ─────────────────────────────────────\n\nexport interface JudgeDimension {\n /** JSON field name + score key. */\n key: string\n /** Description shown in the judge's user prompt. */\n description: string\n}\n\n/** @experimental Pluggable dimensional scorer. `score` is the contract:\n * given an artifact + scenario, return a `JudgeScore`. This is deliberately a\n * function, not a fixed LLM-prompt shape — real consumers judge with\n * ensembles, deterministic checks, or a single LLM call, and the substrate\n * must not constrain that. The `llmJudge()` helper builds a `score` that does\n * one LLM call for the common case. `appliesTo` lets a judge run only on\n * scenarios that match (e.g. a legal-citation judge only on legal scenarios). */\nexport interface JudgeConfig<TArtifact, TScenario extends Scenario = Scenario> {\n name: string\n dimensions: JudgeDimension[]\n /** Score one artifact. Throw on failure — a thrown judge is recorded as a\n * failed cell, never silently folded into a zero. */\n score(input: {\n artifact: TArtifact\n scenario: TScenario\n signal: AbortSignal\n }): JudgeScore | Promise<JudgeScore>\n appliesTo?: (scenario: TScenario) => boolean\n}\n\nexport interface JudgeScore {\n dimensions: Record<string, number>\n composite: number\n notes: string\n}\n\n// ── Optimization (population + generations + mutator) ─────────────────\n\n/** @experimental A tier-4 code surface — a candidate change to the agent's\n * IMPLEMENTATION, not its prompt. Produced by autoresearch (reads codebase +\n * trace findings → opens a worktree). Measured by checking out `worktreeRef`\n * and running the worker against the changed code. See the improvement-tier\n * table in `docs/design/loop-taxonomy.md`. */\nexport interface CodeSurface {\n kind: 'code'\n /** Worktree path or git ref holding the candidate code change. The\n * consumer's `dispatchWithSurface` checks this out before running. */\n worktreeRef: string\n /** Base ref the change is measured against. Default: the repo's main. */\n baseRef?: string\n /** Human summary of what changed — rendered into the auto-PR body. */\n summary?: string\n}\n\n/** @experimental The mutable surface a driver proposes. Tiers (see\n * `docs/design/loop-taxonomy.md`):\n * - `string` — tiers 1-2: system-prompt addendum / serialized tool\n * config. Cheap, reversible, text-diffable.\n * - `CodeSurface` — tier 4: an implementation change behind a worktree ref.\n * Tier 3 (knowledge) is owned by agent-knowledge and rides its own adapter,\n * not this type. */\nexport type MutableSurface = string | CodeSurface\n\n/** @experimental Stateless surface mutation — given findings + current\n * surface, return N candidate surfaces. Pure transform, no generation\n * awareness. Reflective-mutation, `runMultiShotOptimization`, `AxGEPA`\n * conform. Wrapped by `evolutionaryDriver` to become an `ImprovementDriver`. */\nexport interface Mutator<TFindings = unknown> {\n kind: string\n mutate(args: {\n findings: TFindings[]\n currentSurface: MutableSurface\n populationSize: number\n signal: AbortSignal\n }): Promise<MutableSurface[]>\n}\n\n/** @experimental Everything a driver's `propose()` may read to plan the next\n * batch of candidates. The first six fields are always present; the rest are\n * optional context the loop supplies when available, so cheap drivers\n * (`evolutionaryDriver`) can ignore them while a code-tier agentic generator\n * consumes the research report + dataset to drive a coding harness.\n * See `docs/design/self-improvement-engine.md`. */\nexport interface ProposeContext<TFindings = unknown> {\n currentSurface: MutableSurface\n history: GenerationRecord[]\n findings: TFindings[]\n /** BREADTH: how many candidate surfaces to return this generation. */\n populationSize: number\n generation: number\n signal: AbortSignal\n /** The Phase-2 research report (analyst findings + diff), produced AFTER the\n * trace analysts run. Opaque to the substrate — the driver that consumes it\n * types it. See the phase diagram in self-improvement-engine.md. */\n report?: unknown\n /** Handle to all captured data — the driver samples traces / artifacts /\n * rewards here to ground its proposals. */\n dataset?: LabeledScenarioStore\n /** DEPTH: max iterations the agentic generator may take per candidate.\n * 1 = single-shot; >1 = it may iterate on its own change before handing it\n * back to be measured. */\n maxImprovementShots?: number\n}\n\n/** @experimental A surface-improvement strategy — the DRIVER of the\n * improvement loop. Given the current best surface, the history of what's\n * been tried + scored, and any external findings, propose the next batch of\n * candidate surfaces to measure. Optionally decide to stop early.\n *\n * The evolutionary mutator (`evolutionaryDriver`, here) and agent-runtime's\n * `improvementDriver` (with reflective / agentic generators) both conform —\n * drivers of the SAME loop, not separate loops. The loop body\n * (`runOptimization`) and the gated promotion shell (`runImprovementLoop`)\n * are driver-agnostic. */\nexport interface ImprovementDriver<TFindings = unknown> {\n kind: string\n /** Plan: propose N candidate surfaces for the next generation. */\n propose(ctx: ProposeContext<TFindings>): Promise<MutableSurface[]>\n /** Decide: stop early when the driver judges the search converged or\n * exhausted. Default (omitted) runs all `maxGenerations`. */\n decide?(args: { history: GenerationRecord[] }): { stop: boolean; reason?: string }\n}\n\nexport interface OptimizerConfig {\n driver: ImprovementDriver\n populationSize: number\n maxGenerations: number\n surfaceExtractor: (profile: unknown) => MutableSurface\n}\n\n// ── Gates ─────────────────────────────────────────────────────────────\n\n/** @experimental Five-valued verdict taxonomy (MOSS-paper alignment). */\nexport type GateDecision = 'ship' | 'hold' | 'need_more_work' | 'model_ceiling' | 'arch_ceiling'\n\nexport interface GateContext<TArtifact, TScenario extends Scenario> {\n candidateArtifacts: Map<string, TArtifact>\n baselineArtifacts?: Map<string, TArtifact>\n /** Candidate (winner) judge scores, keyed by cellId. */\n judgeScores: Map<string, Record<string, JudgeScore>>\n /** Baseline judge scores, keyed by cellId. SEPARATE from `judgeScores` —\n * baseline + candidate share cellIds (same scenarios), so a single map\n * cannot represent both. A gate computing a holdout delta MUST read\n * candidate from `judgeScores` and baseline from here. */\n baselineJudgeScores?: Map<string, Record<string, JudgeScore>>\n scenarios: TScenario[]\n cost: { candidate: number; baseline: number }\n signal: AbortSignal\n}\n\nexport interface GateResult {\n decision: GateDecision\n reasons: string[]\n contributingGates: Array<{ name: string; passed: boolean; detail: unknown }>\n delta?: number\n}\n\n/** @experimental Composable promotion gate. */\nexport interface Gate<TArtifact = unknown, TScenario extends Scenario = Scenario> {\n name: string\n decide(ctx: GateContext<TArtifact, TScenario>): Promise<GateResult>\n}\n\n// ── Tracing / artifacts / cost ────────────────────────────────────────\n\n/** @experimental Scoped trace writer handed to each dispatch — every span\n * auto-tagged with the cellId so traces filter cleanly. */\nexport interface CampaignTraceWriter {\n span(name: string, attributes?: Record<string, unknown>): TraceSpan\n flush(): Promise<void>\n}\n\nexport interface TraceSpan {\n end(attributes?: Record<string, unknown>): void\n setAttribute(key: string, value: unknown): void\n}\n\n/** @experimental Scoped artifact writer — `write(path, content)` lands under\n * `<runDir>/<cellId>/<path>`. */\nexport interface CampaignArtifactWriter {\n write(path: string, content: string | Uint8Array): Promise<string>\n writeJson(path: string, value: unknown): Promise<string>\n}\n\n/** @experimental Cell-scoped cost meter. Substrate auto-tracks LLM costs\n * via the cost-ledger backend hooks; consumers can record additional\n * spend (sandbox time, tool costs) via `observe`. */\nexport interface CampaignCostMeter {\n observe(amountUsd: number, source: string): void\n current(): number\n}\n\n// ── LabeledScenarioStore ──────────────────────────────────────────────\n\n/** @experimental Source tag — required on every store write. Used by the\n * default training-source filter (production-trace samples NOT used as\n * training scenarios unless explicitly opted in). */\nexport type LabeledScenarioSource =\n | 'production-trace'\n | 'eval-run'\n | 'manual'\n | 'red-team'\n | 'synthetic'\n\nexport type RedactionStatus = 'raw' | 'redacted-pii' | 'redacted-secrets' | 'fully-redacted'\n\n/** How much a label can be trusted to evaluate against — the gold-admission\n * gate. Strictly ordered: a record qualifies for a `minTrust` filter when its\n * trust rank is >= the requested rank.\n *\n * - `unverified` — label is a heuristic (e.g. raw outcome success/fail).\n * Fine as corpus; MUST NOT enter a gold set that lift\n * numbers are computed against.\n * - `verified-signal` — an external signal confirmed the outcome (PR merged,\n * tests green, user did not retry, downstream check).\n * - `human-rated` — a human explicitly rated or corrected the artifact.\n *\n * Absent on a write ⇒ treated as `unverified` (fail-closed: a writer must\n * explicitly assert trust to make a record gold-eligible — it never happens\n * by accident). */\nexport type LabelTrust = 'unverified' | 'verified-signal' | 'human-rated'\n\nconst LABEL_TRUST_RANK: Record<LabelTrust, number> = {\n unverified: 0,\n 'verified-signal': 1,\n 'human-rated': 2,\n}\n\n/** Ordinal rank for a label-trust tier; absent ⇒ `unverified` (rank 0). */\nexport function labelTrustRank(trust: LabelTrust | undefined): number {\n return LABEL_TRUST_RANK[trust ?? 'unverified']\n}\n\n/** @experimental Required-provenance write. The store rejects writes that\n * lack provenance — a default-on flywheel without provenance is the\n * data-poisoning vector flagged in the alignment review. */\nexport interface LabeledScenarioWrite<TScenario extends Scenario = Scenario, TArtifact = unknown> {\n scenario: TScenario\n artifact: TArtifact\n judgeScores: Record<string, JudgeScore>\n source: LabeledScenarioSource\n sourceVersionHash: string\n capturedAt: string\n redactionStatus: RedactionStatus\n /** Gold-admission trust tier. Absent ⇒ `unverified` (fail-closed): the\n * record is corpus, never gold. A writer must explicitly assert\n * `verified-signal` or `human-rated` to make it eligible for a gold\n * sample. See {@link LabelTrust}. */\n labelTrust?: LabelTrust\n /** Optional per-source rate-limit bucket key (e.g., the tenant id). */\n rateLimitBucket?: string\n}\n\nexport interface LabeledScenarioRecord<TScenario extends Scenario = Scenario, TArtifact = unknown>\n extends LabeledScenarioWrite<TScenario, TArtifact> {\n /** Stable hash of (scenario.id, source, capturedAt, sourceVersionHash). */\n recordHash: string\n /** Substrate-assigned split — train if captured before the campaign's\n * `temporalCutoff`, test if after. Explicit override allowed via filter. */\n split: 'train' | 'test'\n}\n\nexport interface LabeledScenarioSampleArgs {\n count: number\n /** REQUIRED — substrate refuses to sample without an explicit split. */\n split: 'train' | 'test'\n /** REQUIRED — only records captured before this timestamp are returned.\n * Enforces temporal split discipline (test scenarios captured AFTER train\n * cannot enter the training pool). */\n capturedBefore: string\n filter?: {\n kind?: string\n source?: LabeledScenarioSource | LabeledScenarioSource[]\n minComposite?: number\n maxComposite?: number\n /** Gold gate: only records whose trust rank is >= this tier are\n * returned. `sample({ split: 'test', minTrust: 'verified-signal' })` is\n * the canonical \"give me the gold set\" call. Absent ⇒ no trust gate\n * (corpus-level read). */\n minTrust?: LabelTrust\n }\n}\n\nexport interface LabeledScenarioStore {\n observe(write: LabeledScenarioWrite): Promise<void>\n sample(args: LabeledScenarioSampleArgs): Promise<LabeledScenarioRecord[]>\n size(): Promise<{\n train: number\n test: number\n bySource: Record<string, number>\n /** Count by trust tier — tells the flywheel how much gold it has\n * accumulated vs. raw corpus. */\n byTrust: Record<LabelTrust, number>\n }>\n}\n\n// ── The CampaignResult schema (the downstream-tools contract) ─────────\n\nexport interface CampaignCellResult<TArtifact> {\n cellId: string\n scenarioId: string\n rep: number\n generation?: number\n artifact: TArtifact\n judgeScores: Record<string, JudgeScore>\n costUsd: number\n durationMs: number\n seed: number\n cached: boolean\n error?: string\n}\n\nexport interface JudgeAggregate {\n mean: number\n stdev: number\n ci95: [number, number]\n n: number\n}\n\nexport interface ScenarioAggregate {\n meanComposite: number\n ci95: [number, number]\n n: number\n}\n\nexport interface GenerationRecord {\n generationIndex: number\n candidates: GenerationCandidate[]\n promoted: string[]\n}\n\n/** One scored candidate surface in a generation. `dimensions` + `scenarios`\n * let a reflective `ImprovementDriver` ground its next proposal on WHICH\n * dimensions the candidate is weakest on and WHICH scenarios it best/worst\n * handled — the evidence a blind `Mutator` cannot see. */\nexport interface GenerationCandidate {\n surfaceHash: string\n composite: number\n ci95: [number, number]\n /** Mean score per judge dimension across all cells (scenarios × reps ×\n * judges that reported the dimension). */\n dimensions: Record<string, number>\n /** Per-scenario composite (mean over reps + judges). */\n scenarios: Array<{ scenarioId: string; composite: number }>\n}\n\nexport interface CampaignAggregates {\n byJudge: Record<string, JudgeAggregate>\n byScenario: Record<string, ScenarioAggregate>\n totalCostUsd: number\n cellsExecuted: number\n cellsSkipped: number\n cellsCached: number\n cellsFailed: number\n}\n\nexport interface CampaignResult<TArtifact = unknown, TScenario extends Scenario = Scenario> {\n /** sha256(scenarios, judges, dispatch source ref, optimizer config, seed). Stable identity for reruns. */\n manifestHash: string\n seed: number\n startedAt: string\n endedAt: string\n durationMs: number\n cells: Array<CampaignCellResult<TArtifact>>\n aggregates: CampaignAggregates\n optimization?: {\n generations: GenerationRecord[]\n winnerSurfaceHash?: string\n }\n gate?: GateResult\n prUrl?: string\n runDir: string\n artifactsByPath: Record<string, string>\n /** Substrate strips the input scenarios to id+kind for the result manifest;\n * consumers needing full payload look it up via the original input. The\n * type parameter `TScenario` is propagated for downstream consumers that\n * want narrowed types when extending `CampaignResult`. */\n scenarios: Array<Pick<TScenario, 'id' | 'kind'>>\n}\n","/**\n * @experimental\n *\n * VCS-pluggable worktree adapter. One improvement = one worktree, PR-like\n * (multiple commits allowed). A code-tier driver's `propose()` creates a\n * worktree, an agent commits the change into it, and `finalize()` returns a\n * `CodeSurface{ worktreeRef }` the measurement checks out to run the worker\n * against the changed code. On promotion the worktree becomes the PR branch.\n *\n * The interface is VCS-agnostic so a future `jj` ([jj-vcs](https://github.com/jj-vcs/jj))\n * adapter can slot in without touching driver code. Only the git adapter\n * ships today. See `docs/design/self-improvement-engine.md`.\n */\n\nimport { execFileSync } from 'node:child_process'\nimport { existsSync } from 'node:fs'\nimport { basename, isAbsolute, join } from 'node:path'\nimport type { CodeSurface } from '../types'\n\nexport interface Worktree {\n /** Absolute path to the checked-out worktree directory. */\n path: string\n /** The branch the worktree is on (becomes the PR branch on promotion). */\n branch: string\n /** The ref the worktree was forked from. */\n baseRef: string\n}\n\nexport interface WorktreeAdapter {\n /** Create an isolated worktree on a fresh branch off `baseRef`. */\n create(opts: { baseRef: string; label: string }): Promise<Worktree>\n /** Commit any pending changes in the worktree, then return a CodeSurface\n * pointing at it. The agent has already written its change into\n * `worktree.path` by the time this is called. */\n finalize(worktree: Worktree, summary: string): Promise<CodeSurface>\n /** Remove the worktree (and its branch) — called for losing candidates. */\n discard(worktree: Worktree): Promise<void>\n}\n\nexport class WorktreeAdapterError extends Error {\n constructor(\n message: string,\n readonly cause?: unknown,\n ) {\n super(message)\n this.name = 'WorktreeAdapterError'\n }\n}\n\nexport interface GitWorktreeAdapterOptions {\n /** Repo root the worktrees fork from. */\n repoRoot: string\n /** Directory worktrees are created under. Default: `<repoRoot>/.worktrees`. */\n worktreeDir?: string\n /** Branch-name prefix. Default: `improve`. */\n branchPrefix?: string\n /** Test seam — defaults to a real `git` runner. */\n git?: (args: string[], cwd: string) => string\n}\n\nfunction defaultGit(args: string[], cwd: string): string {\n try {\n return execFileSync('git', args, { cwd, encoding: 'utf8' }).trim()\n } catch (err) {\n const stderr =\n err && typeof err === 'object' && 'stderr' in err\n ? String((err as { stderr: unknown }).stderr)\n : ''\n throw new WorktreeAdapterError(`git ${args.join(' ')} failed: ${stderr || String(err)}`, err)\n }\n}\n\n/** Slugify a label into a branch-safe segment. */\nfunction slug(label: string): string {\n return (\n label\n .toLowerCase()\n .replace(/[^a-z0-9]+/g, '-')\n .replace(/^-+|-+$/g, '')\n .slice(0, 48) || 'candidate'\n )\n}\n\nexport function gitWorktreeAdapter(opts: GitWorktreeAdapterOptions): WorktreeAdapter {\n const git = opts.git ?? defaultGit\n const worktreeDir = opts.worktreeDir ?? join(opts.repoRoot, '.worktrees')\n const branchPrefix = opts.branchPrefix ?? 'improve'\n\n return {\n async create({ baseRef, label }) {\n const id = `${slug(label)}-${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 6)}`\n const branch = `${branchPrefix}/${id}`\n const path = join(worktreeDir, id)\n git(['worktree', 'add', '-b', branch, path, baseRef], opts.repoRoot)\n return { path, branch, baseRef }\n },\n\n async finalize(worktree, summary) {\n // Stage + commit any pending changes the agent left in the worktree.\n // A no-op commit is refused by git, so only commit when the tree is dirty.\n const status = git(['status', '--porcelain'], worktree.path)\n if (status.length > 0) {\n git(['add', '-A'], worktree.path)\n git(['commit', '-m', summary], worktree.path)\n }\n return {\n kind: 'code',\n worktreeRef: worktree.path,\n baseRef: worktree.baseRef,\n summary,\n }\n },\n\n async discard(worktree) {\n // Remove the worktree, then delete its branch. Force-remove because the\n // worktree may hold uncommitted experiment state we're discarding.\n git(['worktree', 'remove', '--force', worktree.path], opts.repoRoot)\n git(['branch', '-D', worktree.branch], opts.repoRoot)\n },\n }\n}\n\n/** Resolve a `CodeSurface`'s worktreeRef to a directory the measurement can\n * run the worker in. A path ref is returned as-is; anything else is treated\n * as a ref under the adapter's worktree dir. */\nexport function resolveWorktreePath(surface: CodeSurface, worktreeDir?: string): string {\n if (isAbsolute(surface.worktreeRef) && existsSync(surface.worktreeRef)) return surface.worktreeRef\n if (worktreeDir) return join(worktreeDir, basename(surface.worktreeRef))\n return surface.worktreeRef\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;AA6BA,SAAS,kBAAkB;AAC3B,SAAS,YAAY,WAAW,cAAc,qBAAqB;AACnE,SAAS,YAAY;;;AC2QrB,IAAM,mBAA+C;AAAA,EACnD,YAAY;AAAA,EACZ,mBAAmB;AAAA,EACnB,eAAe;AACjB;AAGO,SAAS,eAAe,OAAuC;AACpE,SAAO,iBAAiB,SAAS,YAAY;AAC/C;;;AD/PO,IAAM,4BAAN,cAAwC,MAAM;AAAA,EACnD,YACkB,MAChB,SACA;AACA,UAAM,OAAO;AAHG;AAIhB,SAAK,OAAO;AAAA,EACd;AAAA,EALkB;AAMpB;AAQO,IAAM,yBAAN,MAA6D;AAAA,EAIlE,YAA6B,SAAwC;AAAxC;AAC3B,QAAI,CAAC,WAAW,QAAQ,IAAI,EAAG,WAAU,QAAQ,MAAM,EAAE,WAAW,KAAK,CAAC;AAC1E,SAAK,MAAM,QAAQ,OAAO,KAAK;AAAA,EACjC;AAAA,EAH6B;AAAA,EAHZ;AAAA,EACA,aAAa,oBAAI,IAA4B;AAAA,EAO9D,MAAM,QAAQ,OAA4C;AACxD,SAAK,iBAAiB,KAAK;AAC3B,SAAK,gBAAgB,KAAK;AAC1B,UAAM,SAAS,KAAK,SAAS,KAAK;AAClC,UAAM,OAAO,KAAK,cAAc,MAAM,MAAM;AAC5C,UAAM,OAAO,GAAG,KAAK,UAAU,MAAM,CAAC;AAAA;AAItC,eAAW,MAAM,IAAI;AAAA,EACvB;AAAA,EAEA,MAAM,OAAO,MAAmE;AAC9E,QAAI,CAAC,KAAK,OAAO;AACf,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,KAAK,gBAAgB;AACxB,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AAEA,UAAM,MAA+B,CAAC;AACtC,eAAW,UAAU,aAAa;AAGhC,UAAI,KAAK,UAAU,WAAW,WAAW,oBAAoB;AAC3D,cAAM,WAAW,qBAAqB,KAAK,QAAQ,QAAQ,kBAAkB;AAC7E,YAAI,CAAC,SAAU;AAAA,MACjB;AACA,YAAM,OAAO,KAAK,cAAc,MAAM;AACtC,UAAI,CAAC,WAAW,IAAI,EAAG;AACvB,YAAM,QAAQ,aAAa,MAAM,MAAM,EAAE,MAAM,IAAI,EAAE,OAAO,OAAO;AACnE,iBAAW,QAAQ,OAAO;AACxB,YAAI;AACJ,YAAI;AACF,mBAAS,KAAK,MAAM,IAAI;AAAA,QAC1B,QAAQ;AACN;AAAA,QACF;AACA,YAAI,CAAC,cAAc,QAAQ,MAAM,MAAM,EAAG;AAC1C,YAAI,KAAK,MAAM;AAAA,MACjB;AAAA,IACF;AAGA,QAAI,KAAK,CAAC,GAAG,MAAM;AACjB,UAAI,EAAE,eAAe,EAAE,WAAY,QAAO,EAAE,WAAW,cAAc,EAAE,UAAU;AACjF,aAAO,EAAE,WAAW,cAAc,EAAE,UAAU;AAAA,IAChD,CAAC;AAED,WAAO,IAAI,MAAM,GAAG,KAAK,KAAK;AAAA,EAChC;AAAA,EAEA,MAAM,OAKH;AACD,UAAM,WAAmC,CAAC;AAC1C,UAAM,UAAsC;AAAA,MAC1C,YAAY;AAAA,MACZ,mBAAmB;AAAA,MACnB,eAAe;AAAA,IACjB;AACA,QAAI,QAAQ;AACZ,eAAW,UAAU,aAAa;AAChC,YAAM,OAAO,KAAK,cAAc,MAAM;AACtC,UAAI,CAAC,WAAW,IAAI,GAAG;AACrB,iBAAS,MAAM,IAAI;AACnB;AAAA,MACF;AACA,YAAM,QAAQ,aAAa,MAAM,MAAM,EAAE,MAAM,IAAI,EAAE,OAAO,OAAO;AACnE,eAAS,MAAM,IAAI,MAAM;AACzB,eAAS,MAAM;AACf,iBAAW,QAAQ,OAAO;AACxB,YAAI,QAAoB;AACxB,YAAI;AACF,kBAAS,KAAK,MAAM,IAAI,EAA4B,cAAc;AAAA,QACpE,QAAQ;AAAA,QAER;AACA,gBAAQ,KAAK,KAAK;AAAA,MACpB;AAAA,IACF;AAIA,WAAO,EAAE,OAAO,OAAO,MAAM,OAAO,UAAU,QAAQ;AAAA,EACxD;AAAA,EAEQ,iBAAiB,OAAmC;AAC1D,QAAI,CAAC,MAAM,QAAQ;AACjB,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,MAAM,qBAAqB,MAAM,kBAAkB,WAAW,GAAG;AACpE,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,MAAM,YAAY;AACrB,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,MAAM,iBAAiB;AAC1B,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,YAAY,SAAS,MAAM,MAAM,GAAG;AACvC,YAAM,IAAI;AAAA,QACR;AAAA,QACA,+CAA+C,YAAY,KAAK,IAAI,CAAC;AAAA,MACvE;AAAA,IACF;AAAA,EACF;AAAA,EAEQ,gBAAgB,OAAmC;AACzD,UAAM,MAAM,KAAK,QAAQ;AACzB,QAAI,CAAC,OAAO,CAAC,MAAM,gBAAiB;AACpC,UAAM,MAAM,KAAK,IAAI;AACrB,UAAM,WAAW;AACjB,QAAI,QAAQ,KAAK,WAAW,IAAI,MAAM,eAAe;AACrD,QAAI,CAAC,SAAS,MAAM,MAAM,iBAAiB,UAAU;AACnD,cAAQ,EAAE,QAAQ,MAAM,iBAAiB,eAAe,KAAK,OAAO,EAAE;AACtE,WAAK,WAAW,IAAI,MAAM,iBAAiB,KAAK;AAAA,IAClD;AACA,QAAI,MAAM,SAAS,KAAK;AACtB,YAAM,IAAI;AAAA,QACR;AAAA,QACA,gCAAgC,MAAM,eAAe,aAAa,GAAG;AAAA,MACvE;AAAA,IACF;AACA,UAAM,SAAS;AAAA,EACjB;AAAA,EAEQ,SAAS,OAAoD;AACnE,UAAM,aAAa;AAAA,MACjB,KAAK,UAAU;AAAA,QACb,IAAI,MAAM,SAAS;AAAA,QACnB,KAAK,MAAM;AAAA,QACX,IAAI,MAAM;AAAA,QACV,KAAK,MAAM;AAAA,MACb,CAAC;AAAA,IACH;AAGA,WAAO;AAAA,MACL,GAAG;AAAA,MACH;AAAA,MACA,OAAO;AAAA,IACT;AAAA,EACF;AAAA,EAEQ,cAAc,QAAwB;AAC5C,WAAO,KAAK,KAAK,QAAQ,MAAM,GAAG,MAAM,QAAQ;AAAA,EAClD;AACF;AAEA,IAAM,cAAgD;AAAA,EACpD;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAEA,SAAS,qBACP,QACA,QACS;AACT,MAAI,CAAC,OAAQ,QAAO;AACpB,MAAI,MAAM,QAAQ,MAAM,EAAG,QAAO,OAAO,SAAS,MAAM;AACxD,SAAO,WAAW;AACpB;AAEA,SAAS,cACP,QACA,MACA,QACS;AAET,MAAI,KAAK,UAAU,WAAW,OAAO,cAAc,KAAK,eAAgB,QAAO;AAC/E,MAAI,KAAK,UAAU,UAAU,OAAO,aAAa,KAAK,eAAgB,QAAO;AAE7E,QAAM,IAAI,KAAK;AACf,MAAI,CAAC,EAAG,QAAO;AACf,MAAI,EAAE,QAAQ,OAAO,SAAS,SAAS,EAAE,KAAM,QAAO;AACtD,MAAI,EAAE,QAAQ;AACZ,UAAM,UAAU,MAAM,QAAQ,EAAE,MAAM,IAAI,EAAE,SAAS,CAAC,EAAE,MAAM;AAC9D,QAAI,CAAC,QAAQ,SAAS,MAAe,EAAG,QAAO;AAAA,EACjD;AACA,MAAI,EAAE,iBAAiB,UAAa,EAAE,iBAAiB,QAAW;AAChE,UAAM,aAAa,OAAO,OAAO,OAAO,WAAW,EAAE,IAAI,CAAC,MAAM,EAAE,SAAS;AAC3E,UAAM,MAAM,WAAW,WAAW,IAAI,IAAI,KAAK,IAAI,GAAG,UAAU;AAChE,QAAI,EAAE,iBAAiB,UAAa,MAAM,EAAE,aAAc,QAAO;AACjE,QAAI,EAAE,iBAAiB,UAAa,MAAM,EAAE,aAAc,QAAO;AAAA,EACnE;AACA,MAAI,EAAE,aAAa,UAAa,eAAe,OAAO,UAAU,IAAI,eAAe,EAAE,QAAQ,GAAG;AAC9F,WAAO;AAAA,EACT;AACA,SAAO;AACT;AAEA,SAAS,OAAO,OAAuB;AACrC,SAAO,WAAW,QAAQ,EAAE,OAAO,KAAK,EAAE,OAAO,KAAK,EAAE,MAAM,GAAG,EAAE;AACrE;AAEA,SAAS,WAAW,MAAc,MAAoB;AACpD,MAAI,WAAW,IAAI,GAAG;AACpB,UAAM,WAAW,aAAa,MAAM,MAAM;AAC1C,kBAAc,MAAM,WAAW,IAAI;AAAA,EACrC,OAAO;AACL,kBAAc,MAAM,IAAI;AAAA,EAC1B;AACF;;;AElSA,SAAS,oBAAoB;AAC7B,SAAS,cAAAA,mBAAkB;AAC3B,SAAS,UAAU,YAAY,QAAAC,aAAY;AAuBpC,IAAM,uBAAN,cAAmC,MAAM;AAAA,EAC9C,YACE,SACS,OACT;AACA,UAAM,OAAO;AAFJ;AAGT,SAAK,OAAO;AAAA,EACd;AAAA,EAJW;AAKb;AAaA,SAAS,WAAW,MAAgB,KAAqB;AACvD,MAAI;AACF,WAAO,aAAa,OAAO,MAAM,EAAE,KAAK,UAAU,OAAO,CAAC,EAAE,KAAK;AAAA,EACnE,SAAS,KAAK;AACZ,UAAM,SACJ,OAAO,OAAO,QAAQ,YAAY,YAAY,MAC1C,OAAQ,IAA4B,MAAM,IAC1C;AACN,UAAM,IAAI,qBAAqB,OAAO,KAAK,KAAK,GAAG,CAAC,YAAY,UAAU,OAAO,GAAG,CAAC,IAAI,GAAG;AAAA,EAC9F;AACF;AAGA,SAAS,KAAK,OAAuB;AACnC,SACE,MACG,YAAY,EACZ,QAAQ,eAAe,GAAG,EAC1B,QAAQ,YAAY,EAAE,EACtB,MAAM,GAAG,EAAE,KAAK;AAEvB;AAEO,SAAS,mBAAmB,MAAkD;AACnF,QAAM,MAAM,KAAK,OAAO;AACxB,QAAM,cAAc,KAAK,eAAeA,MAAK,KAAK,UAAU,YAAY;AACxE,QAAM,eAAe,KAAK,gBAAgB;AAE1C,SAAO;AAAA,IACL,MAAM,OAAO,EAAE,SAAS,MAAM,GAAG;AAC/B,YAAM,KAAK,GAAG,KAAK,KAAK,CAAC,IAAI,KAAK,IAAI,EAAE,SAAS,EAAE,CAAC,IAAI,KAAK,OAAO,EAAE,SAAS,EAAE,EAAE,MAAM,GAAG,CAAC,CAAC;AAC9F,YAAM,SAAS,GAAG,YAAY,IAAI,EAAE;AACpC,YAAM,OAAOA,MAAK,aAAa,EAAE;AACjC,UAAI,CAAC,YAAY,OAAO,MAAM,QAAQ,MAAM,OAAO,GAAG,KAAK,QAAQ;AACnE,aAAO,EAAE,MAAM,QAAQ,QAAQ;AAAA,IACjC;AAAA,IAEA,MAAM,SAAS,UAAU,SAAS;AAGhC,YAAM,SAAS,IAAI,CAAC,UAAU,aAAa,GAAG,SAAS,IAAI;AAC3D,UAAI,OAAO,SAAS,GAAG;AACrB,YAAI,CAAC,OAAO,IAAI,GAAG,SAAS,IAAI;AAChC,YAAI,CAAC,UAAU,MAAM,OAAO,GAAG,SAAS,IAAI;AAAA,MAC9C;AACA,aAAO;AAAA,QACL,MAAM;AAAA,QACN,aAAa,SAAS;AAAA,QACtB,SAAS,SAAS;AAAA,QAClB;AAAA,MACF;AAAA,IACF;AAAA,IAEA,MAAM,QAAQ,UAAU;AAGtB,UAAI,CAAC,YAAY,UAAU,WAAW,SAAS,IAAI,GAAG,KAAK,QAAQ;AACnE,UAAI,CAAC,UAAU,MAAM,SAAS,MAAM,GAAG,KAAK,QAAQ;AAAA,IACtD;AAAA,EACF;AACF;AAKO,SAAS,oBAAoB,SAAsB,aAA8B;AACtF,MAAI,WAAW,QAAQ,WAAW,KAAKD,YAAW,QAAQ,WAAW,EAAG,QAAO,QAAQ;AACvF,MAAI,YAAa,QAAOC,MAAK,aAAa,SAAS,QAAQ,WAAW,CAAC;AACvE,SAAO,QAAQ;AACjB;","names":["existsSync","join"]}
|
|
1
|
+
{"version":3,"sources":["../../src/campaign/labeled-store/fs-adapter.ts","../../src/campaign/worktree/index.ts"],"sourcesContent":["/**\n * @experimental\n *\n * Filesystem `LabeledScenarioStore` adapter. The default capture sink for\n * traces + eval artifacts. Production deployments typically swap for a\n * Turso/SQLite adapter (same interface).\n *\n * Records land as one JSONL file per source under `<root>/<source>.jsonl`.\n * Each line is a `LabeledScenarioRecord`. Append-only — no in-place edits.\n *\n * Safety properties enforced at write-time:\n *\n * - **Provenance required**: writes without `source`, `sourceVersionHash`,\n * `capturedAt`, `redactionStatus` are rejected. Closes the alignment\n * reviewer's data-poisoning gap.\n * - **Per-source rate limits**: optional `rateLimitBucket` + `maxWritesPerMinute`\n * stops a single tenant/source from flooding the store.\n *\n * Safety properties enforced at sample-time:\n *\n * - **Required split + capturedBefore**: substrate refuses to sample without\n * an explicit `split` ('train' | 'test') AND a temporal cutoff. Eliminates\n * accidental train/test contamination.\n * - **Default training-source filter**: when the store is sampled with\n * `split: 'train'`, production-trace records are EXCLUDED unless the\n * caller passes `filter.source: 'production-trace'` explicitly. Closes\n * the contamination-by-default gap flagged by the senior eval engineer.\n */\n\nimport { createHash } from 'node:crypto'\nimport { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs'\nimport { join } from 'node:path'\nimport type {\n LabeledScenarioRecord,\n LabeledScenarioSampleArgs,\n LabeledScenarioSource,\n LabeledScenarioStore,\n LabeledScenarioWrite,\n LabelTrust,\n} from '../types'\nimport { labelTrustRank } from '../types'\n\nexport interface FsLabeledScenarioStoreOptions {\n /** Root directory for JSONL files. Created if missing. */\n root: string\n /** Per-source rate limit. When set, writes exceeding the cap are rejected\n * with a typed error. Default: no limit. */\n maxWritesPerMinutePerBucket?: number\n /** Test seam — override `Date.now()` for deterministic tests. */\n now?: () => number\n}\n\nexport class LabeledScenarioStoreError extends Error {\n constructor(\n public readonly code: string,\n message: string,\n ) {\n super(message)\n this.name = 'LabeledScenarioStoreError'\n }\n}\n\ninterface RateLimitState {\n bucket: string\n windowStartMs: number\n count: number\n}\n\nexport class FsLabeledScenarioStore implements LabeledScenarioStore {\n private readonly now: () => number\n private readonly rateLimits = new Map<string, RateLimitState>()\n\n constructor(private readonly options: FsLabeledScenarioStoreOptions) {\n if (!existsSync(options.root)) mkdirSync(options.root, { recursive: true })\n this.now = options.now ?? Date.now\n }\n\n async observe(write: LabeledScenarioWrite): Promise<void> {\n this.assertProvenance(write)\n this.assertRateLimit(write)\n const record = this.toRecord(write)\n const path = this.pathForSource(write.source)\n const line = `${JSON.stringify(record)}\\n`\n // Append atomically. For high-throughput a writev-friendly buffered\n // implementation lands in the Turso adapter; FS adapter is for tests +\n // local dev + small workloads.\n appendLine(path, line)\n }\n\n async sample(args: LabeledScenarioSampleArgs): Promise<LabeledScenarioRecord[]> {\n if (!args.split) {\n throw new LabeledScenarioStoreError(\n 'split_required',\n 'sample() requires an explicit `split` (train | test) — substrate refuses ambiguous reads',\n )\n }\n if (!args.capturedBefore) {\n throw new LabeledScenarioStoreError(\n 'capturedBefore_required',\n 'sample() requires an explicit `capturedBefore` timestamp for temporal-split discipline',\n )\n }\n\n const all: LabeledScenarioRecord[] = []\n for (const source of ALL_SOURCES) {\n // Default training-source filter: when sampling train, EXCLUDE\n // production-trace records unless the caller asks for them.\n if (args.split === 'train' && source === 'production-trace') {\n const explicit = sourceFilterContains(args.filter?.source, 'production-trace')\n if (!explicit) continue\n }\n const path = this.pathForSource(source)\n if (!existsSync(path)) continue\n const lines = readFileSync(path, 'utf8').split('\\n').filter(Boolean)\n for (const line of lines) {\n let record: LabeledScenarioRecord\n try {\n record = JSON.parse(line) as LabeledScenarioRecord\n } catch {\n continue\n }\n if (!matchesFilter(record, args, source)) continue\n all.push(record)\n }\n }\n\n // Deterministic order: by capturedAt ascending, then recordHash.\n all.sort((a, b) => {\n if (a.capturedAt !== b.capturedAt) return a.capturedAt.localeCompare(b.capturedAt)\n return a.recordHash.localeCompare(b.recordHash)\n })\n\n return all.slice(0, args.count)\n }\n\n async size(): Promise<{\n train: number\n test: number\n bySource: Record<string, number>\n byTrust: Record<LabelTrust, number>\n }> {\n const bySource: Record<string, number> = {}\n const byTrust: Record<LabelTrust, number> = {\n unverified: 0,\n 'verified-signal': 0,\n 'human-rated': 0,\n }\n let total = 0\n for (const source of ALL_SOURCES) {\n const path = this.pathForSource(source)\n if (!existsSync(path)) {\n bySource[source] = 0\n continue\n }\n const lines = readFileSync(path, 'utf8').split('\\n').filter(Boolean)\n bySource[source] = lines.length\n total += lines.length\n for (const line of lines) {\n let trust: LabelTrust = 'unverified'\n try {\n trust = (JSON.parse(line) as LabeledScenarioRecord).labelTrust ?? 'unverified'\n } catch {\n // A malformed line counts as unverified — never silently gold.\n }\n byTrust[trust] += 1\n }\n }\n // FS adapter doesn't track split assignments per-record (split is\n // computed at sample-time based on `capturedBefore`). For size(), we\n // report `train`+`test` as the same total — split is a sampling concept.\n return { train: total, test: total, bySource, byTrust }\n }\n\n private assertProvenance(write: LabeledScenarioWrite): void {\n if (!write.source) {\n throw new LabeledScenarioStoreError(\n 'missing_source',\n 'LabeledScenarioWrite requires `source`',\n )\n }\n if (!write.sourceVersionHash || write.sourceVersionHash.length === 0) {\n throw new LabeledScenarioStoreError(\n 'missing_source_version',\n 'LabeledScenarioWrite requires `sourceVersionHash` (git sha or substrate version)',\n )\n }\n if (!write.capturedAt) {\n throw new LabeledScenarioStoreError(\n 'missing_captured_at',\n 'LabeledScenarioWrite requires `capturedAt` ISO timestamp',\n )\n }\n if (!write.redactionStatus) {\n throw new LabeledScenarioStoreError(\n 'missing_redaction_status',\n 'LabeledScenarioWrite requires explicit `redactionStatus` — raw / redacted-pii / redacted-secrets / fully-redacted',\n )\n }\n if (!ALL_SOURCES.includes(write.source)) {\n throw new LabeledScenarioStoreError(\n 'unknown_source',\n `LabeledScenarioWrite.source must be one of: ${ALL_SOURCES.join(', ')}`,\n )\n }\n }\n\n private assertRateLimit(write: LabeledScenarioWrite): void {\n const cap = this.options.maxWritesPerMinutePerBucket\n if (!cap || !write.rateLimitBucket) return\n const now = this.now()\n const windowMs = 60_000\n let state = this.rateLimits.get(write.rateLimitBucket)\n if (!state || now - state.windowStartMs >= windowMs) {\n state = { bucket: write.rateLimitBucket, windowStartMs: now, count: 0 }\n this.rateLimits.set(write.rateLimitBucket, state)\n }\n if (state.count >= cap) {\n throw new LabeledScenarioStoreError(\n 'rate_limit_exceeded',\n `LabeledScenarioStore: bucket ${write.rateLimitBucket} exceeded ${cap} writes/min`,\n )\n }\n state.count += 1\n }\n\n private toRecord(write: LabeledScenarioWrite): LabeledScenarioRecord {\n const recordHash = sha256(\n JSON.stringify({\n id: write.scenario.id,\n src: write.source,\n at: write.capturedAt,\n ver: write.sourceVersionHash,\n }),\n )\n // FS adapter assigns split at sample-time, but we cache a hint here\n // based on capturedAt vs the world's \"now\" — sampler overrides this.\n return {\n ...write,\n recordHash,\n split: 'train',\n }\n }\n\n private pathForSource(source: string): string {\n return join(this.options.root, `${source}.jsonl`)\n }\n}\n\nconst ALL_SOURCES: LabeledScenarioWrite['source'][] = [\n 'production-trace',\n 'eval-run',\n 'manual',\n 'red-team',\n 'synthetic',\n]\n\nfunction sourceFilterContains(\n filter: LabeledScenarioSource | LabeledScenarioSource[] | undefined,\n needle: LabeledScenarioSource,\n): boolean {\n if (!filter) return false\n if (Array.isArray(filter)) return filter.includes(needle)\n return filter === needle\n}\n\nfunction matchesFilter(\n record: LabeledScenarioRecord,\n args: LabeledScenarioSampleArgs,\n source: string,\n): boolean {\n // Temporal cutoff — train must be capturedAt < capturedBefore.\n if (args.split === 'train' && record.capturedAt >= args.capturedBefore) return false\n if (args.split === 'test' && record.capturedAt < args.capturedBefore) return false\n\n const f = args.filter\n if (!f) return true\n if (f.kind && record.scenario.kind !== f.kind) return false\n if (f.source) {\n const sources = Array.isArray(f.source) ? f.source : [f.source]\n if (!sources.includes(source as never)) return false\n }\n if (f.minComposite !== undefined || f.maxComposite !== undefined) {\n const composites = Object.values(record.judgeScores).map((s) => s.composite)\n const max = composites.length === 0 ? 0 : Math.max(...composites)\n if (f.minComposite !== undefined && max < f.minComposite) return false\n if (f.maxComposite !== undefined && max > f.maxComposite) return false\n }\n if (f.minTrust !== undefined && labelTrustRank(record.labelTrust) < labelTrustRank(f.minTrust)) {\n return false\n }\n return true\n}\n\nfunction sha256(input: string): string {\n return createHash('sha256').update(input).digest('hex').slice(0, 16)\n}\n\nfunction appendLine(path: string, line: string): void {\n if (existsSync(path)) {\n const existing = readFileSync(path, 'utf8')\n writeFileSync(path, existing + line)\n } else {\n writeFileSync(path, line)\n }\n}\n","/**\n * @experimental\n *\n * VCS-pluggable worktree adapter. One improvement = one worktree, PR-like\n * (multiple commits allowed). A code-tier driver's `propose()` creates a\n * worktree, an agent commits the change into it, and `finalize()` returns a\n * `CodeSurface{ worktreeRef }` the measurement checks out to run the worker\n * against the changed code. On promotion the worktree becomes the PR branch.\n *\n * The interface is VCS-agnostic so a future `jj` ([jj-vcs](https://github.com/jj-vcs/jj))\n * adapter can slot in without touching driver code. Only the git adapter\n * ships today. See `docs/design/self-improvement-engine.md`.\n */\n\nimport { execFileSync } from 'node:child_process'\nimport { existsSync } from 'node:fs'\nimport { basename, isAbsolute, join } from 'node:path'\nimport type { CodeSurface } from '../types'\n\nexport interface Worktree {\n /** Absolute path to the checked-out worktree directory. */\n path: string\n /** The branch the worktree is on (becomes the PR branch on promotion). */\n branch: string\n /** The ref the worktree was forked from. */\n baseRef: string\n}\n\nexport interface WorktreeAdapter {\n /** Create an isolated worktree on a fresh branch off `baseRef`. */\n create(opts: { baseRef: string; label: string }): Promise<Worktree>\n /** Commit any pending changes in the worktree, then return a CodeSurface\n * pointing at it. The agent has already written its change into\n * `worktree.path` by the time this is called. */\n finalize(worktree: Worktree, summary: string): Promise<CodeSurface>\n /** Remove the worktree (and its branch) — called for losing candidates. */\n discard(worktree: Worktree): Promise<void>\n}\n\nexport class WorktreeAdapterError extends Error {\n constructor(\n message: string,\n readonly cause?: unknown,\n ) {\n super(message)\n this.name = 'WorktreeAdapterError'\n }\n}\n\nexport interface GitWorktreeAdapterOptions {\n /** Repo root the worktrees fork from. */\n repoRoot: string\n /** Directory worktrees are created under. Default: `<repoRoot>/.worktrees`. */\n worktreeDir?: string\n /** Branch-name prefix. Default: `improve`. */\n branchPrefix?: string\n /** Test seam — defaults to a real `git` runner. */\n git?: (args: string[], cwd: string) => string\n}\n\nfunction defaultGit(args: string[], cwd: string): string {\n try {\n return execFileSync('git', args, { cwd, encoding: 'utf8' }).trim()\n } catch (err) {\n const stderr =\n err && typeof err === 'object' && 'stderr' in err\n ? String((err as { stderr: unknown }).stderr)\n : ''\n throw new WorktreeAdapterError(`git ${args.join(' ')} failed: ${stderr || String(err)}`, err)\n }\n}\n\n/** Slugify a label into a branch-safe segment. */\nfunction slug(label: string): string {\n return (\n label\n .toLowerCase()\n .replace(/[^a-z0-9]+/g, '-')\n .replace(/^-+|-+$/g, '')\n .slice(0, 48) || 'candidate'\n )\n}\n\nexport function gitWorktreeAdapter(opts: GitWorktreeAdapterOptions): WorktreeAdapter {\n const git = opts.git ?? defaultGit\n const worktreeDir = opts.worktreeDir ?? join(opts.repoRoot, '.worktrees')\n const branchPrefix = opts.branchPrefix ?? 'improve'\n\n return {\n async create({ baseRef, label }) {\n const id = `${slug(label)}-${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 6)}`\n const branch = `${branchPrefix}/${id}`\n const path = join(worktreeDir, id)\n git(['worktree', 'add', '-b', branch, path, baseRef], opts.repoRoot)\n return { path, branch, baseRef }\n },\n\n async finalize(worktree, summary) {\n // Stage + commit any pending changes the agent left in the worktree.\n // A no-op commit is refused by git, so only commit when the tree is dirty.\n const status = git(['status', '--porcelain'], worktree.path)\n if (status.length > 0) {\n git(['add', '-A'], worktree.path)\n git(['commit', '-m', summary], worktree.path)\n }\n return {\n kind: 'code',\n worktreeRef: worktree.path,\n baseRef: worktree.baseRef,\n summary,\n }\n },\n\n async discard(worktree) {\n // Remove the worktree, then delete its branch. Force-remove because the\n // worktree may hold uncommitted experiment state we're discarding.\n git(['worktree', 'remove', '--force', worktree.path], opts.repoRoot)\n git(['branch', '-D', worktree.branch], opts.repoRoot)\n },\n }\n}\n\n/** Resolve a `CodeSurface`'s worktreeRef to a directory the measurement can\n * run the worker in. A path ref is returned as-is; anything else is treated\n * as a ref under the adapter's worktree dir. */\nexport function resolveWorktreePath(surface: CodeSurface, worktreeDir?: string): string {\n if (isAbsolute(surface.worktreeRef) && existsSync(surface.worktreeRef)) return surface.worktreeRef\n if (worktreeDir) return join(worktreeDir, basename(surface.worktreeRef))\n return surface.worktreeRef\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AA6BA,SAAS,kBAAkB;AAC3B,SAAS,YAAY,WAAW,cAAc,qBAAqB;AACnE,SAAS,YAAY;AAqBd,IAAM,4BAAN,cAAwC,MAAM;AAAA,EACnD,YACkB,MAChB,SACA;AACA,UAAM,OAAO;AAHG;AAIhB,SAAK,OAAO;AAAA,EACd;AAAA,EALkB;AAMpB;AAQO,IAAM,yBAAN,MAA6D;AAAA,EAIlE,YAA6B,SAAwC;AAAxC;AAC3B,QAAI,CAAC,WAAW,QAAQ,IAAI,EAAG,WAAU,QAAQ,MAAM,EAAE,WAAW,KAAK,CAAC;AAC1E,SAAK,MAAM,QAAQ,OAAO,KAAK;AAAA,EACjC;AAAA,EAH6B;AAAA,EAHZ;AAAA,EACA,aAAa,oBAAI,IAA4B;AAAA,EAO9D,MAAM,QAAQ,OAA4C;AACxD,SAAK,iBAAiB,KAAK;AAC3B,SAAK,gBAAgB,KAAK;AAC1B,UAAM,SAAS,KAAK,SAAS,KAAK;AAClC,UAAM,OAAO,KAAK,cAAc,MAAM,MAAM;AAC5C,UAAM,OAAO,GAAG,KAAK,UAAU,MAAM,CAAC;AAAA;AAItC,eAAW,MAAM,IAAI;AAAA,EACvB;AAAA,EAEA,MAAM,OAAO,MAAmE;AAC9E,QAAI,CAAC,KAAK,OAAO;AACf,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,KAAK,gBAAgB;AACxB,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AAEA,UAAM,MAA+B,CAAC;AACtC,eAAW,UAAU,aAAa;AAGhC,UAAI,KAAK,UAAU,WAAW,WAAW,oBAAoB;AAC3D,cAAM,WAAW,qBAAqB,KAAK,QAAQ,QAAQ,kBAAkB;AAC7E,YAAI,CAAC,SAAU;AAAA,MACjB;AACA,YAAM,OAAO,KAAK,cAAc,MAAM;AACtC,UAAI,CAAC,WAAW,IAAI,EAAG;AACvB,YAAM,QAAQ,aAAa,MAAM,MAAM,EAAE,MAAM,IAAI,EAAE,OAAO,OAAO;AACnE,iBAAW,QAAQ,OAAO;AACxB,YAAI;AACJ,YAAI;AACF,mBAAS,KAAK,MAAM,IAAI;AAAA,QAC1B,QAAQ;AACN;AAAA,QACF;AACA,YAAI,CAAC,cAAc,QAAQ,MAAM,MAAM,EAAG;AAC1C,YAAI,KAAK,MAAM;AAAA,MACjB;AAAA,IACF;AAGA,QAAI,KAAK,CAAC,GAAG,MAAM;AACjB,UAAI,EAAE,eAAe,EAAE,WAAY,QAAO,EAAE,WAAW,cAAc,EAAE,UAAU;AACjF,aAAO,EAAE,WAAW,cAAc,EAAE,UAAU;AAAA,IAChD,CAAC;AAED,WAAO,IAAI,MAAM,GAAG,KAAK,KAAK;AAAA,EAChC;AAAA,EAEA,MAAM,OAKH;AACD,UAAM,WAAmC,CAAC;AAC1C,UAAM,UAAsC;AAAA,MAC1C,YAAY;AAAA,MACZ,mBAAmB;AAAA,MACnB,eAAe;AAAA,IACjB;AACA,QAAI,QAAQ;AACZ,eAAW,UAAU,aAAa;AAChC,YAAM,OAAO,KAAK,cAAc,MAAM;AACtC,UAAI,CAAC,WAAW,IAAI,GAAG;AACrB,iBAAS,MAAM,IAAI;AACnB;AAAA,MACF;AACA,YAAM,QAAQ,aAAa,MAAM,MAAM,EAAE,MAAM,IAAI,EAAE,OAAO,OAAO;AACnE,eAAS,MAAM,IAAI,MAAM;AACzB,eAAS,MAAM;AACf,iBAAW,QAAQ,OAAO;AACxB,YAAI,QAAoB;AACxB,YAAI;AACF,kBAAS,KAAK,MAAM,IAAI,EAA4B,cAAc;AAAA,QACpE,QAAQ;AAAA,QAER;AACA,gBAAQ,KAAK,KAAK;AAAA,MACpB;AAAA,IACF;AAIA,WAAO,EAAE,OAAO,OAAO,MAAM,OAAO,UAAU,QAAQ;AAAA,EACxD;AAAA,EAEQ,iBAAiB,OAAmC;AAC1D,QAAI,CAAC,MAAM,QAAQ;AACjB,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,MAAM,qBAAqB,MAAM,kBAAkB,WAAW,GAAG;AACpE,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,MAAM,YAAY;AACrB,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,MAAM,iBAAiB;AAC1B,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,YAAY,SAAS,MAAM,MAAM,GAAG;AACvC,YAAM,IAAI;AAAA,QACR;AAAA,QACA,+CAA+C,YAAY,KAAK,IAAI,CAAC;AAAA,MACvE;AAAA,IACF;AAAA,EACF;AAAA,EAEQ,gBAAgB,OAAmC;AACzD,UAAM,MAAM,KAAK,QAAQ;AACzB,QAAI,CAAC,OAAO,CAAC,MAAM,gBAAiB;AACpC,UAAM,MAAM,KAAK,IAAI;AACrB,UAAM,WAAW;AACjB,QAAI,QAAQ,KAAK,WAAW,IAAI,MAAM,eAAe;AACrD,QAAI,CAAC,SAAS,MAAM,MAAM,iBAAiB,UAAU;AACnD,cAAQ,EAAE,QAAQ,MAAM,iBAAiB,eAAe,KAAK,OAAO,EAAE;AACtE,WAAK,WAAW,IAAI,MAAM,iBAAiB,KAAK;AAAA,IAClD;AACA,QAAI,MAAM,SAAS,KAAK;AACtB,YAAM,IAAI;AAAA,QACR;AAAA,QACA,gCAAgC,MAAM,eAAe,aAAa,GAAG;AAAA,MACvE;AAAA,IACF;AACA,UAAM,SAAS;AAAA,EACjB;AAAA,EAEQ,SAAS,OAAoD;AACnE,UAAM,aAAa;AAAA,MACjB,KAAK,UAAU;AAAA,QACb,IAAI,MAAM,SAAS;AAAA,QACnB,KAAK,MAAM;AAAA,QACX,IAAI,MAAM;AAAA,QACV,KAAK,MAAM;AAAA,MACb,CAAC;AAAA,IACH;AAGA,WAAO;AAAA,MACL,GAAG;AAAA,MACH;AAAA,MACA,OAAO;AAAA,IACT;AAAA,EACF;AAAA,EAEQ,cAAc,QAAwB;AAC5C,WAAO,KAAK,KAAK,QAAQ,MAAM,GAAG,MAAM,QAAQ;AAAA,EAClD;AACF;AAEA,IAAM,cAAgD;AAAA,EACpD;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAEA,SAAS,qBACP,QACA,QACS;AACT,MAAI,CAAC,OAAQ,QAAO;AACpB,MAAI,MAAM,QAAQ,MAAM,EAAG,QAAO,OAAO,SAAS,MAAM;AACxD,SAAO,WAAW;AACpB;AAEA,SAAS,cACP,QACA,MACA,QACS;AAET,MAAI,KAAK,UAAU,WAAW,OAAO,cAAc,KAAK,eAAgB,QAAO;AAC/E,MAAI,KAAK,UAAU,UAAU,OAAO,aAAa,KAAK,eAAgB,QAAO;AAE7E,QAAM,IAAI,KAAK;AACf,MAAI,CAAC,EAAG,QAAO;AACf,MAAI,EAAE,QAAQ,OAAO,SAAS,SAAS,EAAE,KAAM,QAAO;AACtD,MAAI,EAAE,QAAQ;AACZ,UAAM,UAAU,MAAM,QAAQ,EAAE,MAAM,IAAI,EAAE,SAAS,CAAC,EAAE,MAAM;AAC9D,QAAI,CAAC,QAAQ,SAAS,MAAe,EAAG,QAAO;AAAA,EACjD;AACA,MAAI,EAAE,iBAAiB,UAAa,EAAE,iBAAiB,QAAW;AAChE,UAAM,aAAa,OAAO,OAAO,OAAO,WAAW,EAAE,IAAI,CAAC,MAAM,EAAE,SAAS;AAC3E,UAAM,MAAM,WAAW,WAAW,IAAI,IAAI,KAAK,IAAI,GAAG,UAAU;AAChE,QAAI,EAAE,iBAAiB,UAAa,MAAM,EAAE,aAAc,QAAO;AACjE,QAAI,EAAE,iBAAiB,UAAa,MAAM,EAAE,aAAc,QAAO;AAAA,EACnE;AACA,MAAI,EAAE,aAAa,UAAa,eAAe,OAAO,UAAU,IAAI,eAAe,EAAE,QAAQ,GAAG;AAC9F,WAAO;AAAA,EACT;AACA,SAAO;AACT;AAEA,SAAS,OAAO,OAAuB;AACrC,SAAO,WAAW,QAAQ,EAAE,OAAO,KAAK,EAAE,OAAO,KAAK,EAAE,MAAM,GAAG,EAAE;AACrE;AAEA,SAAS,WAAW,MAAc,MAAoB;AACpD,MAAI,WAAW,IAAI,GAAG;AACpB,UAAM,WAAW,aAAa,MAAM,MAAM;AAC1C,kBAAc,MAAM,WAAW,IAAI;AAAA,EACrC,OAAO;AACL,kBAAc,MAAM,IAAI;AAAA,EAC1B;AACF;;;AClSA,SAAS,oBAAoB;AAC7B,SAAS,cAAAA,mBAAkB;AAC3B,SAAS,UAAU,YAAY,QAAAC,aAAY;AAuBpC,IAAM,uBAAN,cAAmC,MAAM;AAAA,EAC9C,YACE,SACS,OACT;AACA,UAAM,OAAO;AAFJ;AAGT,SAAK,OAAO;AAAA,EACd;AAAA,EAJW;AAKb;AAaA,SAAS,WAAW,MAAgB,KAAqB;AACvD,MAAI;AACF,WAAO,aAAa,OAAO,MAAM,EAAE,KAAK,UAAU,OAAO,CAAC,EAAE,KAAK;AAAA,EACnE,SAAS,KAAK;AACZ,UAAM,SACJ,OAAO,OAAO,QAAQ,YAAY,YAAY,MAC1C,OAAQ,IAA4B,MAAM,IAC1C;AACN,UAAM,IAAI,qBAAqB,OAAO,KAAK,KAAK,GAAG,CAAC,YAAY,UAAU,OAAO,GAAG,CAAC,IAAI,GAAG;AAAA,EAC9F;AACF;AAGA,SAAS,KAAK,OAAuB;AACnC,SACE,MACG,YAAY,EACZ,QAAQ,eAAe,GAAG,EAC1B,QAAQ,YAAY,EAAE,EACtB,MAAM,GAAG,EAAE,KAAK;AAEvB;AAEO,SAAS,mBAAmB,MAAkD;AACnF,QAAM,MAAM,KAAK,OAAO;AACxB,QAAM,cAAc,KAAK,eAAeA,MAAK,KAAK,UAAU,YAAY;AACxE,QAAM,eAAe,KAAK,gBAAgB;AAE1C,SAAO;AAAA,IACL,MAAM,OAAO,EAAE,SAAS,MAAM,GAAG;AAC/B,YAAM,KAAK,GAAG,KAAK,KAAK,CAAC,IAAI,KAAK,IAAI,EAAE,SAAS,EAAE,CAAC,IAAI,KAAK,OAAO,EAAE,SAAS,EAAE,EAAE,MAAM,GAAG,CAAC,CAAC;AAC9F,YAAM,SAAS,GAAG,YAAY,IAAI,EAAE;AACpC,YAAM,OAAOA,MAAK,aAAa,EAAE;AACjC,UAAI,CAAC,YAAY,OAAO,MAAM,QAAQ,MAAM,OAAO,GAAG,KAAK,QAAQ;AACnE,aAAO,EAAE,MAAM,QAAQ,QAAQ;AAAA,IACjC;AAAA,IAEA,MAAM,SAAS,UAAU,SAAS;AAGhC,YAAM,SAAS,IAAI,CAAC,UAAU,aAAa,GAAG,SAAS,IAAI;AAC3D,UAAI,OAAO,SAAS,GAAG;AACrB,YAAI,CAAC,OAAO,IAAI,GAAG,SAAS,IAAI;AAChC,YAAI,CAAC,UAAU,MAAM,OAAO,GAAG,SAAS,IAAI;AAAA,MAC9C;AACA,aAAO;AAAA,QACL,MAAM;AAAA,QACN,aAAa,SAAS;AAAA,QACtB,SAAS,SAAS;AAAA,QAClB;AAAA,MACF;AAAA,IACF;AAAA,IAEA,MAAM,QAAQ,UAAU;AAGtB,UAAI,CAAC,YAAY,UAAU,WAAW,SAAS,IAAI,GAAG,KAAK,QAAQ;AACnE,UAAI,CAAC,UAAU,MAAM,SAAS,MAAM,GAAG,KAAK,QAAQ;AAAA,IACtD;AAAA,EACF;AACF;AAKO,SAAS,oBAAoB,SAAsB,aAA8B;AACtF,MAAI,WAAW,QAAQ,WAAW,KAAKD,YAAW,QAAQ,WAAW,EAAG,QAAO,QAAQ;AACvF,MAAI,YAAa,QAAOC,MAAK,aAAa,SAAS,QAAQ,WAAW,CAAC;AACvE,SAAO,QAAQ;AACjB;","names":["existsSync","join"]}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import {
|
|
2
2
|
__export
|
|
3
|
-
} from "./chunk-
|
|
3
|
+
} from "./chunk-PZ5AY32C.js";
|
|
4
4
|
|
|
5
5
|
// src/benchmarks/index.ts
|
|
6
6
|
var benchmarks_exports = {};
|
|
@@ -220,4 +220,4 @@ export {
|
|
|
220
220
|
routing_exports,
|
|
221
221
|
benchmarks_exports
|
|
222
222
|
};
|
|
223
|
-
//# sourceMappingURL=chunk-
|
|
223
|
+
//# sourceMappingURL=chunk-6QDKWHLS.js.map
|
|
@@ -2,9 +2,111 @@ import {
|
|
|
2
2
|
DEFAULT_REDACTION_RULES
|
|
3
3
|
} from "./chunk-GGE4NNQT.js";
|
|
4
4
|
import {
|
|
5
|
+
AgentEvalError,
|
|
5
6
|
ValidationError
|
|
6
7
|
} from "./chunk-QYJT52YW.js";
|
|
7
8
|
|
|
9
|
+
// src/integrity/backend-integrity.ts
|
|
10
|
+
var BackendIntegrityError = class extends AgentEvalError {
|
|
11
|
+
constructor(message, report) {
|
|
12
|
+
super("backend_integrity", message);
|
|
13
|
+
this.report = report;
|
|
14
|
+
}
|
|
15
|
+
report;
|
|
16
|
+
};
|
|
17
|
+
function isStubRecord(rec) {
|
|
18
|
+
return rec.tokenUsage.input === 0 && rec.tokenUsage.output === 0;
|
|
19
|
+
}
|
|
20
|
+
function isUncostedRecord(rec) {
|
|
21
|
+
return rec.tokenUsage.output > 0 && rec.costUsd === 0;
|
|
22
|
+
}
|
|
23
|
+
function summarizeBackendIntegrity(records) {
|
|
24
|
+
const totalRecords = records.length;
|
|
25
|
+
let stubRecords = 0;
|
|
26
|
+
let realRecords = 0;
|
|
27
|
+
let uncostedRecords = 0;
|
|
28
|
+
let totalInputTokens = 0;
|
|
29
|
+
let totalOutputTokens = 0;
|
|
30
|
+
let totalCostUsd = 0;
|
|
31
|
+
for (const rec of records) {
|
|
32
|
+
totalInputTokens += rec.tokenUsage.input;
|
|
33
|
+
totalOutputTokens += rec.tokenUsage.output;
|
|
34
|
+
totalCostUsd += rec.costUsd;
|
|
35
|
+
if (isStubRecord(rec)) stubRecords++;
|
|
36
|
+
else realRecords++;
|
|
37
|
+
if (isUncostedRecord(rec)) uncostedRecords++;
|
|
38
|
+
}
|
|
39
|
+
const verdict = totalRecords === 0 ? "stub" : stubRecords === totalRecords ? "stub" : stubRecords === 0 ? "real" : "mixed";
|
|
40
|
+
const diagnosis = buildDiagnosis({
|
|
41
|
+
totalRecords,
|
|
42
|
+
stubRecords,
|
|
43
|
+
realRecords,
|
|
44
|
+
uncostedRecords,
|
|
45
|
+
totalInputTokens,
|
|
46
|
+
totalOutputTokens,
|
|
47
|
+
totalCostUsd,
|
|
48
|
+
verdict
|
|
49
|
+
});
|
|
50
|
+
return {
|
|
51
|
+
totalRecords,
|
|
52
|
+
stubRecords,
|
|
53
|
+
realRecords,
|
|
54
|
+
uncostedRecords,
|
|
55
|
+
totalInputTokens,
|
|
56
|
+
totalOutputTokens,
|
|
57
|
+
totalCostUsd,
|
|
58
|
+
verdict,
|
|
59
|
+
diagnosis
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
function buildDiagnosis(r) {
|
|
63
|
+
if (r.totalRecords === 0) {
|
|
64
|
+
return "no records \u2014 eval produced zero runs; backend likely failed before first turn";
|
|
65
|
+
}
|
|
66
|
+
if (r.verdict === "stub") {
|
|
67
|
+
return [
|
|
68
|
+
`all ${r.totalRecords} records have zero token usage \u2014 the LLM backend was never called.`,
|
|
69
|
+
"common causes: --backend sandbox without a sandbox bridge running; stub model returning hard-coded strings;",
|
|
70
|
+
"auth misconfigured so requests were silently dropped before the LLM. Re-run with --backend tcloud and TANGLE_API_KEY set,",
|
|
71
|
+
"or boot the cli-bridge / sandbox before invoking the eval."
|
|
72
|
+
].join(" ");
|
|
73
|
+
}
|
|
74
|
+
if (r.verdict === "mixed") {
|
|
75
|
+
const pct = (r.stubRecords / r.totalRecords * 100).toFixed(0);
|
|
76
|
+
return [
|
|
77
|
+
`${r.stubRecords}/${r.totalRecords} records (${pct}%) have zero token usage \u2014 the backend partially failed.`,
|
|
78
|
+
"common causes: rate-limit cascade (429s after the first N personas);",
|
|
79
|
+
"transient auth expiry mid-run; provider outage. Treat the affected records as missing data, not agent failures."
|
|
80
|
+
].join(" ");
|
|
81
|
+
}
|
|
82
|
+
if (r.uncostedRecords > 0) {
|
|
83
|
+
const pct = (r.uncostedRecords / r.totalRecords * 100).toFixed(0);
|
|
84
|
+
return [
|
|
85
|
+
`${r.totalRecords} records with real LLM activity (in=${r.totalInputTokens}, out=${r.totalOutputTokens} tokens).`,
|
|
86
|
+
`${r.uncostedRecords} (${pct}%) have output tokens but costUsd=0 \u2014 cost ledger is mis-wired (no input-token`,
|
|
87
|
+
"propagation from the runtime stream into RunRecord)."
|
|
88
|
+
].join(" ");
|
|
89
|
+
}
|
|
90
|
+
return `${r.totalRecords} records with real LLM activity (in=${r.totalInputTokens}, out=${r.totalOutputTokens} tokens, $${r.totalCostUsd.toFixed(4)}).`;
|
|
91
|
+
}
|
|
92
|
+
function assertRealBackend(records, opts = {}) {
|
|
93
|
+
const report = summarizeBackendIntegrity(records);
|
|
94
|
+
const allowMixed = opts.allowMixed ?? true;
|
|
95
|
+
if (report.verdict === "stub") {
|
|
96
|
+
throw new BackendIntegrityError(
|
|
97
|
+
`backend-integrity: ran against a stub or unconfigured backend \u2014 ${report.diagnosis}`,
|
|
98
|
+
report
|
|
99
|
+
);
|
|
100
|
+
}
|
|
101
|
+
if (!allowMixed && report.verdict === "mixed") {
|
|
102
|
+
throw new BackendIntegrityError(
|
|
103
|
+
`backend-integrity: partial backend failure rejected \u2014 ${report.diagnosis}`,
|
|
104
|
+
report
|
|
105
|
+
);
|
|
106
|
+
}
|
|
107
|
+
return report;
|
|
108
|
+
}
|
|
109
|
+
|
|
8
110
|
// src/dataset.ts
|
|
9
111
|
var HoldoutLockedError = class extends ValidationError {
|
|
10
112
|
constructor(datasetName) {
|
|
@@ -746,6 +848,9 @@ function parseReflectionResponse(raw, maxProposals) {
|
|
|
746
848
|
}
|
|
747
849
|
|
|
748
850
|
export {
|
|
851
|
+
BackendIntegrityError,
|
|
852
|
+
summarizeBackendIntegrity,
|
|
853
|
+
assertRealBackend,
|
|
749
854
|
HoldoutLockedError,
|
|
750
855
|
Dataset,
|
|
751
856
|
hashScenarios,
|
|
@@ -759,4 +864,4 @@ export {
|
|
|
759
864
|
buildReflectionPrompt,
|
|
760
865
|
parseReflectionResponse
|
|
761
866
|
};
|
|
762
|
-
//# sourceMappingURL=chunk-
|
|
867
|
+
//# sourceMappingURL=chunk-GBHRUAOF.js.map
|