@tangle-network/agent-eval 0.77.0 → 0.80.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +50 -19
- package/dist/adapters/http.d.ts +2 -2
- package/dist/adapters/langchain.d.ts +2 -2
- package/dist/adapters/otel.d.ts +4 -4
- package/dist/{agent-profile-DYRboYWu.d.ts → agent-profile-aSEaJ9Pl.d.ts} +1 -1
- package/dist/analyst/index.d.ts +42 -8
- package/dist/analyst/index.js +32 -2
- package/dist/analyst/index.js.map +1 -1
- package/dist/authenticity/index.d.ts +54 -1
- package/dist/authenticity/index.js +88 -1
- package/dist/authenticity/index.js.map +1 -1
- package/dist/belief-state/index.d.ts +188 -0
- package/dist/belief-state/index.js +486 -0
- package/dist/belief-state/index.js.map +1 -0
- package/dist/benchmarks/index.d.ts +2 -2
- package/dist/calibration-Cpr3WaX3.d.ts +101 -0
- package/dist/campaign/index.d.ts +11 -11
- package/dist/campaign/index.js +4 -4
- package/dist/chunk-4DIJWVUT.js +131 -0
- package/dist/chunk-4DIJWVUT.js.map +1 -0
- package/dist/{chunk-7W4SM7FD.js → chunk-5LVWPNS5.js} +91 -91
- package/dist/chunk-5LVWPNS5.js.map +1 -0
- package/dist/{chunk-WYIHD6EB.js → chunk-CF67I6QY.js} +1 -1
- package/dist/chunk-CF67I6QY.js.map +1 -0
- package/dist/{chunk-XPILG2CA.js → chunk-GXHLRXDI.js} +2 -2
- package/dist/{chunk-F3SRAAZO.js → chunk-KWRRMR3J.js} +15 -1
- package/dist/chunk-KWRRMR3J.js.map +1 -0
- package/dist/chunk-NPCTHQIO.js +91 -0
- package/dist/chunk-NPCTHQIO.js.map +1 -0
- package/dist/{chunk-JYE3WOTE.js → chunk-RPLZ4OIB.js} +10 -1
- package/dist/chunk-RPLZ4OIB.js.map +1 -0
- package/dist/{chunk-6EKXFFGQ.js → chunk-RTWFUK6A.js} +2 -2
- package/dist/{chunk-XGNCBAVZ.js → chunk-XQL22JDG.js} +2 -2
- package/dist/{chunk-GJJNJVIR.js → chunk-XXNIODOM.js} +2 -2
- package/dist/contract/index.d.ts +128 -15
- package/dist/contract/index.js +118 -2
- package/dist/contract/index.js.map +1 -1
- package/dist/{control-BgA6BYTm.d.ts → control-CehLtoET.d.ts} +1 -1
- package/dist/control.d.ts +2 -2
- package/dist/control.js +2 -2
- package/dist/governance/index.d.ts +1 -1
- package/dist/hosted/index.d.ts +4 -4
- package/dist/{index-DsnOpCO6.d.ts → index-B1RKber3.d.ts} +1 -1
- package/dist/index.d.ts +127 -26
- package/dist/index.js +32 -7
- package/dist/index.js.map +1 -1
- package/dist/{insight-report-Df3lxYXM.d.ts → insight-report-dlpEzQDi.d.ts} +1 -1
- package/dist/{kind-factory-DW9XWPvM.d.ts → kind-factory-DqV2t1Xk.d.ts} +1 -1
- package/dist/meta-eval/index.d.ts +6 -99
- package/dist/meta-eval/index.js +7 -76
- package/dist/meta-eval/index.js.map +1 -1
- package/dist/off-policy-DiwuKKg7.d.ts +132 -0
- package/dist/openapi.json +1 -1
- package/dist/{outcome-store-D6KWmYvj.d.ts → outcome-store-rnXLEqSn.d.ts} +1 -1
- package/dist/{provenance-B-TFszPW.d.ts → provenance-jG-Gngg8.d.ts} +3 -3
- package/dist/{registry-DuVYiTvw.d.ts → registry-BK0Zee01.d.ts} +1 -1
- package/dist/{release-report-CN8hJlhk.d.ts → release-report-CXXZlR8g.d.ts} +2 -2
- package/dist/reporting.d.ts +5 -5
- package/dist/{researcher-C_KJyIGg.d.ts → researcher-rInLj9De.d.ts} +2 -2
- package/dist/rl.d.ts +10 -140
- package/dist/rl.js +8 -122
- package/dist/rl.js.map +1 -1
- package/dist/{rubric-predictive-validity-D_4BSXGV.d.ts → rubric-predictive-validity-CLPuwiUw.d.ts} +2 -2
- package/dist/{run-improvement-loop-BqYH2vCR.d.ts → run-improvement-loop-BAl_aVOZ.d.ts} +2 -4
- package/dist/{run-record-BgTFzO2r.d.ts → run-record-sItO5ftF.d.ts} +11 -0
- package/dist/{semantic-concept-judge-CV9Wlx4t.d.ts → semantic-concept-judge-qXEUV2w7.d.ts} +3 -3
- package/dist/{summary-report-ByiOUrHj.d.ts → summary-report-BTaXq1TS.d.ts} +1 -1
- package/dist/traces.d.ts +1 -1
- package/dist/traces.js +2 -2
- package/dist/{types-Bba0vl1V.d.ts → types-4mm2msnR.d.ts} +12 -4
- package/dist/{types-CRD68aH7.d.ts → types-DRvV0zRo.d.ts} +10 -1
- package/dist/workflow/index.d.ts +4 -4
- package/dist/workflow/index.js +1 -1
- package/docs/auto-research-loop-end-to-end.md +1 -1
- package/docs/feature-guide.md +4 -4
- package/docs/multi-shot-optimization.md +61 -115
- package/docs/product-eval-adoption.md +1 -1
- package/docs/research/belief-state-agent-eval-roadmap.md +558 -0
- package/docs/research/research-roadmap.md +1 -0
- package/docs/three-package-architecture.md +1 -1
- package/docs/trace-analysis.md +19 -0
- package/package.json +7 -2
- package/dist/chunk-7W4SM7FD.js.map +0 -1
- package/dist/chunk-F3SRAAZO.js.map +0 -1
- package/dist/chunk-JYE3WOTE.js.map +0 -1
- package/dist/chunk-WYIHD6EB.js.map +0 -1
- /package/dist/{chunk-XPILG2CA.js.map → chunk-GXHLRXDI.js.map} +0 -0
- /package/dist/{chunk-6EKXFFGQ.js.map → chunk-RTWFUK6A.js.map} +0 -0
- /package/dist/{chunk-XGNCBAVZ.js.map → chunk-XQL22JDG.js.map} +0 -0
- /package/dist/{chunk-GJJNJVIR.js.map → chunk-XXNIODOM.js.map} +0 -0
package/dist/contract/index.d.ts
CHANGED
|
@@ -1,15 +1,15 @@
|
|
|
1
|
-
import { S as Scenario, M as MutableSurface, b as DispatchContext, a as JudgeConfig, I as ImprovementDriver, G as Gate } from '../types-
|
|
2
|
-
export { C as CampaignAggregates,
|
|
3
|
-
import { L as LoopProvenanceRecord } from '../provenance-
|
|
4
|
-
export { A as AxisEvidence, a as AxisVerdict, B as BuildEvidenceVectorOptions, D as DefaultProductionGateOptions, E as EvidenceVector, b as EvolutionaryDriverOptions, H as HeldOutGateOptions, O as ObjectiveSource, P as ParetoSignificanceGateOptions, c as PromotionObjective, d as PromotionPolicy, R as RunEvalOptions, e as buildEvidenceVector, f as composeGate, g as defaultProductionGate, h as evolutionaryDriver, i as heldOutGate, p as paretoPolicy, j as paretoSignificanceGate, r as runEval } from '../provenance-
|
|
5
|
-
import { C as CampaignStorage, R as RunImprovementLoopResult } from '../run-improvement-loop-
|
|
6
|
-
export { G as GepaDriverOptions, a as RunCampaignOptions, b as RunImprovementLoopOptions, f as fsCampaignStorage, g as gepaDriver, i as inMemoryCampaignStorage, r as runCampaign, c as runImprovementLoop } from '../run-improvement-loop-
|
|
7
|
-
export { D as DeploymentOutcome, F as FileSystemOutcomeStore,
|
|
8
|
-
import { HostedTenant, TraceSpanEvent } from '../hosted/index.js';
|
|
9
|
-
import { R as RunRecord, b as RunSplitTag } from '../run-record-
|
|
10
|
-
import { I as InsightReport } from '../insight-report-
|
|
11
|
-
export { F as FailureClusterInsight, a as InterRaterInsight, J as JudgeInsight, L as LiftInsight, O as OutcomeCorrelationInsight, R as Recommendation, b as ReleaseSummary, S as ScalarDistribution } from '../insight-report-
|
|
12
|
-
import { a as AnalystRegistry } from '../registry-
|
|
1
|
+
import { S as Scenario, M as MutableSurface, b as DispatchContext, a as JudgeConfig, I as ImprovementDriver, G as Gate, c as GateDecision } from '../types-4mm2msnR.js';
|
|
2
|
+
export { C as CampaignAggregates, d as CampaignArtifactWriter, e as CampaignCellResult, f as CampaignCostMeter, g as CampaignResult, h as CampaignTraceWriter, i as CodeSurface, D as Dispatch, j as GateContext, k as GateResult, l as GenerationCandidate, m as GenerationRecord, n as JudgeDimension, J as JudgeScore, o as Mutator, O as OptimizerConfig, p as SessionScript } from '../types-4mm2msnR.js';
|
|
3
|
+
import { L as LoopProvenanceRecord } from '../provenance-jG-Gngg8.js';
|
|
4
|
+
export { A as AxisEvidence, a as AxisVerdict, B as BuildEvidenceVectorOptions, D as DefaultProductionGateOptions, E as EvidenceVector, b as EvolutionaryDriverOptions, H as HeldOutGateOptions, O as ObjectiveSource, P as ParetoSignificanceGateOptions, c as PromotionObjective, d as PromotionPolicy, R as RunEvalOptions, e as buildEvidenceVector, f as composeGate, g as defaultProductionGate, h as evolutionaryDriver, i as heldOutGate, p as paretoPolicy, j as paretoSignificanceGate, r as runEval } from '../provenance-jG-Gngg8.js';
|
|
5
|
+
import { C as CampaignStorage, R as RunImprovementLoopResult } from '../run-improvement-loop-BAl_aVOZ.js';
|
|
6
|
+
export { G as GepaDriverOptions, a as RunCampaignOptions, b as RunImprovementLoopOptions, f as fsCampaignStorage, g as gepaDriver, i as inMemoryCampaignStorage, r as runCampaign, c as runImprovementLoop } from '../run-improvement-loop-BAl_aVOZ.js';
|
|
7
|
+
export { D as DeploymentOutcome, F as FileSystemOutcomeStore, a as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore, b as OutcomeStore } from '../outcome-store-rnXLEqSn.js';
|
|
8
|
+
import { HostedTenant, EvalRunCellScore, EvalRunGenerationSnapshot, EvalRunEvent, TraceSpanEvent } from '../hosted/index.js';
|
|
9
|
+
import { R as RunRecord, b as RunSplitTag } from '../run-record-sItO5ftF.js';
|
|
10
|
+
import { I as InsightReport } from '../insight-report-dlpEzQDi.js';
|
|
11
|
+
export { F as FailureClusterInsight, a as InterRaterInsight, J as JudgeInsight, L as LiftInsight, O as OutcomeCorrelationInsight, R as Recommendation, b as ReleaseSummary, S as ScalarDistribution } from '../insight-report-dlpEzQDi.js';
|
|
12
|
+
import { a as AnalystRegistry } from '../registry-BK0Zee01.js';
|
|
13
13
|
import { a as DatasetScenario } from '../dataset-B2kL-fSM.js';
|
|
14
14
|
import '../red-team-DW9Ca_tj.js';
|
|
15
15
|
import '../store-CKUAgsJz.js';
|
|
@@ -22,9 +22,9 @@ import '@tangle-network/tcloud';
|
|
|
22
22
|
import '../llm-client-DbjLfz-K.js';
|
|
23
23
|
import '../errors-Dwqw-T_m.js';
|
|
24
24
|
import '../raw-provider-sink-C46HDghv.js';
|
|
25
|
-
import '../summary-report-
|
|
25
|
+
import '../summary-report-BTaXq1TS.js';
|
|
26
26
|
import '../failure-cluster-CL7IVgkJ.js';
|
|
27
|
-
import '../types-
|
|
27
|
+
import '../types-DRvV0zRo.js';
|
|
28
28
|
import '../store-GmBE2pZZ.js';
|
|
29
29
|
|
|
30
30
|
/**
|
|
@@ -351,6 +351,119 @@ interface AnalyzeRunsOptions {
|
|
|
351
351
|
}
|
|
352
352
|
declare function analyzeRuns(opts: AnalyzeRunsOptions): Promise<InsightReport>;
|
|
353
353
|
|
|
354
|
+
/**
|
|
355
|
+
* # `@tangle-network/agent-eval/contract` — eval-run diff primitive.
|
|
356
|
+
*
|
|
357
|
+
* The substrate side of the v-N-versus-v-N+1 dashboard view. Given two
|
|
358
|
+
* `EvalRunEvent`s (or two `EvalRunGenerationSnapshot`s from one run), this
|
|
359
|
+
* returns a normalised diff: per-cell composite + per-judge/per-dimension
|
|
360
|
+
* deltas, surface-hash change, aggregate cost + duration shifts.
|
|
361
|
+
*
|
|
362
|
+
* Consumed by:
|
|
363
|
+
* - The hosted-tier dashboard (intelligence-web) — renders v3 vs v4
|
|
364
|
+
* comparisons of cells × judges × dimensions.
|
|
365
|
+
* - CI reporting — emits a "shipped: composite +0.07, cost +$1.20" line
|
|
366
|
+
* in PR review for autonomous-improvement runs.
|
|
367
|
+
* - Any downstream consumer that needs "what actually changed" without
|
|
368
|
+
* reimplementing the matching + arithmetic.
|
|
369
|
+
*
|
|
370
|
+
* Cells are matched on the natural composite key `(scenarioId, rep)`.
|
|
371
|
+
* Unmatched cells surface as `removed` / `added` so callers can tell
|
|
372
|
+
* "this cell got worse" from "this cell wasn't run."
|
|
373
|
+
*/
|
|
374
|
+
|
|
375
|
+
/** Per-dimension delta. `before` / `after` are null when the judge did not
|
|
376
|
+
* emit a value for that side. `delta` is `after - before`; null when
|
|
377
|
+
* either side is null. */
|
|
378
|
+
interface EvalDimensionDelta {
|
|
379
|
+
before: number | null;
|
|
380
|
+
after: number | null;
|
|
381
|
+
delta: number | null;
|
|
382
|
+
}
|
|
383
|
+
/** Per-cell delta, keyed on `(scenarioId, rep)`. */
|
|
384
|
+
interface EvalCellScoreDelta {
|
|
385
|
+
scenarioId: string;
|
|
386
|
+
rep: number;
|
|
387
|
+
compositeBefore: number;
|
|
388
|
+
compositeAfter: number;
|
|
389
|
+
compositeDelta: number;
|
|
390
|
+
/** Per-judge → per-dimension deltas. Outer key = judge name from
|
|
391
|
+
* `EvalRunCellScore.dimensions`; inner key = dimension name. */
|
|
392
|
+
dimensions: Record<string, Record<string, EvalDimensionDelta>>;
|
|
393
|
+
}
|
|
394
|
+
/** Diff between two generation snapshots — the unit the dashboard renders
|
|
395
|
+
* for a single "v3 vs v4" comparison. */
|
|
396
|
+
interface EvalGenerationDiff {
|
|
397
|
+
beforeIndex: number;
|
|
398
|
+
afterIndex: number;
|
|
399
|
+
beforeSurfaceHash: string;
|
|
400
|
+
afterSurfaceHash: string;
|
|
401
|
+
surfaceChanged: boolean;
|
|
402
|
+
/** Cells present in both snapshots, matched on `(scenarioId, rep)`. */
|
|
403
|
+
matched: EvalCellScoreDelta[];
|
|
404
|
+
/** Cells present in `before` but missing from `after`. */
|
|
405
|
+
removed: EvalRunCellScore[];
|
|
406
|
+
/** Cells present in `after` but missing from `before`. */
|
|
407
|
+
added: EvalRunCellScore[];
|
|
408
|
+
/** Aggregate composite mean across all cells in the snapshot. */
|
|
409
|
+
compositeBefore: number;
|
|
410
|
+
compositeAfter: number;
|
|
411
|
+
compositeDelta: number;
|
|
412
|
+
costUsdBefore: number;
|
|
413
|
+
costUsdAfter: number;
|
|
414
|
+
costUsdDelta: number;
|
|
415
|
+
durationMsBefore: number;
|
|
416
|
+
durationMsAfter: number;
|
|
417
|
+
durationMsDelta: number;
|
|
418
|
+
}
|
|
419
|
+
/** Diff between two full eval-runs. Includes both baseline-vs-baseline and
|
|
420
|
+
* winner-vs-winner generation diffs when both sides expose them, plus
|
|
421
|
+
* run-level metadata. */
|
|
422
|
+
interface EvalRunDiff {
|
|
423
|
+
beforeRunId: string;
|
|
424
|
+
afterRunId: string;
|
|
425
|
+
beforeTimestamp: string;
|
|
426
|
+
afterTimestamp: string;
|
|
427
|
+
beforeGateDecision: GateDecision | null;
|
|
428
|
+
afterGateDecision: GateDecision | null;
|
|
429
|
+
beforeHoldoutLift: number | null;
|
|
430
|
+
afterHoldoutLift: number | null;
|
|
431
|
+
holdoutLiftDelta: number | null;
|
|
432
|
+
beforeTotalCostUsd: number;
|
|
433
|
+
afterTotalCostUsd: number;
|
|
434
|
+
totalCostUsdDelta: number;
|
|
435
|
+
beforeTotalDurationMs: number;
|
|
436
|
+
afterTotalDurationMs: number;
|
|
437
|
+
totalDurationMsDelta: number;
|
|
438
|
+
/** Baseline-vs-baseline diff. Null when either run has no baseline. */
|
|
439
|
+
baselineDiff: EvalGenerationDiff | null;
|
|
440
|
+
/** Highest-index-generation comparison. Null when either run has no
|
|
441
|
+
* recorded generations (e.g. baseline-only or errored before any
|
|
442
|
+
* generation completed). */
|
|
443
|
+
winnersDiff: EvalGenerationDiff | null;
|
|
444
|
+
}
|
|
445
|
+
/**
|
|
446
|
+
* Diff two generation snapshots. Cells are matched on `(scenarioId, rep)`;
|
|
447
|
+
* unmatched cells surface in `added` / `removed`. Aggregate fields are
|
|
448
|
+
* recomputed from the snapshot's stored fields, not re-derived from cells —
|
|
449
|
+
* this keeps the diff consistent with whatever aggregation the substrate
|
|
450
|
+
* actually reported.
|
|
451
|
+
*/
|
|
452
|
+
declare function diffGenerations(before: EvalRunGenerationSnapshot, after: EvalRunGenerationSnapshot): EvalGenerationDiff;
|
|
453
|
+
/**
|
|
454
|
+
* Diff two full eval-runs. Produces baseline-vs-baseline and
|
|
455
|
+
* winner-vs-winner generation diffs when both sides expose them, plus
|
|
456
|
+
* run-level cost / lift / gate-decision deltas.
|
|
457
|
+
*/
|
|
458
|
+
declare function diffRuns(before: EvalRunEvent, after: EvalRunEvent): EvalRunDiff;
|
|
459
|
+
/**
|
|
460
|
+
* Within-run baseline → winning-generation diff. The natural "what did the
|
|
461
|
+
* improvement loop produce" view for a single run. Returns null when the
|
|
462
|
+
* run never reached a generation past baseline (errored early, or the gate
|
|
463
|
+
* shipped the baseline as-is).
|
|
464
|
+
*/
|
|
465
|
+
declare function diffRunBaselineToWinner(run: EvalRunEvent): EvalGenerationDiff | null;
|
|
466
|
+
|
|
354
467
|
/**
|
|
355
468
|
* `fromAgentTrace` — provenance correlation from Cursor's Agent Trace spec
|
|
356
469
|
* (https://github.com/cursor/agent-trace, RFC v0.1.0).
|
|
@@ -567,4 +680,4 @@ interface FromOtelSpansOptions {
|
|
|
567
680
|
}
|
|
568
681
|
declare function fromOtelSpans(opts: FromOtelSpansOptions): RunRecord[];
|
|
569
682
|
|
|
570
|
-
export { type AgentTraceContributor, type AgentTraceContributorType, type AgentTraceConversation, type AgentTraceFile, type AgentTraceIndex, type AgentTraceRange, type AgentTraceRecord, type AnalyzeRunsOptions, type AuthoringProvenance, CampaignStorage, DispatchContext, type FeedbackTableMeta, type FeedbackTableRow, type FromFeedbackTableOptions, type FromFeedbackTableResult, type FromOtelSpansOptions, Gate, ImprovementDriver, InsightReport, JudgeConfig, MutableSurface, type PartitionByAuthoringModelResult, RunImprovementLoopResult, Scenario, type SelfImproveBudget, type SelfImproveLlm, type SelfImproveOptions, type SelfImproveProgressEvent, type SelfImproveResult, analyzeRuns, fromFeedbackTable, fromOtelSpans, parseAgentTrace, partitionRunsByAuthoringModel, selfImprove };
|
|
683
|
+
export { type AgentTraceContributor, type AgentTraceContributorType, type AgentTraceConversation, type AgentTraceFile, type AgentTraceIndex, type AgentTraceRange, type AgentTraceRecord, type AnalyzeRunsOptions, type AuthoringProvenance, CampaignStorage, DispatchContext, type EvalCellScoreDelta, type EvalDimensionDelta, type EvalGenerationDiff, type EvalRunDiff, type FeedbackTableMeta, type FeedbackTableRow, type FromFeedbackTableOptions, type FromFeedbackTableResult, type FromOtelSpansOptions, Gate, GateDecision, ImprovementDriver, InsightReport, JudgeConfig, MutableSurface, type PartitionByAuthoringModelResult, RunImprovementLoopResult, Scenario, type SelfImproveBudget, type SelfImproveLlm, type SelfImproveOptions, type SelfImproveProgressEvent, type SelfImproveResult, analyzeRuns, diffGenerations, diffRunBaselineToWinner, diffRuns, fromFeedbackTable, fromOtelSpans, parseAgentTrace, partitionRunsByAuthoringModel, selfImprove };
|
package/dist/contract/index.js
CHANGED
|
@@ -9,7 +9,7 @@ import {
|
|
|
9
9
|
paretoPolicy,
|
|
10
10
|
paretoSignificanceGate,
|
|
11
11
|
runEval
|
|
12
|
-
} from "../chunk-
|
|
12
|
+
} from "../chunk-GXHLRXDI.js";
|
|
13
13
|
import {
|
|
14
14
|
checkCanaries
|
|
15
15
|
} from "../chunk-SHTXZ4O2.js";
|
|
@@ -19,7 +19,7 @@ import {
|
|
|
19
19
|
heldOutGate,
|
|
20
20
|
runImprovementLoop,
|
|
21
21
|
surfaceContentHash
|
|
22
|
-
} from "../chunk-
|
|
22
|
+
} from "../chunk-RPLZ4OIB.js";
|
|
23
23
|
import {
|
|
24
24
|
fsCampaignStorage,
|
|
25
25
|
inMemoryCampaignStorage,
|
|
@@ -1087,6 +1087,119 @@ function cellsToRunRecords(cells, candidateId, runId, surface) {
|
|
|
1087
1087
|
});
|
|
1088
1088
|
}
|
|
1089
1089
|
|
|
1090
|
+
// src/contract/diff.ts
|
|
1091
|
+
function keyForCell(cell) {
|
|
1092
|
+
return JSON.stringify([cell.scenarioId, cell.rep]);
|
|
1093
|
+
}
|
|
1094
|
+
function diffDimensions(before, after) {
|
|
1095
|
+
const out = {};
|
|
1096
|
+
const judges = /* @__PURE__ */ new Set([...Object.keys(before), ...Object.keys(after)]);
|
|
1097
|
+
for (const judge of judges) {
|
|
1098
|
+
const beforeDims = before[judge] ?? {};
|
|
1099
|
+
const afterDims = after[judge] ?? {};
|
|
1100
|
+
const dims = /* @__PURE__ */ new Set([...Object.keys(beforeDims), ...Object.keys(afterDims)]);
|
|
1101
|
+
const judgeOut = {};
|
|
1102
|
+
for (const dim of dims) {
|
|
1103
|
+
const rawBefore = beforeDims[dim];
|
|
1104
|
+
const rawAfter = afterDims[dim];
|
|
1105
|
+
const b = typeof rawBefore === "number" && Number.isFinite(rawBefore) ? rawBefore : null;
|
|
1106
|
+
const a = typeof rawAfter === "number" && Number.isFinite(rawAfter) ? rawAfter : null;
|
|
1107
|
+
judgeOut[dim] = {
|
|
1108
|
+
before: b,
|
|
1109
|
+
after: a,
|
|
1110
|
+
delta: b !== null && a !== null ? a - b : null
|
|
1111
|
+
};
|
|
1112
|
+
}
|
|
1113
|
+
out[judge] = judgeOut;
|
|
1114
|
+
}
|
|
1115
|
+
return out;
|
|
1116
|
+
}
|
|
1117
|
+
function diffGenerations(before, after) {
|
|
1118
|
+
const beforeMap = new Map(before.cells.map((c) => [keyForCell(c), c]));
|
|
1119
|
+
const afterMap = new Map(after.cells.map((c) => [keyForCell(c), c]));
|
|
1120
|
+
const matched = [];
|
|
1121
|
+
const removed = [];
|
|
1122
|
+
const added = [];
|
|
1123
|
+
for (const [key, beforeCell] of beforeMap) {
|
|
1124
|
+
const afterCell = afterMap.get(key);
|
|
1125
|
+
if (!afterCell) {
|
|
1126
|
+
removed.push(beforeCell);
|
|
1127
|
+
continue;
|
|
1128
|
+
}
|
|
1129
|
+
matched.push({
|
|
1130
|
+
scenarioId: beforeCell.scenarioId,
|
|
1131
|
+
rep: beforeCell.rep,
|
|
1132
|
+
compositeBefore: beforeCell.compositeMean,
|
|
1133
|
+
compositeAfter: afterCell.compositeMean,
|
|
1134
|
+
compositeDelta: afterCell.compositeMean - beforeCell.compositeMean,
|
|
1135
|
+
dimensions: diffDimensions(beforeCell.dimensions, afterCell.dimensions)
|
|
1136
|
+
});
|
|
1137
|
+
}
|
|
1138
|
+
for (const [key, afterCell] of afterMap) {
|
|
1139
|
+
if (!beforeMap.has(key)) added.push(afterCell);
|
|
1140
|
+
}
|
|
1141
|
+
return {
|
|
1142
|
+
beforeIndex: before.index,
|
|
1143
|
+
afterIndex: after.index,
|
|
1144
|
+
beforeSurfaceHash: before.surfaceHash,
|
|
1145
|
+
afterSurfaceHash: after.surfaceHash,
|
|
1146
|
+
surfaceChanged: before.surfaceHash !== after.surfaceHash,
|
|
1147
|
+
matched,
|
|
1148
|
+
removed,
|
|
1149
|
+
added,
|
|
1150
|
+
compositeBefore: before.compositeMean,
|
|
1151
|
+
compositeAfter: after.compositeMean,
|
|
1152
|
+
compositeDelta: after.compositeMean - before.compositeMean,
|
|
1153
|
+
costUsdBefore: before.costUsd,
|
|
1154
|
+
costUsdAfter: after.costUsd,
|
|
1155
|
+
costUsdDelta: after.costUsd - before.costUsd,
|
|
1156
|
+
durationMsBefore: before.durationMs,
|
|
1157
|
+
durationMsAfter: after.durationMs,
|
|
1158
|
+
durationMsDelta: after.durationMs - before.durationMs
|
|
1159
|
+
};
|
|
1160
|
+
}
|
|
1161
|
+
function winnerOf(run) {
|
|
1162
|
+
if (run.generations.length === 0) return null;
|
|
1163
|
+
let winner = run.generations[0];
|
|
1164
|
+
for (const gen of run.generations) {
|
|
1165
|
+
if (gen.index > winner.index) winner = gen;
|
|
1166
|
+
}
|
|
1167
|
+
return winner;
|
|
1168
|
+
}
|
|
1169
|
+
function diffRuns(before, after) {
|
|
1170
|
+
const beforeWinner = winnerOf(before);
|
|
1171
|
+
const afterWinner = winnerOf(after);
|
|
1172
|
+
const baselineDiff = before.baseline && after.baseline ? diffGenerations(before.baseline, after.baseline) : null;
|
|
1173
|
+
const winnersDiff = beforeWinner && afterWinner ? diffGenerations(beforeWinner, afterWinner) : null;
|
|
1174
|
+
const beforeLift = before.holdoutLift ?? null;
|
|
1175
|
+
const afterLift = after.holdoutLift ?? null;
|
|
1176
|
+
return {
|
|
1177
|
+
beforeRunId: before.runId,
|
|
1178
|
+
afterRunId: after.runId,
|
|
1179
|
+
beforeTimestamp: before.timestamp,
|
|
1180
|
+
afterTimestamp: after.timestamp,
|
|
1181
|
+
beforeGateDecision: before.gateDecision ?? null,
|
|
1182
|
+
afterGateDecision: after.gateDecision ?? null,
|
|
1183
|
+
beforeHoldoutLift: beforeLift,
|
|
1184
|
+
afterHoldoutLift: afterLift,
|
|
1185
|
+
holdoutLiftDelta: beforeLift !== null && afterLift !== null ? afterLift - beforeLift : null,
|
|
1186
|
+
beforeTotalCostUsd: before.totalCostUsd,
|
|
1187
|
+
afterTotalCostUsd: after.totalCostUsd,
|
|
1188
|
+
totalCostUsdDelta: after.totalCostUsd - before.totalCostUsd,
|
|
1189
|
+
beforeTotalDurationMs: before.totalDurationMs,
|
|
1190
|
+
afterTotalDurationMs: after.totalDurationMs,
|
|
1191
|
+
totalDurationMsDelta: after.totalDurationMs - before.totalDurationMs,
|
|
1192
|
+
baselineDiff,
|
|
1193
|
+
winnersDiff
|
|
1194
|
+
};
|
|
1195
|
+
}
|
|
1196
|
+
function diffRunBaselineToWinner(run) {
|
|
1197
|
+
if (!run.baseline) return null;
|
|
1198
|
+
const winner = winnerOf(run);
|
|
1199
|
+
if (!winner || winner.index === run.baseline.index) return null;
|
|
1200
|
+
return diffGenerations(run.baseline, winner);
|
|
1201
|
+
}
|
|
1202
|
+
|
|
1090
1203
|
// src/contract/intake/agent-trace.ts
|
|
1091
1204
|
function rangeLines(r) {
|
|
1092
1205
|
return Math.max(0, r.end_line - r.start_line + 1);
|
|
@@ -1328,6 +1441,9 @@ export {
|
|
|
1328
1441
|
buildEvidenceVector,
|
|
1329
1442
|
composeGate,
|
|
1330
1443
|
defaultProductionGate,
|
|
1444
|
+
diffGenerations,
|
|
1445
|
+
diffRunBaselineToWinner,
|
|
1446
|
+
diffRuns,
|
|
1331
1447
|
evolutionaryDriver,
|
|
1332
1448
|
fromFeedbackTable,
|
|
1333
1449
|
fromOtelSpans,
|