@tangle-network/agent-eval 0.59.1 → 0.60.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/dist/adapters/http.d.ts +1 -1
  2. package/dist/adapters/http.js +1 -1
  3. package/dist/adapters/langchain.d.ts +1 -1
  4. package/dist/adapters/langchain.js +1 -1
  5. package/dist/adapters/otel.d.ts +2 -2
  6. package/dist/adapters/otel.js +1 -1
  7. package/dist/benchmarks/index.js +2 -2
  8. package/dist/builder-eval/index.js +1 -1
  9. package/dist/campaign/index.d.ts +7 -3
  10. package/dist/campaign/index.js +21 -16
  11. package/dist/campaign/index.js.map +1 -1
  12. package/dist/{chunk-MHQPVHXU.js → chunk-6QDKWHLS.js} +2 -2
  13. package/dist/{chunk-N4SBKEPJ.js → chunk-GBHRUAOF.js} +106 -1
  14. package/dist/chunk-GBHRUAOF.js.map +1 -0
  15. package/dist/{chunk-JB4UWIM6.js → chunk-LBSXXH56.js} +265 -14
  16. package/dist/chunk-LBSXXH56.js.map +1 -0
  17. package/dist/{chunk-74Y2EMNH.js → chunk-NOPYCRNG.js} +6 -5
  18. package/dist/{chunk-74Y2EMNH.js.map → chunk-NOPYCRNG.js.map} +1 -1
  19. package/dist/chunk-PZ5AY32C.js +10 -0
  20. package/dist/chunk-SHTXZ4O2.js +113 -0
  21. package/dist/chunk-SHTXZ4O2.js.map +1 -0
  22. package/dist/cli.js +1 -1
  23. package/dist/contract/index.d.ts +42 -10
  24. package/dist/contract/index.js +55 -15
  25. package/dist/contract/index.js.map +1 -1
  26. package/dist/control.js +1 -1
  27. package/dist/governance/index.js +1 -1
  28. package/dist/hosted/index.d.ts +2 -2
  29. package/dist/hosted/index.js +1 -1
  30. package/dist/{index-D2nT6_KT.d.ts → index-BIkvdkSU.d.ts} +1 -1
  31. package/dist/index.js +8 -8
  32. package/dist/knowledge/index.js +1 -1
  33. package/dist/matrix/index.js +1 -1
  34. package/dist/meta-eval/index.js +1 -1
  35. package/dist/multishot/index.js +1 -1
  36. package/dist/openapi.json +1 -1
  37. package/dist/pipelines/index.js +1 -1
  38. package/dist/prm/index.js +1 -1
  39. package/dist/{run-improvement-loop-BhfdjrMY.d.ts → provenance-BM8vmMBa.d.ts} +205 -3
  40. package/dist/reporting.js +1 -1
  41. package/dist/rl.d.ts +1 -1
  42. package/dist/rl.js +1 -1
  43. package/dist/{run-campaign-ZURVWMMI.js → run-campaign-5XENUKRF.js} +3 -3
  44. package/dist/telemetry/file.js +1 -1
  45. package/dist/telemetry/index.js +1 -1
  46. package/dist/traces.js +1 -1
  47. package/dist/{types-BgrxOJSf.d.ts → types-VCIXx_yo.d.ts} +32 -4
  48. package/dist/wire/index.js +1 -1
  49. package/package.json +25 -12
  50. package/dist/chunk-JB4UWIM6.js.map +0 -1
  51. package/dist/chunk-N4SBKEPJ.js.map +0 -1
  52. package/dist/chunk-NSBPE2FW.js +0 -17
  53. package/dist/chunk-ZWEQJIM6.js +0 -220
  54. package/dist/chunk-ZWEQJIM6.js.map +0 -1
  55. /package/dist/{chunk-MHQPVHXU.js.map → chunk-6QDKWHLS.js.map} +0 -0
  56. /package/dist/{chunk-NSBPE2FW.js.map → chunk-PZ5AY32C.js.map} +0 -0
  57. /package/dist/{run-campaign-ZURVWMMI.js.map → run-campaign-5XENUKRF.js.map} +0 -0
package/dist/control.js CHANGED
@@ -17,7 +17,7 @@ import "./chunk-NCK5QLGT.js";
17
17
  import "./chunk-TVVP3ZZQ.js";
18
18
  import "./chunk-VSMTAMNK.js";
19
19
  import "./chunk-QYJT52YW.js";
20
- import "./chunk-NSBPE2FW.js";
20
+ import "./chunk-PZ5AY32C.js";
21
21
  export {
22
22
  allCriticalPassed,
23
23
  controlRunToRunRecord,
@@ -6,7 +6,7 @@ import {
6
6
  soc2Report,
7
7
  summarize
8
8
  } from "../chunk-KKHDIONI.js";
9
- import "../chunk-NSBPE2FW.js";
9
+ import "../chunk-PZ5AY32C.js";
10
10
  export {
11
11
  classifyEuAiRisk,
12
12
  euAiActReport,
@@ -1,5 +1,5 @@
1
- export { E as EvalRunCellScore, d as EvalRunEvent, e as EvalRunGenerationSnapshot, f as EvalRunStatus, g as HOSTED_WIRE_VERSION, H as HostedClient, h as HostedIngestHeaders, a as HostedTenant, i as HostedWireVersion, j as IngestEvalRunsRequest, k as IngestResponse, l as IngestTracesRequest, T as TraceSpanEvent, m as createHostedClient } from '../index-D2nT6_KT.js';
2
- import '../types-BgrxOJSf.js';
1
+ export { E as EvalRunCellScore, d as EvalRunEvent, e as EvalRunGenerationSnapshot, f as EvalRunStatus, g as HOSTED_WIRE_VERSION, H as HostedClient, h as HostedIngestHeaders, a as HostedTenant, i as HostedWireVersion, j as IngestEvalRunsRequest, k as IngestResponse, l as IngestTracesRequest, T as TraceSpanEvent, m as createHostedClient } from '../index-BIkvdkSU.js';
2
+ import '../types-VCIXx_yo.js';
3
3
  import '../summary-report-DLxh4yWk.js';
4
4
  import '../run-record-etiCMsUq.js';
5
5
  import '../errors-mje_cKOs.js';
@@ -2,7 +2,7 @@ import {
2
2
  HOSTED_WIRE_VERSION,
3
3
  createHostedClient
4
4
  } from "../chunk-FQK2CCIM.js";
5
- import "../chunk-NSBPE2FW.js";
5
+ import "../chunk-PZ5AY32C.js";
6
6
  export {
7
7
  HOSTED_WIRE_VERSION,
8
8
  createHostedClient
@@ -1,4 +1,4 @@
1
- import { M as MutableSurface, n as GateDecision } from './types-BgrxOJSf.js';
1
+ import { M as MutableSurface, n as GateDecision } from './types-VCIXx_yo.js';
2
2
  import { G as GainDistributionBin, P as ParetoFigureSpec } from './summary-report-DLxh4yWk.js';
3
3
  import { a as ContinuousAgreement } from './judge-calibration-DilmB3Ml.js';
4
4
 
package/dist/index.js CHANGED
@@ -1,18 +1,17 @@
1
1
  import {
2
- BackendIntegrityError,
3
2
  HoldoutAuditor,
4
- assertRealBackend,
5
3
  canaryLeakView,
6
4
  checkBehavioralCanary,
7
5
  checkCanaries,
8
- runBehavioralCanaries,
9
- summarizeBackendIntegrity
10
- } from "./chunk-ZWEQJIM6.js";
6
+ runBehavioralCanaries
7
+ } from "./chunk-SHTXZ4O2.js";
11
8
  import {
9
+ BackendIntegrityError,
12
10
  DEFAULT_MUTATION_PRIMITIVES,
13
11
  DEFAULT_RED_TEAM_CORPUS,
14
12
  Dataset,
15
13
  HoldoutLockedError,
14
+ assertRealBackend,
16
15
  buildReflectionPrompt,
17
16
  hashScenarios,
18
17
  parseReflectionResponse,
@@ -20,13 +19,14 @@ import {
20
19
  redTeamReport,
21
20
  runCanaries,
22
21
  scoreRedTeamOutput,
22
+ summarizeBackendIntegrity,
23
23
  toolNamesForRun
24
- } from "./chunk-N4SBKEPJ.js";
24
+ } from "./chunk-GBHRUAOF.js";
25
25
  import {
26
26
  BENCHMARK_SPLIT_SEED,
27
27
  benchmarks_exports,
28
28
  deterministicSplit
29
- } from "./chunk-MHQPVHXU.js";
29
+ } from "./chunk-6QDKWHLS.js";
30
30
  import {
31
31
  DEFAULT_RULES,
32
32
  classifyFailure,
@@ -260,7 +260,7 @@ import {
260
260
  ValidationError,
261
261
  VerificationError
262
262
  } from "./chunk-QYJT52YW.js";
263
- import "./chunk-NSBPE2FW.js";
263
+ import "./chunk-PZ5AY32C.js";
264
264
 
265
265
  // src/run-score.ts
266
266
  var DEFAULT_RUN_SCORE_WEIGHTS = {
@@ -7,7 +7,7 @@ import {
7
7
  } from "../chunk-3CKU6VGU.js";
8
8
  import "../chunk-NCRFYPS3.js";
9
9
  import "../chunk-TVVP3ZZQ.js";
10
- import "../chunk-NSBPE2FW.js";
10
+ import "../chunk-PZ5AY32C.js";
11
11
  export {
12
12
  acquisitionPlansForKnowledgeGaps,
13
13
  blockingKnowledgeEval,
@@ -3,7 +3,7 @@ import {
3
3
  runAgentMatrix,
4
4
  summariseRows
5
5
  } from "../chunk-QWV226SL.js";
6
- import "../chunk-NSBPE2FW.js";
6
+ import "../chunk-PZ5AY32C.js";
7
7
  export {
8
8
  buildByAxis,
9
9
  runAgentMatrix,
@@ -10,7 +10,7 @@ import {
10
10
  llmSpans
11
11
  } from "../chunk-47X6LRCE.js";
12
12
  import "../chunk-5BKGXME7.js";
13
- import "../chunk-NSBPE2FW.js";
13
+ import "../chunk-PZ5AY32C.js";
14
14
 
15
15
  // src/meta-eval/calibration.ts
16
16
  async function calibrationCurve(traceStore, outcomeStore, evalMetric, outcomeMetric, options = {}) {
@@ -1,7 +1,7 @@
1
1
  import {
2
2
  runAgentMatrix
3
3
  } from "../chunk-QWV226SL.js";
4
- import "../chunk-NSBPE2FW.js";
4
+ import "../chunk-PZ5AY32C.js";
5
5
 
6
6
  // src/multishot/router.ts
7
7
  async function routerCompletion(req) {
package/dist/openapi.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "openapi": "3.1.0",
3
3
  "info": {
4
4
  "title": "@tangle-network/agent-eval — wire protocol",
5
- "version": "0.59.1",
5
+ "version": "0.60.0",
6
6
  "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
7
7
  "contact": {
8
8
  "name": "Tangle Network",
@@ -19,7 +19,7 @@ import {
19
19
  } from "../chunk-47X6LRCE.js";
20
20
  import "../chunk-5BKGXME7.js";
21
21
  import "../chunk-QYJT52YW.js";
22
- import "../chunk-NSBPE2FW.js";
22
+ import "../chunk-PZ5AY32C.js";
23
23
 
24
24
  // src/pipelines/budget-breach.ts
25
25
  async function budgetBreachView(store, options = {}) {
package/dist/prm/index.js CHANGED
@@ -9,7 +9,7 @@ import "../chunk-5BKGXME7.js";
9
9
  import {
10
10
  TraceEmitter
11
11
  } from "../chunk-TVVP3ZZQ.js";
12
- import "../chunk-NSBPE2FW.js";
12
+ import "../chunk-PZ5AY32C.js";
13
13
 
14
14
  // src/prm/builtin-rubrics.ts
15
15
  function outputLengthRubric(args = {}) {
@@ -1,7 +1,8 @@
1
- import { S as Scenario, k as CampaignResult, o as GateResult, u as Mutator, I as ImprovementDriver, G as Gate, D as DispatchFn, a as JudgeConfig, L as LabeledScenarioStore, l as CampaignTraceWriter, M as MutableSurface, q as GenerationRecord } from './types-BgrxOJSf.js';
1
+ import { S as Scenario, k as CampaignResult, o as GateResult, u as Mutator, I as ImprovementDriver, G as Gate, D as DispatchFn, a as JudgeConfig, L as LabeledScenarioStore, l as CampaignTraceWriter, M as MutableSurface, q as GenerationRecord, n as GateDecision } from './types-VCIXx_yo.js';
2
2
  import { L as LlmClientOptions } from './llm-client-BXVRUZyX.js';
3
3
  import { R as RedTeamCase } from './red-team-CrC5MZYd.js';
4
4
  import { R as RunRecord } from './run-record-etiCMsUq.js';
5
+ import { H as HostedClient, T as TraceSpanEvent } from './index-BIkvdkSU.js';
5
6
 
6
7
  /**
7
8
  * @experimental
@@ -242,7 +243,11 @@ interface CampaignStorage {
242
243
  }
243
244
  /** Node-filesystem storage — the default. Lazily requires `node:fs` so the
244
245
  * module imports cleanly in non-Node runtimes (where the caller passes
245
- * `inMemoryCampaignStorage` instead and never constructs this). */
246
+ * `inMemoryCampaignStorage` instead and never constructs this).
247
+ *
248
+ * `createRequire(import.meta.url)` is the ESM-native lazy require — a bare
249
+ * `require` is a ReferenceError under `"type": "module"`, which is exactly
250
+ * the shape this package publishes. */
246
251
  declare function fsCampaignStorage(): CampaignStorage;
247
252
  /** In-memory storage for filesystem-less runtimes. Artifacts + trace spans
248
253
  * live in a `Map` for the duration of the run; the `CampaignResult` is
@@ -385,6 +390,14 @@ interface RunOptimizationResult<TArtifact, TScenario extends Scenario> {
385
390
  }>;
386
391
  winnerSurface: MutableSurface;
387
392
  winnerSurfaceHash: string;
393
+ /** Driver label for the promoted surface. Present when the winning
394
+ * candidate came from a `ProposedCandidate` (a reflective driver);
395
+ * absent when the winner is the baseline or a bare-surface mutator. */
396
+ winnerLabel?: string;
397
+ /** Driver rationale for the promoted surface — the "because Z" that
398
+ * motivated the winning change. Survives to `SelfImproveResult` and the
399
+ * emitted provenance record. Absent when the winner is the baseline. */
400
+ winnerRationale?: string;
388
401
  baselineCampaign: CampaignResult<TArtifact, TScenario>;
389
402
  }
390
403
  declare function runOptimization<TScenario extends Scenario, TArtifact>(opts: RunOptimizationOptions<TScenario, TArtifact>): Promise<RunOptimizationResult<TArtifact, TScenario>>;
@@ -443,8 +456,197 @@ interface RunImprovementLoopResult<TArtifact, TScenario extends Scenario> extend
443
456
  baselineOnHoldout: CampaignResult<TArtifact, TScenario>;
444
457
  winnerOnHoldout: CampaignResult<TArtifact, TScenario>;
445
458
  gateResult: Awaited<ReturnType<Gate<TArtifact, TScenario>['decide']>>;
459
+ /** Unified baseline→winner surface diff. Computed UNCONDITIONALLY (not only
460
+ * when `autoOnPromote === 'pr'`) so the diff that the gate decided on is
461
+ * always present on the result + in the emitted provenance record. Empty
462
+ * string when winner == baseline (no change to diff). */
463
+ promotedDiff: string;
446
464
  prResult?: ReturnType<typeof openAutoPr>;
447
465
  }
448
466
  declare function runImprovementLoop<TScenario extends Scenario, TArtifact>(opts: RunImprovementLoopOptions<TScenario, TArtifact>): Promise<RunImprovementLoopResult<TArtifact, TScenario>>;
467
+ declare function defaultRenderDiff(winnerSurface: MutableSurface, baselineSurface: MutableSurface): string;
468
+
469
+ /**
470
+ * @experimental
471
+ *
472
+ * Loop provenance — the durable, queryable record of WHAT a self-improvement
473
+ * loop did and WHY, plus the OTel spans that let an OTLP collector pivot from
474
+ * an eval-run to the underlying candidate→cell→gate→promote chain.
475
+ *
476
+ * Two artifacts, one source of truth:
477
+ *
478
+ * 1. `LoopProvenanceRecord` — a structured JSON record capturing every
479
+ * candidate (surfaceHash + label + rationale), its measured composite,
480
+ * the gate decision + reasons + delta, the held-out lift, the explicit
481
+ * baseline→candidate diff, and BACKEND PROVENANCE (the
482
+ * `assertRealBackend` verdict + worker call count + model). This is the
483
+ * ingestable audit artifact: the +lift recomputes from it, the "because
484
+ * Z" rationale survives in it, and a stub backend is detectable from it.
485
+ *
486
+ * 2. `loopProvenanceSpans()` — the same chain emitted as OTLP-ingestable
487
+ * `TraceSpanEvent`s, pivoted on the substrate's standard
488
+ * `tangle.runId` / `tangle.scenarioId` / `tangle.cellId` /
489
+ * `tangle.generation` attributes (the same pivots `/adapters/otel`
490
+ * reads). The hosted `/v1/ingest/traces` endpoint receives the FULL loop,
491
+ * not just the `cost.*` spans `runCampaign` already emits per cell.
492
+ *
493
+ * The record is built from the substrate's own loop result + the per-call
494
+ * `RunRecord`s the worker emitted — no new measurement, no recomputation that
495
+ * could drift from what the gate actually saw.
496
+ */
497
+
498
+ /** Stable sha256 (full hex) of a surface's effective text. Code surfaces hash
499
+ * their worktree+base identity since the content lives in git. Distinct from
500
+ * `surfaceHash` (16-char content fingerprint used as a loop identity key);
501
+ * this is the byte-identical-verifiable content hash the provenance record +
502
+ * `RunRecord.promptHash` carry. */
503
+ declare function surfaceContentHash(surface: MutableSurface): string;
504
+ interface LoopProvenanceCandidate {
505
+ /** Generation index this candidate was proposed in. */
506
+ generation: number;
507
+ /** 16-char loop-identity fingerprint (matches `GenerationCandidate.surfaceHash`). */
508
+ surfaceHash: string;
509
+ /** Full sha256 content hash — byte-identical-verifiable. */
510
+ contentHash: string;
511
+ /** Driver label, when the driver returned a `ProposedCandidate`. */
512
+ label?: string;
513
+ /** Driver rationale — the "because Z". When the driver returned a bare
514
+ * surface (blind mutator) this is absent. */
515
+ rationale?: string;
516
+ /** Mean composite this candidate scored on the search split. */
517
+ composite: number;
518
+ /** Whether this candidate was promoted out of its generation. */
519
+ promoted: boolean;
520
+ }
521
+ interface LoopProvenanceBackend {
522
+ /** `assertRealBackend`-grade verdict over the worker call records. */
523
+ verdict: 'real' | 'mixed' | 'stub';
524
+ /** Number of worker LLM calls captured (the audit's "worker call count"). */
525
+ workerCallCount: number;
526
+ /** Distinct model ids observed across worker calls. */
527
+ models: string[];
528
+ totalInputTokens: number;
529
+ totalOutputTokens: number;
530
+ totalCostUsd: number;
531
+ }
532
+ /**
533
+ * The durable provenance record. Aligns to the hosted `EvalRunEvent` path but
534
+ * ADDS the rationale + the explicit baseline→candidate diff (both omitted from
535
+ * the bare hosted event) + backend provenance.
536
+ */
537
+ interface LoopProvenanceRecord {
538
+ schema: 'tangle.loop-provenance.v1';
539
+ runId: string;
540
+ runDir: string;
541
+ timestamp: string;
542
+ /** Baseline + winner surface content hashes — distinguishable, byte-verifiable. */
543
+ baselineContentHash: string;
544
+ winnerContentHash: string;
545
+ /** Driver label/rationale for the promoted change. Absent ⇒ winner == baseline. */
546
+ winnerLabel?: string;
547
+ winnerRationale?: string;
548
+ /** The explicit baseline→winner unified diff the gate decided on. */
549
+ diff: string;
550
+ /** Every candidate across every generation, each carrying its rationale. */
551
+ candidates: LoopProvenanceCandidate[];
552
+ /** The gate verdict — decision + reasons + contributing gates + delta. */
553
+ gate: {
554
+ decision: GateDecision;
555
+ reasons: string[];
556
+ delta?: number;
557
+ contributingGates: Array<{
558
+ name: string;
559
+ passed: boolean;
560
+ }>;
561
+ };
562
+ /** baseline-on-holdout composite mean. */
563
+ baselineHoldoutComposite: number;
564
+ /** winner-on-holdout composite mean. */
565
+ winnerHoldoutComposite: number;
566
+ /** winnerHoldout - baselineHoldout — RECOMPUTABLE from this record. */
567
+ heldOutLift: number;
568
+ /** Backend provenance: stub-vs-real verdict + worker call count + models. */
569
+ backend: LoopProvenanceBackend;
570
+ totalCostUsd: number;
571
+ totalDurationMs: number;
572
+ }
573
+ interface BuildLoopProvenanceArgs<TArtifact, TScenario extends Scenario> {
574
+ runId: string;
575
+ runDir: string;
576
+ timestamp: string;
577
+ baselineSurface: MutableSurface;
578
+ winnerSurface: MutableSurface;
579
+ winnerLabel?: string;
580
+ winnerRationale?: string;
581
+ diff: string;
582
+ /** Per-generation candidate records straight off the loop result. */
583
+ generations: Array<{
584
+ generationIndex: number;
585
+ candidates: Array<{
586
+ surfaceHash: string;
587
+ composite: number;
588
+ label?: string;
589
+ rationale?: string;
590
+ }>;
591
+ promoted: string[];
592
+ /** Surfaces measured this generation, keyed positionally to candidates so
593
+ * the content hash can be computed from the real surface text. */
594
+ surfaces: Array<{
595
+ surfaceHash: string;
596
+ surface: MutableSurface;
597
+ }>;
598
+ }>;
599
+ gate: GateResult;
600
+ baselineOnHoldout: CampaignResult<TArtifact, TScenario>;
601
+ winnerOnHoldout: CampaignResult<TArtifact, TScenario>;
602
+ /** Worker call records — the source for backend provenance. */
603
+ workerRecords: ReadonlyArray<RunRecord>;
604
+ totalCostUsd: number;
605
+ totalDurationMs: number;
606
+ }
607
+ /** Build the durable provenance record from a completed loop result. */
608
+ declare function buildLoopProvenanceRecord<TArtifact, TScenario extends Scenario>(args: BuildLoopProvenanceArgs<TArtifact, TScenario>): LoopProvenanceRecord;
609
+ /**
610
+ * Build the loop's OTLP-ingestable spans from a provenance record. One root
611
+ * span per loop (`tangle.runId`), one span per generation, one span per
612
+ * candidate (carrying its surfaceHash + label), and one span for the gate
613
+ * decision (carrying reasons + delta + lift). Candidate + gate spans pivot on
614
+ * the same `tangle.runId` / `tangle.generation` attributes `/adapters/otel`
615
+ * reads, so the hosted collector reconstructs the full tree.
616
+ *
617
+ * Times are synthesized monotonically off a single base so the span tree is
618
+ * orderable; the substrate does not retain per-candidate wall-clock starts.
619
+ */
620
+ declare function loopProvenanceSpans(record: LoopProvenanceRecord, opts?: {
621
+ baseTimeMs?: number;
622
+ }): TraceSpanEvent[];
623
+ /** Canonical durable paths under the run dir. */
624
+ declare function provenanceRecordPath(runDir: string): string;
625
+ declare function provenanceSpansPath(runDir: string): string;
626
+ interface EmitLoopProvenanceResult {
627
+ record: LoopProvenanceRecord;
628
+ spans: TraceSpanEvent[];
629
+ /** Absolute paths the record + spans were written to, when storage persists. */
630
+ recordPath: string;
631
+ spansPath: string;
632
+ }
633
+ interface EmitLoopProvenanceArgs<TArtifact, TScenario extends Scenario> extends BuildLoopProvenanceArgs<TArtifact, TScenario> {
634
+ /** Storage the record + spans are written through. */
635
+ storage: CampaignStorage;
636
+ /** When set, the spans are also shipped to the hosted `/v1/ingest/traces`
637
+ * endpoint so the collector receives the full loop, not just `cost.*`. */
638
+ hostedClient?: HostedClient;
639
+ }
640
+ /**
641
+ * Build the provenance record + OTel spans and persist them durably under the
642
+ * run dir (and ship spans to a hosted collector when one is wired). Returns
643
+ * both artifacts so the caller can assert on / re-derive from them.
644
+ *
645
+ * Fail-loud: the durable write throws on storage failure (a swallowed write is
646
+ * exactly the "emitted but lost" failure this closes). The hosted span ship is
647
+ * the one best-effort leg — its failure is logged, not thrown, so an offline
648
+ * collector never fails the loop (the durable artifact is the source of truth).
649
+ */
650
+ declare function emitLoopProvenance<TArtifact, TScenario extends Scenario>(args: EmitLoopProvenanceArgs<TArtifact, TScenario>): Promise<EmitLoopProvenanceResult>;
449
651
 
450
- export { type CampaignStorage as C, type DefaultProductionGateOptions as D, type EvolutionaryDriverOptions as E, type GepaDriverConstraints as G, type HeldOutGateOptions as H, type OpenAutoPrOptions as O, type RunCampaignOptions as R, type GepaDriverOptions as a, type OpenAutoPrResult as b, type RunEvalOptions as c, type RunImprovementLoopOptions as d, type RunImprovementLoopResult as e, type RunOptimizationOptions as f, type RunOptimizationResult as g, composeGate as h, countSentenceEdits as i, defaultProductionGate as j, evolutionaryDriver as k, extractH2Sections as l, fsCampaignStorage as m, gepaDriver as n, heldOutGate as o, inMemoryCampaignStorage as p, openAutoPr as q, runCampaign as r, runEval as s, runImprovementLoop as t, runOptimization as u, surfaceHash as v };
652
+ export { provenanceSpansPath as A, type BuildLoopProvenanceArgs as B, type CampaignStorage as C, type DefaultProductionGateOptions as D, type EmitLoopProvenanceArgs as E, runCampaign as F, type GepaDriverConstraints as G, type HeldOutGateOptions as H, runEval as I, runImprovementLoop as J, runOptimization as K, type LoopProvenanceBackend as L, surfaceContentHash as M, surfaceHash as N, type OpenAutoPrOptions as O, type RunCampaignOptions as R, type EmitLoopProvenanceResult as a, type EvolutionaryDriverOptions as b, type GepaDriverOptions as c, type LoopProvenanceCandidate as d, type LoopProvenanceRecord as e, type OpenAutoPrResult as f, type RunEvalOptions as g, type RunImprovementLoopOptions as h, type RunImprovementLoopResult as i, type RunOptimizationOptions as j, type RunOptimizationResult as k, buildLoopProvenanceRecord as l, composeGate as m, countSentenceEdits as n, defaultProductionGate as o, defaultRenderDiff as p, emitLoopProvenance as q, evolutionaryDriver as r, extractH2Sections as s, fsCampaignStorage as t, gepaDriver as u, heldOutGate as v, inMemoryCampaignStorage as w, loopProvenanceSpans as x, openAutoPr as y, provenanceRecordPath as z };
package/dist/reporting.js CHANGED
@@ -26,7 +26,7 @@ import {
26
26
  } from "./chunk-S3SDD56V.js";
27
27
  import "./chunk-VSMTAMNK.js";
28
28
  import "./chunk-QYJT52YW.js";
29
- import "./chunk-NSBPE2FW.js";
29
+ import "./chunk-PZ5AY32C.js";
30
30
  export {
31
31
  RESEARCH_REPORT_HARD_PAIR_FLOOR,
32
32
  assertReleaseConfidence,
package/dist/rl.d.ts CHANGED
@@ -1,5 +1,5 @@
1
1
  import { R as RunRecord, a as RunSplitTag } from './run-record-etiCMsUq.js';
2
- import { k as CampaignResult } from './types-BgrxOJSf.js';
2
+ import { k as CampaignResult } from './types-VCIXx_yo.js';
3
3
  import { V as VerificationReport, R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-JP8EvnLv.js';
4
4
  export { r as runEvalCampaign } from './researcher-JP8EvnLv.js';
5
5
  import { S as Span } from './schema-m0gsnbt3.js';
package/dist/rl.js CHANGED
@@ -31,7 +31,7 @@ import "./chunk-PC4UYEBM.js";
31
31
  import {
32
32
  ValidationError
33
33
  } from "./chunk-QYJT52YW.js";
34
- import "./chunk-NSBPE2FW.js";
34
+ import "./chunk-PZ5AY32C.js";
35
35
 
36
36
  // src/rl/compute-curves.ts
37
37
  async function runComputeCurve(opts) {
@@ -1,10 +1,10 @@
1
1
  import {
2
2
  runCampaign
3
- } from "./chunk-74Y2EMNH.js";
3
+ } from "./chunk-NOPYCRNG.js";
4
4
  import "./chunk-S3SDD56V.js";
5
5
  import "./chunk-QYJT52YW.js";
6
- import "./chunk-NSBPE2FW.js";
6
+ import "./chunk-PZ5AY32C.js";
7
7
  export {
8
8
  runCampaign
9
9
  };
10
- //# sourceMappingURL=run-campaign-ZURVWMMI.js.map
10
+ //# sourceMappingURL=run-campaign-5XENUKRF.js.map
@@ -1,4 +1,4 @@
1
- import "../chunk-NSBPE2FW.js";
1
+ import "../chunk-PZ5AY32C.js";
2
2
 
3
3
  // src/telemetry/sink-file.ts
4
4
  import * as fs from "fs";
@@ -1,4 +1,4 @@
1
- import "../chunk-NSBPE2FW.js";
1
+ import "../chunk-PZ5AY32C.js";
2
2
 
3
3
  // src/telemetry/schema.ts
4
4
  var TELEMETRY_SCHEMA_VERSION = 1;
package/dist/traces.js CHANGED
@@ -78,7 +78,7 @@ import {
78
78
  providerFromBaseUrl
79
79
  } from "./chunk-PC4UYEBM.js";
80
80
  import "./chunk-QYJT52YW.js";
81
- import "./chunk-NSBPE2FW.js";
81
+ import "./chunk-PZ5AY32C.js";
82
82
  export {
83
83
  DEFAULT_REDACTION_RULES,
84
84
  DEFAULT_TRACE_ANALYST_BUDGETS,
@@ -118,6 +118,24 @@ interface CodeSurface {
118
118
  * Tier 3 (knowledge) is owned by agent-knowledge and rides its own adapter,
119
119
  * not this type. */
120
120
  type MutableSurface = string | CodeSurface;
121
+ /** @experimental A driver proposal carrying the surface AND the WHY behind
122
+ * it. Reflective drivers (`gepaDriver`) parse a `{label, rationale, payload}`
123
+ * from the model; without this wrapper the loop keeps only `payload` and the
124
+ * rationale that motivated the change is lost — the candidate becomes
125
+ * unattributable. `propose()` may return either bare `MutableSurface`s (cheap
126
+ * blind mutators) or these (reflective drivers); the loop normalizes both. */
127
+ interface ProposedCandidate {
128
+ surface: MutableSurface;
129
+ /** Short human label for the change (≤ 40 chars typical). */
130
+ label: string;
131
+ /** Why this change was proposed — which failure it targets, which
132
+ * primitive it used. Survives to `GenerationCandidate.rationale` and the
133
+ * emitted provenance record. */
134
+ rationale: string;
135
+ }
136
+ /** @experimental Type guard: a proposal carrying its rationale vs a bare
137
+ * surface. The loop branches on this to populate `GenerationCandidate`. */
138
+ declare function isProposedCandidate(value: MutableSurface | ProposedCandidate): value is ProposedCandidate;
121
139
  /** @experimental Stateless surface mutation — given findings + current
122
140
  * surface, return N candidate surfaces. Pure transform, no generation
123
141
  * awareness. Reflective-mutation, `runMultiShotOptimization`, `AxGEPA`
@@ -129,7 +147,7 @@ interface Mutator<TFindings = unknown> {
129
147
  currentSurface: MutableSurface;
130
148
  populationSize: number;
131
149
  signal: AbortSignal;
132
- }): Promise<MutableSurface[]>;
150
+ }): Promise<Array<MutableSurface | ProposedCandidate>>;
133
151
  }
134
152
  /** @experimental Everything a driver's `propose()` may read to plan the next
135
153
  * batch of candidates. The first six fields are always present; the rest are
@@ -169,8 +187,11 @@ interface ProposeContext<TFindings = unknown> {
169
187
  * are driver-agnostic. */
170
188
  interface ImprovementDriver<TFindings = unknown> {
171
189
  kind: string;
172
- /** Plan: propose N candidate surfaces for the next generation. */
173
- propose(ctx: ProposeContext<TFindings>): Promise<MutableSurface[]>;
190
+ /** Plan: propose N candidate surfaces for the next generation. A driver
191
+ * may return bare `MutableSurface`s or `ProposedCandidate`s that carry the
192
+ * `{label, rationale}` motivating the change — the loop threads the
193
+ * rationale into `GenerationCandidate` and the emitted provenance. */
194
+ propose(ctx: ProposeContext<TFindings>): Promise<Array<MutableSurface | ProposedCandidate>>;
174
195
  /** Decide: stop early when the driver judges the search converged or
175
196
  * exhausted. Default (omitted) runs all `maxGenerations`. */
176
197
  decide?(args: {
@@ -368,6 +389,13 @@ interface GenerationCandidate {
368
389
  scenarioId: string;
369
390
  composite: number;
370
391
  }>;
392
+ /** Driver-supplied short label for the change. Present when the driver
393
+ * returned a `ProposedCandidate`; absent for bare-surface mutators. */
394
+ label?: string;
395
+ /** Driver-supplied rationale — WHY this candidate was proposed. The
396
+ * "because rationale Z" the audit requires to survive to the result.
397
+ * Present when the driver returned a `ProposedCandidate`. */
398
+ rationale?: string;
371
399
  }
372
400
  interface CampaignAggregates {
373
401
  byJudge: Record<string, JudgeAggregate>;
@@ -402,4 +430,4 @@ interface CampaignResult<TArtifact = unknown, TScenario extends Scenario = Scena
402
430
  scenarios: Array<Pick<TScenario, 'id' | 'kind'>>;
403
431
  }
404
432
 
405
- export { type CodeSurface as C, type DispatchFn as D, type Gate as G, type ImprovementDriver as I, type JudgeScore as J, type LabeledScenarioStore as L, type MutableSurface as M, type OptimizerConfig as O, type ProposeContext as P, type RedactionStatus as R, type Scenario as S, type TraceSpan as T, type JudgeConfig as a, type DispatchContext as b, type LabeledScenarioWrite as c, type LabeledScenarioSampleArgs as d, type LabeledScenarioRecord as e, type LabelTrust as f, type CampaignAggregates as g, type CampaignArtifactWriter as h, type CampaignCellResult as i, type CampaignCostMeter as j, type CampaignResult as k, type CampaignTraceWriter as l, type GateContext as m, type GateDecision as n, type GateResult as o, type GenerationCandidate as p, type GenerationRecord as q, type JudgeAggregate as r, type JudgeDimension as s, type LabeledScenarioSource as t, type Mutator as u, type ScenarioAggregate as v, type SessionScript as w, labelTrustRank as x };
433
+ export { type CodeSurface as C, type DispatchFn as D, type Gate as G, type ImprovementDriver as I, type JudgeScore as J, type LabeledScenarioStore as L, type MutableSurface as M, type OptimizerConfig as O, type ProposeContext as P, type RedactionStatus as R, type Scenario as S, type TraceSpan as T, type JudgeConfig as a, type DispatchContext as b, type LabeledScenarioWrite as c, type LabeledScenarioSampleArgs as d, type LabeledScenarioRecord as e, type LabelTrust as f, type CampaignAggregates as g, type CampaignArtifactWriter as h, type CampaignCellResult as i, type CampaignCostMeter as j, type CampaignResult as k, type CampaignTraceWriter as l, type GateContext as m, type GateDecision as n, type GateResult as o, type GenerationCandidate as p, type GenerationRecord as q, type JudgeAggregate as r, type JudgeDimension as s, type LabeledScenarioSource as t, type Mutator as u, type ProposedCandidate as v, type ScenarioAggregate as w, type SessionScript as x, isProposedCandidate as y, labelTrustRank as z };
@@ -38,7 +38,7 @@ import {
38
38
  import "../chunk-VXNVVBZO.js";
39
39
  import "../chunk-PC4UYEBM.js";
40
40
  import "../chunk-QYJT52YW.js";
41
- import "../chunk-NSBPE2FW.js";
41
+ import "../chunk-PZ5AY32C.js";
42
42
  export {
43
43
  BUILTIN_RUBRICS,
44
44
  ErrorResponseSchema,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tangle-network/agent-eval",
3
- "version": "0.59.1",
3
+ "version": "0.60.0",
4
4
  "description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
5
5
  "homepage": "https://github.com/tangle-network/agent-eval#readme",
6
6
  "repository": {
@@ -144,6 +144,18 @@
144
144
  "publishConfig": {
145
145
  "access": "public"
146
146
  },
147
+ "scripts": {
148
+ "build": "tsup && pnpm openapi",
149
+ "dev": "tsup --watch",
150
+ "prepare": "husky",
151
+ "prepublishOnly": "pnpm build",
152
+ "test": "vitest run",
153
+ "test:watch": "vitest",
154
+ "typecheck": "tsc --noEmit",
155
+ "lint": "biome check src",
156
+ "format": "biome format --write src",
157
+ "openapi": "node dist/cli.js openapi --out dist/openapi.json"
158
+ },
147
159
  "dependencies": {
148
160
  "@asteasolutions/zod-to-openapi": "^8.5.0",
149
161
  "@ax-llm/ax": "^19.0.25",
@@ -171,6 +183,16 @@
171
183
  "typescript": "^5.7.0",
172
184
  "vitest": "^3.0.0"
173
185
  },
186
+ "pnpm": {
187
+ "minimumReleaseAge": 4320,
188
+ "minimumReleaseAgeExclude": [
189
+ "@tangle-network/sandbox"
190
+ ],
191
+ "overrides": {
192
+ "postcss@<8.5.10": "^8.5.10",
193
+ "ws@>=8.0.0 <8.20.1": "^8.20.1"
194
+ }
195
+ },
174
196
  "engines": {
175
197
  "node": ">=20"
176
198
  },
@@ -183,14 +205,5 @@
183
205
  ]
184
206
  },
185
207
  "license": "MIT",
186
- "scripts": {
187
- "build": "tsup && pnpm openapi",
188
- "dev": "tsup --watch",
189
- "test": "vitest run",
190
- "test:watch": "vitest",
191
- "typecheck": "tsc --noEmit",
192
- "lint": "biome check src",
193
- "format": "biome format --write src",
194
- "openapi": "node dist/cli.js openapi --out dist/openapi.json"
195
- }
196
- }
208
+ "packageManager": "pnpm@10.22.0"
209
+ }