@tangle-network/agent-eval 0.59.1 → 0.60.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/dist/adapters/http.d.ts +1 -1
  2. package/dist/adapters/http.js +1 -1
  3. package/dist/adapters/langchain.d.ts +1 -1
  4. package/dist/adapters/langchain.js +1 -1
  5. package/dist/adapters/otel.d.ts +2 -2
  6. package/dist/adapters/otel.js +1 -1
  7. package/dist/benchmarks/index.js +2 -2
  8. package/dist/builder-eval/index.js +1 -1
  9. package/dist/campaign/index.d.ts +7 -3
  10. package/dist/campaign/index.js +21 -16
  11. package/dist/campaign/index.js.map +1 -1
  12. package/dist/{chunk-MHQPVHXU.js → chunk-6QDKWHLS.js} +2 -2
  13. package/dist/{chunk-N4SBKEPJ.js → chunk-GBHRUAOF.js} +106 -1
  14. package/dist/chunk-GBHRUAOF.js.map +1 -0
  15. package/dist/{chunk-JB4UWIM6.js → chunk-LBSXXH56.js} +265 -14
  16. package/dist/chunk-LBSXXH56.js.map +1 -0
  17. package/dist/{chunk-74Y2EMNH.js → chunk-NOPYCRNG.js} +6 -5
  18. package/dist/{chunk-74Y2EMNH.js.map → chunk-NOPYCRNG.js.map} +1 -1
  19. package/dist/chunk-PZ5AY32C.js +10 -0
  20. package/dist/chunk-SHTXZ4O2.js +113 -0
  21. package/dist/chunk-SHTXZ4O2.js.map +1 -0
  22. package/dist/cli.js +1 -1
  23. package/dist/contract/index.d.ts +42 -10
  24. package/dist/contract/index.js +55 -15
  25. package/dist/contract/index.js.map +1 -1
  26. package/dist/control.js +1 -1
  27. package/dist/governance/index.js +1 -1
  28. package/dist/hosted/index.d.ts +2 -2
  29. package/dist/hosted/index.js +1 -1
  30. package/dist/{index-D2nT6_KT.d.ts → index-BIkvdkSU.d.ts} +1 -1
  31. package/dist/index.js +8 -8
  32. package/dist/knowledge/index.js +1 -1
  33. package/dist/matrix/index.js +1 -1
  34. package/dist/meta-eval/index.js +1 -1
  35. package/dist/multishot/index.js +1 -1
  36. package/dist/openapi.json +1 -1
  37. package/dist/pipelines/index.js +1 -1
  38. package/dist/prm/index.js +1 -1
  39. package/dist/{run-improvement-loop-BhfdjrMY.d.ts → provenance-BM8vmMBa.d.ts} +205 -3
  40. package/dist/reporting.js +1 -1
  41. package/dist/rl.d.ts +1 -1
  42. package/dist/rl.js +1 -1
  43. package/dist/{run-campaign-ZURVWMMI.js → run-campaign-5XENUKRF.js} +3 -3
  44. package/dist/telemetry/file.js +1 -1
  45. package/dist/telemetry/index.js +1 -1
  46. package/dist/traces.js +1 -1
  47. package/dist/{types-BgrxOJSf.d.ts → types-VCIXx_yo.d.ts} +32 -4
  48. package/dist/wire/index.js +1 -1
  49. package/package.json +25 -12
  50. package/dist/chunk-JB4UWIM6.js.map +0 -1
  51. package/dist/chunk-N4SBKEPJ.js.map +0 -1
  52. package/dist/chunk-NSBPE2FW.js +0 -17
  53. package/dist/chunk-ZWEQJIM6.js +0 -220
  54. package/dist/chunk-ZWEQJIM6.js.map +0 -1
  55. /package/dist/{chunk-MHQPVHXU.js.map → chunk-6QDKWHLS.js.map} +0 -0
  56. /package/dist/{chunk-NSBPE2FW.js.map → chunk-PZ5AY32C.js.map} +0 -0
  57. /package/dist/{run-campaign-ZURVWMMI.js.map → run-campaign-5XENUKRF.js.map} +0 -0
@@ -1,13 +1,13 @@
1
- import { S as Scenario, M as MutableSurface, b as DispatchContext, a as JudgeConfig, I as ImprovementDriver, G as Gate } from '../types-BgrxOJSf.js';
2
- export { g as CampaignAggregates, h as CampaignArtifactWriter, i as CampaignCellResult, j as CampaignCostMeter, k as CampaignResult, l as CampaignTraceWriter, C as CodeSurface, D as Dispatch, m as GateContext, n as GateDecision, o as GateResult, p as GenerationCandidate, q as GenerationRecord, s as JudgeDimension, J as JudgeScore, u as Mutator, O as OptimizerConfig, w as SessionScript } from '../types-BgrxOJSf.js';
3
- import { C as CampaignStorage, e as RunImprovementLoopResult } from '../run-improvement-loop-BhfdjrMY.js';
4
- export { D as DefaultProductionGateOptions, E as EvolutionaryDriverOptions, a as GepaDriverOptions, H as HeldOutGateOptions, R as RunCampaignOptions, c as RunEvalOptions, d as RunImprovementLoopOptions, h as composeGate, j as defaultProductionGate, k as evolutionaryDriver, m as fsCampaignStorage, n as gepaDriver, o as heldOutGate, p as inMemoryCampaignStorage, r as runCampaign, s as runEval, t as runImprovementLoop } from '../run-improvement-loop-BhfdjrMY.js';
1
+ import { S as Scenario, M as MutableSurface, b as DispatchContext, a as JudgeConfig, I as ImprovementDriver, G as Gate } from '../types-VCIXx_yo.js';
2
+ export { g as CampaignAggregates, h as CampaignArtifactWriter, i as CampaignCellResult, j as CampaignCostMeter, k as CampaignResult, l as CampaignTraceWriter, C as CodeSurface, D as Dispatch, m as GateContext, n as GateDecision, o as GateResult, p as GenerationCandidate, q as GenerationRecord, s as JudgeDimension, J as JudgeScore, u as Mutator, O as OptimizerConfig, x as SessionScript } from '../types-VCIXx_yo.js';
3
+ import { C as CampaignStorage, e as LoopProvenanceRecord, i as RunImprovementLoopResult } from '../provenance-BM8vmMBa.js';
4
+ export { D as DefaultProductionGateOptions, b as EvolutionaryDriverOptions, c as GepaDriverOptions, H as HeldOutGateOptions, R as RunCampaignOptions, g as RunEvalOptions, h as RunImprovementLoopOptions, m as composeGate, o as defaultProductionGate, r as evolutionaryDriver, t as fsCampaignStorage, u as gepaDriver, v as heldOutGate, w as inMemoryCampaignStorage, F as runCampaign, I as runEval, J as runImprovementLoop } from '../provenance-BM8vmMBa.js';
5
5
  export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore, O as OutcomeStore } from '../outcome-store-D6KWmYvj.js';
6
- import { a as HostedTenant, I as InsightReport, T as TraceSpanEvent } from '../index-D2nT6_KT.js';
7
- export { F as FailureClusterInsight, b as InterRaterInsight, J as JudgeInsight, L as LiftInsight, O as OutcomeCorrelationInsight, R as Recommendation, c as ReleaseSummary, S as ScalarDistribution } from '../index-D2nT6_KT.js';
6
+ import { a as HostedTenant, I as InsightReport, T as TraceSpanEvent } from '../index-BIkvdkSU.js';
7
+ export { F as FailureClusterInsight, b as InterRaterInsight, J as JudgeInsight, L as LiftInsight, O as OutcomeCorrelationInsight, R as Recommendation, c as ReleaseSummary, S as ScalarDistribution } from '../index-BIkvdkSU.js';
8
+ import { R as RunRecord, a as RunSplitTag } from '../run-record-etiCMsUq.js';
8
9
  import { A as AnalystRegistry } from '../registry-DK9kqXvb.js';
9
10
  import { a as DatasetScenario } from '../dataset-BlwAtYYf.js';
10
- import { R as RunRecord, a as RunSplitTag } from '../run-record-etiCMsUq.js';
11
11
  import '../llm-client-BXVRUZyX.js';
12
12
  import '../errors-mje_cKOs.js';
13
13
  import '../raw-provider-sink-C46HDghv.js';
@@ -131,12 +131,30 @@ interface SelfImproveOptions<TScenario extends Scenario, TArtifact> {
131
131
  /** LLM config consumed by the default `gepaDriver`. Ignored if you pass
132
132
  * your own `driver`. */
133
133
  llm?: SelfImproveLlm;
134
- /** Storage backend. Default `inMemoryCampaignStorage()` — nothing
135
- * persists past the call. Pass `fsCampaignStorage()` to write to disk. */
134
+ /** Storage backend. Default is DURABLE: when a real (non-`mem://`) `runDir`
135
+ * is available, the substrate defaults to `fsCampaignStorage()` so the
136
+ * provenance record + OTel spans survive the call. Pass
137
+ * `inMemoryCampaignStorage()` explicitly to opt OUT (tests, edge runtimes).
138
+ * Default when `runDir` is `mem://...` (or unset): in-memory. */
136
139
  storage?: CampaignStorage;
137
140
  /** Run directory (logical for in-memory storage, real path for fs).
138
- * Default `mem://selfImprove-<timestamp>`. */
141
+ * Default `mem://selfImprove-<timestamp>` (in-memory, non-durable). Pass a
142
+ * real path to persist the provenance record + spans. */
139
143
  runDir?: string;
144
+ /**
145
+ * Worker call records for backend provenance. The agent is opaque to the
146
+ * substrate (it returns an artifact, not token usage), so to capture an
147
+ * `assertRealBackend`-grade verdict + worker call count + model in the
148
+ * provenance record, the agent reports its per-call `RunRecord`s here.
149
+ * Called once after the loop; return the records the agent accumulated.
150
+ * When unset, backend provenance is derived from campaign cells (cost only;
151
+ * verdict will read `stub` without token usage — the honest signal that no
152
+ * token channel was wired).
153
+ */
154
+ collectWorkerRecords?: () => RunRecord[];
155
+ /** Fires once the durable provenance record + OTel spans are emitted.
156
+ * Receives the structured record for inline assertions / custom routing. */
157
+ onProvenance?: (record: LoopProvenanceRecord) => void;
140
158
  /** Distributed-driver seam — same as `RunCampaignOptions.cellPlacement`.
141
159
  * Returns an opaque placement key the substrate forwards to your agent
142
160
  * as `ctx.placement`. Combined with `httpDispatch` from
@@ -184,10 +202,24 @@ interface SelfImproveResult<TScenario extends Scenario, TArtifact> {
184
202
  compositeMean: number;
185
203
  perScenario: Record<string, number>;
186
204
  surface: MutableSurface;
205
+ /** Driver label for the promoted change. Absent ⇒ winner == baseline or
206
+ * a bare-surface mutator. */
207
+ label?: string;
208
+ /** Driver rationale — the "because Z" that motivated the promoted change.
209
+ * Threaded from the driver's `ProposedCandidate` through the loop.
210
+ * Absent ⇒ winner == baseline. */
211
+ rationale?: string;
187
212
  };
188
213
  /** `winner.compositeMean - baselineOnHoldout.compositeMean`. Positive
189
214
  * means the gate observed improvement. */
190
215
  lift: number;
216
+ /** The explicit baseline→winner unified diff. Always present (empty string
217
+ * when winner == baseline). */
218
+ diff: string;
219
+ /** Durable, queryable provenance record: candidate→cell→gate→promote chain +
220
+ * rationale + diff + backend provenance. The artifact the hosted ingest
221
+ * path stores; the +lift RECOMPUTES from `record.heldOutLift`. */
222
+ provenance: LoopProvenanceRecord;
191
223
  /** `defaultProductionGate.decide()` result. */
192
224
  gateDecision: 'ship' | 'hold' | 'need_more_work' | 'model_ceiling' | 'arch_ceiling';
193
225
  /** Number of generations actually explored (may be less than the
@@ -1,25 +1,28 @@
1
1
  import {
2
2
  composeGate,
3
3
  defaultProductionGate,
4
+ emitLoopProvenance,
4
5
  evolutionaryDriver,
5
6
  gepaDriver,
6
7
  heldOutGate,
7
8
  runEval,
8
- runImprovementLoop
9
- } from "../chunk-JB4UWIM6.js";
9
+ runImprovementLoop,
10
+ surfaceContentHash
11
+ } from "../chunk-LBSXXH56.js";
10
12
  import {
11
13
  fsCampaignStorage,
12
14
  inMemoryCampaignStorage,
13
15
  runCampaign
14
- } from "../chunk-74Y2EMNH.js";
16
+ } from "../chunk-NOPYCRNG.js";
15
17
  import {
16
18
  createHostedClient
17
19
  } from "../chunk-FQK2CCIM.js";
18
20
  import {
19
- checkCanaries,
21
+ checkCanaries
22
+ } from "../chunk-SHTXZ4O2.js";
23
+ import {
20
24
  summarizeBackendIntegrity
21
- } from "../chunk-ZWEQJIM6.js";
22
- import "../chunk-N4SBKEPJ.js";
25
+ } from "../chunk-GBHRUAOF.js";
23
26
  import "../chunk-YV7J7X5N.js";
24
27
  import {
25
28
  FileSystemOutcomeStore,
@@ -42,7 +45,10 @@ import "../chunk-VSMTAMNK.js";
42
45
  import "../chunk-VXNVVBZO.js";
43
46
  import "../chunk-PC4UYEBM.js";
44
47
  import "../chunk-QYJT52YW.js";
45
- import "../chunk-NSBPE2FW.js";
48
+ import "../chunk-PZ5AY32C.js";
49
+
50
+ // src/contract/self-improve.ts
51
+ import { createHash } from "crypto";
46
52
 
47
53
  // src/contract/analyze-runs.ts
48
54
  async function analyzeRuns(opts) {
@@ -848,8 +854,9 @@ async function selfImprove(opts) {
848
854
  holdoutScenarios: holdout,
849
855
  deltaThreshold: 0.05
850
856
  });
851
- const storage = opts.storage ?? inMemoryCampaignStorage();
852
857
  const runDir = opts.runDir ?? `mem://selfImprove-${startedAt}`;
858
+ const isMemRunDir = runDir.startsWith("mem://");
859
+ const storage = opts.storage ?? (isMemRunDir ? inMemoryCampaignStorage() : fsCampaignStorage());
853
860
  if (opts.onProgress) {
854
861
  opts.onProgress({ kind: "baseline.started", scenarios: opts.scenarios.length });
855
862
  }
@@ -892,22 +899,53 @@ async function selfImprove(opts) {
892
899
  );
893
900
  const insight = await analyzeRuns({
894
901
  runs: [
895
- ...cellsToRunRecords(result.baselineCampaign.cells, "baseline", runDir),
896
- ...cellsToRunRecords(result.winnerOnHoldout.cells, "winner", runDir)
902
+ ...cellsToRunRecords(result.baselineCampaign.cells, "baseline", runDir, opts.baselineSurface),
903
+ ...cellsToRunRecords(result.winnerOnHoldout.cells, "winner", runDir, result.winnerSurface)
897
904
  ],
898
905
  baselineCandidateId: "baseline",
899
906
  candidateCandidateId: "winner"
900
907
  });
908
+ const durationMs = Date.now() - startedAt;
909
+ const workerRecords = opts.collectWorkerRecords?.() ?? cellsToRunRecords(result.winnerOnHoldout.cells, "winner", runDir, result.winnerSurface);
910
+ const { record: provenance } = await emitLoopProvenance({
911
+ runId: `${runDir}#${startedAt}`,
912
+ runDir,
913
+ timestamp: new Date(startedAt).toISOString(),
914
+ baselineSurface: opts.baselineSurface,
915
+ winnerSurface: result.winnerSurface,
916
+ winnerLabel: result.winnerLabel,
917
+ winnerRationale: result.winnerRationale,
918
+ diff: result.promotedDiff,
919
+ generations: result.generations.map((g) => ({
920
+ generationIndex: g.record.generationIndex,
921
+ candidates: g.record.candidates,
922
+ promoted: g.record.promoted,
923
+ surfaces: g.surfaces.map((s) => ({ surfaceHash: s.surfaceHash, surface: s.surface }))
924
+ })),
925
+ gate: result.gateResult,
926
+ baselineOnHoldout: result.baselineOnHoldout,
927
+ winnerOnHoldout: result.winnerOnHoldout,
928
+ workerRecords,
929
+ totalCostUsd: totalCost,
930
+ totalDurationMs: durationMs,
931
+ storage,
932
+ hostedClient: opts.hostedTenant ? createHostedClient(opts.hostedTenant) : void 0
933
+ });
934
+ if (opts.onProvenance) opts.onProvenance(provenance);
901
935
  const summary = {
902
936
  baseline,
903
937
  winner: {
904
938
  ...winnerStats,
905
- surface: result.winnerSurface
939
+ surface: result.winnerSurface,
940
+ ...result.winnerLabel ? { label: result.winnerLabel } : {},
941
+ ...result.winnerRationale ? { rationale: result.winnerRationale } : {}
906
942
  },
907
943
  lift: winnerStats.compositeMean - baseline.compositeMean,
944
+ diff: result.promotedDiff,
945
+ provenance,
908
946
  gateDecision: result.gateResult.decision,
909
947
  generationsExplored: result.generations.length,
910
- durationMs: Date.now() - startedAt,
948
+ durationMs,
911
949
  totalCostUsd: totalCost,
912
950
  insight,
913
951
  raw: result
@@ -989,7 +1027,9 @@ function hashString(s) {
989
1027
  }
990
1028
  return h.toString(16).padStart(8, "0");
991
1029
  }
992
- function cellsToRunRecords(cells, candidateId, runId) {
1030
+ function cellsToRunRecords(cells, candidateId, runId, surface) {
1031
+ const promptHash = surfaceContentHash(surface);
1032
+ const configHash = `sha256:${createHash("sha256").update(candidateId).digest("hex")}`;
993
1033
  return cells.map((cell) => {
994
1034
  const perJudge = {};
995
1035
  const perDimMeanAccum = {};
@@ -1027,8 +1067,8 @@ function cellsToRunRecords(cells, candidateId, runId) {
1027
1067
  // Synthesize a stable seed for that pairing.
1028
1068
  seed: cell.rep * 1e6 + hashString(cell.scenarioId).slice(0, 6).split("").reduce((a, c) => a * 31 + c.charCodeAt(0) >>> 0, 0),
1029
1069
  model: "campaign-cell",
1030
- promptHash: "sha256:cell",
1031
- configHash: "sha256:cell",
1070
+ promptHash,
1071
+ configHash,
1032
1072
  commitSha: "cell",
1033
1073
  wallMs: cell.durationMs,
1034
1074
  costUsd: cell.costUsd,