@tangle-network/agent-eval 0.59.1 → 0.60.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapters/http.d.ts +1 -1
- package/dist/adapters/http.js +1 -1
- package/dist/adapters/langchain.d.ts +1 -1
- package/dist/adapters/langchain.js +1 -1
- package/dist/adapters/otel.d.ts +2 -2
- package/dist/adapters/otel.js +1 -1
- package/dist/benchmarks/index.js +2 -2
- package/dist/builder-eval/index.js +1 -1
- package/dist/campaign/index.d.ts +7 -3
- package/dist/campaign/index.js +21 -16
- package/dist/campaign/index.js.map +1 -1
- package/dist/{chunk-MHQPVHXU.js → chunk-6QDKWHLS.js} +2 -2
- package/dist/{chunk-N4SBKEPJ.js → chunk-GBHRUAOF.js} +106 -1
- package/dist/chunk-GBHRUAOF.js.map +1 -0
- package/dist/{chunk-JB4UWIM6.js → chunk-LBSXXH56.js} +265 -14
- package/dist/chunk-LBSXXH56.js.map +1 -0
- package/dist/{chunk-74Y2EMNH.js → chunk-NOPYCRNG.js} +6 -5
- package/dist/{chunk-74Y2EMNH.js.map → chunk-NOPYCRNG.js.map} +1 -1
- package/dist/chunk-PZ5AY32C.js +10 -0
- package/dist/chunk-SHTXZ4O2.js +113 -0
- package/dist/chunk-SHTXZ4O2.js.map +1 -0
- package/dist/cli.js +1 -1
- package/dist/contract/index.d.ts +42 -10
- package/dist/contract/index.js +55 -15
- package/dist/contract/index.js.map +1 -1
- package/dist/control.js +1 -1
- package/dist/governance/index.js +1 -1
- package/dist/hosted/index.d.ts +2 -2
- package/dist/hosted/index.js +1 -1
- package/dist/{index-D2nT6_KT.d.ts → index-BIkvdkSU.d.ts} +1 -1
- package/dist/index.js +8 -8
- package/dist/knowledge/index.js +1 -1
- package/dist/matrix/index.js +1 -1
- package/dist/meta-eval/index.js +1 -1
- package/dist/multishot/index.js +1 -1
- package/dist/openapi.json +1 -1
- package/dist/pipelines/index.js +1 -1
- package/dist/prm/index.js +1 -1
- package/dist/{run-improvement-loop-BhfdjrMY.d.ts → provenance-BM8vmMBa.d.ts} +205 -3
- package/dist/reporting.js +1 -1
- package/dist/rl.d.ts +1 -1
- package/dist/rl.js +1 -1
- package/dist/{run-campaign-ZURVWMMI.js → run-campaign-5XENUKRF.js} +3 -3
- package/dist/telemetry/file.js +1 -1
- package/dist/telemetry/index.js +1 -1
- package/dist/traces.js +1 -1
- package/dist/{types-BgrxOJSf.d.ts → types-VCIXx_yo.d.ts} +32 -4
- package/dist/wire/index.js +1 -1
- package/package.json +25 -12
- package/dist/chunk-JB4UWIM6.js.map +0 -1
- package/dist/chunk-N4SBKEPJ.js.map +0 -1
- package/dist/chunk-NSBPE2FW.js +0 -17
- package/dist/chunk-ZWEQJIM6.js +0 -220
- package/dist/chunk-ZWEQJIM6.js.map +0 -1
- /package/dist/{chunk-MHQPVHXU.js.map → chunk-6QDKWHLS.js.map} +0 -0
- /package/dist/{chunk-NSBPE2FW.js.map → chunk-PZ5AY32C.js.map} +0 -0
- /package/dist/{run-campaign-ZURVWMMI.js.map → run-campaign-5XENUKRF.js.map} +0 -0
package/dist/contract/index.d.ts
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
|
-
import { S as Scenario, M as MutableSurface, b as DispatchContext, a as JudgeConfig, I as ImprovementDriver, G as Gate } from '../types-
|
|
2
|
-
export { g as CampaignAggregates, h as CampaignArtifactWriter, i as CampaignCellResult, j as CampaignCostMeter, k as CampaignResult, l as CampaignTraceWriter, C as CodeSurface, D as Dispatch, m as GateContext, n as GateDecision, o as GateResult, p as GenerationCandidate, q as GenerationRecord, s as JudgeDimension, J as JudgeScore, u as Mutator, O as OptimizerConfig,
|
|
3
|
-
import { C as CampaignStorage, e as RunImprovementLoopResult } from '../
|
|
4
|
-
export { D as DefaultProductionGateOptions,
|
|
1
|
+
import { S as Scenario, M as MutableSurface, b as DispatchContext, a as JudgeConfig, I as ImprovementDriver, G as Gate } from '../types-VCIXx_yo.js';
|
|
2
|
+
export { g as CampaignAggregates, h as CampaignArtifactWriter, i as CampaignCellResult, j as CampaignCostMeter, k as CampaignResult, l as CampaignTraceWriter, C as CodeSurface, D as Dispatch, m as GateContext, n as GateDecision, o as GateResult, p as GenerationCandidate, q as GenerationRecord, s as JudgeDimension, J as JudgeScore, u as Mutator, O as OptimizerConfig, x as SessionScript } from '../types-VCIXx_yo.js';
|
|
3
|
+
import { C as CampaignStorage, e as LoopProvenanceRecord, i as RunImprovementLoopResult } from '../provenance-BM8vmMBa.js';
|
|
4
|
+
export { D as DefaultProductionGateOptions, b as EvolutionaryDriverOptions, c as GepaDriverOptions, H as HeldOutGateOptions, R as RunCampaignOptions, g as RunEvalOptions, h as RunImprovementLoopOptions, m as composeGate, o as defaultProductionGate, r as evolutionaryDriver, t as fsCampaignStorage, u as gepaDriver, v as heldOutGate, w as inMemoryCampaignStorage, F as runCampaign, I as runEval, J as runImprovementLoop } from '../provenance-BM8vmMBa.js';
|
|
5
5
|
export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore, O as OutcomeStore } from '../outcome-store-D6KWmYvj.js';
|
|
6
|
-
import { a as HostedTenant, I as InsightReport, T as TraceSpanEvent } from '../index-
|
|
7
|
-
export { F as FailureClusterInsight, b as InterRaterInsight, J as JudgeInsight, L as LiftInsight, O as OutcomeCorrelationInsight, R as Recommendation, c as ReleaseSummary, S as ScalarDistribution } from '../index-
|
|
6
|
+
import { a as HostedTenant, I as InsightReport, T as TraceSpanEvent } from '../index-BIkvdkSU.js';
|
|
7
|
+
export { F as FailureClusterInsight, b as InterRaterInsight, J as JudgeInsight, L as LiftInsight, O as OutcomeCorrelationInsight, R as Recommendation, c as ReleaseSummary, S as ScalarDistribution } from '../index-BIkvdkSU.js';
|
|
8
|
+
import { R as RunRecord, a as RunSplitTag } from '../run-record-etiCMsUq.js';
|
|
8
9
|
import { A as AnalystRegistry } from '../registry-DK9kqXvb.js';
|
|
9
10
|
import { a as DatasetScenario } from '../dataset-BlwAtYYf.js';
|
|
10
|
-
import { R as RunRecord, a as RunSplitTag } from '../run-record-etiCMsUq.js';
|
|
11
11
|
import '../llm-client-BXVRUZyX.js';
|
|
12
12
|
import '../errors-mje_cKOs.js';
|
|
13
13
|
import '../raw-provider-sink-C46HDghv.js';
|
|
@@ -131,12 +131,30 @@ interface SelfImproveOptions<TScenario extends Scenario, TArtifact> {
|
|
|
131
131
|
/** LLM config consumed by the default `gepaDriver`. Ignored if you pass
|
|
132
132
|
* your own `driver`. */
|
|
133
133
|
llm?: SelfImproveLlm;
|
|
134
|
-
/** Storage backend. Default
|
|
135
|
-
*
|
|
134
|
+
/** Storage backend. Default is DURABLE: when a real (non-`mem://`) `runDir`
|
|
135
|
+
* is available, the substrate defaults to `fsCampaignStorage()` so the
|
|
136
|
+
* provenance record + OTel spans survive the call. Pass
|
|
137
|
+
* `inMemoryCampaignStorage()` explicitly to opt OUT (tests, edge runtimes).
|
|
138
|
+
* Default when `runDir` is `mem://...` (or unset): in-memory. */
|
|
136
139
|
storage?: CampaignStorage;
|
|
137
140
|
/** Run directory (logical for in-memory storage, real path for fs).
|
|
138
|
-
* Default `mem://selfImprove-<timestamp
|
|
141
|
+
* Default `mem://selfImprove-<timestamp>` (in-memory, non-durable). Pass a
|
|
142
|
+
* real path to persist the provenance record + spans. */
|
|
139
143
|
runDir?: string;
|
|
144
|
+
/**
|
|
145
|
+
* Worker call records for backend provenance. The agent is opaque to the
|
|
146
|
+
* substrate (it returns an artifact, not token usage), so to capture an
|
|
147
|
+
* `assertRealBackend`-grade verdict + worker call count + model in the
|
|
148
|
+
* provenance record, the agent reports its per-call `RunRecord`s here.
|
|
149
|
+
* Called once after the loop; return the records the agent accumulated.
|
|
150
|
+
* When unset, backend provenance is derived from campaign cells (cost only;
|
|
151
|
+
* verdict will read `stub` without token usage — the honest signal that no
|
|
152
|
+
* token channel was wired).
|
|
153
|
+
*/
|
|
154
|
+
collectWorkerRecords?: () => RunRecord[];
|
|
155
|
+
/** Fires once the durable provenance record + OTel spans are emitted.
|
|
156
|
+
* Receives the structured record for inline assertions / custom routing. */
|
|
157
|
+
onProvenance?: (record: LoopProvenanceRecord) => void;
|
|
140
158
|
/** Distributed-driver seam — same as `RunCampaignOptions.cellPlacement`.
|
|
141
159
|
* Returns an opaque placement key the substrate forwards to your agent
|
|
142
160
|
* as `ctx.placement`. Combined with `httpDispatch` from
|
|
@@ -184,10 +202,24 @@ interface SelfImproveResult<TScenario extends Scenario, TArtifact> {
|
|
|
184
202
|
compositeMean: number;
|
|
185
203
|
perScenario: Record<string, number>;
|
|
186
204
|
surface: MutableSurface;
|
|
205
|
+
/** Driver label for the promoted change. Absent ⇒ winner == baseline or
|
|
206
|
+
* a bare-surface mutator. */
|
|
207
|
+
label?: string;
|
|
208
|
+
/** Driver rationale — the "because Z" that motivated the promoted change.
|
|
209
|
+
* Threaded from the driver's `ProposedCandidate` through the loop.
|
|
210
|
+
* Absent ⇒ winner == baseline. */
|
|
211
|
+
rationale?: string;
|
|
187
212
|
};
|
|
188
213
|
/** `winner.compositeMean - baselineOnHoldout.compositeMean`. Positive
|
|
189
214
|
* means the gate observed improvement. */
|
|
190
215
|
lift: number;
|
|
216
|
+
/** The explicit baseline→winner unified diff. Always present (empty string
|
|
217
|
+
* when winner == baseline). */
|
|
218
|
+
diff: string;
|
|
219
|
+
/** Durable, queryable provenance record: candidate→cell→gate→promote chain +
|
|
220
|
+
* rationale + diff + backend provenance. The artifact the hosted ingest
|
|
221
|
+
* path stores; the +lift RECOMPUTES from `record.heldOutLift`. */
|
|
222
|
+
provenance: LoopProvenanceRecord;
|
|
191
223
|
/** `defaultProductionGate.decide()` result. */
|
|
192
224
|
gateDecision: 'ship' | 'hold' | 'need_more_work' | 'model_ceiling' | 'arch_ceiling';
|
|
193
225
|
/** Number of generations actually explored (may be less than the
|
package/dist/contract/index.js
CHANGED
|
@@ -1,25 +1,28 @@
|
|
|
1
1
|
import {
|
|
2
2
|
composeGate,
|
|
3
3
|
defaultProductionGate,
|
|
4
|
+
emitLoopProvenance,
|
|
4
5
|
evolutionaryDriver,
|
|
5
6
|
gepaDriver,
|
|
6
7
|
heldOutGate,
|
|
7
8
|
runEval,
|
|
8
|
-
runImprovementLoop
|
|
9
|
-
|
|
9
|
+
runImprovementLoop,
|
|
10
|
+
surfaceContentHash
|
|
11
|
+
} from "../chunk-LBSXXH56.js";
|
|
10
12
|
import {
|
|
11
13
|
fsCampaignStorage,
|
|
12
14
|
inMemoryCampaignStorage,
|
|
13
15
|
runCampaign
|
|
14
|
-
} from "../chunk-
|
|
16
|
+
} from "../chunk-NOPYCRNG.js";
|
|
15
17
|
import {
|
|
16
18
|
createHostedClient
|
|
17
19
|
} from "../chunk-FQK2CCIM.js";
|
|
18
20
|
import {
|
|
19
|
-
checkCanaries
|
|
21
|
+
checkCanaries
|
|
22
|
+
} from "../chunk-SHTXZ4O2.js";
|
|
23
|
+
import {
|
|
20
24
|
summarizeBackendIntegrity
|
|
21
|
-
} from "../chunk-
|
|
22
|
-
import "../chunk-N4SBKEPJ.js";
|
|
25
|
+
} from "../chunk-GBHRUAOF.js";
|
|
23
26
|
import "../chunk-YV7J7X5N.js";
|
|
24
27
|
import {
|
|
25
28
|
FileSystemOutcomeStore,
|
|
@@ -42,7 +45,10 @@ import "../chunk-VSMTAMNK.js";
|
|
|
42
45
|
import "../chunk-VXNVVBZO.js";
|
|
43
46
|
import "../chunk-PC4UYEBM.js";
|
|
44
47
|
import "../chunk-QYJT52YW.js";
|
|
45
|
-
import "../chunk-
|
|
48
|
+
import "../chunk-PZ5AY32C.js";
|
|
49
|
+
|
|
50
|
+
// src/contract/self-improve.ts
|
|
51
|
+
import { createHash } from "crypto";
|
|
46
52
|
|
|
47
53
|
// src/contract/analyze-runs.ts
|
|
48
54
|
async function analyzeRuns(opts) {
|
|
@@ -848,8 +854,9 @@ async function selfImprove(opts) {
|
|
|
848
854
|
holdoutScenarios: holdout,
|
|
849
855
|
deltaThreshold: 0.05
|
|
850
856
|
});
|
|
851
|
-
const storage = opts.storage ?? inMemoryCampaignStorage();
|
|
852
857
|
const runDir = opts.runDir ?? `mem://selfImprove-${startedAt}`;
|
|
858
|
+
const isMemRunDir = runDir.startsWith("mem://");
|
|
859
|
+
const storage = opts.storage ?? (isMemRunDir ? inMemoryCampaignStorage() : fsCampaignStorage());
|
|
853
860
|
if (opts.onProgress) {
|
|
854
861
|
opts.onProgress({ kind: "baseline.started", scenarios: opts.scenarios.length });
|
|
855
862
|
}
|
|
@@ -892,22 +899,53 @@ async function selfImprove(opts) {
|
|
|
892
899
|
);
|
|
893
900
|
const insight = await analyzeRuns({
|
|
894
901
|
runs: [
|
|
895
|
-
...cellsToRunRecords(result.baselineCampaign.cells, "baseline", runDir),
|
|
896
|
-
...cellsToRunRecords(result.winnerOnHoldout.cells, "winner", runDir)
|
|
902
|
+
...cellsToRunRecords(result.baselineCampaign.cells, "baseline", runDir, opts.baselineSurface),
|
|
903
|
+
...cellsToRunRecords(result.winnerOnHoldout.cells, "winner", runDir, result.winnerSurface)
|
|
897
904
|
],
|
|
898
905
|
baselineCandidateId: "baseline",
|
|
899
906
|
candidateCandidateId: "winner"
|
|
900
907
|
});
|
|
908
|
+
const durationMs = Date.now() - startedAt;
|
|
909
|
+
const workerRecords = opts.collectWorkerRecords?.() ?? cellsToRunRecords(result.winnerOnHoldout.cells, "winner", runDir, result.winnerSurface);
|
|
910
|
+
const { record: provenance } = await emitLoopProvenance({
|
|
911
|
+
runId: `${runDir}#${startedAt}`,
|
|
912
|
+
runDir,
|
|
913
|
+
timestamp: new Date(startedAt).toISOString(),
|
|
914
|
+
baselineSurface: opts.baselineSurface,
|
|
915
|
+
winnerSurface: result.winnerSurface,
|
|
916
|
+
winnerLabel: result.winnerLabel,
|
|
917
|
+
winnerRationale: result.winnerRationale,
|
|
918
|
+
diff: result.promotedDiff,
|
|
919
|
+
generations: result.generations.map((g) => ({
|
|
920
|
+
generationIndex: g.record.generationIndex,
|
|
921
|
+
candidates: g.record.candidates,
|
|
922
|
+
promoted: g.record.promoted,
|
|
923
|
+
surfaces: g.surfaces.map((s) => ({ surfaceHash: s.surfaceHash, surface: s.surface }))
|
|
924
|
+
})),
|
|
925
|
+
gate: result.gateResult,
|
|
926
|
+
baselineOnHoldout: result.baselineOnHoldout,
|
|
927
|
+
winnerOnHoldout: result.winnerOnHoldout,
|
|
928
|
+
workerRecords,
|
|
929
|
+
totalCostUsd: totalCost,
|
|
930
|
+
totalDurationMs: durationMs,
|
|
931
|
+
storage,
|
|
932
|
+
hostedClient: opts.hostedTenant ? createHostedClient(opts.hostedTenant) : void 0
|
|
933
|
+
});
|
|
934
|
+
if (opts.onProvenance) opts.onProvenance(provenance);
|
|
901
935
|
const summary = {
|
|
902
936
|
baseline,
|
|
903
937
|
winner: {
|
|
904
938
|
...winnerStats,
|
|
905
|
-
surface: result.winnerSurface
|
|
939
|
+
surface: result.winnerSurface,
|
|
940
|
+
...result.winnerLabel ? { label: result.winnerLabel } : {},
|
|
941
|
+
...result.winnerRationale ? { rationale: result.winnerRationale } : {}
|
|
906
942
|
},
|
|
907
943
|
lift: winnerStats.compositeMean - baseline.compositeMean,
|
|
944
|
+
diff: result.promotedDiff,
|
|
945
|
+
provenance,
|
|
908
946
|
gateDecision: result.gateResult.decision,
|
|
909
947
|
generationsExplored: result.generations.length,
|
|
910
|
-
durationMs
|
|
948
|
+
durationMs,
|
|
911
949
|
totalCostUsd: totalCost,
|
|
912
950
|
insight,
|
|
913
951
|
raw: result
|
|
@@ -989,7 +1027,9 @@ function hashString(s) {
|
|
|
989
1027
|
}
|
|
990
1028
|
return h.toString(16).padStart(8, "0");
|
|
991
1029
|
}
|
|
992
|
-
function cellsToRunRecords(cells, candidateId, runId) {
|
|
1030
|
+
function cellsToRunRecords(cells, candidateId, runId, surface) {
|
|
1031
|
+
const promptHash = surfaceContentHash(surface);
|
|
1032
|
+
const configHash = `sha256:${createHash("sha256").update(candidateId).digest("hex")}`;
|
|
993
1033
|
return cells.map((cell) => {
|
|
994
1034
|
const perJudge = {};
|
|
995
1035
|
const perDimMeanAccum = {};
|
|
@@ -1027,8 +1067,8 @@ function cellsToRunRecords(cells, candidateId, runId) {
|
|
|
1027
1067
|
// Synthesize a stable seed for that pairing.
|
|
1028
1068
|
seed: cell.rep * 1e6 + hashString(cell.scenarioId).slice(0, 6).split("").reduce((a, c) => a * 31 + c.charCodeAt(0) >>> 0, 0),
|
|
1029
1069
|
model: "campaign-cell",
|
|
1030
|
-
promptHash
|
|
1031
|
-
configHash
|
|
1070
|
+
promptHash,
|
|
1071
|
+
configHash,
|
|
1032
1072
|
commitSha: "cell",
|
|
1033
1073
|
wallMs: cell.durationMs,
|
|
1034
1074
|
costUsd: cell.costUsd,
|