@tangle-network/agent-eval 0.23.1 → 0.24.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +80 -0
- package/README.md +141 -79
- package/dist/baseline-4R5deP0N.d.ts +108 -0
- package/dist/benchmarks/index.d.ts +3 -2
- package/dist/benchmarks/index.js +1 -1
- package/dist/builder-eval/index.d.ts +249 -0
- package/dist/builder-eval/index.js +391 -0
- package/dist/builder-eval/index.js.map +1 -0
- package/dist/{chunk-IOXMGMHQ.js → chunk-2A5XJB43.js} +142 -318
- package/dist/chunk-2A5XJB43.js.map +1 -0
- package/dist/chunk-47X6LRCE.js +76 -0
- package/dist/chunk-47X6LRCE.js.map +1 -0
- package/dist/{chunk-6M774GY6.js → chunk-4F5DQN55.js} +1 -1
- package/dist/chunk-4F5DQN55.js.map +1 -0
- package/dist/{chunk-KAO3Q65R.js → chunk-4S4BM3QQ.js} +15 -13
- package/dist/chunk-4S4BM3QQ.js.map +1 -0
- package/dist/chunk-5BKGXME7.js +65 -0
- package/dist/chunk-5BKGXME7.js.map +1 -0
- package/dist/{chunk-42I2QC2L.js → chunk-6QDKWHLS.js} +18 -14
- package/dist/chunk-6QDKWHLS.js.map +1 -0
- package/dist/chunk-I4MBDTY5.js +272 -0
- package/dist/chunk-I4MBDTY5.js.map +1 -0
- package/dist/chunk-K2TPS5LB.js +569 -0
- package/dist/chunk-K2TPS5LB.js.map +1 -0
- package/dist/chunk-KKHDIONI.js +414 -0
- package/dist/chunk-KKHDIONI.js.map +1 -0
- package/dist/chunk-KMPRBJK4.js +74 -0
- package/dist/chunk-KMPRBJK4.js.map +1 -0
- package/dist/{chunk-QUKKGHTZ.js → chunk-KTGTIOFD.js} +6 -3
- package/dist/chunk-KTGTIOFD.js.map +1 -0
- package/dist/chunk-LSH4MMOZ.js +838 -0
- package/dist/chunk-LSH4MMOZ.js.map +1 -0
- package/dist/chunk-NG236HPC.js +57 -0
- package/dist/chunk-NG236HPC.js.map +1 -0
- package/dist/{chunk-QBW3YBTR.js → chunk-NLMNWKVM.js} +14 -6
- package/dist/chunk-NLMNWKVM.js.map +1 -0
- package/dist/chunk-NU65VQ7M.js +99 -0
- package/dist/chunk-NU65VQ7M.js.map +1 -0
- package/dist/chunk-OHEPNJQN.js +554 -0
- package/dist/chunk-OHEPNJQN.js.map +1 -0
- package/dist/chunk-OWLAAMME.js +250 -0
- package/dist/chunk-OWLAAMME.js.map +1 -0
- package/dist/{chunk-SQQLHODJ.js → chunk-PC4UYEBM.js} +7 -4
- package/dist/chunk-PC4UYEBM.js.map +1 -0
- package/dist/{chunk-7EAUOUQS.js → chunk-RAF443UI.js} +213 -115
- package/dist/chunk-RAF443UI.js.map +1 -0
- package/dist/chunk-RZTMDUO7.js +49 -0
- package/dist/chunk-RZTMDUO7.js.map +1 -0
- package/dist/{chunk-EXGR4XEM.js → chunk-SESZDQPX.js} +23 -19
- package/dist/chunk-SESZDQPX.js.map +1 -0
- package/dist/{chunk-6KQG5HAH.js → chunk-SY6WAAAD.js} +84 -71
- package/dist/chunk-SY6WAAAD.js.map +1 -0
- package/dist/{chunk-5IIQKMD5.js → chunk-TVVP3ZZQ.js} +14 -4
- package/dist/chunk-TVVP3ZZQ.js.map +1 -0
- package/dist/{chunk-VQQSPGSM.js → chunk-VRJVTXRV.js} +169 -111
- package/dist/chunk-VRJVTXRV.js.map +1 -0
- package/dist/chunk-WWYCWKUM.js +196 -0
- package/dist/chunk-WWYCWKUM.js.map +1 -0
- package/dist/{chunk-AXHNWLIX.js → chunk-YRZ4M5GS.js} +2 -90
- package/dist/chunk-YRZ4M5GS.js.map +1 -0
- package/dist/chunk-ZN274SWR.js +613 -0
- package/dist/chunk-ZN274SWR.js.map +1 -0
- package/dist/cli.js +10 -6
- package/dist/cli.js.map +1 -1
- package/dist/{control-DvkH87qJ.d.ts → control-CBShYYA6.d.ts} +32 -33
- package/dist/control-runtime-BuJHoLg0.d.ts +180 -0
- package/dist/control.d.ts +8 -6
- package/dist/control.js +10 -7
- package/dist/{dataset-B9qvlm_o.d.ts → dataset-CiK_3LDr.d.ts} +5 -2
- package/dist/{emitter-B2XqDKFU.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
- package/dist/errors-BZ9sTdz7.d.ts +70 -0
- package/dist/failure-cluster-C2EGSDiT.d.ts +76 -0
- package/dist/feedback-trajectory-DfFdrraJ.d.ts +169 -0
- package/dist/governance/index.d.ts +5 -0
- package/dist/governance/index.js +18 -0
- package/dist/governance/index.js.map +1 -0
- package/dist/{index-DDTlbHEK.d.ts → index--fVrWDiR.d.ts} +1 -1
- package/dist/index-Oj9fAPPN.d.ts +270 -0
- package/dist/index.d.ts +1866 -3151
- package/dist/index.js +5457 -7809
- package/dist/index.js.map +1 -1
- package/dist/{integrity-Cr5YodSY.d.ts → integrity-DK2EBVZC.d.ts} +4 -3
- package/dist/knowledge/index.d.ts +102 -0
- package/dist/knowledge/index.js +18 -0
- package/dist/knowledge/index.js.map +1 -0
- package/dist/meta-eval/index.d.ts +99 -0
- package/dist/meta-eval/index.js +324 -0
- package/dist/meta-eval/index.js.map +1 -0
- package/dist/multi-layer-verifier-LkP3LVKj.d.ts +141 -0
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +11 -8
- package/dist/optimization.js +11 -9
- package/dist/outcome-store-D6KWmYvj.d.ts +63 -0
- package/dist/pipelines/index.d.ts +172 -0
- package/dist/pipelines/index.js +409 -0
- package/dist/pipelines/index.js.map +1 -0
- package/dist/prm/index.d.ts +99 -0
- package/dist/prm/index.js +222 -0
- package/dist/prm/index.js.map +1 -0
- package/dist/query-DODUYdPg.d.ts +30 -0
- package/dist/release-report-TDPn1cxq.d.ts +292 -0
- package/dist/replay-BL96gCEP.d.ts +226 -0
- package/dist/reporting.d.ts +10 -295
- package/dist/reporting.js +10 -6
- package/dist/{eval-campaign-Ds5QljIh.d.ts → researcher-CUOiGcGv.d.ts} +148 -146
- package/dist/rl.d.ts +1762 -8
- package/dist/rl.js +2035 -58
- package/dist/rl.js.map +1 -1
- package/dist/rubric-D5tjHNJQ.d.ts +72 -0
- package/dist/rubric-predictive-validity-C0uDYwG6.d.ts +105 -0
- package/dist/{run-record-DNiOMBrZ.d.ts → run-record-CqzahIbx.d.ts} +4 -1
- package/dist/sequential-Dgz1n51-.d.ts +139 -0
- package/dist/{store-u47QaJ9G.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
- package/dist/{summary-report-Ce1r4EYo.d.ts → summary-report-BXGs_9V0.d.ts} +3 -76
- package/dist/telemetry/file.js +4 -1
- package/dist/telemetry/file.js.map +1 -1
- package/dist/telemetry/index.js +57 -57
- package/dist/telemetry/index.js.map +1 -1
- package/dist/test-graded-scenario-B2kWEdh9.d.ts +146 -0
- package/dist/traces.d.ts +142 -387
- package/dist/traces.js +1302 -40
- package/dist/traces.js.map +1 -1
- package/dist/trajectory-CnoBo-JY.d.ts +32 -0
- package/dist/wire/index.d.ts +22 -22
- package/dist/wire/index.js +4 -3
- package/package.json +44 -18
- package/dist/chunk-42I2QC2L.js.map +0 -1
- package/dist/chunk-5IIQKMD5.js.map +0 -1
- package/dist/chunk-6KQG5HAH.js.map +0 -1
- package/dist/chunk-6M774GY6.js.map +0 -1
- package/dist/chunk-7EAUOUQS.js.map +0 -1
- package/dist/chunk-AXHNWLIX.js.map +0 -1
- package/dist/chunk-EXGR4XEM.js.map +0 -1
- package/dist/chunk-IOXMGMHQ.js.map +0 -1
- package/dist/chunk-KAO3Q65R.js.map +0 -1
- package/dist/chunk-LZKIOBG2.js +0 -2026
- package/dist/chunk-LZKIOBG2.js.map +0 -1
- package/dist/chunk-QBW3YBTR.js.map +0 -1
- package/dist/chunk-QUKKGHTZ.js.map +0 -1
- package/dist/chunk-SQQLHODJ.js.map +0 -1
- package/dist/chunk-V5QSWN7L.js +0 -1310
- package/dist/chunk-V5QSWN7L.js.map +0 -1
- package/dist/chunk-VQQSPGSM.js.map +0 -1
- package/dist/chunk-XPHOZPOM.js +0 -1947
- package/dist/chunk-XPHOZPOM.js.map +0 -1
- package/dist/feedback-trajectory-c43WGtTX.d.ts +0 -346
- package/dist/index-ekBXweiQ.d.ts +0 -1894
- package/dist/sequential-DgU2mFsE.d.ts +0 -304
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import { R as
|
|
3
|
-
import {
|
|
4
|
-
import {
|
|
5
|
-
import { T as
|
|
1
|
+
import { A as AgentEvalError, C as CaptureIntegrityError } from './errors-BZ9sTdz7.js';
|
|
2
|
+
import { R as RawProviderSink, P as ProviderRedactor, a as RunIntegrityExpectations, b as RunIntegrityReport } from './integrity-DK2EBVZC.js';
|
|
3
|
+
import { a as RunSplitTag, b as RunTokenUsage, c as RunJudgeMetadata, R as RunRecord } from './run-record-CqzahIbx.js';
|
|
4
|
+
import { W as ResearchReportOptions, N as ResearchReport, a3 as GateDecision } from './summary-report-BXGs_9V0.js';
|
|
5
|
+
import { T as TraceEmitter, R as RunCompleteHook } from './emitter-DP_cSSiw.js';
|
|
6
|
+
import { T as TraceStore } from './store-Db2Bv8Cf.js';
|
|
6
7
|
|
|
7
8
|
/**
|
|
8
9
|
* LLM client with graceful degrade.
|
|
@@ -81,7 +82,7 @@ interface LlmCallResult {
|
|
|
81
82
|
/** Raw response body. */
|
|
82
83
|
raw: Record<string, unknown>;
|
|
83
84
|
}
|
|
84
|
-
declare class LlmCallError extends
|
|
85
|
+
declare class LlmCallError extends AgentEvalError {
|
|
85
86
|
readonly status: number;
|
|
86
87
|
readonly body: string;
|
|
87
88
|
readonly model: string;
|
|
@@ -147,10 +148,11 @@ declare function callLlmJson<T = unknown>(req: LlmCallRequest, opts?: LlmClientO
|
|
|
147
148
|
value: T;
|
|
148
149
|
result: LlmCallResult;
|
|
149
150
|
}>;
|
|
150
|
-
|
|
151
|
-
|
|
151
|
+
type LlmRouteAssertionReason = 'no_explicit_base_url' | 'base_url_blocked' | 'base_url_not_allowed' | 'no_auth' | 'wrong_provider';
|
|
152
|
+
declare class LlmRouteAssertionError extends CaptureIntegrityError {
|
|
153
|
+
readonly reason: LlmRouteAssertionReason;
|
|
152
154
|
readonly baseUrl: string;
|
|
153
|
-
constructor(message: string,
|
|
155
|
+
constructor(message: string, reason: LlmRouteAssertionReason, baseUrl: string);
|
|
154
156
|
}
|
|
155
157
|
interface LlmRouteRequirements {
|
|
156
158
|
/**
|
|
@@ -218,143 +220,6 @@ declare class LlmClient {
|
|
|
218
220
|
}>;
|
|
219
221
|
}
|
|
220
222
|
|
|
221
|
-
/**
|
|
222
|
-
* Researcher interface — stable hook for an external autonomous-research
|
|
223
|
-
* agent to drive the meta-loop.
|
|
224
|
-
*
|
|
225
|
-
* Implementations live downstream (typically in a private repo that
|
|
226
|
-
* runs the actual LLM). This package ships only the contract + a
|
|
227
|
-
* `NoopResearcher` so consumers can wire the surface without being
|
|
228
|
-
* forced to implement every method up front.
|
|
229
|
-
*
|
|
230
|
-
* The four methods mirror the four stages of the paper "Two Loops,
|
|
231
|
-
* Three Roles":
|
|
232
|
-
*
|
|
233
|
-
* inspectFailures — given the observed runs, what failure modes
|
|
234
|
-
* are present? (data → diagnosis)
|
|
235
|
-
* proposeChange — given diagnosed failure modes, what
|
|
236
|
-
* structural changes should we try?
|
|
237
|
-
* (diagnosis → plan delta)
|
|
238
|
-
* applyChange — fold the proposed deltas into a concrete
|
|
239
|
-
* experiment plan against an existing baseline.
|
|
240
|
-
* (plan delta → executable plan)
|
|
241
|
-
* evaluateChange — run the plan, return runs + the gate verdict.
|
|
242
|
-
* (executable plan → verdict)
|
|
243
|
-
*
|
|
244
|
-
* Composition is the discipline: a Researcher implementation MUST
|
|
245
|
-
* keep these four steps separate and inspectable. Conflating
|
|
246
|
-
* "diagnose + propose + run" into a single LLM call defeats the
|
|
247
|
-
* point of the framework — you can't audit which step lied.
|
|
248
|
-
*
|
|
249
|
-
* THIS INTERFACE IS STABLE. Breaking changes require a new module
|
|
250
|
-
* (e.g. `Researcher2`) so existing implementations keep working.
|
|
251
|
-
*/
|
|
252
|
-
|
|
253
|
-
/** A diagnosed failure mode with the run-IDs that exhibit it. */
|
|
254
|
-
interface FailureMode {
|
|
255
|
-
/** Short machine-readable code. Must be stable across runs of the
|
|
256
|
-
* same researcher to enable longitudinal tracking. */
|
|
257
|
-
code: string;
|
|
258
|
-
/** Human-readable description for the paper / dashboard. */
|
|
259
|
-
description: string;
|
|
260
|
-
evidence: {
|
|
261
|
-
/** Run IDs (from `RunRecord.runId`) where this failure mode was
|
|
262
|
-
* observed. */
|
|
263
|
-
runIds: string[];
|
|
264
|
-
/** Number of run samples that informed the diagnosis. */
|
|
265
|
-
samples: number;
|
|
266
|
-
};
|
|
267
|
-
}
|
|
268
|
-
/** A single steering change the researcher wants to try. */
|
|
269
|
-
interface SteeringChange {
|
|
270
|
-
kind: 'reviewer_prompt' | 'skill_add' | 'skill_remove' | 'threshold' | 'budget';
|
|
271
|
-
/** Implementation-specific payload. Researcher implementations
|
|
272
|
-
* define the schema — keep this `unknown` here to avoid coupling
|
|
273
|
-
* the public interface to any one researcher's internal model. */
|
|
274
|
-
payload: unknown;
|
|
275
|
-
/** Why the researcher proposed this change. Goes into the audit
|
|
276
|
-
* trail next to the failure-mode evidence. */
|
|
277
|
-
rationale: string;
|
|
278
|
-
/** Optional self-reported expected delta on the headline metric. */
|
|
279
|
-
expectedDelta?: number;
|
|
280
|
-
}
|
|
281
|
-
/** A single experiment plan, mapped onto the search/holdout splits. */
|
|
282
|
-
interface ExperimentPlan {
|
|
283
|
-
baselineCandidateId: string;
|
|
284
|
-
proposedCandidateId: string;
|
|
285
|
-
changes: SteeringChange[];
|
|
286
|
-
/** USD ceiling for the entire experiment. The runner must stop
|
|
287
|
-
* before exceeding this and report a partial result. */
|
|
288
|
-
evaluationBudgetUsd: number;
|
|
289
|
-
/** Item IDs (your dataset keys) for the search vs holdout splits. */
|
|
290
|
-
splits: {
|
|
291
|
-
search: string[];
|
|
292
|
-
holdout: string[];
|
|
293
|
-
};
|
|
294
|
-
}
|
|
295
|
-
/** Result of running a plan: every run, plus the gate verdict. */
|
|
296
|
-
interface ExperimentResult {
|
|
297
|
-
plan: ExperimentPlan;
|
|
298
|
-
runs: RunRecord[];
|
|
299
|
-
gateDecision: GateDecision;
|
|
300
|
-
}
|
|
301
|
-
/**
|
|
302
|
-
* The researcher loop. Stable, four-step, inspectable.
|
|
303
|
-
*
|
|
304
|
-
* ┌──────────┐ inspectFailures ┌──────────┐ proposeChange ┌──────────┐
|
|
305
|
-
* │ runs │ ─────────────────▶│ failures │ ──────────────▶│ changes │
|
|
306
|
-
* └──────────┘ └──────────┘ └────┬─────┘
|
|
307
|
-
* │
|
|
308
|
-
* ▼
|
|
309
|
-
* ┌────────────────┐ applyChange ┌────────┐
|
|
310
|
-
* │ ExperimentPlan │ ◀────────────│ base │
|
|
311
|
-
* └────────┬───────┘ └────────┘
|
|
312
|
-
* │
|
|
313
|
-
* evaluateChange ▼
|
|
314
|
-
* ┌────────────────┐
|
|
315
|
-
* │ ExperimentResult│
|
|
316
|
-
* └────────────────┘
|
|
317
|
-
*/
|
|
318
|
-
interface Researcher {
|
|
319
|
-
inspectFailures(runs: RunRecord[]): Promise<FailureMode[]>;
|
|
320
|
-
proposeChange(failures: FailureMode[]): Promise<SteeringChange[]>;
|
|
321
|
-
applyChange(changes: SteeringChange[], baseline: ExperimentPlan): Promise<ExperimentPlan>;
|
|
322
|
-
evaluateChange(plan: ExperimentPlan): Promise<ExperimentResult>;
|
|
323
|
-
}
|
|
324
|
-
interface CallbackResearcherOptions {
|
|
325
|
-
inspectFailures: Researcher['inspectFailures'];
|
|
326
|
-
proposeChange: Researcher['proposeChange'];
|
|
327
|
-
applyChange: Researcher['applyChange'];
|
|
328
|
-
evaluateChange: Researcher['evaluateChange'];
|
|
329
|
-
}
|
|
330
|
-
/**
|
|
331
|
-
* Minimal concrete researcher for tests, scripts, and small integrations.
|
|
332
|
-
* Larger autonomous researchers can still implement `Researcher` directly.
|
|
333
|
-
*/
|
|
334
|
-
declare class CallbackResearcher implements Researcher {
|
|
335
|
-
private readonly callbacks;
|
|
336
|
-
constructor(callbacks: CallbackResearcherOptions);
|
|
337
|
-
inspectFailures(runs: RunRecord[]): Promise<FailureMode[]>;
|
|
338
|
-
proposeChange(failures: FailureMode[]): Promise<SteeringChange[]>;
|
|
339
|
-
applyChange(changes: SteeringChange[], baseline: ExperimentPlan): Promise<ExperimentPlan>;
|
|
340
|
-
evaluateChange(plan: ExperimentPlan): Promise<ExperimentResult>;
|
|
341
|
-
}
|
|
342
|
-
/**
|
|
343
|
-
* No-op researcher — fails loud on every method. Use as a placeholder
|
|
344
|
-
* in code paths that wire the interface but don't have an implementation
|
|
345
|
-
* yet. Importantly, this does NOT silently succeed: a no-op researcher
|
|
346
|
-
* that returned empty arrays would muffle the loop's signal that
|
|
347
|
-
* nobody implemented the brain.
|
|
348
|
-
*/
|
|
349
|
-
declare class NoopResearcher implements Researcher {
|
|
350
|
-
private readonly hint;
|
|
351
|
-
constructor(hint?: string);
|
|
352
|
-
inspectFailures(_runs: RunRecord[]): Promise<FailureMode[]>;
|
|
353
|
-
proposeChange(_failures: FailureMode[]): Promise<SteeringChange[]>;
|
|
354
|
-
applyChange(_changes: SteeringChange[], _baseline: ExperimentPlan): Promise<ExperimentPlan>;
|
|
355
|
-
evaluateChange(_plan: ExperimentPlan): Promise<ExperimentResult>;
|
|
356
|
-
}
|
|
357
|
-
|
|
358
223
|
/**
|
|
359
224
|
* EvalCampaign — opinionated matrix runner that wires the four
|
|
360
225
|
* capture-integrity directives by construction.
|
|
@@ -570,4 +435,141 @@ interface EvalCampaignResult {
|
|
|
570
435
|
}
|
|
571
436
|
declare function runEvalCampaign<V>(opts: EvalCampaignOptions<V>): Promise<EvalCampaignResult>;
|
|
572
437
|
|
|
438
|
+
/**
|
|
439
|
+
* Researcher interface — stable hook for an external autonomous-research
|
|
440
|
+
* agent to drive the meta-loop.
|
|
441
|
+
*
|
|
442
|
+
* Implementations live downstream (typically in a private repo that
|
|
443
|
+
* runs the actual LLM). This package ships only the contract + a
|
|
444
|
+
* `NoopResearcher` so consumers can wire the surface without being
|
|
445
|
+
* forced to implement every method up front.
|
|
446
|
+
*
|
|
447
|
+
* The four methods mirror the four stages of the paper "Two Loops,
|
|
448
|
+
* Three Roles":
|
|
449
|
+
*
|
|
450
|
+
* inspectFailures — given the observed runs, what failure modes
|
|
451
|
+
* are present? (data → diagnosis)
|
|
452
|
+
* proposeChange — given diagnosed failure modes, what
|
|
453
|
+
* structural changes should we try?
|
|
454
|
+
* (diagnosis → plan delta)
|
|
455
|
+
* applyChange — fold the proposed deltas into a concrete
|
|
456
|
+
* experiment plan against an existing baseline.
|
|
457
|
+
* (plan delta → executable plan)
|
|
458
|
+
* evaluateChange — run the plan, return runs + the gate verdict.
|
|
459
|
+
* (executable plan → verdict)
|
|
460
|
+
*
|
|
461
|
+
* Composition is the discipline: a Researcher implementation MUST
|
|
462
|
+
* keep these four steps separate and inspectable. Conflating
|
|
463
|
+
* "diagnose + propose + run" into a single LLM call defeats the
|
|
464
|
+
* point of the framework — you can't audit which step lied.
|
|
465
|
+
*
|
|
466
|
+
* THIS INTERFACE IS STABLE. Breaking changes require a new module
|
|
467
|
+
* (e.g. `Researcher2`) so existing implementations keep working.
|
|
468
|
+
*/
|
|
469
|
+
|
|
470
|
+
/** A diagnosed failure mode with the run-IDs that exhibit it. */
|
|
471
|
+
interface FailureMode {
|
|
472
|
+
/** Short machine-readable code. Must be stable across runs of the
|
|
473
|
+
* same researcher to enable longitudinal tracking. */
|
|
474
|
+
code: string;
|
|
475
|
+
/** Human-readable description for the paper / dashboard. */
|
|
476
|
+
description: string;
|
|
477
|
+
evidence: {
|
|
478
|
+
/** Run IDs (from `RunRecord.runId`) where this failure mode was
|
|
479
|
+
* observed. */
|
|
480
|
+
runIds: string[];
|
|
481
|
+
/** Number of run samples that informed the diagnosis. */
|
|
482
|
+
samples: number;
|
|
483
|
+
};
|
|
484
|
+
}
|
|
485
|
+
/** A single steering change the researcher wants to try. */
|
|
486
|
+
interface SteeringChange {
|
|
487
|
+
kind: 'reviewer_prompt' | 'skill_add' | 'skill_remove' | 'threshold' | 'budget';
|
|
488
|
+
/** Implementation-specific payload. Researcher implementations
|
|
489
|
+
* define the schema — keep this `unknown` here to avoid coupling
|
|
490
|
+
* the public interface to any one researcher's internal model. */
|
|
491
|
+
payload: unknown;
|
|
492
|
+
/** Why the researcher proposed this change. Goes into the audit
|
|
493
|
+
* trail next to the failure-mode evidence. */
|
|
494
|
+
rationale: string;
|
|
495
|
+
/** Optional self-reported expected delta on the headline metric. */
|
|
496
|
+
expectedDelta?: number;
|
|
497
|
+
}
|
|
498
|
+
/** A single experiment plan, mapped onto the search/holdout splits. */
|
|
499
|
+
interface ExperimentPlan {
|
|
500
|
+
baselineCandidateId: string;
|
|
501
|
+
proposedCandidateId: string;
|
|
502
|
+
changes: SteeringChange[];
|
|
503
|
+
/** USD ceiling for the entire experiment. The runner must stop
|
|
504
|
+
* before exceeding this and report a partial result. */
|
|
505
|
+
evaluationBudgetUsd: number;
|
|
506
|
+
/** Item IDs (your dataset keys) for the search vs holdout splits. */
|
|
507
|
+
splits: {
|
|
508
|
+
search: string[];
|
|
509
|
+
holdout: string[];
|
|
510
|
+
};
|
|
511
|
+
}
|
|
512
|
+
/** Result of running a plan: every run, plus the gate verdict. */
|
|
513
|
+
interface ExperimentResult {
|
|
514
|
+
plan: ExperimentPlan;
|
|
515
|
+
runs: RunRecord[];
|
|
516
|
+
gateDecision: GateDecision;
|
|
517
|
+
}
|
|
518
|
+
/**
|
|
519
|
+
* The researcher loop. Stable, four-step, inspectable.
|
|
520
|
+
*
|
|
521
|
+
* ┌──────────┐ inspectFailures ┌──────────┐ proposeChange ┌──────────┐
|
|
522
|
+
* │ runs │ ─────────────────▶│ failures │ ──────────────▶│ changes │
|
|
523
|
+
* └──────────┘ └──────────┘ └────┬─────┘
|
|
524
|
+
* │
|
|
525
|
+
* ▼
|
|
526
|
+
* ┌────────────────┐ applyChange ┌────────┐
|
|
527
|
+
* │ ExperimentPlan │ ◀────────────│ base │
|
|
528
|
+
* └────────┬───────┘ └────────┘
|
|
529
|
+
* │
|
|
530
|
+
* evaluateChange ▼
|
|
531
|
+
* ┌────────────────┐
|
|
532
|
+
* │ ExperimentResult│
|
|
533
|
+
* └────────────────┘
|
|
534
|
+
*/
|
|
535
|
+
interface Researcher {
|
|
536
|
+
inspectFailures(runs: RunRecord[]): Promise<FailureMode[]>;
|
|
537
|
+
proposeChange(failures: FailureMode[]): Promise<SteeringChange[]>;
|
|
538
|
+
applyChange(changes: SteeringChange[], baseline: ExperimentPlan): Promise<ExperimentPlan>;
|
|
539
|
+
evaluateChange(plan: ExperimentPlan): Promise<ExperimentResult>;
|
|
540
|
+
}
|
|
541
|
+
interface CallbackResearcherOptions {
|
|
542
|
+
inspectFailures: Researcher['inspectFailures'];
|
|
543
|
+
proposeChange: Researcher['proposeChange'];
|
|
544
|
+
applyChange: Researcher['applyChange'];
|
|
545
|
+
evaluateChange: Researcher['evaluateChange'];
|
|
546
|
+
}
|
|
547
|
+
/**
|
|
548
|
+
* Minimal concrete researcher for tests, scripts, and small integrations.
|
|
549
|
+
* Larger autonomous researchers can still implement `Researcher` directly.
|
|
550
|
+
*/
|
|
551
|
+
declare class CallbackResearcher implements Researcher {
|
|
552
|
+
private readonly callbacks;
|
|
553
|
+
constructor(callbacks: CallbackResearcherOptions);
|
|
554
|
+
inspectFailures(runs: RunRecord[]): Promise<FailureMode[]>;
|
|
555
|
+
proposeChange(failures: FailureMode[]): Promise<SteeringChange[]>;
|
|
556
|
+
applyChange(changes: SteeringChange[], baseline: ExperimentPlan): Promise<ExperimentPlan>;
|
|
557
|
+
evaluateChange(plan: ExperimentPlan): Promise<ExperimentResult>;
|
|
558
|
+
}
|
|
559
|
+
/**
|
|
560
|
+
* No-op researcher — fails loud on every method. Use as a placeholder
|
|
561
|
+
* in code paths that wire the interface but don't have an implementation
|
|
562
|
+
* yet. Importantly, this does NOT silently succeed: a no-op researcher
|
|
563
|
+
* that returned empty arrays would muffle the loop's signal that
|
|
564
|
+
* nobody implemented the brain.
|
|
565
|
+
*/
|
|
566
|
+
declare class NoopResearcher implements Researcher {
|
|
567
|
+
private readonly hint;
|
|
568
|
+
constructor(hint?: string);
|
|
569
|
+
inspectFailures(_runs: RunRecord[]): Promise<FailureMode[]>;
|
|
570
|
+
proposeChange(_failures: FailureMode[]): Promise<SteeringChange[]>;
|
|
571
|
+
applyChange(_changes: SteeringChange[], _baseline: ExperimentPlan): Promise<ExperimentPlan>;
|
|
572
|
+
evaluateChange(_plan: ExperimentPlan): Promise<ExperimentResult>;
|
|
573
|
+
}
|
|
574
|
+
|
|
573
575
|
export { CallbackResearcher as C, type EvalCampaignOptions as E, type FailedRun as F, type LlmClientOptions as L, NoopResearcher as N, type Researcher as R, type SteeringChange as S, type CallbackResearcherOptions as a, type CampaignFactoryParams as b, type CampaignIntegrityPolicy as c, type CampaignRunContext as d, type CampaignRunOutcome as e, type CampaignRunner as f, type CampaignScenario as g, type CampaignVariant as h, type EvalCampaignResult as i, type ExperimentPlan as j, type ExperimentResult as k, type FailureMode as l, LlmCallError as m, type LlmCallRequest as n, type LlmCallResult as o, LlmClient as p, type LlmMessage as q, runEvalCampaign as r, LlmRouteAssertionError as s, type LlmRouteRequirements as t, type LlmUsage as u, assertLlmRoute as v, callLlm as w, callLlmJson as x, probeLlm as y, stripFencedJson as z };
|