@tangle-network/agent-eval 0.21.0 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/CHANGELOG.md +236 -1
  2. package/README.md +17 -3
  3. package/dist/benchmarks/index.d.ts +2 -2
  4. package/dist/{chunk-WOK2RTWG.js → chunk-4W4NCYM2.js} +134 -109
  5. package/dist/chunk-4W4NCYM2.js.map +1 -0
  6. package/dist/{chunk-WOPGKVN4.js → chunk-6KQG5HAH.js} +2 -2
  7. package/dist/chunk-6M774GY6.js +53 -0
  8. package/dist/chunk-6M774GY6.js.map +1 -0
  9. package/dist/chunk-7EAUOUQS.js +495 -0
  10. package/dist/chunk-7EAUOUQS.js.map +1 -0
  11. package/dist/chunk-AXHNWLIX.js +246 -0
  12. package/dist/chunk-AXHNWLIX.js.map +1 -0
  13. package/dist/chunk-EXGR4XEM.js +283 -0
  14. package/dist/chunk-EXGR4XEM.js.map +1 -0
  15. package/dist/{chunk-3IX6QTB7.js → chunk-IOXMGMHQ.js} +418 -541
  16. package/dist/chunk-IOXMGMHQ.js.map +1 -0
  17. package/dist/{chunk-3GN6U53I.js → chunk-KAO3Q65R.js} +2 -2
  18. package/dist/chunk-LZKIOBG2.js +2026 -0
  19. package/dist/chunk-LZKIOBG2.js.map +1 -0
  20. package/dist/{chunk-YUFXO3TU.js → chunk-QBW3YBTR.js} +1 -1
  21. package/dist/chunk-QBW3YBTR.js.map +1 -0
  22. package/dist/chunk-QUKKGHTZ.js +121 -0
  23. package/dist/chunk-QUKKGHTZ.js.map +1 -0
  24. package/dist/{chunk-SNUHRBDL.js → chunk-SQQLHODJ.js} +10 -1
  25. package/dist/{chunk-SNUHRBDL.js.map → chunk-SQQLHODJ.js.map} +1 -1
  26. package/dist/{chunk-ARZ6BEV6.js → chunk-V5QSWN7L.js} +2 -2
  27. package/dist/{chunk-HRZELXCR.js → chunk-VQQSPGSM.js} +3 -3
  28. package/dist/cli.js +3 -3
  29. package/dist/{control-cxwMOAsy.d.ts → control-DvkH87qJ.d.ts} +2 -2
  30. package/dist/control.d.ts +3 -3
  31. package/dist/control.js +2 -2
  32. package/dist/eval-campaign-Ds5QljIh.d.ts +573 -0
  33. package/dist/{feedback-trajectory-CB0A32o3.d.ts → feedback-trajectory-c43WGtTX.d.ts} +1 -1
  34. package/dist/{index-c5saLbKD.d.ts → index-DDTlbHEK.d.ts} +1 -1
  35. package/dist/index-ekBXweiQ.d.ts +1894 -0
  36. package/dist/index.d.ts +20 -430
  37. package/dist/index.js +154 -34
  38. package/dist/index.js.map +1 -1
  39. package/dist/integrity-Cr5YodSY.d.ts +210 -0
  40. package/dist/openapi.json +1 -1
  41. package/dist/optimization.d.ts +7 -145
  42. package/dist/optimization.js +12 -3
  43. package/dist/reporting.d.ts +294 -4
  44. package/dist/reporting.js +18 -9
  45. package/dist/rl.d.ts +8 -0
  46. package/dist/rl.js +113 -0
  47. package/dist/rl.js.map +1 -0
  48. package/dist/{run-record-CX_jcAyr.d.ts → run-record-DNiOMBrZ.d.ts} +10 -1
  49. package/dist/sequential-DgU2mFsE.d.ts +304 -0
  50. package/dist/{multi-shot-optimization-Bvtz294B.d.ts → summary-report-Ce1r4EYo.d.ts} +382 -2
  51. package/dist/traces.d.ts +101 -181
  52. package/dist/traces.js +19 -8
  53. package/dist/wire/index.js +3 -3
  54. package/docs/auto-research-loop-end-to-end.md +186 -0
  55. package/docs/research-report-methodology.md +19 -4
  56. package/docs/three-package-architecture.md +180 -0
  57. package/docs/wire-protocol.md +1 -1
  58. package/package.json +7 -2
  59. package/dist/chunk-3IX6QTB7.js.map +0 -1
  60. package/dist/chunk-KRR4VMH7.js +0 -423
  61. package/dist/chunk-KRR4VMH7.js.map +0 -1
  62. package/dist/chunk-WOK2RTWG.js.map +0 -1
  63. package/dist/chunk-YUFXO3TU.js.map +0 -1
  64. package/dist/reporting-Da2ihlcM.d.ts +0 -672
  65. /package/dist/{chunk-WOPGKVN4.js.map → chunk-6KQG5HAH.js.map} +0 -0
  66. /package/dist/{chunk-3GN6U53I.js.map → chunk-KAO3Q65R.js.map} +0 -0
  67. /package/dist/{chunk-ARZ6BEV6.js.map → chunk-V5QSWN7L.js.map} +0 -0
  68. /package/dist/{chunk-HRZELXCR.js.map → chunk-VQQSPGSM.js.map} +0 -0
@@ -0,0 +1,210 @@
1
+ import { T as TraceStore } from './store-u47QaJ9G.js';
2
+
3
+ /**
4
+ * RawProviderSink — first-class persistence for the actual HTTP-level
5
+ * request/response bodies of every LLM provider call.
6
+ *
7
+ * Why this is a separate sink from the structured `LlmSpan`:
8
+ *
9
+ * - `LlmSpan` records the *intent* — model name, messages, output text,
10
+ * usage. It's what dashboards read; it's NOT enough for forensics.
11
+ * - When a downstream consumer reports "the verifier used the wrong route"
12
+ * or "tokens look right but reasoning was missing," the only way to
13
+ * answer is the raw HTTP body. Span fields can lie (a proxy can echo
14
+ * a different `model` value than what actually answered); the raw
15
+ * response is ground truth.
16
+ *
17
+ * Default behaviour: opt-in. Pass `rawSink` to `LlmClientOptions` (or the
18
+ * matrix runner / BuilderSession sets it up automatically) and every
19
+ * request, response, and error is recorded — including retries, with the
20
+ * attempt index attached so a flaky call's full event chain is recoverable.
21
+ *
22
+ * Redaction is enforced at sink time. The default redactor strips
23
+ * `Authorization`, `X-Api-Key`, `X-Auth-Token`, `Cookie` headers and any
24
+ * payload field whose key matches `apiKey | api_key | bearer | password |
25
+ * secret | token` (case-insensitive). Override via the sink constructor or
26
+ * the per-call `redactor`. The `redactedFields` array on the persisted
27
+ * event lets a reviewer see what was stripped without exposing the values.
28
+ */
29
+ type RawProviderDirection = 'request' | 'response' | 'error';
30
+ interface RawProviderEvent {
31
+ /** Stable id. Generated by the sink if omitted. */
32
+ eventId: string;
33
+ /** Trace context populated by `LlmClient` when the call is wrapped in a span. */
34
+ runId?: string;
35
+ spanId?: string;
36
+ /**
37
+ * Logical provider name. Free-form so callers can use whatever id matches
38
+ * their topology (`'openai'`, `'anthropic'`, `'tangle-router'`, …). When
39
+ * omitted, derived from `baseUrl` in `LlmClientOptions`.
40
+ */
41
+ provider: string;
42
+ model: string;
43
+ /** Endpoint path, e.g. `'/v1/chat/completions'`. */
44
+ endpoint: string;
45
+ /** Base URL used for the call (already-normalised — no trailing slash). */
46
+ baseUrl: string;
47
+ /** 0-indexed retry attempt. The first attempt is 0; a retried call gets 1, 2, … */
48
+ attemptIndex: number;
49
+ direction: RawProviderDirection;
50
+ /** Unix ms. */
51
+ timestamp: number;
52
+ /** Wall-clock duration of the call leg. Set on `response` and `error` events; null on `request`. */
53
+ durationMs?: number;
54
+ statusCode?: number;
55
+ requestHeaders?: Record<string, string>;
56
+ requestBody?: unknown;
57
+ responseHeaders?: Record<string, string>;
58
+ responseBody?: unknown;
59
+ /** Set on `direction: 'error'` events. */
60
+ errorMessage?: string;
61
+ /** Field paths the redactor stripped from this event ('header:Authorization', 'body.apiKey', …). */
62
+ redactedFields: string[];
63
+ }
64
+ interface RawProviderSinkFilter {
65
+ runId?: string;
66
+ spanId?: string;
67
+ direction?: RawProviderDirection;
68
+ attemptIndex?: number;
69
+ }
70
+ interface RawProviderSink {
71
+ record(event: RawProviderEvent): Promise<void>;
72
+ /** Optional listing — implementations that durably persist (file, db) should support this. */
73
+ list?(filter?: RawProviderSinkFilter): Promise<RawProviderEvent[]>;
74
+ /** Optional teardown for backed implementations. */
75
+ close?(): Promise<void>;
76
+ }
77
+ type ProviderRedactor = (event: RawProviderEvent) => RawProviderEvent;
78
+ /**
79
+ * Default redactor — strips well-known auth headers and any body field whose
80
+ * key matches the credential pattern. Records every redacted path on
81
+ * `event.redactedFields` so a downstream reviewer can see what was removed.
82
+ */
83
+ declare function defaultProviderRedactor(event: RawProviderEvent): RawProviderEvent;
84
+ interface InMemoryRawProviderSinkOptions {
85
+ redactor?: ProviderRedactor;
86
+ }
87
+ declare class InMemoryRawProviderSink implements RawProviderSink {
88
+ private events;
89
+ private redactor;
90
+ constructor(opts?: InMemoryRawProviderSinkOptions);
91
+ record(event: RawProviderEvent): Promise<void>;
92
+ list(filter?: RawProviderSinkFilter): Promise<RawProviderEvent[]>;
93
+ size(): number;
94
+ }
95
+ declare class NoopRawProviderSink implements RawProviderSink {
96
+ record(): Promise<void>;
97
+ /**
98
+ * Returns an empty array. Implemented so `assertRunCaptured` does not
99
+ * trip the `no_raw_sink` issue when a caller explicitly opts out of
100
+ * capture by passing this sink — opt-out is a deliberate choice, not a
101
+ * misconfiguration.
102
+ */
103
+ list(): Promise<RawProviderEvent[]>;
104
+ }
105
+ interface FileSystemRawProviderSinkOptions {
106
+ /** Directory the NDJSON file is written into. Created if missing. */
107
+ dir: string;
108
+ /** File name; default `'raw-provider-events.ndjson'`. */
109
+ fileName?: string;
110
+ /** Bytes after which the writer rolls over to a new file (default 32 MiB). */
111
+ rollAtBytes?: number;
112
+ redactor?: ProviderRedactor;
113
+ }
114
+ declare class FileSystemRawProviderSink implements RawProviderSink {
115
+ private dir;
116
+ private fileName;
117
+ private rollAtBytes;
118
+ private redactor;
119
+ private bytesWritten;
120
+ private rollIndex;
121
+ private initPromise;
122
+ constructor(opts: FileSystemRawProviderSinkOptions);
123
+ private ensureInit;
124
+ private currentPath;
125
+ record(event: RawProviderEvent): Promise<void>;
126
+ list(filter?: RawProviderSinkFilter): Promise<RawProviderEvent[]>;
127
+ }
128
+ /**
129
+ * Best-effort provider id from a base URL. Falls back to the URL host when
130
+ * none of the well-known patterns match.
131
+ */
132
+ declare function providerFromBaseUrl(baseUrl: string): string;
133
+
134
+ /**
135
+ * Run-completion integrity check — at end of run, verify the expected event
136
+ * types were actually captured. The point is the launch-review failure mode:
137
+ * a run *appears* successful but the raw provider events were never written,
138
+ * so a downstream reviewer can't reconstruct what happened.
139
+ *
140
+ * Pattern:
141
+ *
142
+ * const report = await assertRunCaptured(store, runId, {
143
+ * llmSpansMin: 1,
144
+ * judgeSpansMin: 1,
145
+ * rawSink: providerSink, // must have ≥ 1 event for this run
146
+ * requireRawCoverageOfLlmSpans: true, // every llm span has matching raw events
147
+ * })
148
+ * if (!report.ok) throwIfRunIncomplete(report) // or mark run failed and continue
149
+ *
150
+ * The function is read-only on the store and returns a structured report;
151
+ * the caller chooses the failure mode (throw, mark run failed, log warning).
152
+ * `throwIfRunIncomplete` is the convenient strict mode.
153
+ */
154
+
155
+ interface RunIntegrityExpectations {
156
+ /** Minimum LLM span count. Default 0 (no requirement). */
157
+ llmSpansMin?: number;
158
+ /** Minimum judge span count. Default 0. */
159
+ judgeSpansMin?: number;
160
+ /** Minimum tool span count. Default 0. */
161
+ toolSpansMin?: number;
162
+ /**
163
+ * Raw provider sink to consult for capture verification. When present,
164
+ * the check requires at least one raw event for the run.
165
+ */
166
+ rawSink?: RawProviderSink;
167
+ /** Minimum raw provider event count. Default 0; ignored when `rawSink` absent. */
168
+ rawProviderEventsMin?: number;
169
+ /**
170
+ * Every LLM span must have at least one matching raw `request` event
171
+ * (matched by spanId). Catches the common bug where the structured span
172
+ * was emitted but the raw HTTP capture was wired to a different sink.
173
+ */
174
+ requireRawCoverageOfLlmSpans?: boolean;
175
+ /** Run outcome must be set (not null/undefined). Default false. */
176
+ requireOutcome?: boolean;
177
+ }
178
+ type RunIntegrityIssueCode = 'no_run' | 'missing_llm_spans' | 'missing_judge_spans' | 'missing_tool_spans' | 'missing_raw_events' | 'no_raw_sink' | 'orphan_llm_span' | 'missing_outcome';
179
+ interface RunIntegrityIssue {
180
+ code: RunIntegrityIssueCode;
181
+ message: string;
182
+ detail?: Record<string, unknown>;
183
+ }
184
+ interface RunIntegrityReport {
185
+ ok: boolean;
186
+ runId: string;
187
+ llmSpanCount: number;
188
+ judgeSpanCount: number;
189
+ toolSpanCount: number;
190
+ rawProviderEventCount: number;
191
+ /**
192
+ * Coverage of LLM spans by raw provider events keyed on spanId.
193
+ * `total` is the number of LLM spans; `covered` is the count with at
194
+ * least one matching `request` raw event.
195
+ */
196
+ rawSpanCoverage: {
197
+ covered: number;
198
+ total: number;
199
+ };
200
+ issues: RunIntegrityIssue[];
201
+ }
202
+ declare class RunIntegrityError extends Error {
203
+ readonly report: RunIntegrityReport;
204
+ constructor(report: RunIntegrityReport);
205
+ }
206
+ declare function assertRunCaptured(store: TraceStore, runId: string, expectations?: RunIntegrityExpectations): Promise<RunIntegrityReport>;
207
+ /** Strict mode: throws `RunIntegrityError` when the report isn't ok. */
208
+ declare function throwIfRunIncomplete(report: RunIntegrityReport): void;
209
+
210
+ export { FileSystemRawProviderSink as F, InMemoryRawProviderSink as I, NoopRawProviderSink as N, type ProviderRedactor as P, type RawProviderSink as R, type RunIntegrityExpectations as a, type RunIntegrityReport as b, type FileSystemRawProviderSinkOptions as c, type InMemoryRawProviderSinkOptions as d, type RawProviderDirection as e, type RawProviderEvent as f, type RawProviderSinkFilter as g, RunIntegrityError as h, type RunIntegrityIssue as i, type RunIntegrityIssueCode as j, assertRunCaptured as k, defaultProviderRedactor as l, providerFromBaseUrl as p, throwIfRunIncomplete as t };
package/dist/openapi.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "openapi": "3.1.0",
3
3
  "info": {
4
4
  "title": "@tangle-network/agent-eval — wire protocol",
5
- "version": "0.21.0",
5
+ "version": "0.23.0",
6
6
  "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
7
7
  "contact": {
8
8
  "name": "Tangle Network",
@@ -1,146 +1,8 @@
1
- import { G as GateDecision } from './multi-shot-optimization-Bvtz294B.js';
2
- export { A as ActionableSideInfo, b as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, E as EvolvableVariant, e as GenerationReport, I as InMemoryTrialCache, h as MultiShotGateConfig, i as MultiShotGateResult, j as MultiShotMutateAdapter, k as MultiShotOptimizationConfig, l as MultiShotOptimizationResult, m as MultiShotRun, n as MultiShotRunInput, o as MultiShotRunner, p as MultiShotScore, q as MultiShotScorer, r as MultiShotSplit, s as MultiShotTrace, t as MultiShotTrialResult, u as MultiShotVariant, M as MutateAdapter, v as PromptEvolutionConfig, w as PromptEvolutionEvent, x as PromptEvolutionResult, R as ReflectionContext, y as ReflectionProposal, S as ScenarioAggregate, z as ScoreAdapter, T as TrialCache, a as TrialResult, B as TrialTrace, V as VariantAggregate, C as buildReflectionPrompt, J as defaultMultiShotObjectives, Q as parseReflectionResponse, U as runMultiShotOptimization, W as runPromptEvolution, Y as trialTraceFromMultiShotTrial } from './multi-shot-optimization-Bvtz294B.js';
3
- import { a as RunRecord } from './run-record-CX_jcAyr.js';
4
- export { n as FeedbackArtifactType, o as FeedbackAttempt, F as FeedbackLabel, p as FeedbackLabelKind, q as FeedbackLabelSource, r as FeedbackOptimizerRow, s as FeedbackOutcome, t as FeedbackReplayAdapter, u as FeedbackReplayResult, v as FeedbackSeverity, w as FeedbackSplitPolicy, x as FeedbackTask, b as FeedbackTrajectory, y as FeedbackTrajectoryFilter, a as FeedbackTrajectoryStore, z as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, A as ProposedSideEffect, D as assignFeedbackSplit, E as controlRunToFeedbackTrajectory, G as createFeedbackTrajectory, H as feedbackTrajectoriesToDatasetScenarios, J as feedbackTrajectoriesToOptimizerRows, K as feedbackTrajectoryToDatasetScenario, L as feedbackTrajectoryToOptimizerRow, N as parseFeedbackTrajectoriesJsonl, O as renderPreferenceMemoryMarkdown, Q as replayFeedbackTrajectories, R as replayFeedbackTrajectory, U as serializeFeedbackTrajectoriesJsonl, Y as summarizePreferenceMemory, Z as withAssignedFeedbackSplit } from './feedback-trajectory-CB0A32o3.js';
5
- import './dataset-B9qvlm_o.js';
6
- import './emitter-B2XqDKFU.js';
1
+ export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, N as NoopResearcher, R as Researcher, S as SteeringChange, r as runEvalCampaign } from './eval-campaign-Ds5QljIh.js';
2
+ export { A as ActionableSideInfo, a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, E as EvolvableVariant, G as GenerationReport, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, c as MultiShotMutateAdapter, d as MultiShotOptimizationConfig, e as MultiShotOptimizationResult, f as MultiShotRun, g as MultiShotRunInput, h as MultiShotRunner, i as MultiShotScore, j as MultiShotScorer, k as MultiShotSplit, l as MultiShotTrace, m as MultiShotTrialResult, n as MultiShotVariant, o as MutateAdapter, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, R as ReflectionContext, r as ReflectionProposal, S as ScenarioAggregate, s as ScoreAdapter, T as TrialCache, t as TrialResult, u as TrialTrace, V as VariantAggregate, v as buildReflectionPrompt, w as defaultMultiShotObjectives, x as parseReflectionResponse, y as runMultiShotOptimization, z as runPromptEvolution, B as trialTraceFromMultiShotTrial } from './summary-report-Ce1r4EYo.js';
3
+ export { F as FeedbackArtifactType, a as FeedbackAttempt, b as FeedbackLabel, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, l as FeedbackTrajectory, m as FeedbackTrajectoryFilter, n as FeedbackTrajectoryStore, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-c43WGtTX.js';
4
+ import './run-record-DNiOMBrZ.js';
5
+ import './integrity-Cr5YodSY.js';
7
6
  import './store-u47QaJ9G.js';
8
-
9
- /**
10
- * Researcher interface — stable hook for an external autonomous-research
11
- * agent to drive the meta-loop.
12
- *
13
- * Implementations live downstream (typically in a private repo that
14
- * runs the actual LLM). This package ships only the contract + a
15
- * `NoopResearcher` so consumers can wire the surface without being
16
- * forced to implement every method up front.
17
- *
18
- * The four methods mirror the four stages of the paper "Two Loops,
19
- * Three Roles":
20
- *
21
- * inspectFailures — given the observed runs, what failure modes
22
- * are present? (data → diagnosis)
23
- * proposeChange — given diagnosed failure modes, what
24
- * structural changes should we try?
25
- * (diagnosis → plan delta)
26
- * applyChange — fold the proposed deltas into a concrete
27
- * experiment plan against an existing baseline.
28
- * (plan delta → executable plan)
29
- * evaluateChange — run the plan, return runs + the gate verdict.
30
- * (executable plan → verdict)
31
- *
32
- * Composition is the discipline: a Researcher implementation MUST
33
- * keep these four steps separate and inspectable. Conflating
34
- * "diagnose + propose + run" into a single LLM call defeats the
35
- * point of the framework — you can't audit which step lied.
36
- *
37
- * THIS INTERFACE IS STABLE. Breaking changes require a new module
38
- * (e.g. `Researcher2`) so existing implementations keep working.
39
- */
40
-
41
- /** A diagnosed failure mode with the run-IDs that exhibit it. */
42
- interface FailureMode {
43
- /** Short machine-readable code. Must be stable across runs of the
44
- * same researcher to enable longitudinal tracking. */
45
- code: string;
46
- /** Human-readable description for the paper / dashboard. */
47
- description: string;
48
- evidence: {
49
- /** Run IDs (from `RunRecord.runId`) where this failure mode was
50
- * observed. */
51
- runIds: string[];
52
- /** Number of run samples that informed the diagnosis. */
53
- samples: number;
54
- };
55
- }
56
- /** A single steering change the researcher wants to try. */
57
- interface SteeringChange {
58
- kind: 'reviewer_prompt' | 'skill_add' | 'skill_remove' | 'threshold' | 'budget';
59
- /** Implementation-specific payload. Researcher implementations
60
- * define the schema — keep this `unknown` here to avoid coupling
61
- * the public interface to any one researcher's internal model. */
62
- payload: unknown;
63
- /** Why the researcher proposed this change. Goes into the audit
64
- * trail next to the failure-mode evidence. */
65
- rationale: string;
66
- /** Optional self-reported expected delta on the headline metric. */
67
- expectedDelta?: number;
68
- }
69
- /** A single experiment plan, mapped onto the search/holdout splits. */
70
- interface ExperimentPlan {
71
- baselineCandidateId: string;
72
- proposedCandidateId: string;
73
- changes: SteeringChange[];
74
- /** USD ceiling for the entire experiment. The runner must stop
75
- * before exceeding this and report a partial result. */
76
- evaluationBudgetUsd: number;
77
- /** Item IDs (your dataset keys) for the search vs holdout splits. */
78
- splits: {
79
- search: string[];
80
- holdout: string[];
81
- };
82
- }
83
- /** Result of running a plan: every run, plus the gate verdict. */
84
- interface ExperimentResult {
85
- plan: ExperimentPlan;
86
- runs: RunRecord[];
87
- gateDecision: GateDecision;
88
- }
89
- /**
90
- * The researcher loop. Stable, four-step, inspectable.
91
- *
92
- * ┌──────────┐ inspectFailures ┌──────────┐ proposeChange ┌──────────┐
93
- * │ runs │ ─────────────────▶│ failures │ ──────────────▶│ changes │
94
- * └──────────┘ └──────────┘ └────┬─────┘
95
- * │
96
- * ▼
97
- * ┌────────────────┐ applyChange ┌────────┐
98
- * │ ExperimentPlan │ ◀────────────│ base │
99
- * └────────┬───────┘ └────────┘
100
- * │
101
- * evaluateChange ▼
102
- * ┌────────────────┐
103
- * │ ExperimentResult│
104
- * └────────────────┘
105
- */
106
- interface Researcher {
107
- inspectFailures(runs: RunRecord[]): Promise<FailureMode[]>;
108
- proposeChange(failures: FailureMode[]): Promise<SteeringChange[]>;
109
- applyChange(changes: SteeringChange[], baseline: ExperimentPlan): Promise<ExperimentPlan>;
110
- evaluateChange(plan: ExperimentPlan): Promise<ExperimentResult>;
111
- }
112
- interface CallbackResearcherOptions {
113
- inspectFailures: Researcher['inspectFailures'];
114
- proposeChange: Researcher['proposeChange'];
115
- applyChange: Researcher['applyChange'];
116
- evaluateChange: Researcher['evaluateChange'];
117
- }
118
- /**
119
- * Minimal concrete researcher for tests, scripts, and small integrations.
120
- * Larger autonomous researchers can still implement `Researcher` directly.
121
- */
122
- declare class CallbackResearcher implements Researcher {
123
- private readonly callbacks;
124
- constructor(callbacks: CallbackResearcherOptions);
125
- inspectFailures(runs: RunRecord[]): Promise<FailureMode[]>;
126
- proposeChange(failures: FailureMode[]): Promise<SteeringChange[]>;
127
- applyChange(changes: SteeringChange[], baseline: ExperimentPlan): Promise<ExperimentPlan>;
128
- evaluateChange(plan: ExperimentPlan): Promise<ExperimentResult>;
129
- }
130
- /**
131
- * No-op researcher — fails loud on every method. Use as a placeholder
132
- * in code paths that wire the interface but don't have an implementation
133
- * yet. Importantly, this does NOT silently succeed: a no-op researcher
134
- * that returned empty arrays would muffle the loop's signal that
135
- * nobody implemented the brain.
136
- */
137
- declare class NoopResearcher implements Researcher {
138
- private readonly hint;
139
- constructor(hint?: string);
140
- inspectFailures(_runs: RunRecord[]): Promise<FailureMode[]>;
141
- proposeChange(_failures: FailureMode[]): Promise<SteeringChange[]>;
142
- applyChange(_changes: SteeringChange[], _baseline: ExperimentPlan): Promise<ExperimentPlan>;
143
- evaluateChange(_plan: ExperimentPlan): Promise<ExperimentResult>;
144
- }
145
-
146
- export { CallbackResearcher, type CallbackResearcherOptions, type ExperimentPlan, type ExperimentResult, type FailureMode, NoopResearcher, type Researcher, type SteeringChange };
7
+ import './emitter-B2XqDKFU.js';
8
+ import './dataset-B9qvlm_o.js';
@@ -25,9 +25,17 @@ import {
25
25
  summarizePreferenceMemory,
26
26
  trialTraceFromMultiShotTrial,
27
27
  withAssignedFeedbackSplit
28
- } from "./chunk-HRZELXCR.js";
29
- import "./chunk-YUFXO3TU.js";
30
- import "./chunk-KRR4VMH7.js";
28
+ } from "./chunk-VQQSPGSM.js";
29
+ import "./chunk-QBW3YBTR.js";
30
+ import {
31
+ runEvalCampaign
32
+ } from "./chunk-EXGR4XEM.js";
33
+ import "./chunk-KAO3Q65R.js";
34
+ import "./chunk-IOXMGMHQ.js";
35
+ import "./chunk-QUKKGHTZ.js";
36
+ import "./chunk-SQQLHODJ.js";
37
+ import "./chunk-5IIQKMD5.js";
38
+ import "./chunk-6M774GY6.js";
31
39
  import "./chunk-PZ5AY32C.js";
32
40
  export {
33
41
  CallbackResearcher,
@@ -50,6 +58,7 @@ export {
50
58
  renderPreferenceMemoryMarkdown,
51
59
  replayFeedbackTrajectories,
52
60
  replayFeedbackTrajectory,
61
+ runEvalCampaign,
53
62
  runMultiShotOptimization,
54
63
  runPromptEvolution,
55
64
  serializeFeedbackTrajectoriesJsonl,