@tangle-network/agent-eval 0.72.0 → 0.72.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/CHANGELOG.md +39 -0
  2. package/dist/adapters/http.d.ts +1 -1
  3. package/dist/adapters/langchain.d.ts +1 -1
  4. package/dist/adapters/otel.d.ts +3 -2
  5. package/dist/agent-profile-DYRboYWu.d.ts +364 -0
  6. package/dist/analyst/index.d.ts +221 -0
  7. package/dist/analyst/index.js +371 -0
  8. package/dist/analyst/index.js.map +1 -0
  9. package/dist/analyst-t7zZS3TV.d.ts +88 -0
  10. package/dist/campaign/index.d.ts +518 -9
  11. package/dist/campaign/index.js +672 -22
  12. package/dist/campaign/index.js.map +1 -1
  13. package/dist/chunk-7W4SM7FD.js +1075 -0
  14. package/dist/chunk-7W4SM7FD.js.map +1 -0
  15. package/dist/{chunk-AIWHLG7J.js → chunk-GJJNJVIR.js} +11 -11
  16. package/dist/chunk-JHA3ZGSO.js +1496 -0
  17. package/dist/chunk-JHA3ZGSO.js.map +1 -0
  18. package/dist/{chunk-4QJN7RDX.js → chunk-JYE3WOTE.js} +55 -7
  19. package/dist/{chunk-4QJN7RDX.js.map → chunk-JYE3WOTE.js.map} +1 -1
  20. package/dist/chunk-LB2UOI5F.js +412 -0
  21. package/dist/chunk-LB2UOI5F.js.map +1 -0
  22. package/dist/{chunk-ODGETRTM.js → chunk-VUINJM5M.js} +234 -1415
  23. package/dist/chunk-VUINJM5M.js.map +1 -0
  24. package/dist/chunk-WYIHD6EB.js +1044 -0
  25. package/dist/chunk-WYIHD6EB.js.map +1 -0
  26. package/dist/{chunk-UD6EF73X.js → chunk-XPILG2CA.js} +119 -2
  27. package/dist/chunk-XPILG2CA.js.map +1 -0
  28. package/dist/contract/index.d.ts +17 -13
  29. package/dist/contract/index.js +13 -7
  30. package/dist/contract/index.js.map +1 -1
  31. package/dist/{control-DxvZeV5X.d.ts → control-BgA6BYTm.d.ts} +1 -1
  32. package/dist/control.d.ts +2 -2
  33. package/dist/{feedback-trajectory-8hKC5EOb.d.ts → feedback-trajectory-B3rErRsh.d.ts} +1 -1
  34. package/dist/harness-optimizer-EnEnQPsr.d.ts +106 -0
  35. package/dist/hosted/index.d.ts +223 -2
  36. package/dist/index.d.ts +49 -1323
  37. package/dist/index.js +353 -2496
  38. package/dist/index.js.map +1 -1
  39. package/dist/{index-BGBrVS24.d.ts → insight-report-Df3lxYXM.d.ts} +1 -221
  40. package/dist/kind-factory-DW9XWPvM.d.ts +172 -0
  41. package/dist/multi-layer-verifier-DlWCXuxL.d.ts +141 -0
  42. package/dist/openapi.json +1 -1
  43. package/dist/pareto-E-pembql.d.ts +81 -0
  44. package/dist/{provenance-C69gLUXH.d.ts → provenance-B-TFszPW.d.ts} +131 -4
  45. package/dist/redact-B40YG2M_.d.ts +45 -0
  46. package/dist/registry-DuVYiTvw.d.ts +128 -0
  47. package/dist/{researcher-WJvIpX3L.d.ts → researcher-C_KJyIGg.d.ts} +1 -141
  48. package/dist/rl.d.ts +4 -3
  49. package/dist/rl.js +4 -4
  50. package/dist/run-critic-BAIjX99r.d.ts +56 -0
  51. package/dist/{run-improvement-loop-Bzamo6GB.d.ts → run-improvement-loop-BqYH2vCR.d.ts} +25 -1
  52. package/dist/semantic-concept-judge-CV9Wlx4t.d.ts +650 -0
  53. package/dist/{store-jzKpMl16.d.ts → store-GmBE2pZZ.d.ts} +1 -1
  54. package/dist/traces.d.ts +371 -308
  55. package/dist/traces.js +43 -18
  56. package/dist/{types-CnmZ2bkP.d.ts → types-Bba0vl1V.d.ts} +1 -1
  57. package/dist/{registry-BGKyX6bw.d.ts → types-CRD68aH7.d.ts} +3 -128
  58. package/dist/wire/index.d.ts +1 -1
  59. package/dist/workflow/index.d.ts +494 -0
  60. package/dist/workflow/index.js +2177 -0
  61. package/dist/workflow/index.js.map +1 -0
  62. package/docs/design/self-improvement-roadmap.md +106 -0
  63. package/package.json +36 -12
  64. package/dist/agent-profile-DzcPHR1Z.d.ts +0 -114
  65. package/dist/chunk-ODGETRTM.js.map +0 -1
  66. package/dist/chunk-SL55X4VN.js +0 -186
  67. package/dist/chunk-SL55X4VN.js.map +0 -1
  68. package/dist/chunk-UD6EF73X.js.map +0 -1
  69. /package/dist/{chunk-AIWHLG7J.js.map → chunk-GJJNJVIR.js.map} +0 -0
@@ -1,8 +1,10 @@
1
- import { o as Mutator, I as ImprovementDriver, S as Scenario, G as Gate, f as CampaignResult, M as MutableSurface, k as GateResult, j as GateDecision } from './types-CnmZ2bkP.js';
1
+ import { o as Mutator, I as ImprovementDriver, S as Scenario, G as Gate, k as GateResult, i as GateContext, f as CampaignResult, M as MutableSurface, j as GateDecision } from './types-Bba0vl1V.js';
2
2
  import { R as RedTeamCase } from './red-team-DW9Ca_tj.js';
3
3
  import { R as RunRecord } from './run-record-BgTFzO2r.js';
4
- import { a as RunCampaignOptions, C as CampaignStorage } from './run-improvement-loop-Bzamo6GB.js';
5
- import { H as HostedClient, T as TraceSpanEvent } from './index-BGBrVS24.js';
4
+ import { D as Direction } from './pareto-E-pembql.js';
5
+ import { a as PairedBootstrapResult } from './statistics-B7yCbi9i.js';
6
+ import { a as RunCampaignOptions, C as CampaignStorage } from './run-improvement-loop-BqYH2vCR.js';
7
+ import { HostedClient, TraceSpanEvent } from './hosted/index.js';
6
8
 
7
9
  /**
8
10
  * @experimental
@@ -113,6 +115,131 @@ interface HeldOutGateOptions<TScenario extends Scenario = Scenario> {
113
115
  }
114
116
  declare function heldOutGate<TArtifact, TScenario extends Scenario>(options: HeldOutGateOptions<TScenario>): Gate<TArtifact, TScenario>;
115
117
 
118
+ /**
119
+ * @experimental
120
+ *
121
+ * Promotion policy over the evidence VECTOR — the substrate's answer to "never
122
+ * collapse the multi-objective promotion decision into one scalar." A
123
+ * `defaultProductionGate` is one opinionated composition; this module factors
124
+ * the decision into two reusable pieces so MANY policies can compete over the
125
+ * SAME evidence (the quant-desk pattern: one evidence bus, plural strategies):
126
+ *
127
+ * buildEvidenceVector(ctx, objectives, opts) -> EvidenceVector // the bus
128
+ * PromotionPolicy = (ev: EvidenceVector) => GateResult // a strategy
129
+ * paretoPolicy(ev) // the default strategy
130
+ * paretoSignificanceGate(options): Gate // bus + policy as a Gate
131
+ *
132
+ * The Pareto policy is SYMMETRIC multi-objective: every objective is BOTH a
133
+ * potential gain source AND a safety floor (unlike `defaultProductionGate`,
134
+ * where only `composite` can win and `criticalDimensions` are pure floors). A
135
+ * candidate ships iff it weakly DOMINATES the baseline at the confidence level —
136
+ * no objective credibly worse (CI floor breach) AND at least one objective
137
+ * credibly better (CI gain). Insufficient evidence on ANY axis -> need_more_work
138
+ * (NOT folded into hold: "gather more reps" and "reject" are different actions).
139
+ *
140
+ * Cost/latency are NOT CI axes here — `GateContext` carries only an aggregate
141
+ * per-side cost, no per-cell observation vector to bootstrap. Treat them as hard
142
+ * constraints (compose with a budget gate via `composeGate`), not faked CIs.
143
+ */
144
+
145
+ /** Where an objective's per-cell scalar comes from. `composite` reads the
146
+ * judge's composite; `dimension` reads a named per-dimension score. */
147
+ type ObjectiveSource = {
148
+ kind: 'composite';
149
+ } | {
150
+ kind: 'dimension';
151
+ dimension: string;
152
+ };
153
+ interface PromotionObjective {
154
+ /** Stable label used in reports + `contributingGates`. */
155
+ name: string;
156
+ source: ObjectiveSource;
157
+ /** 'maximize' (quality dims) or 'minimize' (error/risk/length dims). Orients
158
+ * the paired delta so a positive bootstrap always means "candidate better". */
159
+ direction: Direction;
160
+ /** The good-direction paired-delta CI lower bound must EXCEED this to count
161
+ * as a significant gain on this axis. Interpreted in the judge's native
162
+ * scale. Default 0 (⇒ "confidently better"). */
163
+ gainThreshold?: number;
164
+ /** A floor breach (regression) is declared when the good-direction CI lower
165
+ * bound is below −floorTolerance. When omitted it auto-scales off observed
166
+ * magnitudes (0.05 on [0,1], 5 on 0-100), matching `dimensionRegressions`. */
167
+ floorTolerance?: number;
168
+ }
169
+ /** Per-axis verdict from the good-direction paired bootstrap. */
170
+ type AxisVerdict = 'improved' | 'regressed' | 'flat' | 'few_runs';
171
+ interface AxisEvidence {
172
+ name: string;
173
+ source: ObjectiveSource;
174
+ direction: Direction;
175
+ /** Paired bootstrap on the GOOD-DIRECTION delta (oriented by `direction`):
176
+ * a positive value means the candidate is better on this axis. */
177
+ bootstrap: PairedBootstrapResult;
178
+ /** Paired observations contributing to this axis. */
179
+ n: number;
180
+ gainThreshold: number;
181
+ floorTolerance: number;
182
+ verdict: AxisVerdict;
183
+ }
184
+ interface EvidenceVector {
185
+ /** One entry per objective — NOTHING averaged across axes. */
186
+ axes: AxisEvidence[];
187
+ /** Smallest paired n across axes that produced observations — the binding
188
+ * evidence-sufficiency constraint. 0 when no axis produced observations. */
189
+ minN: number;
190
+ /** Aggregate per-side cost from the gate context (a constraint input, not a
191
+ * CI axis — see the module header). */
192
+ cost: {
193
+ candidate: number;
194
+ baseline: number;
195
+ };
196
+ }
197
+ /** A promotion strategy: a pure function from the evidence vector to a verdict.
198
+ * Many policies can run over the same `EvidenceVector` and disagree — that's
199
+ * the point (competing strategies, shared evidence). */
200
+ type PromotionPolicy = (ev: EvidenceVector) => GateResult;
201
+ interface BuildEvidenceVectorOptions {
202
+ /** Minimum paired observations before an axis can claim significance; below
203
+ * it the axis is `few_runs`. Default 3. */
204
+ minProductiveRuns?: number;
205
+ /** Confidence level for every axis bootstrap. Default 0.95. */
206
+ confidence?: number;
207
+ /** Bootstrap resamples. Default 2000. */
208
+ resamples?: number;
209
+ /** Fixed bootstrap seed for a deterministic, reproducible verdict. Default 1337. */
210
+ seed?: number;
211
+ }
212
+ /**
213
+ * The Evidence Bus. For each objective, pair candidate vs baseline by full
214
+ * cellId and bootstrap a CI on the good-direction paired delta. Reuses the
215
+ * exact `pairHoldout` + `pairedBootstrap` machinery the held-out gate uses, so
216
+ * a single source of truth governs pairing granularity + scale handling.
217
+ */
218
+ declare function buildEvidenceVector<TArtifact, TScenario extends Scenario>(ctx: GateContext<TArtifact, TScenario>, objectives: PromotionObjective[], opts?: BuildEvidenceVectorOptions): EvidenceVector;
219
+ /**
220
+ * The default strategy: symmetric multi-objective Pareto significance. Ship iff
221
+ * the candidate weakly dominates the baseline at the confidence level — no axis
222
+ * credibly worse AND ≥1 axis credibly better. Floor breach on any axis → hold
223
+ * (anti-Goodhart, dominates everything). Insufficient evidence on any axis →
224
+ * need_more_work. Statistically equivalent → hold (never ship noise).
225
+ */
226
+ declare const paretoPolicy: PromotionPolicy;
227
+ interface ParetoSignificanceGateOptions extends BuildEvidenceVectorOptions {
228
+ /** The objective vector. Every axis is both a gain source and a safety floor. */
229
+ objectives: PromotionObjective[];
230
+ /** Strategy applied to the evidence vector. Default `paretoPolicy`. Override
231
+ * to run a stricter/looser strategy over the SAME bus (competing policies). */
232
+ policy?: PromotionPolicy;
233
+ /** Override the gate name in reports. */
234
+ name?: string;
235
+ }
236
+ /**
237
+ * Wrap the bus + a policy as a `Gate`. Plugs into the existing
238
+ * `runImprovementLoop({ gate })` slot and composes via `composeGate`; default
239
+ * loop behavior is unchanged because consumers opt in by passing this gate.
240
+ */
241
+ declare function paretoSignificanceGate<TArtifact = unknown, TScenario extends Scenario = Scenario>(options: ParetoSignificanceGateOptions): Gate<TArtifact, TScenario>;
242
+
116
243
  /**
117
244
  * @experimental
118
245
  *
@@ -311,4 +438,4 @@ interface EmitLoopProvenanceArgs<TArtifact, TScenario extends Scenario> extends
311
438
  */
312
439
  declare function emitLoopProvenance<TArtifact, TScenario extends Scenario>(args: EmitLoopProvenanceArgs<TArtifact, TScenario>): Promise<EmitLoopProvenanceResult>;
313
440
 
314
- export { type BuildLoopProvenanceArgs as B, type DefaultProductionGateOptions as D, type EvolutionaryDriverOptions as E, type HeldOutGateOptions as H, type LoopProvenanceRecord as L, type RunEvalOptions as R, type EmitLoopProvenanceArgs as a, type EmitLoopProvenanceResult as b, composeGate as c, defaultProductionGate as d, evolutionaryDriver as e, type LoopProvenanceBackend as f, type LoopProvenanceCandidate as g, heldOutGate as h, buildLoopProvenanceRecord as i, emitLoopProvenance as j, provenanceSpansPath as k, loopProvenanceSpans as l, provenanceRecordPath as p, runEval as r, surfaceContentHash as s };
441
+ export { type AxisEvidence as A, type BuildEvidenceVectorOptions as B, type DefaultProductionGateOptions as D, type EvidenceVector as E, type HeldOutGateOptions as H, type LoopProvenanceRecord as L, type ObjectiveSource as O, type ParetoSignificanceGateOptions as P, type RunEvalOptions as R, type AxisVerdict as a, type EvolutionaryDriverOptions as b, type PromotionObjective as c, type PromotionPolicy as d, buildEvidenceVector as e, composeGate as f, defaultProductionGate as g, evolutionaryDriver as h, heldOutGate as i, paretoSignificanceGate as j, type BuildLoopProvenanceArgs as k, type EmitLoopProvenanceArgs as l, type EmitLoopProvenanceResult as m, type LoopProvenanceBackend as n, type LoopProvenanceCandidate as o, paretoPolicy as p, buildLoopProvenanceRecord as q, runEval as r, emitLoopProvenance as s, loopProvenanceSpans as t, provenanceRecordPath as u, provenanceSpansPath as v, surfaceContentHash as w };
@@ -0,0 +1,45 @@
1
+ /**
2
+ * Redaction — remove PII / secrets from trace payloads before persist.
3
+ *
4
+ * Pre-persistence rules mean raw traces in storage are already scrubbed.
5
+ * Unredacted variants (for debugging / post-mortems) live in a separate
6
+ * storage layer with stricter access controls; this module only covers
7
+ * the default scrub-then-persist path.
8
+ *
9
+ * Rules compose: pass an array of `RedactionRule`, each is applied in
10
+ * order. Strings that match get replaced with a tagged sentinel so the
11
+ * eval framework can count how many redactions happened per run
12
+ * (surfaced via `redaction_applied` events).
13
+ */
14
+ interface RedactionRule {
15
+ id: string;
16
+ pattern: RegExp;
17
+ /** Replacement — e.g. '[PII:email]'. Defaults to `[redacted:{id}]`. */
18
+ replacement?: string;
19
+ }
20
+ interface RedactionReport {
21
+ redactionCount: number;
22
+ byRule: Record<string, number>;
23
+ }
24
+ /** OWASP / common-sense defaults — extend per-domain. */
25
+ declare const DEFAULT_REDACTION_RULES: RedactionRule[];
26
+ declare const REDACTION_VERSION = "1.0.0";
27
+ /**
28
+ * Redact a single string. Returns the new string and a per-rule count of
29
+ * how many substitutions fired.
30
+ */
31
+ declare function redactString(input: string, rules?: RedactionRule[]): {
32
+ output: string;
33
+ report: RedactionReport;
34
+ };
35
+ /**
36
+ * Walk a JSON-ish value applying `redactString` to every string leaf.
37
+ * Arrays and plain objects are recursed; other types pass through
38
+ * untouched. Circular references throw — traces should be tree-shaped.
39
+ */
40
+ declare function redactValue(value: unknown, rules?: RedactionRule[], report?: RedactionReport): {
41
+ value: unknown;
42
+ report: RedactionReport;
43
+ };
44
+
45
+ export { DEFAULT_REDACTION_RULES as D, type RedactionRule as R, type RedactionReport as a, REDACTION_VERSION as b, redactValue as c, redactString as r };
@@ -0,0 +1,128 @@
1
+ import { A as Analyst, a as AnalystContext, b as AnalystRunSummary, c as AnalystFinding, d as AnalystRunResult, C as ChatClient, e as AnalystRunInputs, f as AnalystRunEvent } from './types-CRD68aH7.js';
2
+
3
+ /**
4
+ * AnalystRegistry — orchestrate N analysts against one run.
5
+ *
6
+ * Owns three responsibilities and only three:
7
+ * 1. Registration — ids must be unique; bad registrations fail loudly
8
+ * at register-time, not run-time.
9
+ * 2. Routing — each analyst declares its `inputKind`; the registry
10
+ * picks the matching field from AnalystRunInputs and skips the
11
+ * analyst with a logged reason if it's missing.
12
+ * 3. Isolation — one analyst's exception MUST NOT stop other analysts.
13
+ * Failed analysts produce zero findings + a 'failed' summary row.
14
+ *
15
+ * Cross-cutting concerns (telemetry, error → finding conversion, cost
16
+ * ingestion, storage rotation) live in `AnalystHooks`. Budget shaping
17
+ * (equal split vs weighted vs custom) lives in `BudgetPolicy`. Both
18
+ * have sensible defaults; consumers override only what they need.
19
+ */
20
+
21
+ interface AnalystHooks {
22
+ /** Before analyze() — last chance to mutate ctx (e.g. inject tags, override budget). */
23
+ onBeforeAnalyze?(args: {
24
+ analyst: Analyst;
25
+ ctx: AnalystContext;
26
+ runId: string;
27
+ }): void | Promise<void>;
28
+ /** After every analyst (ok | failed | skipped). Use for telemetry, ingestion, rotation. */
29
+ onAfterAnalyze?(args: {
30
+ analyst: Analyst;
31
+ summary: AnalystRunSummary;
32
+ findings: AnalystFinding[];
33
+ runId: string;
34
+ }): void | Promise<void>;
35
+ /**
36
+ * On analyst exception. Hook MAY return findings to convert the
37
+ * error into structured findings; the summary still reports 'failed'.
38
+ * Return void to keep the default empty-findings behavior.
39
+ */
40
+ onError?(args: {
41
+ analyst: Analyst;
42
+ error: Error;
43
+ runId: string;
44
+ }): AnalystFinding[] | undefined | Promise<AnalystFinding[] | undefined>;
45
+ /** Once after registry.run() completes. Use for final aggregation, persistence. */
46
+ onComplete?(args: {
47
+ result: AnalystRunResult;
48
+ }): void | Promise<void>;
49
+ }
50
+ interface BudgetPolicy {
51
+ /** Overall USD cap across the registry.run(). */
52
+ totalUsd?: number;
53
+ /** Per-analyst weight for the default allocator. Missing ids get weight 1. */
54
+ weights?: Record<string, number>;
55
+ /**
56
+ * Custom allocator — receives the analyst, remaining/total budget, and
57
+ * the count of analysts that will run. Returns the per-analyst budget
58
+ * (or undefined to leave it uncapped). Overrides weights when set.
59
+ */
60
+ allocate?: (args: {
61
+ analyst: Analyst;
62
+ totalUsd: number | undefined;
63
+ remainingUsd: number | undefined;
64
+ runningCount: number;
65
+ }) => number | undefined;
66
+ }
67
+ interface AnalystRegistryOptions {
68
+ /** Shared chat client passed to every LLM analyst via AnalystContext. */
69
+ chat?: ChatClient;
70
+ /** Logger callback. Defaults to a no-op. */
71
+ log?: (msg: string, fields?: Record<string, unknown>) => void;
72
+ /** Hooks invoked around analyze() — observability + customization seam. */
73
+ hooks?: AnalystHooks;
74
+ /** Default budget when run() doesn't override. */
75
+ defaultBudget?: BudgetPolicy;
76
+ }
77
+ interface RegistryRunOpts {
78
+ /** Restrict to a subset of registered analysts by id. */
79
+ only?: string[];
80
+ /** Skip these analysts even if registered. Useful for cheap iteration. */
81
+ skip?: string[];
82
+ /** Budget policy — totalUsd + optional weights/allocator. Falls back to options.defaultBudget. */
83
+ budget?: BudgetPolicy;
84
+ /** Wall-clock cap. Analysts SHOULD honor `ctx.deadlineMs`. */
85
+ timeoutMs?: number;
86
+ /** Abort signal — forwarded into every analyst's context. */
87
+ signal?: AbortSignal;
88
+ /** Tags echoed into AnalystContext.tags — useful for tracking environment/version in findings. */
89
+ tags?: Record<string, string>;
90
+ /**
91
+ * Prior-run findings made available as retrieval context to every
92
+ * analyst via `ctx.priorFindings`. The registry forwards the slice
93
+ * whose `analyst_id` matches each registered analyst so a kind sees
94
+ * only its own history. Pass `{ '*': findings }` to broadcast to
95
+ * every analyst (useful for cross-kind chaining where the improvement
96
+ * analyst consumes upstream failure findings).
97
+ */
98
+ priorFindings?: ReadonlyArray<AnalystFinding> | Record<string, ReadonlyArray<AnalystFinding>>;
99
+ }
100
+ declare class AnalystRegistry {
101
+ private readonly analysts;
102
+ private readonly options;
103
+ constructor(options?: AnalystRegistryOptions);
104
+ register(analyst: Analyst): void;
105
+ list(): ReadonlyArray<{
106
+ id: string;
107
+ description: string;
108
+ version: string;
109
+ cost: Analyst['cost'];
110
+ }>;
111
+ run(runId: string, inputs: AnalystRunInputs, runOpts?: RegistryRunOpts): Promise<AnalystRunResult>;
112
+ /**
113
+ * Streaming counterpart to `run()`. Emits `AnalystRunEvent` values
114
+ * in real time — `run-started`, then per-analyst `skipped` /
115
+ * `started` / `completed`, then a terminal `run-completed` whose
116
+ * payload is the full `AnalystRunResult`. UIs use this to render
117
+ * progress; persistence consumers use `run()` and read the result.
118
+ *
119
+ * Hooks (`onBeforeAnalyze` / `onAfterAnalyze` / `onError` /
120
+ * `onComplete`) fire as before — streaming is additive, not a hook
121
+ * replacement.
122
+ */
123
+ runStream(runId: string, inputs: AnalystRunInputs, runOpts?: RegistryRunOpts): AsyncGenerator<AnalystRunEvent, void, void>;
124
+ private selectAnalysts;
125
+ private routeInput;
126
+ }
127
+
128
+ export { type AnalystHooks as A, type BudgetPolicy as B, type RegistryRunOpts as R, AnalystRegistry as a, type AnalystRegistryOptions as b };
@@ -7,146 +7,6 @@ import { R as RawProviderSink } from './raw-provider-sink-C46HDghv.js';
7
7
  import { F as FailureClass } from './schema-m0gsnbt3.js';
8
8
  import { T as TraceStore } from './store-CKUAgsJz.js';
9
9
 
10
- /**
11
- * Multi-layer verifier — ordered pipeline of verification layers.
12
- *
13
- * Different contract from {@link JudgeRunner} (which runs parallel
14
- * specs against a sandbox). MultiLayerVerifier is a DAG of layers
15
- * (install → typecheck → build → lint → serve → semantic → …) with
16
- * dependency-based skip, per-layer findings, soft-fail semantics, and
17
- * an aggregated `blendedScore` across all passed layers.
18
- *
19
- * Use when you want:
20
- * - ordered stages where a failing upstream stage skips downstream ones
21
- * - each stage produces rich `findings` (severity + message + evidence)
22
- * - a single composite score across stages with per-stage weights
23
- * - soft-fail stages whose failure doesn't abort the pipeline
24
- *
25
- * Use {@link JudgeRunner} when you want:
26
- * - N independent judges running in parallel against the same artifact
27
- * - no inter-judge dependencies
28
- * - boolean `passed` per judge + overall
29
- *
30
- * Both primitives compose — JudgeRunner can be invoked as a single
31
- * layer inside a MultiLayerVerifier if that suits the caller.
32
- */
33
- type LayerStatus = 'pass' | 'fail' | 'skipped' | 'error' | 'timeout';
34
- type Severity = 'critical' | 'major' | 'minor' | 'info';
35
- interface Finding {
36
- severity: Severity;
37
- message: string;
38
- evidence?: string;
39
- /** Optional layer name the finding belongs to (set by the verifier if omitted). */
40
- layer?: string;
41
- /**
42
- * Free-form structured payload — used by `multiToolchainLayer` to attach
43
- * `{ adapter: 'pnpm' }`, by judges to attach evidence pointers, etc.
44
- * Renderers MAY interrogate; agent-eval primitives never assume shape.
45
- */
46
- detail?: Record<string, unknown>;
47
- }
48
- interface LayerResult {
49
- layer: string;
50
- status: LayerStatus;
51
- /** 0..1 score, optional — layers that don't produce a numeric score omit. */
52
- score?: number;
53
- durationMs: number;
54
- findings: Finding[];
55
- /** Short human-readable summary (one line). */
56
- reason?: string;
57
- /**
58
- * Numeric layer-level diagnostics: error counts, warning counts,
59
- * cyclomatic complexity, total adapter wall-time, etc. Keyed by
60
- * diagnostic name; null = "diagnostic not applicable / not measured."
61
- * Renderers that know the keys can display them; ones that don't,
62
- * ignore. Free-form on purpose — consumers type the value shape in
63
- * their own namespace.
64
- */
65
- diagnostics?: Record<string, number | null>;
66
- /** Any rich per-layer detail — rendered as-is by consumers that know the layer. */
67
- detail?: Record<string, unknown>;
68
- }
69
- interface VerifyContext<Env = unknown> {
70
- /** Per-run opaque context the caller provides. Layers destructure what they need. */
71
- env: Env;
72
- /** Previously-computed results from layers that already ran. */
73
- prior: Record<string, LayerResult>;
74
- /** Signal — if aborted, layers MUST bail within reasonable wall. */
75
- signal: AbortSignal;
76
- }
77
- interface Layer<Env = unknown> {
78
- name: string;
79
- /** Stages that must have `status: 'pass'` before this layer runs. */
80
- dependsOn?: string[];
81
- /**
82
- * Weight in the composite `blendedScore`. Default 1.0. Layers with weight 0
83
- * contribute findings but not score.
84
- */
85
- weight?: number;
86
- /**
87
- * If true, a `fail` status contributes to `blendedScore` (as 0) instead of
88
- * being dropped — use for layers whose failure is a real signal. Default:
89
- * fail drops from numerator + denominator, matching VB's existing semantics.
90
- */
91
- failContributesToScore?: boolean;
92
- /** Optional per-layer wall-cap in ms. Honored by the verifier (AbortSignal). */
93
- capMs?: number;
94
- run: (ctx: VerifyContext<Env>) => Promise<LayerResult> | LayerResult;
95
- }
96
- interface VerifyOptions<Env = unknown> {
97
- env: Env;
98
- /**
99
- * Overall wall cap. Default: sum of layer capMs, or Infinity if any layer
100
- * omits a cap. The verifier short-circuits remaining layers on overall cap.
101
- */
102
- overallCapMs?: number;
103
- /** Called with each layer result as it completes. */
104
- onLayer?: (result: LayerResult) => void;
105
- }
106
- interface VerificationReport {
107
- layers: LayerResult[];
108
- passCount: number;
109
- failCount: number;
110
- skippedCount: number;
111
- errorCount: number;
112
- /** True iff at least one scored layer ran AND every scored layer passed. */
113
- allPass: boolean;
114
- /**
115
- * Weighted mean of `score` across contributing layers. 0 when no layers
116
- * contributed. See {@link Layer.failContributesToScore} for fail semantics.
117
- */
118
- blendedScore: number;
119
- durationMs: number;
120
- startedAt: string;
121
- finishedAt: string;
122
- }
123
- /**
124
- * Grade a semantic-concept-style judge result into a single layer status.
125
- *
126
- * Pass when overall score >= threshold AND no critical-severity concept gap.
127
- * Fail otherwise. Use inside a `Layer.run` when wrapping a concept judge.
128
- *
129
- * Generalized from VerticalBench H3 fix: `failingConcepts.length === 0` was
130
- * too strict — a single concept at 6/10 failed the entire layer despite
131
- * overall score being >= 0.7. Now we trust the judge's own `severity` field:
132
- * `critical` findings veto; `major`/`minor` reduce the score but don't veto.
133
- */
134
- declare function gradeSemanticStatus(input: {
135
- score: number;
136
- findings: Array<{
137
- severity: Severity;
138
- present?: boolean;
139
- score?: number;
140
- }>;
141
- available: boolean;
142
- threshold?: number;
143
- }): LayerStatus;
144
- declare class MultiLayerVerifier<Env = unknown> {
145
- private readonly layers;
146
- constructor(layers: Layer<Env>[]);
147
- run(opts: VerifyOptions<Env>): Promise<VerificationReport>;
148
- }
149
-
150
10
  /**
151
11
  * EvalCampaign — opinionated matrix runner that wires the four
152
12
  * capture-integrity directives by construction.
@@ -524,4 +384,4 @@ declare class NoopResearcher implements Researcher {
524
384
  evaluateChange(_plan: ExperimentPlan): Promise<ExperimentResult>;
525
385
  }
526
386
 
527
- export { CallbackResearcher as C, type ExperimentPlan as E, type FailureMode as F, type Layer as L, MultiLayerVerifier as M, NoopResearcher as N, type Researcher as R, type SteeringChange as S, type VerificationReport as V, type ExperimentResult as a, type EvalCampaignResult as b, type EvalCampaignOptions as c, type Severity as d, type VerifyOptions as e, type LayerResult as f, type VerifyContext as g, type CallbackResearcherOptions as h, type CampaignFactoryParams as i, type CampaignIntegrityPolicy as j, type CampaignRunContext as k, type CampaignRunOutcome as l, type CampaignRunner as m, type CampaignScenario as n, type CampaignVariant as o, type FailedRun as p, type Finding as q, runEvalCampaign as r, type LayerStatus as s, gradeSemanticStatus as t };
387
+ export { CallbackResearcher as C, type ExperimentPlan as E, type FailureMode as F, NoopResearcher as N, type Researcher as R, type SteeringChange as S, type ExperimentResult as a, type EvalCampaignResult as b, type EvalCampaignOptions as c, type CallbackResearcherOptions as d, type CampaignFactoryParams as e, type CampaignIntegrityPolicy as f, type CampaignRunContext as g, type CampaignRunOutcome as h, type CampaignRunner as i, type CampaignScenario as j, type CampaignVariant as k, type FailedRun as l, runEvalCampaign as r };
package/dist/rl.d.ts CHANGED
@@ -1,12 +1,13 @@
1
1
  import { R as RunRecord, b as RunSplitTag } from './run-record-BgTFzO2r.js';
2
- import { f as CampaignResult } from './types-CnmZ2bkP.js';
3
- import { V as VerificationReport, R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-WJvIpX3L.js';
4
- export { r as runEvalCampaign } from './researcher-WJvIpX3L.js';
2
+ import { f as CampaignResult } from './types-Bba0vl1V.js';
3
+ import { a as VerificationReport } from './multi-layer-verifier-DlWCXuxL.js';
5
4
  import { S as Span } from './schema-m0gsnbt3.js';
6
5
  import { T as TraceStore } from './store-CKUAgsJz.js';
7
6
  import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
8
7
  export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from './outcome-store-D6KWmYvj.js';
9
8
  import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-D_4BSXGV.js';
9
+ import { R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-C_KJyIGg.js';
10
+ export { r as runEvalCampaign } from './researcher-C_KJyIGg.js';
10
11
  import { I as InterimReleaseConfidence } from './sequential-5iSVfzl2.js';
11
12
  import './errors-Dwqw-T_m.js';
12
13
  import './llm-client-DbjLfz-K.js';
package/dist/rl.js CHANGED
@@ -10,8 +10,8 @@ import {
10
10
  } from "./chunk-3RF76KTD.js";
11
11
  import {
12
12
  runEvalCampaign
13
- } from "./chunk-AIWHLG7J.js";
14
- import "./chunk-F3SRAAZO.js";
13
+ } from "./chunk-GJJNJVIR.js";
14
+ import "./chunk-IHDHUN2X.js";
15
15
  import {
16
16
  rubricPredictiveValidity
17
17
  } from "./chunk-YRZ4M5GS.js";
@@ -24,10 +24,10 @@ import {
24
24
  wilcoxonSignedRank
25
25
  } from "./chunk-ITBRCT73.js";
26
26
  import "./chunk-SBCB6VZY.js";
27
+ import "./chunk-PC4UYEBM.js";
28
+ import "./chunk-F3SRAAZO.js";
27
29
  import "./chunk-TVVP3ZZQ.js";
28
30
  import "./chunk-VSMTAMNK.js";
29
- import "./chunk-IHDHUN2X.js";
30
- import "./chunk-PC4UYEBM.js";
31
31
  import {
32
32
  ValidationError
33
33
  } from "./chunk-3BFEG2F6.js";
@@ -0,0 +1,56 @@
1
+ import { R as Run, S as Span, a as TraceEvent, A as Artifact, B as BudgetLedgerEntry } from './schema-m0gsnbt3.js';
2
+ import { T as TraceStore } from './store-CKUAgsJz.js';
3
+
4
+ interface RunScore {
5
+ success: number;
6
+ goalProgress: number;
7
+ repoGroundedness: number;
8
+ driftPenalty: number;
9
+ toolUseQuality: number;
10
+ patchQuality: number;
11
+ testReality: number;
12
+ finalGate: number;
13
+ reviewerBlockers: number;
14
+ costUsd: number;
15
+ wallSeconds: number;
16
+ notes?: string[];
17
+ }
18
+ interface RunScoreWeights {
19
+ success: number;
20
+ goalProgress: number;
21
+ repoGroundedness: number;
22
+ driftPenalty: number;
23
+ toolUseQuality: number;
24
+ patchQuality: number;
25
+ testReality: number;
26
+ finalGate: number;
27
+ reviewerBlockers: number;
28
+ costUsd: number;
29
+ wallSeconds: number;
30
+ }
31
+ declare const DEFAULT_RUN_SCORE_WEIGHTS: RunScoreWeights;
32
+ declare function aggregateRunScore(score: RunScore, weights?: Partial<RunScoreWeights>): number;
33
+ declare function clamp01(value: number): number;
34
+
35
+ interface RunTrace {
36
+ run: Run;
37
+ spans: Span[];
38
+ events: TraceEvent[];
39
+ artifacts: Artifact[];
40
+ budget: BudgetLedgerEntry[];
41
+ }
42
+ interface RunCriticOptions {
43
+ weights?: Partial<RunScoreWeights>;
44
+ driftPatterns?: RegExp[];
45
+ }
46
+ declare class RunCritic {
47
+ private readonly weights?;
48
+ private readonly driftPatterns;
49
+ constructor(options?: RunCriticOptions);
50
+ score(store: TraceStore, runId: string): Promise<RunScore>;
51
+ scoreTrace(trace: RunTrace): RunScore;
52
+ rank(score: RunScore): number;
53
+ private isDrift;
54
+ }
55
+
56
+ export { DEFAULT_RUN_SCORE_WEIGHTS as D, type RunScore as R, type RunTrace as a, type RunScoreWeights as b, RunCritic as c, type RunCriticOptions as d, aggregateRunScore as e, clamp01 as f };
@@ -1,5 +1,5 @@
1
1
  import { L as LlmClientOptions } from './llm-client-DbjLfz-K.js';
2
- import { I as ImprovementDriver, S as Scenario, f as CampaignResult, k as GateResult, D as DispatchFn, a as JudgeConfig, L as LabeledScenarioStore, g as CampaignTraceWriter, m as GenerationRecord, M as MutableSurface, P as ParetoParent, G as Gate } from './types-CnmZ2bkP.js';
2
+ import { I as ImprovementDriver, S as Scenario, f as CampaignResult, k as GateResult, D as DispatchFn, a as JudgeConfig, L as LabeledScenarioStore, g as CampaignTraceWriter, m as GenerationRecord, M as MutableSurface, P as ParetoParent, G as Gate } from './types-Bba0vl1V.js';
3
3
 
4
4
  /**
5
5
  * @experimental
@@ -308,6 +308,30 @@ interface RunOptimizationOptions<TScenario extends Scenario, TArtifact> extends
308
308
  /** Phase-2 research report forwarded to `propose()` (analyst findings +
309
309
  * diff). Opaque here; the driver types it. */
310
310
  report?: unknown;
311
+ /** Structured findings forwarded to `propose()` as `ctx.findings`. A
312
+ * findings producer (trace-analyst registry, HALO) emits these from the
313
+ * generation's traces; findings-grounded drivers (`improvementDriver`,
314
+ * `memoryCurationDriver`, `traceAnalystDriver`) consume them. Opaque here;
315
+ * the driver types its `TFindings`. Empty when no producer is wired. */
316
+ findings?: unknown[];
317
+ /** Per-generation findings producer — the EYES→HANDS loop closure. After each
318
+ * generation's candidates are scored, this is called with that generation's
319
+ * results; whatever it returns REPLACES `ctx.findings` for the NEXT
320
+ * generation's `propose()`, so the diagnosis is refreshed each round instead
321
+ * of being a static one-shot. Generic by design: the substrate does not
322
+ * import an analyst — the consumer plugs its trace-analyst registry / HALO
323
+ * here (reading the per-candidate `runDir` traces). When absent, findings
324
+ * stay the static `opts.findings`. */
325
+ analyzeGeneration?: (input: {
326
+ generation: number;
327
+ runDir: string;
328
+ candidates: Array<{
329
+ surfaceHash: string;
330
+ campaign: CampaignResult<TArtifact, TScenario>;
331
+ composite: number;
332
+ }>;
333
+ history: GenerationRecord[];
334
+ }) => Promise<unknown[]>;
311
335
  }
312
336
  interface RunOptimizationResult<TArtifact, TScenario extends Scenario> {
313
337
  generations: Array<{