@tangle-network/agent-eval 0.71.0 → 0.72.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +63 -0
- package/dist/adapters/http.d.ts +1 -1
- package/dist/adapters/langchain.d.ts +1 -1
- package/dist/adapters/otel.d.ts +3 -2
- package/dist/agent-profile-DYRboYWu.d.ts +364 -0
- package/dist/analyst/index.d.ts +221 -0
- package/dist/analyst/index.js +371 -0
- package/dist/analyst/index.js.map +1 -0
- package/dist/analyst-t7zZS3TV.d.ts +88 -0
- package/dist/campaign/index.d.ts +485 -9
- package/dist/campaign/index.js +618 -30
- package/dist/campaign/index.js.map +1 -1
- package/dist/chunk-7W4SM7FD.js +1075 -0
- package/dist/chunk-7W4SM7FD.js.map +1 -0
- package/dist/{chunk-AIWHLG7J.js → chunk-GJJNJVIR.js} +11 -11
- package/dist/chunk-JHA3ZGSO.js +1496 -0
- package/dist/chunk-JHA3ZGSO.js.map +1 -0
- package/dist/{chunk-VMAYE3LM.js → chunk-JYE3WOTE.js} +57 -9
- package/dist/{chunk-VMAYE3LM.js.map → chunk-JYE3WOTE.js.map} +1 -1
- package/dist/chunk-LB2UOI5F.js +412 -0
- package/dist/chunk-LB2UOI5F.js.map +1 -0
- package/dist/{chunk-ODGETRTM.js → chunk-VUINJM5M.js} +234 -1415
- package/dist/chunk-VUINJM5M.js.map +1 -0
- package/dist/chunk-WYIHD6EB.js +1044 -0
- package/dist/chunk-WYIHD6EB.js.map +1 -0
- package/dist/{chunk-6QZUCFKM.js → chunk-XPILG2CA.js} +120 -3
- package/dist/chunk-XPILG2CA.js.map +1 -0
- package/dist/{chunk-6XQIEUQ2.js → chunk-ZPSKPT3V.js} +5 -3
- package/dist/{chunk-6XQIEUQ2.js.map → chunk-ZPSKPT3V.js.map} +1 -1
- package/dist/contract/index.d.ts +17 -13
- package/dist/contract/index.js +14 -8
- package/dist/contract/index.js.map +1 -1
- package/dist/{control-DxvZeV5X.d.ts → control-BgA6BYTm.d.ts} +1 -1
- package/dist/control.d.ts +2 -2
- package/dist/{feedback-trajectory-8hKC5EOb.d.ts → feedback-trajectory-B3rErRsh.d.ts} +1 -1
- package/dist/harness-optimizer-EnEnQPsr.d.ts +106 -0
- package/dist/hosted/index.d.ts +223 -2
- package/dist/index.d.ts +49 -1323
- package/dist/index.js +339 -2627
- package/dist/index.js.map +1 -1
- package/dist/{index-BGBrVS24.d.ts → insight-report-Df3lxYXM.d.ts} +1 -221
- package/dist/kind-factory-DW9XWPvM.d.ts +172 -0
- package/dist/multi-layer-verifier-DlWCXuxL.d.ts +141 -0
- package/dist/openapi.json +1 -1
- package/dist/pareto-E-pembql.d.ts +81 -0
- package/dist/{provenance-C69gLUXH.d.ts → provenance-B-TFszPW.d.ts} +131 -4
- package/dist/redact-B40YG2M_.d.ts +45 -0
- package/dist/registry-DuVYiTvw.d.ts +128 -0
- package/dist/{researcher-WJvIpX3L.d.ts → researcher-C_KJyIGg.d.ts} +1 -141
- package/dist/rl.d.ts +4 -3
- package/dist/rl.js +4 -4
- package/dist/{run-campaign-BVY3RGAZ.js → run-campaign-OVEZF24D.js} +2 -2
- package/dist/run-critic-BAIjX99r.d.ts +56 -0
- package/dist/{run-improvement-loop-Bzamo6GB.d.ts → run-improvement-loop-BqYH2vCR.d.ts} +25 -1
- package/dist/semantic-concept-judge-CV9Wlx4t.d.ts +650 -0
- package/dist/{store-jzKpMl16.d.ts → store-GmBE2pZZ.d.ts} +1 -1
- package/dist/traces.d.ts +371 -308
- package/dist/traces.js +43 -18
- package/dist/{types-CnmZ2bkP.d.ts → types-Bba0vl1V.d.ts} +1 -1
- package/dist/{registry-BGKyX6bw.d.ts → types-CRD68aH7.d.ts} +3 -128
- package/dist/wire/index.d.ts +1 -1
- package/dist/workflow/index.d.ts +494 -0
- package/dist/workflow/index.js +2177 -0
- package/dist/workflow/index.js.map +1 -0
- package/docs/design/self-improvement-roadmap.md +106 -0
- package/package.json +36 -12
- package/dist/agent-profile-DzcPHR1Z.d.ts +0 -114
- package/dist/chunk-6QZUCFKM.js.map +0 -1
- package/dist/chunk-ODGETRTM.js.map +0 -1
- package/dist/chunk-PQV2TKC3.js +0 -27
- package/dist/chunk-PQV2TKC3.js.map +0 -1
- /package/dist/{chunk-AIWHLG7J.js.map → chunk-GJJNJVIR.js.map} +0 -0
- /package/dist/{run-campaign-BVY3RGAZ.js.map → run-campaign-OVEZF24D.js.map} +0 -0
|
@@ -1,8 +1,10 @@
|
|
|
1
|
-
import { o as Mutator, I as ImprovementDriver, S as Scenario, G as Gate,
|
|
1
|
+
import { o as Mutator, I as ImprovementDriver, S as Scenario, G as Gate, k as GateResult, i as GateContext, f as CampaignResult, M as MutableSurface, j as GateDecision } from './types-Bba0vl1V.js';
|
|
2
2
|
import { R as RedTeamCase } from './red-team-DW9Ca_tj.js';
|
|
3
3
|
import { R as RunRecord } from './run-record-BgTFzO2r.js';
|
|
4
|
-
import {
|
|
5
|
-
import {
|
|
4
|
+
import { D as Direction } from './pareto-E-pembql.js';
|
|
5
|
+
import { a as PairedBootstrapResult } from './statistics-B7yCbi9i.js';
|
|
6
|
+
import { a as RunCampaignOptions, C as CampaignStorage } from './run-improvement-loop-BqYH2vCR.js';
|
|
7
|
+
import { HostedClient, TraceSpanEvent } from './hosted/index.js';
|
|
6
8
|
|
|
7
9
|
/**
|
|
8
10
|
* @experimental
|
|
@@ -113,6 +115,131 @@ interface HeldOutGateOptions<TScenario extends Scenario = Scenario> {
|
|
|
113
115
|
}
|
|
114
116
|
declare function heldOutGate<TArtifact, TScenario extends Scenario>(options: HeldOutGateOptions<TScenario>): Gate<TArtifact, TScenario>;
|
|
115
117
|
|
|
118
|
+
/**
|
|
119
|
+
* @experimental
|
|
120
|
+
*
|
|
121
|
+
* Promotion policy over the evidence VECTOR — the substrate's answer to "never
|
|
122
|
+
* collapse the multi-objective promotion decision into one scalar." A
|
|
123
|
+
* `defaultProductionGate` is one opinionated composition; this module factors
|
|
124
|
+
* the decision into two reusable pieces so MANY policies can compete over the
|
|
125
|
+
* SAME evidence (the quant-desk pattern: one evidence bus, plural strategies):
|
|
126
|
+
*
|
|
127
|
+
* buildEvidenceVector(ctx, objectives, opts) -> EvidenceVector // the bus
|
|
128
|
+
* PromotionPolicy = (ev: EvidenceVector) => GateResult // a strategy
|
|
129
|
+
* paretoPolicy(ev) // the default strategy
|
|
130
|
+
* paretoSignificanceGate(options): Gate // bus + policy as a Gate
|
|
131
|
+
*
|
|
132
|
+
* The Pareto policy is SYMMETRIC multi-objective: every objective is BOTH a
|
|
133
|
+
* potential gain source AND a safety floor (unlike `defaultProductionGate`,
|
|
134
|
+
* where only `composite` can win and `criticalDimensions` are pure floors). A
|
|
135
|
+
* candidate ships iff it weakly DOMINATES the baseline at the confidence level —
|
|
136
|
+
* no objective credibly worse (CI floor breach) AND at least one objective
|
|
137
|
+
* credibly better (CI gain). Insufficient evidence on ANY axis -> need_more_work
|
|
138
|
+
* (NOT folded into hold: "gather more reps" and "reject" are different actions).
|
|
139
|
+
*
|
|
140
|
+
* Cost/latency are NOT CI axes here — `GateContext` carries only an aggregate
|
|
141
|
+
* per-side cost, no per-cell observation vector to bootstrap. Treat them as hard
|
|
142
|
+
* constraints (compose with a budget gate via `composeGate`), not faked CIs.
|
|
143
|
+
*/
|
|
144
|
+
|
|
145
|
+
/** Where an objective's per-cell scalar comes from. `composite` reads the
|
|
146
|
+
* judge's composite; `dimension` reads a named per-dimension score. */
|
|
147
|
+
type ObjectiveSource = {
|
|
148
|
+
kind: 'composite';
|
|
149
|
+
} | {
|
|
150
|
+
kind: 'dimension';
|
|
151
|
+
dimension: string;
|
|
152
|
+
};
|
|
153
|
+
interface PromotionObjective {
|
|
154
|
+
/** Stable label used in reports + `contributingGates`. */
|
|
155
|
+
name: string;
|
|
156
|
+
source: ObjectiveSource;
|
|
157
|
+
/** 'maximize' (quality dims) or 'minimize' (error/risk/length dims). Orients
|
|
158
|
+
* the paired delta so a positive bootstrap always means "candidate better". */
|
|
159
|
+
direction: Direction;
|
|
160
|
+
/** The good-direction paired-delta CI lower bound must EXCEED this to count
|
|
161
|
+
* as a significant gain on this axis. Interpreted in the judge's native
|
|
162
|
+
* scale. Default 0 (⇒ "confidently better"). */
|
|
163
|
+
gainThreshold?: number;
|
|
164
|
+
/** A floor breach (regression) is declared when the good-direction CI lower
|
|
165
|
+
* bound is below −floorTolerance. When omitted it auto-scales off observed
|
|
166
|
+
* magnitudes (0.05 on [0,1], 5 on 0-100), matching `dimensionRegressions`. */
|
|
167
|
+
floorTolerance?: number;
|
|
168
|
+
}
|
|
169
|
+
/** Per-axis verdict from the good-direction paired bootstrap. */
|
|
170
|
+
type AxisVerdict = 'improved' | 'regressed' | 'flat' | 'few_runs';
|
|
171
|
+
interface AxisEvidence {
|
|
172
|
+
name: string;
|
|
173
|
+
source: ObjectiveSource;
|
|
174
|
+
direction: Direction;
|
|
175
|
+
/** Paired bootstrap on the GOOD-DIRECTION delta (oriented by `direction`):
|
|
176
|
+
* a positive value means the candidate is better on this axis. */
|
|
177
|
+
bootstrap: PairedBootstrapResult;
|
|
178
|
+
/** Paired observations contributing to this axis. */
|
|
179
|
+
n: number;
|
|
180
|
+
gainThreshold: number;
|
|
181
|
+
floorTolerance: number;
|
|
182
|
+
verdict: AxisVerdict;
|
|
183
|
+
}
|
|
184
|
+
interface EvidenceVector {
|
|
185
|
+
/** One entry per objective — NOTHING averaged across axes. */
|
|
186
|
+
axes: AxisEvidence[];
|
|
187
|
+
/** Smallest paired n across axes that produced observations — the binding
|
|
188
|
+
* evidence-sufficiency constraint. 0 when no axis produced observations. */
|
|
189
|
+
minN: number;
|
|
190
|
+
/** Aggregate per-side cost from the gate context (a constraint input, not a
|
|
191
|
+
* CI axis — see the module header). */
|
|
192
|
+
cost: {
|
|
193
|
+
candidate: number;
|
|
194
|
+
baseline: number;
|
|
195
|
+
};
|
|
196
|
+
}
|
|
197
|
+
/** A promotion strategy: a pure function from the evidence vector to a verdict.
|
|
198
|
+
* Many policies can run over the same `EvidenceVector` and disagree — that's
|
|
199
|
+
* the point (competing strategies, shared evidence). */
|
|
200
|
+
type PromotionPolicy = (ev: EvidenceVector) => GateResult;
|
|
201
|
+
interface BuildEvidenceVectorOptions {
|
|
202
|
+
/** Minimum paired observations before an axis can claim significance; below
|
|
203
|
+
* it the axis is `few_runs`. Default 3. */
|
|
204
|
+
minProductiveRuns?: number;
|
|
205
|
+
/** Confidence level for every axis bootstrap. Default 0.95. */
|
|
206
|
+
confidence?: number;
|
|
207
|
+
/** Bootstrap resamples. Default 2000. */
|
|
208
|
+
resamples?: number;
|
|
209
|
+
/** Fixed bootstrap seed for a deterministic, reproducible verdict. Default 1337. */
|
|
210
|
+
seed?: number;
|
|
211
|
+
}
|
|
212
|
+
/**
|
|
213
|
+
* The Evidence Bus. For each objective, pair candidate vs baseline by full
|
|
214
|
+
* cellId and bootstrap a CI on the good-direction paired delta. Reuses the
|
|
215
|
+
* exact `pairHoldout` + `pairedBootstrap` machinery the held-out gate uses, so
|
|
216
|
+
* a single source of truth governs pairing granularity + scale handling.
|
|
217
|
+
*/
|
|
218
|
+
declare function buildEvidenceVector<TArtifact, TScenario extends Scenario>(ctx: GateContext<TArtifact, TScenario>, objectives: PromotionObjective[], opts?: BuildEvidenceVectorOptions): EvidenceVector;
|
|
219
|
+
/**
|
|
220
|
+
* The default strategy: symmetric multi-objective Pareto significance. Ship iff
|
|
221
|
+
* the candidate weakly dominates the baseline at the confidence level — no axis
|
|
222
|
+
* credibly worse AND ≥1 axis credibly better. Floor breach on any axis → hold
|
|
223
|
+
* (anti-Goodhart, dominates everything). Insufficient evidence on any axis →
|
|
224
|
+
* need_more_work. Statistically equivalent → hold (never ship noise).
|
|
225
|
+
*/
|
|
226
|
+
declare const paretoPolicy: PromotionPolicy;
|
|
227
|
+
interface ParetoSignificanceGateOptions extends BuildEvidenceVectorOptions {
|
|
228
|
+
/** The objective vector. Every axis is both a gain source and a safety floor. */
|
|
229
|
+
objectives: PromotionObjective[];
|
|
230
|
+
/** Strategy applied to the evidence vector. Default `paretoPolicy`. Override
|
|
231
|
+
* to run a stricter/looser strategy over the SAME bus (competing policies). */
|
|
232
|
+
policy?: PromotionPolicy;
|
|
233
|
+
/** Override the gate name in reports. */
|
|
234
|
+
name?: string;
|
|
235
|
+
}
|
|
236
|
+
/**
|
|
237
|
+
* Wrap the bus + a policy as a `Gate`. Plugs into the existing
|
|
238
|
+
* `runImprovementLoop({ gate })` slot and composes via `composeGate`; default
|
|
239
|
+
* loop behavior is unchanged because consumers opt in by passing this gate.
|
|
240
|
+
*/
|
|
241
|
+
declare function paretoSignificanceGate<TArtifact = unknown, TScenario extends Scenario = Scenario>(options: ParetoSignificanceGateOptions): Gate<TArtifact, TScenario>;
|
|
242
|
+
|
|
116
243
|
/**
|
|
117
244
|
* @experimental
|
|
118
245
|
*
|
|
@@ -311,4 +438,4 @@ interface EmitLoopProvenanceArgs<TArtifact, TScenario extends Scenario> extends
|
|
|
311
438
|
*/
|
|
312
439
|
declare function emitLoopProvenance<TArtifact, TScenario extends Scenario>(args: EmitLoopProvenanceArgs<TArtifact, TScenario>): Promise<EmitLoopProvenanceResult>;
|
|
313
440
|
|
|
314
|
-
export { type
|
|
441
|
+
export { type AxisEvidence as A, type BuildEvidenceVectorOptions as B, type DefaultProductionGateOptions as D, type EvidenceVector as E, type HeldOutGateOptions as H, type LoopProvenanceRecord as L, type ObjectiveSource as O, type ParetoSignificanceGateOptions as P, type RunEvalOptions as R, type AxisVerdict as a, type EvolutionaryDriverOptions as b, type PromotionObjective as c, type PromotionPolicy as d, buildEvidenceVector as e, composeGate as f, defaultProductionGate as g, evolutionaryDriver as h, heldOutGate as i, paretoSignificanceGate as j, type BuildLoopProvenanceArgs as k, type EmitLoopProvenanceArgs as l, type EmitLoopProvenanceResult as m, type LoopProvenanceBackend as n, type LoopProvenanceCandidate as o, paretoPolicy as p, buildLoopProvenanceRecord as q, runEval as r, emitLoopProvenance as s, loopProvenanceSpans as t, provenanceRecordPath as u, provenanceSpansPath as v, surfaceContentHash as w };
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Redaction — remove PII / secrets from trace payloads before persist.
|
|
3
|
+
*
|
|
4
|
+
* Pre-persistence rules mean raw traces in storage are already scrubbed.
|
|
5
|
+
* Unredacted variants (for debugging / post-mortems) live in a separate
|
|
6
|
+
* storage layer with stricter access controls; this module only covers
|
|
7
|
+
* the default scrub-then-persist path.
|
|
8
|
+
*
|
|
9
|
+
* Rules compose: pass an array of `RedactionRule`, each is applied in
|
|
10
|
+
* order. Strings that match get replaced with a tagged sentinel so the
|
|
11
|
+
* eval framework can count how many redactions happened per run
|
|
12
|
+
* (surfaced via `redaction_applied` events).
|
|
13
|
+
*/
|
|
14
|
+
interface RedactionRule {
|
|
15
|
+
id: string;
|
|
16
|
+
pattern: RegExp;
|
|
17
|
+
/** Replacement — e.g. '[PII:email]'. Defaults to `[redacted:{id}]`. */
|
|
18
|
+
replacement?: string;
|
|
19
|
+
}
|
|
20
|
+
interface RedactionReport {
|
|
21
|
+
redactionCount: number;
|
|
22
|
+
byRule: Record<string, number>;
|
|
23
|
+
}
|
|
24
|
+
/** OWASP / common-sense defaults — extend per-domain. */
|
|
25
|
+
declare const DEFAULT_REDACTION_RULES: RedactionRule[];
|
|
26
|
+
declare const REDACTION_VERSION = "1.0.0";
|
|
27
|
+
/**
|
|
28
|
+
* Redact a single string. Returns the new string and a per-rule count of
|
|
29
|
+
* how many substitutions fired.
|
|
30
|
+
*/
|
|
31
|
+
declare function redactString(input: string, rules?: RedactionRule[]): {
|
|
32
|
+
output: string;
|
|
33
|
+
report: RedactionReport;
|
|
34
|
+
};
|
|
35
|
+
/**
|
|
36
|
+
* Walk a JSON-ish value applying `redactString` to every string leaf.
|
|
37
|
+
* Arrays and plain objects are recursed; other types pass through
|
|
38
|
+
* untouched. Circular references throw — traces should be tree-shaped.
|
|
39
|
+
*/
|
|
40
|
+
declare function redactValue(value: unknown, rules?: RedactionRule[], report?: RedactionReport): {
|
|
41
|
+
value: unknown;
|
|
42
|
+
report: RedactionReport;
|
|
43
|
+
};
|
|
44
|
+
|
|
45
|
+
export { DEFAULT_REDACTION_RULES as D, type RedactionRule as R, type RedactionReport as a, REDACTION_VERSION as b, redactValue as c, redactString as r };
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
import { A as Analyst, a as AnalystContext, b as AnalystRunSummary, c as AnalystFinding, d as AnalystRunResult, C as ChatClient, e as AnalystRunInputs, f as AnalystRunEvent } from './types-CRD68aH7.js';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* AnalystRegistry — orchestrate N analysts against one run.
|
|
5
|
+
*
|
|
6
|
+
* Owns three responsibilities and only three:
|
|
7
|
+
* 1. Registration — ids must be unique; bad registrations fail loudly
|
|
8
|
+
* at register-time, not run-time.
|
|
9
|
+
* 2. Routing — each analyst declares its `inputKind`; the registry
|
|
10
|
+
* picks the matching field from AnalystRunInputs and skips the
|
|
11
|
+
* analyst with a logged reason if it's missing.
|
|
12
|
+
* 3. Isolation — one analyst's exception MUST NOT stop other analysts.
|
|
13
|
+
* Failed analysts produce zero findings + a 'failed' summary row.
|
|
14
|
+
*
|
|
15
|
+
* Cross-cutting concerns (telemetry, error → finding conversion, cost
|
|
16
|
+
* ingestion, storage rotation) live in `AnalystHooks`. Budget shaping
|
|
17
|
+
* (equal split vs weighted vs custom) lives in `BudgetPolicy`. Both
|
|
18
|
+
* have sensible defaults; consumers override only what they need.
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
interface AnalystHooks {
|
|
22
|
+
/** Before analyze() — last chance to mutate ctx (e.g. inject tags, override budget). */
|
|
23
|
+
onBeforeAnalyze?(args: {
|
|
24
|
+
analyst: Analyst;
|
|
25
|
+
ctx: AnalystContext;
|
|
26
|
+
runId: string;
|
|
27
|
+
}): void | Promise<void>;
|
|
28
|
+
/** After every analyst (ok | failed | skipped). Use for telemetry, ingestion, rotation. */
|
|
29
|
+
onAfterAnalyze?(args: {
|
|
30
|
+
analyst: Analyst;
|
|
31
|
+
summary: AnalystRunSummary;
|
|
32
|
+
findings: AnalystFinding[];
|
|
33
|
+
runId: string;
|
|
34
|
+
}): void | Promise<void>;
|
|
35
|
+
/**
|
|
36
|
+
* On analyst exception. Hook MAY return findings to convert the
|
|
37
|
+
* error into structured findings; the summary still reports 'failed'.
|
|
38
|
+
* Return void to keep the default empty-findings behavior.
|
|
39
|
+
*/
|
|
40
|
+
onError?(args: {
|
|
41
|
+
analyst: Analyst;
|
|
42
|
+
error: Error;
|
|
43
|
+
runId: string;
|
|
44
|
+
}): AnalystFinding[] | undefined | Promise<AnalystFinding[] | undefined>;
|
|
45
|
+
/** Once after registry.run() completes. Use for final aggregation, persistence. */
|
|
46
|
+
onComplete?(args: {
|
|
47
|
+
result: AnalystRunResult;
|
|
48
|
+
}): void | Promise<void>;
|
|
49
|
+
}
|
|
50
|
+
interface BudgetPolicy {
|
|
51
|
+
/** Overall USD cap across the registry.run(). */
|
|
52
|
+
totalUsd?: number;
|
|
53
|
+
/** Per-analyst weight for the default allocator. Missing ids get weight 1. */
|
|
54
|
+
weights?: Record<string, number>;
|
|
55
|
+
/**
|
|
56
|
+
* Custom allocator — receives the analyst, remaining/total budget, and
|
|
57
|
+
* the count of analysts that will run. Returns the per-analyst budget
|
|
58
|
+
* (or undefined to leave it uncapped). Overrides weights when set.
|
|
59
|
+
*/
|
|
60
|
+
allocate?: (args: {
|
|
61
|
+
analyst: Analyst;
|
|
62
|
+
totalUsd: number | undefined;
|
|
63
|
+
remainingUsd: number | undefined;
|
|
64
|
+
runningCount: number;
|
|
65
|
+
}) => number | undefined;
|
|
66
|
+
}
|
|
67
|
+
interface AnalystRegistryOptions {
|
|
68
|
+
/** Shared chat client passed to every LLM analyst via AnalystContext. */
|
|
69
|
+
chat?: ChatClient;
|
|
70
|
+
/** Logger callback. Defaults to a no-op. */
|
|
71
|
+
log?: (msg: string, fields?: Record<string, unknown>) => void;
|
|
72
|
+
/** Hooks invoked around analyze() — observability + customization seam. */
|
|
73
|
+
hooks?: AnalystHooks;
|
|
74
|
+
/** Default budget when run() doesn't override. */
|
|
75
|
+
defaultBudget?: BudgetPolicy;
|
|
76
|
+
}
|
|
77
|
+
interface RegistryRunOpts {
|
|
78
|
+
/** Restrict to a subset of registered analysts by id. */
|
|
79
|
+
only?: string[];
|
|
80
|
+
/** Skip these analysts even if registered. Useful for cheap iteration. */
|
|
81
|
+
skip?: string[];
|
|
82
|
+
/** Budget policy — totalUsd + optional weights/allocator. Falls back to options.defaultBudget. */
|
|
83
|
+
budget?: BudgetPolicy;
|
|
84
|
+
/** Wall-clock cap. Analysts SHOULD honor `ctx.deadlineMs`. */
|
|
85
|
+
timeoutMs?: number;
|
|
86
|
+
/** Abort signal — forwarded into every analyst's context. */
|
|
87
|
+
signal?: AbortSignal;
|
|
88
|
+
/** Tags echoed into AnalystContext.tags — useful for tracking environment/version in findings. */
|
|
89
|
+
tags?: Record<string, string>;
|
|
90
|
+
/**
|
|
91
|
+
* Prior-run findings made available as retrieval context to every
|
|
92
|
+
* analyst via `ctx.priorFindings`. The registry forwards the slice
|
|
93
|
+
* whose `analyst_id` matches each registered analyst so a kind sees
|
|
94
|
+
* only its own history. Pass `{ '*': findings }` to broadcast to
|
|
95
|
+
* every analyst (useful for cross-kind chaining where the improvement
|
|
96
|
+
* analyst consumes upstream failure findings).
|
|
97
|
+
*/
|
|
98
|
+
priorFindings?: ReadonlyArray<AnalystFinding> | Record<string, ReadonlyArray<AnalystFinding>>;
|
|
99
|
+
}
|
|
100
|
+
declare class AnalystRegistry {
|
|
101
|
+
private readonly analysts;
|
|
102
|
+
private readonly options;
|
|
103
|
+
constructor(options?: AnalystRegistryOptions);
|
|
104
|
+
register(analyst: Analyst): void;
|
|
105
|
+
list(): ReadonlyArray<{
|
|
106
|
+
id: string;
|
|
107
|
+
description: string;
|
|
108
|
+
version: string;
|
|
109
|
+
cost: Analyst['cost'];
|
|
110
|
+
}>;
|
|
111
|
+
run(runId: string, inputs: AnalystRunInputs, runOpts?: RegistryRunOpts): Promise<AnalystRunResult>;
|
|
112
|
+
/**
|
|
113
|
+
* Streaming counterpart to `run()`. Emits `AnalystRunEvent` values
|
|
114
|
+
* in real time — `run-started`, then per-analyst `skipped` /
|
|
115
|
+
* `started` / `completed`, then a terminal `run-completed` whose
|
|
116
|
+
* payload is the full `AnalystRunResult`. UIs use this to render
|
|
117
|
+
* progress; persistence consumers use `run()` and read the result.
|
|
118
|
+
*
|
|
119
|
+
* Hooks (`onBeforeAnalyze` / `onAfterAnalyze` / `onError` /
|
|
120
|
+
* `onComplete`) fire as before — streaming is additive, not a hook
|
|
121
|
+
* replacement.
|
|
122
|
+
*/
|
|
123
|
+
runStream(runId: string, inputs: AnalystRunInputs, runOpts?: RegistryRunOpts): AsyncGenerator<AnalystRunEvent, void, void>;
|
|
124
|
+
private selectAnalysts;
|
|
125
|
+
private routeInput;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
export { type AnalystHooks as A, type BudgetPolicy as B, type RegistryRunOpts as R, AnalystRegistry as a, type AnalystRegistryOptions as b };
|
|
@@ -7,146 +7,6 @@ import { R as RawProviderSink } from './raw-provider-sink-C46HDghv.js';
|
|
|
7
7
|
import { F as FailureClass } from './schema-m0gsnbt3.js';
|
|
8
8
|
import { T as TraceStore } from './store-CKUAgsJz.js';
|
|
9
9
|
|
|
10
|
-
/**
|
|
11
|
-
* Multi-layer verifier — ordered pipeline of verification layers.
|
|
12
|
-
*
|
|
13
|
-
* Different contract from {@link JudgeRunner} (which runs parallel
|
|
14
|
-
* specs against a sandbox). MultiLayerVerifier is a DAG of layers
|
|
15
|
-
* (install → typecheck → build → lint → serve → semantic → …) with
|
|
16
|
-
* dependency-based skip, per-layer findings, soft-fail semantics, and
|
|
17
|
-
* an aggregated `blendedScore` across all passed layers.
|
|
18
|
-
*
|
|
19
|
-
* Use when you want:
|
|
20
|
-
* - ordered stages where a failing upstream stage skips downstream ones
|
|
21
|
-
* - each stage produces rich `findings` (severity + message + evidence)
|
|
22
|
-
* - a single composite score across stages with per-stage weights
|
|
23
|
-
* - soft-fail stages whose failure doesn't abort the pipeline
|
|
24
|
-
*
|
|
25
|
-
* Use {@link JudgeRunner} when you want:
|
|
26
|
-
* - N independent judges running in parallel against the same artifact
|
|
27
|
-
* - no inter-judge dependencies
|
|
28
|
-
* - boolean `passed` per judge + overall
|
|
29
|
-
*
|
|
30
|
-
* Both primitives compose — JudgeRunner can be invoked as a single
|
|
31
|
-
* layer inside a MultiLayerVerifier if that suits the caller.
|
|
32
|
-
*/
|
|
33
|
-
type LayerStatus = 'pass' | 'fail' | 'skipped' | 'error' | 'timeout';
|
|
34
|
-
type Severity = 'critical' | 'major' | 'minor' | 'info';
|
|
35
|
-
interface Finding {
|
|
36
|
-
severity: Severity;
|
|
37
|
-
message: string;
|
|
38
|
-
evidence?: string;
|
|
39
|
-
/** Optional layer name the finding belongs to (set by the verifier if omitted). */
|
|
40
|
-
layer?: string;
|
|
41
|
-
/**
|
|
42
|
-
* Free-form structured payload — used by `multiToolchainLayer` to attach
|
|
43
|
-
* `{ adapter: 'pnpm' }`, by judges to attach evidence pointers, etc.
|
|
44
|
-
* Renderers MAY interrogate; agent-eval primitives never assume shape.
|
|
45
|
-
*/
|
|
46
|
-
detail?: Record<string, unknown>;
|
|
47
|
-
}
|
|
48
|
-
interface LayerResult {
|
|
49
|
-
layer: string;
|
|
50
|
-
status: LayerStatus;
|
|
51
|
-
/** 0..1 score, optional — layers that don't produce a numeric score omit. */
|
|
52
|
-
score?: number;
|
|
53
|
-
durationMs: number;
|
|
54
|
-
findings: Finding[];
|
|
55
|
-
/** Short human-readable summary (one line). */
|
|
56
|
-
reason?: string;
|
|
57
|
-
/**
|
|
58
|
-
* Numeric layer-level diagnostics: error counts, warning counts,
|
|
59
|
-
* cyclomatic complexity, total adapter wall-time, etc. Keyed by
|
|
60
|
-
* diagnostic name; null = "diagnostic not applicable / not measured."
|
|
61
|
-
* Renderers that know the keys can display them; ones that don't,
|
|
62
|
-
* ignore. Free-form on purpose — consumers type the value shape in
|
|
63
|
-
* their own namespace.
|
|
64
|
-
*/
|
|
65
|
-
diagnostics?: Record<string, number | null>;
|
|
66
|
-
/** Any rich per-layer detail — rendered as-is by consumers that know the layer. */
|
|
67
|
-
detail?: Record<string, unknown>;
|
|
68
|
-
}
|
|
69
|
-
interface VerifyContext<Env = unknown> {
|
|
70
|
-
/** Per-run opaque context the caller provides. Layers destructure what they need. */
|
|
71
|
-
env: Env;
|
|
72
|
-
/** Previously-computed results from layers that already ran. */
|
|
73
|
-
prior: Record<string, LayerResult>;
|
|
74
|
-
/** Signal — if aborted, layers MUST bail within reasonable wall. */
|
|
75
|
-
signal: AbortSignal;
|
|
76
|
-
}
|
|
77
|
-
interface Layer<Env = unknown> {
|
|
78
|
-
name: string;
|
|
79
|
-
/** Stages that must have `status: 'pass'` before this layer runs. */
|
|
80
|
-
dependsOn?: string[];
|
|
81
|
-
/**
|
|
82
|
-
* Weight in the composite `blendedScore`. Default 1.0. Layers with weight 0
|
|
83
|
-
* contribute findings but not score.
|
|
84
|
-
*/
|
|
85
|
-
weight?: number;
|
|
86
|
-
/**
|
|
87
|
-
* If true, a `fail` status contributes to `blendedScore` (as 0) instead of
|
|
88
|
-
* being dropped — use for layers whose failure is a real signal. Default:
|
|
89
|
-
* fail drops from numerator + denominator, matching VB's existing semantics.
|
|
90
|
-
*/
|
|
91
|
-
failContributesToScore?: boolean;
|
|
92
|
-
/** Optional per-layer wall-cap in ms. Honored by the verifier (AbortSignal). */
|
|
93
|
-
capMs?: number;
|
|
94
|
-
run: (ctx: VerifyContext<Env>) => Promise<LayerResult> | LayerResult;
|
|
95
|
-
}
|
|
96
|
-
interface VerifyOptions<Env = unknown> {
|
|
97
|
-
env: Env;
|
|
98
|
-
/**
|
|
99
|
-
* Overall wall cap. Default: sum of layer capMs, or Infinity if any layer
|
|
100
|
-
* omits a cap. The verifier short-circuits remaining layers on overall cap.
|
|
101
|
-
*/
|
|
102
|
-
overallCapMs?: number;
|
|
103
|
-
/** Called with each layer result as it completes. */
|
|
104
|
-
onLayer?: (result: LayerResult) => void;
|
|
105
|
-
}
|
|
106
|
-
interface VerificationReport {
|
|
107
|
-
layers: LayerResult[];
|
|
108
|
-
passCount: number;
|
|
109
|
-
failCount: number;
|
|
110
|
-
skippedCount: number;
|
|
111
|
-
errorCount: number;
|
|
112
|
-
/** True iff at least one scored layer ran AND every scored layer passed. */
|
|
113
|
-
allPass: boolean;
|
|
114
|
-
/**
|
|
115
|
-
* Weighted mean of `score` across contributing layers. 0 when no layers
|
|
116
|
-
* contributed. See {@link Layer.failContributesToScore} for fail semantics.
|
|
117
|
-
*/
|
|
118
|
-
blendedScore: number;
|
|
119
|
-
durationMs: number;
|
|
120
|
-
startedAt: string;
|
|
121
|
-
finishedAt: string;
|
|
122
|
-
}
|
|
123
|
-
/**
|
|
124
|
-
* Grade a semantic-concept-style judge result into a single layer status.
|
|
125
|
-
*
|
|
126
|
-
* Pass when overall score >= threshold AND no critical-severity concept gap.
|
|
127
|
-
* Fail otherwise. Use inside a `Layer.run` when wrapping a concept judge.
|
|
128
|
-
*
|
|
129
|
-
* Generalized from VerticalBench H3 fix: `failingConcepts.length === 0` was
|
|
130
|
-
* too strict — a single concept at 6/10 failed the entire layer despite
|
|
131
|
-
* overall score being >= 0.7. Now we trust the judge's own `severity` field:
|
|
132
|
-
* `critical` findings veto; `major`/`minor` reduce the score but don't veto.
|
|
133
|
-
*/
|
|
134
|
-
declare function gradeSemanticStatus(input: {
|
|
135
|
-
score: number;
|
|
136
|
-
findings: Array<{
|
|
137
|
-
severity: Severity;
|
|
138
|
-
present?: boolean;
|
|
139
|
-
score?: number;
|
|
140
|
-
}>;
|
|
141
|
-
available: boolean;
|
|
142
|
-
threshold?: number;
|
|
143
|
-
}): LayerStatus;
|
|
144
|
-
declare class MultiLayerVerifier<Env = unknown> {
|
|
145
|
-
private readonly layers;
|
|
146
|
-
constructor(layers: Layer<Env>[]);
|
|
147
|
-
run(opts: VerifyOptions<Env>): Promise<VerificationReport>;
|
|
148
|
-
}
|
|
149
|
-
|
|
150
10
|
/**
|
|
151
11
|
* EvalCampaign — opinionated matrix runner that wires the four
|
|
152
12
|
* capture-integrity directives by construction.
|
|
@@ -524,4 +384,4 @@ declare class NoopResearcher implements Researcher {
|
|
|
524
384
|
evaluateChange(_plan: ExperimentPlan): Promise<ExperimentResult>;
|
|
525
385
|
}
|
|
526
386
|
|
|
527
|
-
export { CallbackResearcher as C, type ExperimentPlan as E, type FailureMode as F,
|
|
387
|
+
export { CallbackResearcher as C, type ExperimentPlan as E, type FailureMode as F, NoopResearcher as N, type Researcher as R, type SteeringChange as S, type ExperimentResult as a, type EvalCampaignResult as b, type EvalCampaignOptions as c, type CallbackResearcherOptions as d, type CampaignFactoryParams as e, type CampaignIntegrityPolicy as f, type CampaignRunContext as g, type CampaignRunOutcome as h, type CampaignRunner as i, type CampaignScenario as j, type CampaignVariant as k, type FailedRun as l, runEvalCampaign as r };
|
package/dist/rl.d.ts
CHANGED
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
import { R as RunRecord, b as RunSplitTag } from './run-record-BgTFzO2r.js';
|
|
2
|
-
import { f as CampaignResult } from './types-
|
|
3
|
-
import {
|
|
4
|
-
export { r as runEvalCampaign } from './researcher-WJvIpX3L.js';
|
|
2
|
+
import { f as CampaignResult } from './types-Bba0vl1V.js';
|
|
3
|
+
import { a as VerificationReport } from './multi-layer-verifier-DlWCXuxL.js';
|
|
5
4
|
import { S as Span } from './schema-m0gsnbt3.js';
|
|
6
5
|
import { T as TraceStore } from './store-CKUAgsJz.js';
|
|
7
6
|
import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
|
|
8
7
|
export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from './outcome-store-D6KWmYvj.js';
|
|
9
8
|
import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-D_4BSXGV.js';
|
|
9
|
+
import { R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-C_KJyIGg.js';
|
|
10
|
+
export { r as runEvalCampaign } from './researcher-C_KJyIGg.js';
|
|
10
11
|
import { I as InterimReleaseConfidence } from './sequential-5iSVfzl2.js';
|
|
11
12
|
import './errors-Dwqw-T_m.js';
|
|
12
13
|
import './llm-client-DbjLfz-K.js';
|
package/dist/rl.js
CHANGED
|
@@ -10,8 +10,8 @@ import {
|
|
|
10
10
|
} from "./chunk-3RF76KTD.js";
|
|
11
11
|
import {
|
|
12
12
|
runEvalCampaign
|
|
13
|
-
} from "./chunk-
|
|
14
|
-
import "./chunk-
|
|
13
|
+
} from "./chunk-GJJNJVIR.js";
|
|
14
|
+
import "./chunk-IHDHUN2X.js";
|
|
15
15
|
import {
|
|
16
16
|
rubricPredictiveValidity
|
|
17
17
|
} from "./chunk-YRZ4M5GS.js";
|
|
@@ -24,10 +24,10 @@ import {
|
|
|
24
24
|
wilcoxonSignedRank
|
|
25
25
|
} from "./chunk-ITBRCT73.js";
|
|
26
26
|
import "./chunk-SBCB6VZY.js";
|
|
27
|
+
import "./chunk-PC4UYEBM.js";
|
|
28
|
+
import "./chunk-F3SRAAZO.js";
|
|
27
29
|
import "./chunk-TVVP3ZZQ.js";
|
|
28
30
|
import "./chunk-VSMTAMNK.js";
|
|
29
|
-
import "./chunk-IHDHUN2X.js";
|
|
30
|
-
import "./chunk-PC4UYEBM.js";
|
|
31
31
|
import {
|
|
32
32
|
ValidationError
|
|
33
33
|
} from "./chunk-3BFEG2F6.js";
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import {
|
|
2
2
|
runCampaign
|
|
3
|
-
} from "./chunk-
|
|
3
|
+
} from "./chunk-ZPSKPT3V.js";
|
|
4
4
|
import "./chunk-ITBRCT73.js";
|
|
5
5
|
import "./chunk-3BFEG2F6.js";
|
|
6
6
|
import "./chunk-PZ5AY32C.js";
|
|
7
7
|
export {
|
|
8
8
|
runCampaign
|
|
9
9
|
};
|
|
10
|
-
//# sourceMappingURL=run-campaign-
|
|
10
|
+
//# sourceMappingURL=run-campaign-OVEZF24D.js.map
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import { R as Run, S as Span, a as TraceEvent, A as Artifact, B as BudgetLedgerEntry } from './schema-m0gsnbt3.js';
|
|
2
|
+
import { T as TraceStore } from './store-CKUAgsJz.js';
|
|
3
|
+
|
|
4
|
+
interface RunScore {
|
|
5
|
+
success: number;
|
|
6
|
+
goalProgress: number;
|
|
7
|
+
repoGroundedness: number;
|
|
8
|
+
driftPenalty: number;
|
|
9
|
+
toolUseQuality: number;
|
|
10
|
+
patchQuality: number;
|
|
11
|
+
testReality: number;
|
|
12
|
+
finalGate: number;
|
|
13
|
+
reviewerBlockers: number;
|
|
14
|
+
costUsd: number;
|
|
15
|
+
wallSeconds: number;
|
|
16
|
+
notes?: string[];
|
|
17
|
+
}
|
|
18
|
+
interface RunScoreWeights {
|
|
19
|
+
success: number;
|
|
20
|
+
goalProgress: number;
|
|
21
|
+
repoGroundedness: number;
|
|
22
|
+
driftPenalty: number;
|
|
23
|
+
toolUseQuality: number;
|
|
24
|
+
patchQuality: number;
|
|
25
|
+
testReality: number;
|
|
26
|
+
finalGate: number;
|
|
27
|
+
reviewerBlockers: number;
|
|
28
|
+
costUsd: number;
|
|
29
|
+
wallSeconds: number;
|
|
30
|
+
}
|
|
31
|
+
declare const DEFAULT_RUN_SCORE_WEIGHTS: RunScoreWeights;
|
|
32
|
+
declare function aggregateRunScore(score: RunScore, weights?: Partial<RunScoreWeights>): number;
|
|
33
|
+
declare function clamp01(value: number): number;
|
|
34
|
+
|
|
35
|
+
interface RunTrace {
|
|
36
|
+
run: Run;
|
|
37
|
+
spans: Span[];
|
|
38
|
+
events: TraceEvent[];
|
|
39
|
+
artifacts: Artifact[];
|
|
40
|
+
budget: BudgetLedgerEntry[];
|
|
41
|
+
}
|
|
42
|
+
interface RunCriticOptions {
|
|
43
|
+
weights?: Partial<RunScoreWeights>;
|
|
44
|
+
driftPatterns?: RegExp[];
|
|
45
|
+
}
|
|
46
|
+
declare class RunCritic {
|
|
47
|
+
private readonly weights?;
|
|
48
|
+
private readonly driftPatterns;
|
|
49
|
+
constructor(options?: RunCriticOptions);
|
|
50
|
+
score(store: TraceStore, runId: string): Promise<RunScore>;
|
|
51
|
+
scoreTrace(trace: RunTrace): RunScore;
|
|
52
|
+
rank(score: RunScore): number;
|
|
53
|
+
private isDrift;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
export { DEFAULT_RUN_SCORE_WEIGHTS as D, type RunScore as R, type RunTrace as a, type RunScoreWeights as b, RunCritic as c, type RunCriticOptions as d, aggregateRunScore as e, clamp01 as f };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { L as LlmClientOptions } from './llm-client-DbjLfz-K.js';
|
|
2
|
-
import { I as ImprovementDriver, S as Scenario, f as CampaignResult, k as GateResult, D as DispatchFn, a as JudgeConfig, L as LabeledScenarioStore, g as CampaignTraceWriter, m as GenerationRecord, M as MutableSurface, P as ParetoParent, G as Gate } from './types-
|
|
2
|
+
import { I as ImprovementDriver, S as Scenario, f as CampaignResult, k as GateResult, D as DispatchFn, a as JudgeConfig, L as LabeledScenarioStore, g as CampaignTraceWriter, m as GenerationRecord, M as MutableSurface, P as ParetoParent, G as Gate } from './types-Bba0vl1V.js';
|
|
3
3
|
|
|
4
4
|
/**
|
|
5
5
|
* @experimental
|
|
@@ -308,6 +308,30 @@ interface RunOptimizationOptions<TScenario extends Scenario, TArtifact> extends
|
|
|
308
308
|
/** Phase-2 research report forwarded to `propose()` (analyst findings +
|
|
309
309
|
* diff). Opaque here; the driver types it. */
|
|
310
310
|
report?: unknown;
|
|
311
|
+
/** Structured findings forwarded to `propose()` as `ctx.findings`. A
|
|
312
|
+
* findings producer (trace-analyst registry, HALO) emits these from the
|
|
313
|
+
* generation's traces; findings-grounded drivers (`improvementDriver`,
|
|
314
|
+
* `memoryCurationDriver`, `traceAnalystDriver`) consume them. Opaque here;
|
|
315
|
+
* the driver types its `TFindings`. Empty when no producer is wired. */
|
|
316
|
+
findings?: unknown[];
|
|
317
|
+
/** Per-generation findings producer — the EYES→HANDS loop closure. After each
|
|
318
|
+
* generation's candidates are scored, this is called with that generation's
|
|
319
|
+
* results; whatever it returns REPLACES `ctx.findings` for the NEXT
|
|
320
|
+
* generation's `propose()`, so the diagnosis is refreshed each round instead
|
|
321
|
+
* of being a static one-shot. Generic by design: the substrate does not
|
|
322
|
+
* import an analyst — the consumer plugs its trace-analyst registry / HALO
|
|
323
|
+
* here (reading the per-candidate `runDir` traces). When absent, findings
|
|
324
|
+
* stay the static `opts.findings`. */
|
|
325
|
+
analyzeGeneration?: (input: {
|
|
326
|
+
generation: number;
|
|
327
|
+
runDir: string;
|
|
328
|
+
candidates: Array<{
|
|
329
|
+
surfaceHash: string;
|
|
330
|
+
campaign: CampaignResult<TArtifact, TScenario>;
|
|
331
|
+
composite: number;
|
|
332
|
+
}>;
|
|
333
|
+
history: GenerationRecord[];
|
|
334
|
+
}) => Promise<unknown[]>;
|
|
311
335
|
}
|
|
312
336
|
interface RunOptimizationResult<TArtifact, TScenario extends Scenario> {
|
|
313
337
|
generations: Array<{
|