@tangle-network/agent-runtime 0.48.0 → 0.49.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +79 -15
- package/dist/agent.js +1 -1
- package/dist/chunk-GHX7XOJ2.js +433 -0
- package/dist/chunk-GHX7XOJ2.js.map +1 -0
- package/dist/{chunk-TJS7S3HJ.js → chunk-IQS4HI3F.js} +14 -5
- package/dist/chunk-IQS4HI3F.js.map +1 -0
- package/dist/{chunk-IW2LMLK6.js → chunk-PXUTIMGJ.js} +767 -129
- package/dist/chunk-PXUTIMGJ.js.map +1 -0
- package/dist/{chunk-656G2XCL.js → chunk-U2VEWKKK.js} +3 -3
- package/dist/{chunk-JNPK46YH.js → chunk-VIEDXELL.js} +408 -6
- package/dist/chunk-VIEDXELL.js.map +1 -0
- package/dist/{chunk-VR4JIC5H.js → chunk-XTEZ3YJ4.js} +2 -2
- package/dist/index.d.ts +29 -4
- package/dist/index.js +109 -21
- package/dist/index.js.map +1 -1
- package/dist/kb-gate-CsXpNRk7.d.ts +1145 -0
- package/dist/{loop-runner-bin-DEm4roYF.d.ts → loop-runner-bin-Cgn0A-NW.d.ts} +1 -1
- package/dist/loop-runner-bin.d.ts +2 -2
- package/dist/loop-runner-bin.js +3 -3
- package/dist/loops.d.ts +2 -2
- package/dist/loops.js +11 -1
- package/dist/mcp/bin.js +187 -24
- package/dist/mcp/bin.js.map +1 -1
- package/dist/mcp/index.d.ts +27 -124
- package/dist/mcp/index.js +28 -6
- package/dist/mcp/index.js.map +1 -1
- package/dist/platform.js +2 -2
- package/dist/platform.js.map +1 -1
- package/dist/runtime.d.ts +285 -8
- package/dist/runtime.js +11 -1
- package/dist/workflow.js +1 -1
- package/package.json +6 -5
- package/dist/chunk-IW2LMLK6.js.map +0 -1
- package/dist/chunk-JNPK46YH.js.map +0 -1
- package/dist/chunk-LX66I3SC.js +0 -218
- package/dist/chunk-LX66I3SC.js.map +0 -1
- package/dist/chunk-TJS7S3HJ.js.map +0 -1
- package/dist/kb-gate-51BlLlVM.d.ts +0 -529
- /package/dist/{chunk-656G2XCL.js.map → chunk-U2VEWKKK.js.map} +0 -0
- /package/dist/{chunk-VR4JIC5H.js.map → chunk-XTEZ3YJ4.js.map} +0 -0
package/dist/runtime.d.ts
CHANGED
|
@@ -2,6 +2,7 @@ import { AgentProfile, BackendType, CreateSandboxOptions, SandboxInstance, Sandb
|
|
|
2
2
|
export { AgentProfile, CreateSandboxOptions, SandboxEvent, SandboxInstance } from '@tangle-network/sandbox';
|
|
3
3
|
import { R as ResultBlobStore, a as SpawnJournal, N as NodeId, b as SpawnEvent, T as TreeView, c as Settled, d as AgentSpec, E as ExecutorRegistry, B as Budget, A as Agent, e as RootHandle, f as SupervisedResult, g as Spend, S as Scope, h as ExecutorFactory, U as UsageEvent, i as Supervisor } from './types-BpDfCPUp.js';
|
|
4
4
|
export { j as Executor, k as ExecutorContext, l as ExecutorResult, H as Handle, m as NodeSnapshot, n as NodeStatus, o as Restart, p as RootSignal, q as Runtime, r as SpawnOpts, s as SupervisorOpts, W as WidenGate } from './types-BpDfCPUp.js';
|
|
5
|
+
import { R as RuntimeHooks } from './runtime-hooks-C7JwKb9E.js';
|
|
5
6
|
import { ChatClient, AnalystFinding, DefaultVerdict, AgentProfile as AgentProfile$1 } from '@tangle-network/agent-eval';
|
|
6
7
|
export { DefaultVerdict } from '@tangle-network/agent-eval';
|
|
7
8
|
export { A as AnalyzeInput, a as CompletionAnalyst, b as CompletionEvidence, c as CompletionPolicy, d as CompletionVerdict, C as CreateDriverOptions, D as DriverDecision, P as PlannerContext, e as TopologyMove, T as TopologyPlanner, f as completionAuthorizes, g as createDriver, h as deterministicCompletion, r as renderAnalyses, s as sentinelCompletion, i as stopSentinel } from './driver-DYU2sgHr.js';
|
|
@@ -10,7 +11,6 @@ export { D as Driver, C as LoopDecisionPayload, F as LoopEndedPayload, G as Loop
|
|
|
10
11
|
import { Scenario, ProfileDispatchFn } from '@tangle-network/agent-eval/campaign';
|
|
11
12
|
import { R as RunLoopOptions } from './run-loop-DvD4aGiE.js';
|
|
12
13
|
export { c as createSandboxForSpec, d as defaultSelectWinner, r as runLoop } from './run-loop-DvD4aGiE.js';
|
|
13
|
-
import { R as RuntimeHooks } from './runtime-hooks-C7JwKb9E.js';
|
|
14
14
|
|
|
15
15
|
/**
|
|
16
16
|
* @experimental
|
|
@@ -113,6 +113,140 @@ declare function replaySpawnTree(journal: SpawnJournal, blobs: ResultBlobStore,
|
|
|
113
113
|
*/
|
|
114
114
|
declare function materializeTreeView(events: SpawnEvent[]): TreeView;
|
|
115
115
|
|
|
116
|
+
/**
|
|
117
|
+
* createWaterfallCollector — 100% trajectory observability from the lifecycle stream:
|
|
118
|
+
* every spawn/settle (shots, analysts, nested agents) becomes one timed, billed span.
|
|
119
|
+
* The sum of spans IS the run's cost story — what each step cost in dollars, tokens,
|
|
120
|
+
* and wall-clock, rendered as a text waterfall or exported as structured rows for any
|
|
121
|
+
* chart. Attach the collector's `hooks` to `runAgentic`/`runBenchmark`; spans accumulate
|
|
122
|
+
* across every task the hooks observe.
|
|
123
|
+
*/
|
|
124
|
+
|
|
125
|
+
interface WaterfallSpan {
|
|
126
|
+
id: string;
|
|
127
|
+
/** The spawn label (`shot:0`, `analyst:1`, a nested agent's label) — the row name. */
|
|
128
|
+
label: string;
|
|
129
|
+
runId: string;
|
|
130
|
+
parentId?: string;
|
|
131
|
+
startMs: number;
|
|
132
|
+
endMs?: number;
|
|
133
|
+
status: 'running' | 'done' | 'down';
|
|
134
|
+
usd: number;
|
|
135
|
+
tokens: {
|
|
136
|
+
input: number;
|
|
137
|
+
output: number;
|
|
138
|
+
};
|
|
139
|
+
score?: number;
|
|
140
|
+
}
|
|
141
|
+
interface WaterfallReport {
|
|
142
|
+
spans: WaterfallSpan[];
|
|
143
|
+
/** Wall-clock of the observed window (first spawn → last settle). */
|
|
144
|
+
totalMs: number;
|
|
145
|
+
totalUsd: number;
|
|
146
|
+
totalTokens: {
|
|
147
|
+
input: number;
|
|
148
|
+
output: number;
|
|
149
|
+
};
|
|
150
|
+
/** Rollup by label prefix (the part before ':') — shots vs analysts vs anything else. */
|
|
151
|
+
byKind: Record<string, {
|
|
152
|
+
count: number;
|
|
153
|
+
ms: number;
|
|
154
|
+
usd: number;
|
|
155
|
+
tokens: {
|
|
156
|
+
input: number;
|
|
157
|
+
output: number;
|
|
158
|
+
};
|
|
159
|
+
}>;
|
|
160
|
+
}
|
|
161
|
+
interface WaterfallCollector {
|
|
162
|
+
/** Attach these to RunAgenticOptions.hooks / BenchmarkConfig.hooks. */
|
|
163
|
+
hooks: RuntimeHooks;
|
|
164
|
+
report(): WaterfallReport;
|
|
165
|
+
/** The text waterfall — one row per span, bars scaled to the observed window. */
|
|
166
|
+
render(opts?: {
|
|
167
|
+
width?: number;
|
|
168
|
+
maxRows?: number;
|
|
169
|
+
}): string;
|
|
170
|
+
reset(): void;
|
|
171
|
+
}
|
|
172
|
+
declare function createWaterfallCollector(): WaterfallCollector;
|
|
173
|
+
|
|
174
|
+
/**
|
|
175
|
+
* anytimeReport — time-to-satisfactory-output metrics, derived entirely from the
|
|
176
|
+
* waterfall's spans (no new instrumentation): per task, the best-so-far score after each
|
|
177
|
+
* shot with its elapsed wall-clock and cumulative spend; per strategy, the standard
|
|
178
|
+
* anytime-optimization metrics:
|
|
179
|
+
*
|
|
180
|
+
* TTT time-to-target — elapsed ms until best-so-far ≥ the target (per task; median
|
|
181
|
+
* over tasks that reached it)
|
|
182
|
+
* STT shots-to-target — attempts until best-so-far ≥ target
|
|
183
|
+
* ERT expected running time (the COCO benchmarking convention): TOTAL time spent
|
|
184
|
+
* across all tasks — including failures' full budgets — divided by the number of
|
|
185
|
+
* tasks that reached the target. The honest "how long per success, all-in".
|
|
186
|
+
* AUC the anytime curve's area (mean best-so-far score across the budget, per shot
|
|
187
|
+
* index) — higher = climbs earlier.
|
|
188
|
+
*
|
|
189
|
+
* The "satisfactory" bar follows the COCO/BBOB convention: a SET of satisficing targets
|
|
190
|
+
* (e.g. [0.5, 0.8, 1.0] on the normalized check score), each measured independently —
|
|
191
|
+
* runtime-to-target per (task, target) pair — optionally overridden per task
|
|
192
|
+
* (`targetFor`) when satisfaction is task-specific. Spans come from
|
|
193
|
+
* `createWaterfallCollector().report()`; tasks are grouped by the supervisor runId
|
|
194
|
+
* (`agentic:<strategy>:<taskId>`); shot spans are `shot:N` labels.
|
|
195
|
+
*/
|
|
196
|
+
|
|
197
|
+
interface AnytimeTaskCurve {
|
|
198
|
+
taskId: string;
|
|
199
|
+
strategy: string;
|
|
200
|
+
/** Best-so-far after each settled shot: elapsed ms from the task's first spawn,
|
|
201
|
+
* cumulative usd, and the running max score. */
|
|
202
|
+
points: Array<{
|
|
203
|
+
elapsedMs: number;
|
|
204
|
+
cumUsd: number;
|
|
205
|
+
best: number;
|
|
206
|
+
}>;
|
|
207
|
+
/** Per satisficing target (keyed by the target value as a string): the first point
|
|
208
|
+
* where best ≥ target, or null when never reached within budget. */
|
|
209
|
+
hits: Record<string, {
|
|
210
|
+
ms: number;
|
|
211
|
+
shots: number;
|
|
212
|
+
usd: number;
|
|
213
|
+
} | null>;
|
|
214
|
+
}
|
|
215
|
+
interface AnytimeStrategySummary {
|
|
216
|
+
strategy: string;
|
|
217
|
+
/** The satisficing target this row summarizes. */
|
|
218
|
+
target: number;
|
|
219
|
+
tasks: number;
|
|
220
|
+
reachedTarget: number;
|
|
221
|
+
/** Median time-to-target over the tasks that reached it (null when none did). */
|
|
222
|
+
medianTttMs: number | null;
|
|
223
|
+
medianShotsToTarget: number | null;
|
|
224
|
+
/** COCO ERT: Σ all task wall-time (incl. failures) / #successes. Null when 0 succeed. */
|
|
225
|
+
ertMs: number | null;
|
|
226
|
+
/** Same construction over dollars: Σ all spend / #successes. */
|
|
227
|
+
erUsd: number | null;
|
|
228
|
+
/** Mean best-so-far score by shot index (the anytime curve, averaged over tasks). */
|
|
229
|
+
curveByShot: number[];
|
|
230
|
+
/** Area under the per-shot anytime curve, normalized to [0,1]. */
|
|
231
|
+
auc: number;
|
|
232
|
+
}
|
|
233
|
+
interface AnytimeReport {
|
|
234
|
+
targets: number[];
|
|
235
|
+
perTask: AnytimeTaskCurve[];
|
|
236
|
+
/** One summary per (strategy, target) pair — the COCO-style multi-target view. */
|
|
237
|
+
perStrategy: AnytimeStrategySummary[];
|
|
238
|
+
}
|
|
239
|
+
/** Derive anytime metrics from waterfall spans. `targets` are the satisficing score
|
|
240
|
+
* bars (default [1] = fully resolved; COCO-style multi-target: [0.5, 0.8, 1]);
|
|
241
|
+
* `targetFor` overrides the bar per task (task-specific satisfaction) — when set, the
|
|
242
|
+
* per-task bar replaces every entry of `targets` for that task. */
|
|
243
|
+
declare function anytimeReport(spans: WaterfallSpan[], opts?: {
|
|
244
|
+
targets?: number[];
|
|
245
|
+
targetFor?: (taskId: string) => number;
|
|
246
|
+
}): AnytimeReport;
|
|
247
|
+
/** One row per (strategy, satisficing target): the shareable time-to-satisfactory table. */
|
|
248
|
+
declare function renderAnytimeTable(report: AnytimeReport): string;
|
|
249
|
+
|
|
116
250
|
/**
|
|
117
251
|
* auditIntent — the route-rigor analyst: is this trajectory even going the RIGHT WAY?
|
|
118
252
|
*
|
|
@@ -1144,6 +1278,9 @@ interface AgenticOptions {
|
|
|
1144
1278
|
routerKey: string;
|
|
1145
1279
|
model: string;
|
|
1146
1280
|
temperature?: number;
|
|
1281
|
+
/** Completion cap per worker turn — REQUIRED for thinking models (they burn unbounded
|
|
1282
|
+
* budgets on reasoning and return empty content without it). Omitted ⇒ provider default. */
|
|
1283
|
+
maxTokens?: number;
|
|
1147
1284
|
/** Turns the agent may take within ONE shot before the driver intervenes. */
|
|
1148
1285
|
innerTurns?: number;
|
|
1149
1286
|
/** The depth STEERER's analyst instruction (observe()'s system prompt). The knob a
|
|
@@ -1225,6 +1362,9 @@ interface ShotSpec {
|
|
|
1225
1362
|
messages?: Msg[];
|
|
1226
1363
|
steer?: string;
|
|
1227
1364
|
persona?: ShotPersona;
|
|
1365
|
+
/** Restrict THIS shot to a subset of the domain's tools (by name) — focus a shot on
|
|
1366
|
+
* the relevant capabilities. Restriction-only; unknown names throw. Omitted ⇒ all. */
|
|
1367
|
+
tools?: string[];
|
|
1228
1368
|
}
|
|
1229
1369
|
interface StrategyResult {
|
|
1230
1370
|
score: number;
|
|
@@ -1253,6 +1393,18 @@ interface StrategyCtx {
|
|
|
1253
1393
|
shot(spec?: ShotSpec): Promise<ShotResult | null>;
|
|
1254
1394
|
/** The firewalled critic reads the trajectory → a steer string, or null on COMPLETE/down. */
|
|
1255
1395
|
critique(messages: Msg[]): Promise<string | null>;
|
|
1396
|
+
/** The RAW analyst channel: the firewalled critic answers `instruction` over the
|
|
1397
|
+
* trajectory verbatim — no findings extraction, so verdict-shaped formats
|
|
1398
|
+
* (CONTINUE/STOP decisions, calibrated predictions) survive. Same firewall:
|
|
1399
|
+
* trajectory in, never scores. Null when the analyst went down. */
|
|
1400
|
+
consult(messages: Msg[], instruction: string): Promise<string | null>;
|
|
1401
|
+
/** The tools THIS artifact's task actually offers (names + descriptions only — never
|
|
1402
|
+
* the implementations). Tool sets vary per task on heterogeneous domains; a strategy
|
|
1403
|
+
* that restricts shots MUST select from this list, never from hardcoded names. */
|
|
1404
|
+
listTools(handle: ArtifactHandle): Promise<Array<{
|
|
1405
|
+
name: string;
|
|
1406
|
+
description?: string;
|
|
1407
|
+
}>>;
|
|
1256
1408
|
}
|
|
1257
1409
|
/** Author a Strategy from the composable steps — the open, compact way. */
|
|
1258
1410
|
declare function defineStrategy(name: string, run: (ctx: StrategyCtx) => Promise<StrategyResult>): Strategy;
|
|
@@ -1346,6 +1498,10 @@ interface BenchmarkTaskRow {
|
|
|
1346
1498
|
taskId: string;
|
|
1347
1499
|
/** Per-strategy cells; absent when the task errored before completing all strategies. */
|
|
1348
1500
|
cells?: Record<string, BenchmarkCell>;
|
|
1501
|
+
/** Per-strategy failures on this task: the strategy competed, threw, and scored an
|
|
1502
|
+
* honest zero — it loses, it does not poison the row. The message is kept so a later
|
|
1503
|
+
* generation's author can see WHY a candidate died. */
|
|
1504
|
+
errors?: Record<string, string>;
|
|
1349
1505
|
/** Why the task was excluded (infra/setup failure) — never silently dropped. */
|
|
1350
1506
|
error?: string;
|
|
1351
1507
|
}
|
|
@@ -1757,6 +1913,13 @@ interface PromotionGateOptions {
|
|
|
1757
1913
|
incumbent: string;
|
|
1758
1914
|
/** The challenger's strategy name. */
|
|
1759
1915
|
candidate: string;
|
|
1916
|
+
/** 'superiority' (default): the candidate must score significantly BETTER.
|
|
1917
|
+
* 'non-inferiority': the candidate must prove its score is not worse than the
|
|
1918
|
+
* incumbent by more than `scoreTolerance` AND its cost savings are significant —
|
|
1919
|
+
* the gate for "same quality, cheaper" claims. */
|
|
1920
|
+
mode?: 'superiority' | 'non-inferiority';
|
|
1921
|
+
/** non-inferiority: the score CI lower bound must clear −scoreTolerance. Default 0.05. */
|
|
1922
|
+
scoreTolerance?: number;
|
|
1760
1923
|
/** The CI lower bound on the paired lift must EXCEED this (score scale). Default 0. */
|
|
1761
1924
|
deltaThreshold?: number;
|
|
1762
1925
|
/** Minimum paired tasks before significance can be claimed. Default 6 — below that
|
|
@@ -1770,7 +1933,8 @@ interface PromotionGateOptions {
|
|
|
1770
1933
|
}
|
|
1771
1934
|
interface PromotionVerdict {
|
|
1772
1935
|
promoted: boolean;
|
|
1773
|
-
reason: 'identical-champion' | 'few-tasks' | 'no-margin' | 'significant';
|
|
1936
|
+
reason: 'identical-champion' | 'few-tasks' | 'no-margin' | 'significant' | 'non-inferior-and-cheaper' | 'non-inferiority-unproven' | 'not-cheaper';
|
|
1937
|
+
mode: 'superiority' | 'non-inferiority';
|
|
1774
1938
|
/** Paired tasks that carried both strategies' cells. */
|
|
1775
1939
|
n: number;
|
|
1776
1940
|
/** Paired (candidate − incumbent) lift across the holdout tasks. */
|
|
@@ -1780,6 +1944,23 @@ interface PromotionVerdict {
|
|
|
1780
1944
|
low: number;
|
|
1781
1945
|
high: number;
|
|
1782
1946
|
};
|
|
1947
|
+
/** non-inferiority mode: paired (incumbent − candidate) cost SAVINGS per task (usd) —
|
|
1948
|
+
* positive means the candidate is cheaper; significant iff the CI low clears zero. */
|
|
1949
|
+
costSavings?: {
|
|
1950
|
+
mean: number;
|
|
1951
|
+
median: number;
|
|
1952
|
+
low: number;
|
|
1953
|
+
high: number;
|
|
1954
|
+
};
|
|
1955
|
+
/** Paired (candidate − incumbent) wall-clock per task (ms) — negative = the candidate
|
|
1956
|
+
* is FASTER. Informational in every mode (never gates); the latency answer to "what
|
|
1957
|
+
* does this win actually cost the user?". */
|
|
1958
|
+
latency?: {
|
|
1959
|
+
mean: number;
|
|
1960
|
+
median: number;
|
|
1961
|
+
low: number;
|
|
1962
|
+
high: number;
|
|
1963
|
+
};
|
|
1783
1964
|
}
|
|
1784
1965
|
declare function promotionGate(opts: PromotionGateOptions): PromotionVerdict;
|
|
1785
1966
|
|
|
@@ -2227,7 +2408,7 @@ declare function openSandboxRun<Out>(client: SandboxClient, options: OpenSandbox
|
|
|
2227
2408
|
*/
|
|
2228
2409
|
|
|
2229
2410
|
/** The compressed consumable a skill carries: everything an author needs to emit a loop. */
|
|
2230
|
-
declare const strategyAuthorContract = "\nYou author an OPTIMIZATION STRATEGY for an agentic loop system. A strategy decides how to\nspend a compute budget to beat a task's deployable check. You compose exactly two steps:\n\n shot(spec?: { handle?, messages?, steer?, persona? }): Promise<ShotResult | null>\n Runs ONE worker attempt (a bounded tool loop) over an artifact.\n - omit handle => the shot opens its OWN fresh artifact and closes it after (a sample).\n - pass handle => the shot CONTINUES that artifact (state accumulates across shots).\n - messages => the carried conversation (pass the previous ShotResult.messages to continue).\n - steer => a corrective instruction injected before the shot.\n - persona => { systemPrompt?, model? } \u2014 give THIS shot its own role and/or model\n (multi-agent strategies: a researcher shot then an engineer shot, a panel of k\n personas over one budget). On a fresh shot the systemPrompt replaces the task's; on\n a carried conversation it arrives as a hand-off message. Same conserved budget.\n ShotResult = { messages, score (0..1 on the task's check), passes, total, completions, toolErrors }\n Returns null if the attempt failed infra-wise.\n\n critique(messages): Promise<string | null>\n A firewalled trace-analyst reads the attempt's trajectory and returns ONE corrective\n instruction (or null when it judges the work complete). Costs ~1 completion.\n\n surface.open(task) / surface.close(handle)\n Open a persistent artifact you manage yourself (remember to close in a finally).\n\nRules:\n- Stay within ~budget total shots; every shot/critique spends from a conserved pool.\n- For a FRESH attempt OMIT `messages` entirely (never pass `[]` \u2014 an empty array is a\n fresh conversation too, but be explicit). To CONTINUE, pass the previous\n ShotResult.messages unchanged.\n- Return { score, resolved, completions, progression, shots } \u2014 score = the BEST checkpoint\n you reached (keep-best, never final-state), progression = score after each shot.\n- The module must be EXACTLY this shape (no other imports, no commentary outside code):\n\nimport { defineStrategy } from '@tangle-network/agent-runtime/loops'\nexport default defineStrategy('your-strategy-name', async ({ surface, task, budget, shot, critique }) => {\n // your composition\n})\n";
|
|
2411
|
+
declare const strategyAuthorContract = "\nYou author an OPTIMIZATION STRATEGY for an agentic loop system. A strategy decides how to\nspend a compute budget to beat a task's deployable check. You compose exactly two steps:\n\n shot(spec?: { handle?, messages?, steer?, persona?, tools? }): Promise<ShotResult | null>\n Runs ONE worker attempt (a bounded tool loop) over an artifact.\n - omit handle => the shot opens its OWN fresh artifact and closes it after (a sample).\n - pass handle => the shot CONTINUES that artifact (state accumulates across shots).\n - messages => the carried conversation (pass the previous ShotResult.messages to continue).\n - steer => a corrective instruction injected before the shot.\n - persona => { systemPrompt?, model? } \u2014 give THIS shot its own role and/or model\n (multi-agent strategies: a researcher shot then an engineer shot, a panel of k\n personas over one budget). On a fresh shot the systemPrompt replaces the task's; on\n a carried conversation it arrives as a hand-off message. Same conserved budget.\n - tools => string[] \u2014 restrict THIS shot to a subset of the task's tools by\n name (focus an explore shot on read-only tools, an execute shot on write tools).\n Restriction-only; unknown names make the shot fail. ALWAYS select from\n await listTools(handle) \u2014 never hardcode. Omitted => the shot sees every tool.\n ShotResult = { messages, score (0..1 on the task's check), passes, total, completions, toolErrors }\n Returns null if the attempt failed infra-wise.\n\n critique(messages): Promise<string | null>\n A firewalled trace-analyst reads the attempt's trajectory and returns ONE corrective\n instruction (or null when it judges the work complete). Costs ~1 completion.\n\n consult(messages, instruction): Promise<string | null>\n The RAW analyst channel: the same firewalled critic answers YOUR instruction over the\n trajectory verbatim (no reformatting) \u2014 use it when you need a specific reply format\n (a decision, a prediction). Costs ~1 completion.\n\n surface.open(task) / surface.close(handle)\n Open a persistent artifact you manage yourself (remember to close in a finally).\n close is idempotent \u2014 closing an already-closed handle is a safe no-op.\n\n listTools(handle): Promise<Array<{ name, description? }>>\n The tools THIS task actually offers. TOOL SETS VARY PER TASK \u2014 if you restrict a\n shot with `tools`, you MUST pick names from await listTools(handle); hardcoding\n names from an example kills your shots on every task whose tools differ.\n\nRules:\n- ALWAYS await every shot/critique/surface call \u2014 a floating promise that rejects\n crashes the whole benchmark run.\n- Stay within ~budget total shots; every shot/critique spends from a conserved pool.\n- For a FRESH attempt OMIT `messages` entirely (never pass `[]` \u2014 an empty array is a\n fresh conversation too, but be explicit). To CONTINUE, pass the previous\n ShotResult.messages unchanged.\n- Return { score, resolved, completions, progression, shots } \u2014 score = the BEST checkpoint\n you reached (keep-best, never final-state), progression = score after each shot.\n- The module must be EXACTLY this shape (no other imports, no commentary outside code):\n\nimport { defineStrategy } from '@tangle-network/agent-runtime/loops'\nexport default defineStrategy('your-strategy-name', async ({ surface, task, budget, shot, critique, listTools }) => {\n // your composition (listTools comes from the destructured context \u2014 it is NOT a global)\n})\n";
|
|
2231
2412
|
interface AuthorStrategyOptions {
|
|
2232
2413
|
/** The model-call seam (agent-eval `createChatClient`). */
|
|
2233
2414
|
chat: ChatClient;
|
|
@@ -2328,6 +2509,14 @@ interface StrategyEvolutionConfig {
|
|
|
2328
2509
|
populationSize?: number;
|
|
2329
2510
|
/** The gen0 field. Default [sample, refine, sampleThenRefine]. */
|
|
2330
2511
|
baselines?: Strategy[];
|
|
2512
|
+
/** What "better" means for PROMOTION. 'score' (default): the candidate must beat the
|
|
2513
|
+
* incumbent's score (superiority gate). 'cost': the candidate must prove score
|
|
2514
|
+
* NON-INFERIORITY (not worse by more than `scoreTolerance`) plus significant cost
|
|
2515
|
+
* savings — the "same quality, cheaper" objective. The author is told the objective
|
|
2516
|
+
* and sees per-task spend either way. */
|
|
2517
|
+
objective?: 'score' | 'cost';
|
|
2518
|
+
/** Cost objective: the score CI lower bound must clear −scoreTolerance. Default 0.05. */
|
|
2519
|
+
scoreTolerance?: number;
|
|
2331
2520
|
/** Search-side champion selection. Default 'costAware'. */
|
|
2332
2521
|
champion?: ChampionPolicy;
|
|
2333
2522
|
/** Score band treated as a tie under 'costAware'. Default 0.01. */
|
|
@@ -2336,6 +2525,48 @@ interface StrategyEvolutionConfig {
|
|
|
2336
2525
|
outDir: string;
|
|
2337
2526
|
/** Promotion-gate evidence floor (paired holdout tasks). */
|
|
2338
2527
|
minPairedTasks?: number;
|
|
2528
|
+
/** BAND-AWARE scoring — concentrate the measurement where lift is possible.
|
|
2529
|
+
* Holdout: draw `holdoutPoolN` candidate tasks and run `baselines[0]` once at the run
|
|
2530
|
+
* budget as an INDEPENDENT reference screen; keep tasks scoring ≤ `maxRefScore`
|
|
2531
|
+
* (headroom exists) and take the first `holdoutN`. Band membership is decided before
|
|
2532
|
+
* either finalist touches a task and both finalists then face the SAME tasks — the
|
|
2533
|
+
* estimand becomes "paired lift on headroom tasks", pre-registered by this config.
|
|
2534
|
+
* Train: champion selection ignores zero-spread tasks (every field strategy scored
|
|
2535
|
+
* identically — zero selection information, pure noise dilution). */
|
|
2536
|
+
band?: {
|
|
2537
|
+
holdoutPoolN: number;
|
|
2538
|
+
/** Keep holdout tasks where the reference scores ≤ this. Default 0.99 — drop only
|
|
2539
|
+
* tasks the reference already solves fully (no headroom, a candidate can only tie). */
|
|
2540
|
+
maxRefScore?: number;
|
|
2541
|
+
};
|
|
2542
|
+
/** What the author learns from a tournament. 'exact' (default) = scores + progressions
|
|
2543
|
+
* per task; 'binary' = pass/fail only — the leakage-bounded channel (one bit per cell
|
|
2544
|
+
* per generation reaches the author from the evaluation data). */
|
|
2545
|
+
lossesDetail?: 'exact' | 'binary';
|
|
2546
|
+
/** Reproducer certification (arXiv:2606.11045): when the final champion is AUTHORED,
|
|
2547
|
+
* compress it to a short natural-language summary, have a fresh author re-implement
|
|
2548
|
+
* from the summary alone (no losses, no code), and score the reproduction on the same
|
|
2549
|
+
* holdout. A reproduction gap is an overfitting signal (their detector: 100%
|
|
2550
|
+
* sensitivity / 91% specificity in the ML-agent setting) — recorded on the report,
|
|
2551
|
+
* never gate-blocking in v1. */
|
|
2552
|
+
reproducerCheck?: {
|
|
2553
|
+
/** Word budget for the strategy summary. Default 64. */
|
|
2554
|
+
summaryMaxWords?: number;
|
|
2555
|
+
/** Reproduction counts as faithful when reproducedScore ≥ championScore − tolerance.
|
|
2556
|
+
* Default 0.05. */
|
|
2557
|
+
tolerance?: number;
|
|
2558
|
+
};
|
|
2559
|
+
/** Endurance: write the run state after every completed phase; with `resume`, a
|
|
2560
|
+
* restart skips completed phases (authored modules re-imported from their files).
|
|
2561
|
+
* Worst case after a mid-run death is re-paying ONE phase, never the run. */
|
|
2562
|
+
checkpoint?: {
|
|
2563
|
+
path: string;
|
|
2564
|
+
resume?: boolean;
|
|
2565
|
+
};
|
|
2566
|
+
/** Called before each benchmark phase (gen0, gen1…, band-screen, holdout, reproduce).
|
|
2567
|
+
* The seam for environment recycling — no artifacts span phases, so a runner may
|
|
2568
|
+
* recreate a wedge-prone environment container here. */
|
|
2569
|
+
onPhase?: (phase: string) => Promise<void>;
|
|
2339
2570
|
onTask?: (phase: string, row: BenchmarkTaskRow, done: number, total: number) => void;
|
|
2340
2571
|
hooks?: RuntimeHooks;
|
|
2341
2572
|
}
|
|
@@ -2371,6 +2602,32 @@ interface EvolutionArchiveNode {
|
|
|
2371
2602
|
score: number;
|
|
2372
2603
|
usd: number;
|
|
2373
2604
|
}
|
|
2605
|
+
interface ReproductionCheck {
|
|
2606
|
+
/** The compressed strategy description the reproducer implemented from. */
|
|
2607
|
+
summary: string;
|
|
2608
|
+
reproducedName: string;
|
|
2609
|
+
file?: string;
|
|
2610
|
+
championHoldoutScore: number;
|
|
2611
|
+
reproducedHoldoutScore: number;
|
|
2612
|
+
/** champion − reproduced (positive = the reproduction fell short). */
|
|
2613
|
+
gap: number;
|
|
2614
|
+
/** reproducedScore ≥ championScore − tolerance. A failed reproduction is an
|
|
2615
|
+
* overfitting signal: the champion's win did not fit through the summary. */
|
|
2616
|
+
reproducible: boolean;
|
|
2617
|
+
/** Infra failure during reproduction (distinct from a semantic reproduction failure). */
|
|
2618
|
+
error?: string;
|
|
2619
|
+
}
|
|
2620
|
+
interface EvolutionBandInfo {
|
|
2621
|
+
/** Tasks screened by the reference on the holdout pool. */
|
|
2622
|
+
screened: number;
|
|
2623
|
+
/** Tasks kept (reference score ≤ maxRefScore) before truncating to holdoutN. */
|
|
2624
|
+
inBand: number;
|
|
2625
|
+
/** Reference scores per screened task (the screening record). */
|
|
2626
|
+
refScores: Array<{
|
|
2627
|
+
taskId: string;
|
|
2628
|
+
score: number;
|
|
2629
|
+
}>;
|
|
2630
|
+
}
|
|
2374
2631
|
interface EvolutionReport {
|
|
2375
2632
|
gen0: BenchmarkReport;
|
|
2376
2633
|
gen0Champion: ChampionPick;
|
|
@@ -2379,6 +2636,11 @@ interface EvolutionReport {
|
|
|
2379
2636
|
finalChampion: ChampionPick;
|
|
2380
2637
|
holdout: BenchmarkReport;
|
|
2381
2638
|
verdict: PromotionVerdict;
|
|
2639
|
+
/** Present when band screening ran — the verdict's estimand is then "paired lift on
|
|
2640
|
+
* headroom tasks" (band membership fixed by the reference screen, pre-registered). */
|
|
2641
|
+
band?: EvolutionBandInfo;
|
|
2642
|
+
/** Present when reproducerCheck ran (final champion was authored). */
|
|
2643
|
+
reproduction?: ReproductionCheck;
|
|
2382
2644
|
/** SEARCH TELEMETRY, not evidence: each entry is that generation's own train-slice
|
|
2383
2645
|
* re-measurement, so cross-generation deltas mix true drift with run-to-run variance
|
|
2384
2646
|
* (entries are unpaired across generations). The only evidence-grade comparison in
|
|
@@ -2390,9 +2652,22 @@ interface EvolutionReport {
|
|
|
2390
2652
|
usd: number;
|
|
2391
2653
|
}>;
|
|
2392
2654
|
}
|
|
2393
|
-
/**
|
|
2394
|
-
*
|
|
2395
|
-
*
|
|
2655
|
+
/** Strategy means recomputed over the DISCRIMINATING tasks only — tasks where the field
|
|
2656
|
+
* strategies did not all score identically. Zero-spread tasks (everyone 1.0, everyone
|
|
2657
|
+
* 0.0, everyone tied) carry no selection information; averaging over them dilutes real
|
|
2658
|
+
* differences toward zero. Search-side denoising only — the gate never uses this. */
|
|
2659
|
+
declare function discriminatingMeans(report: BenchmarkReport, fieldOrder: string[]): Record<string, {
|
|
2660
|
+
score: number;
|
|
2661
|
+
usd: number;
|
|
2662
|
+
}> | null;
|
|
2663
|
+
/** The champion pick over a means table. 'score' takes the best mean score (ties →
|
|
2664
|
+
* field order). 'costAware' treats scores within `epsilon` of the best as tied and
|
|
2665
|
+
* takes the cheapest — the (score, $) Pareto rule collapsed to one pick. */
|
|
2666
|
+
declare function pickChampion(means: Record<string, {
|
|
2667
|
+
score: number;
|
|
2668
|
+
usd: number;
|
|
2669
|
+
}>, fieldOrder: string[], policy: ChampionPolicy, epsilon: number): ChampionPick;
|
|
2670
|
+
/** Search-side champion selection over a tournament report. */
|
|
2396
2671
|
declare function selectChampion(report: BenchmarkReport, fieldOrder: string[], policy: ChampionPolicy, epsilon: number): ChampionPick;
|
|
2397
2672
|
declare function runStrategyEvolution(cfg: StrategyEvolutionConfig): Promise<EvolutionReport>;
|
|
2398
2673
|
|
|
@@ -2579,7 +2854,9 @@ interface RouterToolsSeam {
|
|
|
2579
2854
|
model?: string;
|
|
2580
2855
|
tools: ReadonlyArray<ToolSpec>;
|
|
2581
2856
|
executeToolCall: (name: string, args: Record<string, unknown>, task: unknown) => Promise<string>;
|
|
2582
|
-
/** Max inference turns (
|
|
2857
|
+
/** Max inference turns. Default 200 (runaway backstop — set far above any
|
|
2858
|
+
* legitimate workflow). For tighter per-workflow limits use a cost budget
|
|
2859
|
+
* or wall-clock deadline at the call site. */
|
|
2583
2860
|
maxTurns?: number;
|
|
2584
2861
|
}
|
|
2585
2862
|
/**
|
|
@@ -2805,4 +3082,4 @@ declare function gitWorkspace(opts: GitWorkspaceOptions): Workspace;
|
|
|
2805
3082
|
* requires `jj` on the `Shell`'s host. */
|
|
2806
3083
|
declare function jjWorkspace(opts: GitWorkspaceOptions): Workspace;
|
|
2807
3084
|
|
|
2808
|
-
export { Agent, AgentRunSpec, AgentSpec, type AgenticOptions, type AgenticRunResult, type AgenticSurface, type AgenticTask, type AgenticTool, type ArtifactHandle, type AssertTraceDerivedFindings, type AuditIntentInput, type AuditIntentOptions, type AuthorStrategyOptions, type AuthoredStrategy, type BenchmarkCell, type BenchmarkConfig, type BenchmarkLift, type BenchmarkReport, type BenchmarkStrategySummary, type BenchmarkTaskRow, type BridgeSeam, Budget, type BudgetPool, type BudgetReadout, type ChampionPick, type ChampionPolicy, type CheckpointCapableBox, type CliSeam, type CombinatorShape, type Corpus, type CorpusFilter, type CorpusRecord, type CreateScopeAnalystOptions, type CriuCapableClient, type DefinePersona, type DefinePersonaInput, type Deliverable, type Environment, type EqualKArm, type EqualKOnCost, type EqualKOnCostOptions, type EqualKVerdict, type EvolutionArchiveNode, type EvolutionAuthor, type EvolutionCandidate, type EvolutionGeneration, type EvolutionReport, ExecCtx, type ExecutorConfig, ExecutorFactory, ExecutorRegistry, type Fanout, type FanoutOptions, type FanoutSynthesis, FileCorpus, FileResultBlobStore, FileSpawnJournal, type FlatWidenGate, type ForkCapableBox, type GitWorkspaceOptions, type HarvestCorpusOptions, type HarvestFailure, type HarvestReport, InMemoryCorpus, InMemoryResultBlobStore, InMemorySpawnJournal, type IntentAudit, Iteration, type LoopDispatchOptions, type LoopOptionsForDispatch, LoopResult, type LoopShape, LoopTokenUsage, type LoopUntil, type LoopUntilSpec, type LoopUntilState, type McpEndpoint, type McpEnvironmentOptions, NodeId, type Observation, type ObserveInput, type ObserveOptions, type OpenSandboxRunOptions, type Outcome, type Panel, type PanelJudge, type PanelSpec, type PanelVerdict, type Persona, type PersonaContext, type PersonaExecutors, type Pipeline, type PipelineStage, type PromotionGateOptions, type PromotionVerdict, type RenderCorpusToInstructions, type RenderCorpusToInstructionsOptions, type ReservationTicket, ResultBlobStore, RootHandle, type RouterSeam, type RouterToolsSeam, type RunAgenticOptions, RunLoopOptions, type RunPersonified, type RunPersonifiedOptions, type SandboxCapabilities, SandboxClient, type SandboxLineage, type SandboxLineageHandle, type SandboxRun, type SandboxSeam, Scope, type ScopeAnalyst, type ScopeAnalyzeInput, type ScopeWidenGate, type SessionCapableBox, Settled, type ShapeBudget, type ShapeContext, type ShapeRegistry, type Shell, type ShotPersona, type ShotSpec, SpawnEvent, SpawnJournal, Spend, type SteerContext, type Strategy, type StrategyCtx, type StrategyEvolutionConfig, type StrategyResult, SupervisedResult, Supervisor, type SurfaceScore, type ToolSpec, type TrajectoryNode, type TrajectoryReport, type TrajectoryReportFn, type TrajectoryReportOptions, TreeView, type TurnResult, UsageEvent, type UsageSink, type VerifierEnvironmentOptions, type Verify, type VerifySpec, type Widen, type WidenDecision, type WidenLineage, type WidenSpec, type Workspace, type WorkspaceCommit, acquireSandbox, adaptiveRefine, assertStrategyContract, assertTraceDerivedFindings, auditIntent, authorStrategy, breadthDriver, buildSteerContext, builtinShapes, contentAddress, createBudgetPool, createExecutor, createExecutorRegistry, createMcpEnvironment, createRootHandle, createSandboxLineage, createScope, createScopeAnalyst, createShapeRegistry, createSupervisor, createVerifierEnvironment, defaultAnalystInstruction, defaultAuditorInstruction, definePersona, defineStrategy, depthDriver, equalKOnCost, extractLlmCallEvent, fanout, flatWidenGate, gitWorkspace, harvestCorpus, inlineSandboxClient, jjWorkspace, localShell, loopDispatch, loopUntil, mapSandboxEvent, materializeTreeView, observe, openSandboxRun, panel, pipeline, printBenchmarkReport, probeSandboxCapabilities, promotionGate, refine, registerShape, renderCorpusToInstructions, renderReport, replaySpawnTree, reportLoopUsage, runAgentic, runBenchmark, runPersonified, runStrategyEvolution, sample, sampleThenRefine, selectChampion, settledToIteration, spendFromUsageEvents, strategyAuthorContract, trajectoryReport, verify, widen };
|
|
3085
|
+
export { Agent, AgentRunSpec, AgentSpec, type AgenticOptions, type AgenticRunResult, type AgenticSurface, type AgenticTask, type AgenticTool, type AnytimeReport, type AnytimeStrategySummary, type AnytimeTaskCurve, type ArtifactHandle, type AssertTraceDerivedFindings, type AuditIntentInput, type AuditIntentOptions, type AuthorStrategyOptions, type AuthoredStrategy, type BenchmarkCell, type BenchmarkConfig, type BenchmarkLift, type BenchmarkReport, type BenchmarkStrategySummary, type BenchmarkTaskRow, type BridgeSeam, Budget, type BudgetPool, type BudgetReadout, type ChampionPick, type ChampionPolicy, type CheckpointCapableBox, type CliSeam, type CombinatorShape, type Corpus, type CorpusFilter, type CorpusRecord, type CreateScopeAnalystOptions, type CriuCapableClient, type DefinePersona, type DefinePersonaInput, type Deliverable, type Environment, type EqualKArm, type EqualKOnCost, type EqualKOnCostOptions, type EqualKVerdict, type EvolutionArchiveNode, type EvolutionAuthor, type EvolutionBandInfo, type EvolutionCandidate, type EvolutionGeneration, type EvolutionReport, ExecCtx, type ExecutorConfig, ExecutorFactory, ExecutorRegistry, type Fanout, type FanoutOptions, type FanoutSynthesis, FileCorpus, FileResultBlobStore, FileSpawnJournal, type FlatWidenGate, type ForkCapableBox, type GitWorkspaceOptions, type HarvestCorpusOptions, type HarvestFailure, type HarvestReport, InMemoryCorpus, InMemoryResultBlobStore, InMemorySpawnJournal, type IntentAudit, Iteration, type LoopDispatchOptions, type LoopOptionsForDispatch, LoopResult, type LoopShape, LoopTokenUsage, type LoopUntil, type LoopUntilSpec, type LoopUntilState, type McpEndpoint, type McpEnvironmentOptions, NodeId, type Observation, type ObserveInput, type ObserveOptions, type OpenSandboxRunOptions, type Outcome, type Panel, type PanelJudge, type PanelSpec, type PanelVerdict, type Persona, type PersonaContext, type PersonaExecutors, type Pipeline, type PipelineStage, type PromotionGateOptions, type PromotionVerdict, type RenderCorpusToInstructions, type RenderCorpusToInstructionsOptions, type ReservationTicket, ResultBlobStore, RootHandle, type RouterSeam, type RouterToolsSeam, type RunAgenticOptions, RunLoopOptions, type RunPersonified, type RunPersonifiedOptions, type SandboxCapabilities, SandboxClient, type SandboxLineage, type SandboxLineageHandle, type SandboxRun, type SandboxSeam, Scope, type ScopeAnalyst, type ScopeAnalyzeInput, type ScopeWidenGate, type SessionCapableBox, Settled, type ShapeBudget, type ShapeContext, type ShapeRegistry, type Shell, type ShotPersona, type ShotSpec, SpawnEvent, SpawnJournal, Spend, type SteerContext, type Strategy, type StrategyCtx, type StrategyEvolutionConfig, type StrategyResult, SupervisedResult, Supervisor, type SurfaceScore, type ToolSpec, type TrajectoryNode, type TrajectoryReport, type TrajectoryReportFn, type TrajectoryReportOptions, TreeView, type TurnResult, UsageEvent, type UsageSink, type VerifierEnvironmentOptions, type Verify, type VerifySpec, type WaterfallCollector, type WaterfallReport, type WaterfallSpan, type Widen, type WidenDecision, type WidenLineage, type WidenSpec, type Workspace, type WorkspaceCommit, acquireSandbox, adaptiveRefine, anytimeReport, assertStrategyContract, assertTraceDerivedFindings, auditIntent, authorStrategy, breadthDriver, buildSteerContext, builtinShapes, contentAddress, createBudgetPool, createExecutor, createExecutorRegistry, createMcpEnvironment, createRootHandle, createSandboxLineage, createScope, createScopeAnalyst, createShapeRegistry, createSupervisor, createVerifierEnvironment, createWaterfallCollector, defaultAnalystInstruction, defaultAuditorInstruction, definePersona, defineStrategy, depthDriver, discriminatingMeans, equalKOnCost, extractLlmCallEvent, fanout, flatWidenGate, gitWorkspace, harvestCorpus, inlineSandboxClient, jjWorkspace, localShell, loopDispatch, loopUntil, mapSandboxEvent, materializeTreeView, observe, openSandboxRun, panel, pickChampion, pipeline, printBenchmarkReport, probeSandboxCapabilities, promotionGate, refine, registerShape, renderAnytimeTable, renderCorpusToInstructions, renderReport, replaySpawnTree, reportLoopUsage, runAgentic, runBenchmark, runPersonified, runStrategyEvolution, sample, sampleThenRefine, selectChampion, settledToIteration, spendFromUsageEvents, strategyAuthorContract, trajectoryReport, verify, widen };
|
package/dist/runtime.js
CHANGED
|
@@ -7,6 +7,7 @@ import {
|
|
|
7
7
|
InMemorySpawnJournal,
|
|
8
8
|
acquireSandbox,
|
|
9
9
|
adaptiveRefine,
|
|
10
|
+
anytimeReport,
|
|
10
11
|
assertStrategyContract,
|
|
11
12
|
assertTraceDerivedFindings,
|
|
12
13
|
auditIntent,
|
|
@@ -29,6 +30,7 @@ import {
|
|
|
29
30
|
createShapeRegistry,
|
|
30
31
|
createSupervisor,
|
|
31
32
|
createVerifierEnvironment,
|
|
33
|
+
createWaterfallCollector,
|
|
32
34
|
defaultAnalystInstruction,
|
|
33
35
|
defaultAuditorInstruction,
|
|
34
36
|
defaultSelectWinner,
|
|
@@ -36,6 +38,7 @@ import {
|
|
|
36
38
|
defineStrategy,
|
|
37
39
|
depthDriver,
|
|
38
40
|
deterministicCompletion,
|
|
41
|
+
discriminatingMeans,
|
|
39
42
|
equalKOnCost,
|
|
40
43
|
fanout,
|
|
41
44
|
flatWidenGate,
|
|
@@ -50,6 +53,7 @@ import {
|
|
|
50
53
|
observe,
|
|
51
54
|
openSandboxRun,
|
|
52
55
|
panel,
|
|
56
|
+
pickChampion,
|
|
53
57
|
pipeline,
|
|
54
58
|
printBenchmarkReport,
|
|
55
59
|
probeSandboxCapabilities,
|
|
@@ -57,6 +61,7 @@ import {
|
|
|
57
61
|
refine,
|
|
58
62
|
registerShape,
|
|
59
63
|
renderAnalyses,
|
|
64
|
+
renderAnytimeTable,
|
|
60
65
|
renderCorpusToInstructions,
|
|
61
66
|
renderReport,
|
|
62
67
|
replaySpawnTree,
|
|
@@ -77,7 +82,7 @@ import {
|
|
|
77
82
|
trajectoryReport,
|
|
78
83
|
verify,
|
|
79
84
|
widen
|
|
80
|
-
} from "./chunk-
|
|
85
|
+
} from "./chunk-PXUTIMGJ.js";
|
|
81
86
|
import {
|
|
82
87
|
extractLlmCallEvent,
|
|
83
88
|
mapSandboxEvent
|
|
@@ -92,6 +97,7 @@ export {
|
|
|
92
97
|
InMemorySpawnJournal,
|
|
93
98
|
acquireSandbox,
|
|
94
99
|
adaptiveRefine,
|
|
100
|
+
anytimeReport,
|
|
95
101
|
assertStrategyContract,
|
|
96
102
|
assertTraceDerivedFindings,
|
|
97
103
|
auditIntent,
|
|
@@ -114,6 +120,7 @@ export {
|
|
|
114
120
|
createShapeRegistry,
|
|
115
121
|
createSupervisor,
|
|
116
122
|
createVerifierEnvironment,
|
|
123
|
+
createWaterfallCollector,
|
|
117
124
|
defaultAnalystInstruction,
|
|
118
125
|
defaultAuditorInstruction,
|
|
119
126
|
defaultSelectWinner,
|
|
@@ -121,6 +128,7 @@ export {
|
|
|
121
128
|
defineStrategy,
|
|
122
129
|
depthDriver,
|
|
123
130
|
deterministicCompletion,
|
|
131
|
+
discriminatingMeans,
|
|
124
132
|
equalKOnCost,
|
|
125
133
|
extractLlmCallEvent,
|
|
126
134
|
fanout,
|
|
@@ -137,6 +145,7 @@ export {
|
|
|
137
145
|
observe,
|
|
138
146
|
openSandboxRun,
|
|
139
147
|
panel,
|
|
148
|
+
pickChampion,
|
|
140
149
|
pipeline,
|
|
141
150
|
printBenchmarkReport,
|
|
142
151
|
probeSandboxCapabilities,
|
|
@@ -144,6 +153,7 @@ export {
|
|
|
144
153
|
refine,
|
|
145
154
|
registerShape,
|
|
146
155
|
renderAnalyses,
|
|
156
|
+
renderAnytimeTable,
|
|
147
157
|
renderCorpusToInstructions,
|
|
148
158
|
renderReport,
|
|
149
159
|
replaySpawnTree,
|
package/dist/workflow.js
CHANGED
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tangle-network/agent-runtime",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.49.0",
|
|
4
4
|
"description": "Shared task-lifecycle skeleton for agents: a recursive loop kernel for chat turns, one-shot tasks, and multi-attempt loops, with trace capture and eval-gated self-improvement. Domain behavior lives in adapters; scoring and ship-gates in @tangle-network/agent-eval.",
|
|
5
5
|
"homepage": "https://github.com/tangle-network/agent-runtime#readme",
|
|
6
6
|
"repository": {
|
|
@@ -95,14 +95,14 @@
|
|
|
95
95
|
"test:watch": "vitest",
|
|
96
96
|
"lint": "biome check src tests examples",
|
|
97
97
|
"lint:fix": "biome check --write src tests examples",
|
|
98
|
-
"typecheck": "tsc --noEmit",
|
|
98
|
+
"typecheck": "tsc --noEmit && pnpm run typecheck:examples",
|
|
99
99
|
"typecheck:examples": "tsc --noEmit -p tsconfig.examples.json",
|
|
100
100
|
"verify:package": "node scripts/verify-package-exports.mjs"
|
|
101
101
|
},
|
|
102
102
|
"devDependencies": {
|
|
103
103
|
"@biomejs/biome": "^2.4.0",
|
|
104
104
|
"@tangle-network/agent-eval": "^0.89.0",
|
|
105
|
-
"@tangle-network/sandbox": "^0.
|
|
105
|
+
"@tangle-network/sandbox": "^0.6.0",
|
|
106
106
|
"@types/node": "^25.6.0",
|
|
107
107
|
"playwright": "^1.40.0",
|
|
108
108
|
"tsup": "^8.0.0",
|
|
@@ -112,7 +112,8 @@
|
|
|
112
112
|
"pnpm": {
|
|
113
113
|
"minimumReleaseAge": 4320,
|
|
114
114
|
"minimumReleaseAgeExclude": [
|
|
115
|
-
"@tangle-network/agent-eval"
|
|
115
|
+
"@tangle-network/agent-eval",
|
|
116
|
+
"@tangle-network/sandbox"
|
|
116
117
|
],
|
|
117
118
|
"onlyBuiltDependencies": [
|
|
118
119
|
"esbuild"
|
|
@@ -126,7 +127,7 @@
|
|
|
126
127
|
"peerDependencies": {
|
|
127
128
|
"@tangle-network/agent-eval": ">=0.83.0 <1.0.0",
|
|
128
129
|
"@tangle-network/agent-knowledge": ">=1.3.0 <2.0.0",
|
|
129
|
-
"@tangle-network/sandbox": ">=0.1.2 <0.
|
|
130
|
+
"@tangle-network/sandbox": ">=0.1.2 <0.7.0",
|
|
130
131
|
"playwright": "^1.40.0"
|
|
131
132
|
},
|
|
132
133
|
"peerDependenciesMeta": {
|