@tangle-network/agent-runtime 0.44.0 → 0.46.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +95 -203
- package/dist/agent.d.ts +3 -2
- package/dist/agent.js +5 -7
- package/dist/agent.js.map +1 -1
- package/dist/analyst-loop.d.ts +28 -2
- package/dist/analyst-loop.js +4 -1
- package/dist/audit.d.ts +93 -0
- package/dist/audit.js +312 -0
- package/dist/audit.js.map +1 -0
- package/dist/chunk-4B6U4CVQ.js +15 -0
- package/dist/chunk-4B6U4CVQ.js.map +1 -0
- package/dist/chunk-65FQLI4V.js +4089 -0
- package/dist/chunk-65FQLI4V.js.map +1 -0
- package/dist/{chunk-GFKVVRQ7.js → chunk-GN75RGM6.js} +13 -12
- package/dist/chunk-GN75RGM6.js.map +1 -0
- package/dist/chunk-GSUO5QS6.js +146 -0
- package/dist/chunk-GSUO5QS6.js.map +1 -0
- package/dist/chunk-HNUXAZIJ.js +580 -0
- package/dist/chunk-HNUXAZIJ.js.map +1 -0
- package/dist/{chunk-SKUZZCHE.js → chunk-I42NHLKX.js} +5 -5
- package/dist/chunk-I42NHLKX.js.map +1 -0
- package/dist/{chunk-HVYOHJHK.js → chunk-JNPK46YH.js} +2 -2
- package/dist/chunk-JNPK46YH.js.map +1 -0
- package/dist/{chunk-3HMHSN22.js → chunk-KADIJAD4.js} +38 -24
- package/dist/chunk-KADIJAD4.js.map +1 -0
- package/dist/{chunk-KDMRUD2P.js → chunk-KPN7OQ64.js} +296 -8
- package/dist/chunk-KPN7OQ64.js.map +1 -0
- package/dist/{chunk-NRZOXCJK.js → chunk-VR4JIC5H.js} +2 -2
- package/dist/chunk-WIR4HOOJ.js +27 -0
- package/dist/chunk-WIR4HOOJ.js.map +1 -0
- package/dist/coder-DCWFQpmJ.d.ts +114 -0
- package/dist/driver-C-mtBo7h.d.ts +221 -0
- package/dist/improvement.d.ts +0 -1
- package/dist/improvement.js +0 -5
- package/dist/improvement.js.map +1 -1
- package/dist/index.d.ts +122 -9
- package/dist/index.js +398 -10
- package/dist/index.js.map +1 -1
- package/dist/{kb-gate-D0ZIhFOU.d.ts → kb-gate-2Gwpz_27.d.ts} +86 -9
- package/dist/{loop-runner-bin-BLMa8He3.d.ts → loop-runner-bin-D-K6bRp3.d.ts} +17 -13
- package/dist/loop-runner-bin.d.ts +8 -6
- package/dist/loop-runner-bin.js +6 -8
- package/dist/loops.d.ts +7 -393
- package/dist/loops.js +96 -27
- package/dist/mcp/bin.js +7 -7
- package/dist/mcp/bin.js.map +1 -1
- package/dist/mcp/index.d.ts +286 -13
- package/dist/mcp/index.js +341 -9
- package/dist/mcp/index.js.map +1 -1
- package/dist/{otel-export-wFDmmurL.d.ts → otel-export-nurzFwuJ.d.ts} +1 -1
- package/dist/profiles.d.ts +385 -86
- package/dist/profiles.js +549 -4
- package/dist/profiles.js.map +1 -1
- package/dist/{run-loop-C4L1Sted.d.ts → run-loop-CU2Y00Si.d.ts} +36 -13
- package/dist/runtime-hooks-C7JwKb9E.d.ts +70 -0
- package/dist/runtime.d.ts +1964 -0
- package/dist/runtime.js +114 -0
- package/dist/runtime.js.map +1 -0
- package/dist/substrate-CUgk7F7s.d.ts +77 -0
- package/dist/topology.d.ts +73 -0
- package/dist/topology.js +111 -0
- package/dist/topology.js.map +1 -0
- package/dist/types-BfoeiQRZ.d.ts +438 -0
- package/dist/{types-DbJzz2uf.d.ts → types-DnYoHvvZ.d.ts} +110 -4
- package/dist/workflow.d.ts +4 -3
- package/dist/workflow.js +4 -5
- package/dist/workflow.js.map +1 -1
- package/package.json +37 -28
- package/skills/agent-runtime-adoption/SKILL.md +32 -29
- package/skills/generate-eval/SKILL.md +60 -0
- package/dist/chunk-3HMHSN22.js.map +0 -1
- package/dist/chunk-GFKVVRQ7.js.map +0 -1
- package/dist/chunk-HVYOHJHK.js.map +0 -1
- package/dist/chunk-KDMRUD2P.js.map +0 -1
- package/dist/chunk-PY6NMZYX.js +0 -52
- package/dist/chunk-PY6NMZYX.js.map +0 -1
- package/dist/chunk-S7JXV32P.js +0 -947
- package/dist/chunk-S7JXV32P.js.map +0 -1
- package/dist/chunk-SKUZZCHE.js.map +0 -1
- package/dist/chunk-SQSCRJ7U.js +0 -65
- package/dist/chunk-SQSCRJ7U.js.map +0 -1
- package/dist/chunk-VOX6Z3II.js +0 -90
- package/dist/chunk-VOX6Z3II.js.map +0 -1
- package/dist/chunk-XBUG326M.js +0 -261
- package/dist/chunk-XBUG326M.js.map +0 -1
- package/dist/dynamic-wUgp6UKs.d.ts +0 -108
- package/dist/optimize-prompt-D-urF2wW.d.ts +0 -129
- /package/dist/{chunk-NRZOXCJK.js.map → chunk-VR4JIC5H.js.map} +0 -0
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
import { AgentProfile } from '@tangle-network/sandbox';
|
|
2
|
+
import { O as OutputAdapter, V as Validator, A as AgentRunSpec, D as Driver } from './types-DnYoHvvZ.js';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* @experimental
|
|
6
|
+
*
|
|
7
|
+
* `coderProfile` — opinionated preset for code-modification tasks.
|
|
8
|
+
*
|
|
9
|
+
* The agent is told to:
|
|
10
|
+
* - work on a fresh branch inside the sandbox workspace
|
|
11
|
+
* - keep the patch minimal (under `maxDiffLines`)
|
|
12
|
+
* - avoid `forbiddenPaths`
|
|
13
|
+
* - run `testCmd` and `typecheckCmd`
|
|
14
|
+
* - emit a final JSON result the output adapter parses
|
|
15
|
+
*
|
|
16
|
+
* The profile is stateless and agent-agnostic — `harness` selects the
|
|
17
|
+
* sandbox-SDK backend (`claude-code`, `codex`, `opencode/*`). For
|
|
18
|
+
* heterogeneous fanout, use `multiHarnessCoderFanout`.
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
/** @experimental */
|
|
22
|
+
interface CoderTask {
|
|
23
|
+
/** What the agent must accomplish. Free-form prose. */
|
|
24
|
+
goal: string;
|
|
25
|
+
/** Absolute path inside the sandbox where the repo lives. */
|
|
26
|
+
repoRoot: string;
|
|
27
|
+
/** Default `main`. The branch the agent diffs against. */
|
|
28
|
+
baseBranch?: string;
|
|
29
|
+
/** Default `pnpm test --run`. */
|
|
30
|
+
testCmd?: string;
|
|
31
|
+
/** Default `pnpm typecheck`. */
|
|
32
|
+
typecheckCmd?: string;
|
|
33
|
+
/** Files the agent may inspect for context. Surfaced verbatim in the prompt. */
|
|
34
|
+
contextFiles?: string[];
|
|
35
|
+
/**
|
|
36
|
+
* Paths the agent must not touch. Validator hard-fails on any match.
|
|
37
|
+
* Use glob-free literal path prefixes for unambiguous enforcement.
|
|
38
|
+
*/
|
|
39
|
+
forbiddenPaths?: string[];
|
|
40
|
+
/** Default 400. Hard cap; validator hard-fails when exceeded. */
|
|
41
|
+
maxDiffLines?: number;
|
|
42
|
+
}
|
|
43
|
+
/** @experimental */
|
|
44
|
+
interface CoderOutput {
|
|
45
|
+
/** Branch the agent wrote the patch on. */
|
|
46
|
+
branch: string;
|
|
47
|
+
/** Unified diff (`git diff <base>..HEAD`). */
|
|
48
|
+
patch: string;
|
|
49
|
+
testResult: {
|
|
50
|
+
passed: boolean;
|
|
51
|
+
output: string;
|
|
52
|
+
};
|
|
53
|
+
typecheckResult: {
|
|
54
|
+
passed: boolean;
|
|
55
|
+
output: string;
|
|
56
|
+
};
|
|
57
|
+
diffStats: {
|
|
58
|
+
filesChanged: number;
|
|
59
|
+
insertions: number;
|
|
60
|
+
deletions: number;
|
|
61
|
+
};
|
|
62
|
+
/** Optional reviewer commentary surfaced by the agent. */
|
|
63
|
+
reviewerNotes?: string;
|
|
64
|
+
}
|
|
65
|
+
/** @experimental */
|
|
66
|
+
interface CoderProfileOptions {
|
|
67
|
+
/** Sandbox-SDK backend.type. Default `'claude-code'`. */
|
|
68
|
+
harness?: string;
|
|
69
|
+
/** Default model id passed in `AgentProfile.model.default`. */
|
|
70
|
+
model?: string;
|
|
71
|
+
/** Custom system prompt replacement. Default = built-in coder preset. */
|
|
72
|
+
systemPrompt?: string;
|
|
73
|
+
/** Stable name for `AgentRunSpec.name`. Default = `coder-${harness}`. */
|
|
74
|
+
name?: string;
|
|
75
|
+
}
|
|
76
|
+
/**
|
|
77
|
+
* Build a coder preset.
|
|
78
|
+
*
|
|
79
|
+
* `validator` enforces test + typecheck + a 400-line default diff cap. For
|
|
80
|
+
* per-task `forbiddenPaths` / `maxDiffLines` enforcement, pass `task` here
|
|
81
|
+
* — the returned validator closes over its constraints. Without a task
|
|
82
|
+
* the validator falls back to the default cap and skips path enforcement.
|
|
83
|
+
*
|
|
84
|
+
* @experimental
|
|
85
|
+
*/
|
|
86
|
+
declare function coderProfile(options?: CoderProfileOptions & {
|
|
87
|
+
task?: CoderTask;
|
|
88
|
+
}): {
|
|
89
|
+
profile: AgentProfile;
|
|
90
|
+
taskToPrompt: (task: CoderTask) => string;
|
|
91
|
+
output: OutputAdapter<CoderOutput>;
|
|
92
|
+
validator: Validator<CoderOutput>;
|
|
93
|
+
agentRunSpec: AgentRunSpec<CoderTask>;
|
|
94
|
+
};
|
|
95
|
+
/** @experimental */
|
|
96
|
+
interface MultiHarnessCoderFanoutOptions {
|
|
97
|
+
/**
|
|
98
|
+
* Sandbox-SDK backend.type identifiers, one per parallel agent. Default:
|
|
99
|
+
* `['claude-code', 'codex', 'opencode/zai-coding-plan/glm-5.1']`.
|
|
100
|
+
*/
|
|
101
|
+
harnesses?: string[];
|
|
102
|
+
/** Optional per-harness model override. Indexed parallel to `harnesses`. */
|
|
103
|
+
models?: (string | undefined)[];
|
|
104
|
+
}
|
|
105
|
+
/** @experimental */
|
|
106
|
+
declare function multiHarnessCoderFanout(options?: MultiHarnessCoderFanoutOptions): {
|
|
107
|
+
agentRuns: AgentRunSpec<CoderTask>[];
|
|
108
|
+
output: OutputAdapter<CoderOutput>;
|
|
109
|
+
validator: Validator<CoderOutput>;
|
|
110
|
+
driver: Driver<CoderTask, CoderOutput, 'pick-winner' | 'fail'>;
|
|
111
|
+
};
|
|
112
|
+
declare function createCoderValidator(task: CoderTask): Validator<CoderOutput>;
|
|
113
|
+
|
|
114
|
+
export { type CoderOutput as C, type MultiHarnessCoderFanoutOptions as M, type CoderTask as a, type CoderProfileOptions as b, coderProfile as c, createCoderValidator as d, multiHarnessCoderFanout as m };
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
import { AnalystFinding } from '@tangle-network/agent-eval';
|
|
2
|
+
import { I as Iteration, D as Driver } from './types-DnYoHvvZ.js';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* @experimental
|
|
6
|
+
*
|
|
7
|
+
* Completion / satisfiability — the OTHER output of the pluggable analyst (the steer output
|
|
8
|
+
* is `AnalystFinding[]` via the `analyze` hook; this is the "is it done?" output via the
|
|
9
|
+
* `complete` hook). A `CompletionAnalyst` reads a node's trace and returns a `CompletionVerdict`
|
|
10
|
+
* the PARENT (driver) acts on: end the node, or keep going. It fits ANY node and composes to
|
|
11
|
+
* any depth — a 1-deep loop has one; an N-deep tree has one per node.
|
|
12
|
+
*
|
|
13
|
+
* The verdict's authority scales with its DETERMINISM (the thing that varies by task):
|
|
14
|
+
* - `deterministic` — build/test/lint pass, a proof checks, every claim's citation resolves:
|
|
15
|
+
* ground truth, the driver TRUSTS it and ends. Not an opinion.
|
|
16
|
+
* - `probabilistic` — a quality/soundness judgment (marketing, "the experiment is sound"):
|
|
17
|
+
* ADVISORY. It passes to the driver with its reasons; the driver validates (here: a
|
|
18
|
+
* confidence threshold; a richer driver may re-examine the reasons) before ending.
|
|
19
|
+
*
|
|
20
|
+
* Two stop-signal mechanisms, by node mode, both → one `CompletionVerdict`:
|
|
21
|
+
* - sandbox-agent (text stream): a unique per-node STOP SENTINEL the agent emits when done
|
|
22
|
+
* (`stopSentinel` / `sentinelCompletion`) — ralph-loop style; the seed makes it
|
|
23
|
+
* unguessable + attributable, so it can't be spuriously emitted or confused with content.
|
|
24
|
+
* - deterministic check (compile/test/citation/proof): `deterministicCompletion(check)` —
|
|
25
|
+
* a verifier over the output, never the judge verdict (selector ≠ judge holds).
|
|
26
|
+
*/
|
|
27
|
+
|
|
28
|
+
/** Trace-derived evidence for a completion claim — an artifact (output) or a verifier metric,
|
|
29
|
+
* never the judge's own verdict. Mirrors the steer-firewall's provenance discipline. */
|
|
30
|
+
interface CompletionEvidence {
|
|
31
|
+
kind: 'artifact' | 'metric';
|
|
32
|
+
uri: string;
|
|
33
|
+
}
|
|
34
|
+
/** The "is it done?" verdict an analyst returns to the parent. */
|
|
35
|
+
interface CompletionVerdict {
|
|
36
|
+
done: boolean;
|
|
37
|
+
/** How verifiable the claim is — sets whether the driver trusts it or validates it. */
|
|
38
|
+
determinism: 'deterministic' | 'probabilistic';
|
|
39
|
+
/** Why the analyst believes it is (or isn't) done — what the driver validates. */
|
|
40
|
+
reasons?: string;
|
|
41
|
+
/** 0..1, for probabilistic verdicts; the driver's validation threshold reads this. */
|
|
42
|
+
confidence?: number;
|
|
43
|
+
evidence?: ReadonlyArray<CompletionEvidence>;
|
|
44
|
+
}
|
|
45
|
+
/** Reads a node's trace → a completion verdict. Same input shape as the `analyze` hook, so
|
|
46
|
+
* ONE analyst node can back both channels (findings for steer, a verdict for stop). */
|
|
47
|
+
interface CompletionAnalyst<Task, Output> {
|
|
48
|
+
assess(input: {
|
|
49
|
+
task: Task;
|
|
50
|
+
history: ReadonlyArray<Iteration<Task, Output>>;
|
|
51
|
+
}): CompletionVerdict | Promise<CompletionVerdict>;
|
|
52
|
+
}
|
|
53
|
+
/** When a verdict authorizes the driver to END. Deterministic → trust (ground truth);
|
|
54
|
+
* probabilistic → validate by confidence threshold (the driver's check). */
|
|
55
|
+
interface CompletionPolicy {
|
|
56
|
+
/** Minimum confidence a PROBABILISTIC verdict must clear to end. Default 0.8. */
|
|
57
|
+
minConfidence?: number;
|
|
58
|
+
}
|
|
59
|
+
declare function completionAuthorizes(v: CompletionVerdict, policy?: CompletionPolicy): boolean;
|
|
60
|
+
/**
|
|
61
|
+
* A unique, attributable stop sentinel for a node (ralph-loop style). Deterministic from the
|
|
62
|
+
* seed (no Math.random — reproducible + attributable to the node); the agent is instructed to
|
|
63
|
+
* emit it VERBATIM when it judges itself done. Unguessable enough that content never trips it.
|
|
64
|
+
*/
|
|
65
|
+
declare function stopSentinel(seed: string): string;
|
|
66
|
+
/**
|
|
67
|
+
* Completion for a sandbox-agent node: done iff the latest output carries the node's stop
|
|
68
|
+
* sentinel. PROBABILISTIC (the agent's own self-judgment) — the driver validates it.
|
|
69
|
+
*/
|
|
70
|
+
declare function sentinelCompletion<Task>(sentinel: string, opts?: {
|
|
71
|
+
confidence?: number;
|
|
72
|
+
}): CompletionAnalyst<Task, string>;
|
|
73
|
+
/**
|
|
74
|
+
* Completion for a DETERMINISTIC check (build/test/lint/citation/proof): done iff the check
|
|
75
|
+
* passes. Ground truth — the driver ends directly, no validation. The check reads the output
|
|
76
|
+
* (a verifier), never the judge verdict — selector ≠ judge stays intact.
|
|
77
|
+
*/
|
|
78
|
+
declare function deterministicCompletion<Task, Output>(check: (output: Output, history: ReadonlyArray<Iteration<Task, Output>>) => {
|
|
79
|
+
passed: boolean;
|
|
80
|
+
reasons?: string;
|
|
81
|
+
}): CompletionAnalyst<Task, Output>;
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* @experimental
|
|
85
|
+
*
|
|
86
|
+
* Dynamic driver — the agent authors the loop topology at runtime.
|
|
87
|
+
*
|
|
88
|
+
* Where a fixed-shape driver encodes one topology as a pure function of
|
|
89
|
+
* history, this driver delegates the per-round shape to an injected
|
|
90
|
+
* `TopologyPlanner`. Each round the planner inspects the task + iteration
|
|
91
|
+
* history and emits one `TopologyMove`:
|
|
92
|
+
* - `refine` → one task next round (optionally rewritten from the prior attempt)
|
|
93
|
+
* - `fanout` → N tasks next round (the kernel round-robins `agentRuns`, so a
|
|
94
|
+
* 2-harness fanout dispatches branch 0 to harness A and branch 1 to harness B)
|
|
95
|
+
* - `stop` → terminate; the kernel selects the winner across all iterations
|
|
96
|
+
*
|
|
97
|
+
* The planner is the brain; this driver is the structure. It maps moves onto
|
|
98
|
+
* the kernel's `plan`/`decide` contract, enforces the iteration + fanout caps,
|
|
99
|
+
* and fails loud on a malformed move. The planner is injected — so a test can
|
|
100
|
+
* drive a deterministic policy through the real kernel, and production can wire
|
|
101
|
+
* it to an LLM-backed, agent-authored planner.
|
|
102
|
+
*
|
|
103
|
+
* Topology is orthogonal to harness: the planner never names a backend. Which
|
|
104
|
+
* harness runs a branch is decided by the `AgentRunSpec` the kernel round-robins
|
|
105
|
+
* to, so one dynamic driver works across claude-code, codex, opencode, pi —
|
|
106
|
+
* including fanning a single round across several at once.
|
|
107
|
+
*/
|
|
108
|
+
|
|
109
|
+
/** Terminal once `decide` returns `'done'` (a kernel terminal decision). */
|
|
110
|
+
type DriverDecision = 'continue' | 'done';
|
|
111
|
+
/**
|
|
112
|
+
* One topology decision for the next round. `fanout` carries explicit tasks
|
|
113
|
+
* rather than a count so the planner can issue heterogeneous branches (a
|
|
114
|
+
* different sub-task per harness); pass N copies of one task for a homogeneous
|
|
115
|
+
* fanout that relies on `agentRuns` diversity instead.
|
|
116
|
+
*
|
|
117
|
+
* @experimental
|
|
118
|
+
*/
|
|
119
|
+
type TopologyMove<Task> = {
|
|
120
|
+
kind: 'refine';
|
|
121
|
+
task: Task;
|
|
122
|
+
rationale?: string;
|
|
123
|
+
parentIndex?: number;
|
|
124
|
+
} | {
|
|
125
|
+
kind: 'fanout';
|
|
126
|
+
tasks: Task[];
|
|
127
|
+
rationale?: string;
|
|
128
|
+
parentIndex?: number;
|
|
129
|
+
} | {
|
|
130
|
+
kind: 'stop';
|
|
131
|
+
rationale?: string;
|
|
132
|
+
} | {
|
|
133
|
+
kind: 'select';
|
|
134
|
+
index: number;
|
|
135
|
+
rationale?: string;
|
|
136
|
+
};
|
|
137
|
+
/** @experimental */
|
|
138
|
+
interface PlannerContext<Task, Output> {
|
|
139
|
+
/** The root task the loop was invoked with — stable across rounds. */
|
|
140
|
+
task: Task;
|
|
141
|
+
/** Every iteration so far, in dispatch order, with outputs + verdicts. */
|
|
142
|
+
history: ReadonlyArray<Iteration<Task, Output>>;
|
|
143
|
+
/** `history.length` — iterations already spent. */
|
|
144
|
+
iterationsSpent: number;
|
|
145
|
+
/** Iterations left before the driver's `maxIterations` cap forces a stop. */
|
|
146
|
+
iterationsRemaining: number;
|
|
147
|
+
/**
|
|
148
|
+
* Trace-analyst findings about the attempts so far — populated only when an
|
|
149
|
+
* `analyze` hook is wired into the driver (see CreateDriverOptions).
|
|
150
|
+
* This is the channel that lets the planner steer from the DIAGNOSIS
|
|
151
|
+
* (`f(trace, findings)`), not the verdict score alone. Undefined = no analyst
|
|
152
|
+
* wired (the planner runs exactly as before). @experimental
|
|
153
|
+
*/
|
|
154
|
+
analyses?: ReadonlyArray<AnalystFinding>;
|
|
155
|
+
}
|
|
156
|
+
/**
|
|
157
|
+
* Chooses the next topology move from the task + history. Sync or async; an
|
|
158
|
+
* async planner is where an LLM call goes (an agent-authored topology planner).
|
|
159
|
+
*
|
|
160
|
+
* @experimental
|
|
161
|
+
*/
|
|
162
|
+
type TopologyPlanner<Task, Output> = (ctx: PlannerContext<Task, Output>) => TopologyMove<Task> | Promise<TopologyMove<Task>>;
|
|
163
|
+
/**
|
|
164
|
+
* Input to the optional `analyze` hook: the root task + the trace so far. The
|
|
165
|
+
* hook turns this into `AnalystFinding[]` — the caller's seam to `runAnalystLoop`.
|
|
166
|
+
* @experimental
|
|
167
|
+
*/
|
|
168
|
+
interface AnalyzeInput<Task, Output> {
|
|
169
|
+
task: Task;
|
|
170
|
+
history: ReadonlyArray<Iteration<Task, Output>>;
|
|
171
|
+
}
|
|
172
|
+
/** @experimental */
|
|
173
|
+
interface CreateDriverOptions<Task, Output> {
|
|
174
|
+
/** The agent-authored topology policy. Invoked once per round in `plan`. */
|
|
175
|
+
planner: TopologyPlanner<Task, Output>;
|
|
176
|
+
/**
|
|
177
|
+
* Optional trace-analyst hook. When set, the driver calls it each round AFTER
|
|
178
|
+
* the first (a trace must exist) and BEFORE the planner, then passes the
|
|
179
|
+
* findings to the planner via `PlannerContext.analyses` — so the planner
|
|
180
|
+
* decides from the diagnosis, not the verdict score alone. This is the seam to
|
|
181
|
+
* `runAnalystLoop`; it lives on the driver so `run-loop` stays analyst-free
|
|
182
|
+
* (the layering rule). Fail-loud: a throwing or non-array hook aborts the round
|
|
183
|
+
* (no silent empty findings).
|
|
184
|
+
*/
|
|
185
|
+
analyze?: (input: AnalyzeInput<Task, Output>) => ReadonlyArray<AnalystFinding> | Promise<ReadonlyArray<AnalystFinding>>;
|
|
186
|
+
/**
|
|
187
|
+
* Optional completion analyst — the DEPLOYABLE, non-oracle stop. Each round (after a
|
|
188
|
+
* trace exists) the driver asks "is it done?"; if the verdict AUTHORIZES ending
|
|
189
|
+
* (deterministic = trust ground truth; probabilistic = clears `completionPolicy`'s
|
|
190
|
+
* confidence), the driver stops BEFORE consulting the planner. This is the satisfiability
|
|
191
|
+
* primitive — usable at 1 deep, composing to any depth (one per node). Fail-loud: a
|
|
192
|
+
* throwing or non-verdict assess aborts the round. Distinct from `analyze` (the steer
|
|
193
|
+
* channel) though one analyst node may back both.
|
|
194
|
+
*/
|
|
195
|
+
complete?: CompletionAnalyst<Task, Output>;
|
|
196
|
+
/** Validation policy for a probabilistic completion verdict (the driver's check). */
|
|
197
|
+
completionPolicy?: CompletionPolicy;
|
|
198
|
+
/**
|
|
199
|
+
* Hard safety cap on total iterations. When reached, the driver stops before
|
|
200
|
+
* consulting the planner. Default 8. Set the kernel's `runLoop`
|
|
201
|
+
* `maxIterations >= ` this so the driver's cap governs and the loop closes on
|
|
202
|
+
* a clean `'done'` rather than a truncated `'continue'`.
|
|
203
|
+
*/
|
|
204
|
+
maxIterations?: number;
|
|
205
|
+
/** Max branches a single `fanout` move may dispatch. Default 4. */
|
|
206
|
+
maxFanout?: number;
|
|
207
|
+
/** Stable identifier surfaced in trace events. Default `'dynamic'`. */
|
|
208
|
+
name?: string;
|
|
209
|
+
}
|
|
210
|
+
/** @experimental */
|
|
211
|
+
declare function createDriver<Task, Output>(options: CreateDriverOptions<Task, Output>): Driver<Task, Output, DriverDecision>;
|
|
212
|
+
/**
|
|
213
|
+
* Compact, planner-facing rendering of trace-analyst findings — the diagnosis the
|
|
214
|
+
* planner steers from. Empty input renders to '' (callers omit the section). Shows
|
|
215
|
+
* severity·area·claim·recommended_action·confidence; raw evidence_refs/metadata are
|
|
216
|
+
* for renderers that know the analyst, not the topology decision.
|
|
217
|
+
* @experimental
|
|
218
|
+
*/
|
|
219
|
+
declare function renderAnalyses(findings: ReadonlyArray<AnalystFinding>): string;
|
|
220
|
+
|
|
221
|
+
export { type AnalyzeInput as A, type CreateDriverOptions as C, type DriverDecision as D, type PlannerContext as P, type TopologyPlanner as T, type CompletionAnalyst as a, type CompletionEvidence as b, type CompletionPolicy as c, type CompletionVerdict as d, type TopologyMove as e, completionAuthorizes as f, createDriver as g, deterministicCompletion as h, stopSentinel as i, renderAnalyses as r, sentinelCompletion as s };
|
package/dist/improvement.d.ts
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import { AnalystFinding } from '@tangle-network/agent-eval';
|
|
2
2
|
import { L as LocalHarness, r as runLocalHarness } from './local-harness-KrdFTY5R.js';
|
|
3
3
|
import { LabeledScenarioStore, WorktreeAdapter, ImprovementDriver } from '@tangle-network/agent-eval/campaign';
|
|
4
|
-
export { O as OptimizePromptOptions, b as OptimizePromptReflection, a as OptimizePromptResult, o as optimizePrompt } from './optimize-prompt-D-urF2wW.js';
|
|
5
4
|
import { S as SurfaceImprovementEdit } from './improvement-adapter-BC4HhuAR.js';
|
|
6
5
|
import { I as ImprovementAdapter } from './types-p8dWBIXL.js';
|
|
7
6
|
import 'node:child_process';
|
package/dist/improvement.js
CHANGED
|
@@ -1,10 +1,6 @@
|
|
|
1
1
|
import {
|
|
2
2
|
runLocalHarness
|
|
3
3
|
} from "./chunk-GLR25NG7.js";
|
|
4
|
-
import {
|
|
5
|
-
optimizePrompt
|
|
6
|
-
} from "./chunk-VOX6Z3II.js";
|
|
7
|
-
import "./chunk-SQSCRJ7U.js";
|
|
8
4
|
import "./chunk-DGUM43GV.js";
|
|
9
5
|
|
|
10
6
|
// src/improvement/agentic-generator.ts
|
|
@@ -160,7 +156,6 @@ function applyPatch(patch, cwd) {
|
|
|
160
156
|
export {
|
|
161
157
|
agenticGenerator,
|
|
162
158
|
improvementDriver,
|
|
163
|
-
optimizePrompt,
|
|
164
159
|
reflectiveGenerator
|
|
165
160
|
};
|
|
166
161
|
//# sourceMappingURL=improvement.js.map
|
package/dist/improvement.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/improvement/agentic-generator.ts","../src/improvement/improvement-driver.ts","../src/improvement/reflective-generator.ts"],"sourcesContent":["/**\n * @experimental\n *\n * `agenticGenerator` — the full-agentic `CandidateGenerator`: the\n * `shots=N, sandbox=on` setting of the one `improvementDriver`. It runs a real\n * coding harness (claude / codex / opencode) inside the candidate worktree the\n * driver already created, letting the agent read the codebase + the research\n * report and make the change in place. The driver then commits the worktree\n * into a `CodeSurface`.\n *\n * Mechanism: identical to the proven Phase-2.8 in-process executor — spawn the\n * harness as a subprocess with `cwd` = the worktree, on the same filesystem,\n * so edits land in place (no sandbox-mount round-trip). `runLocalHarness` is\n * the verified primitive. The OUTER sandbox is the improvement loop's own\n * execution context; the generator does not nest a second sandbox per\n * candidate (which would reintroduce a host↔sandbox worktree-transport\n * problem that does not need solving here).\n *\n * `maxShots` is the DEPTH dial: the harness runs once; if it produced no change\n * (the worktree stays clean), the generator refines the prompt and retries, up\n * to `maxShots` times. A harness that already changed files returns on shot 1.\n */\n\nimport { spawnSync } from 'node:child_process'\nimport type { AnalystFinding } from '@tangle-network/agent-eval'\nimport { type LocalHarness, runLocalHarness } from '../mcp/local-harness'\nimport type { CandidateGenerator } from './improvement-driver'\n\nexport interface AgenticGeneratorOptions {\n /** Local coding harness to run in the worktree. Default `claude`. */\n harness?: LocalHarness\n /** Per-shot wall-clock timeout (ms). Default = `runLocalHarness` default (5m). */\n timeoutMs?: number\n /** Build the harness task prompt from the report + findings. Override for\n * domain phrasing; the default turns findings into a concrete coder task. */\n buildPrompt?: (args: { report: unknown; findings: AnalystFinding[] }) => string\n /** Test seam — inject the harness runner (defaults to `runLocalHarness`). */\n runHarness?: typeof runLocalHarness\n /** Test seam — inject the worktree-dirty check (defaults to `git status`). */\n isDirty?: (worktreePath: string) => boolean\n}\n\nexport function agenticGenerator(opts: AgenticGeneratorOptions = {}): CandidateGenerator {\n const harness = opts.harness ?? 'claude'\n const buildPrompt = opts.buildPrompt ?? defaultBuildPrompt\n const run = opts.runHarness ?? runLocalHarness\n const dirty = opts.isDirty ?? worktreeDirty\n\n return {\n kind: `agentic:${harness}`,\n async generate({ worktreePath, report, findings, maxShots, signal }) {\n let prompt = buildPrompt({ report, findings })\n const shots = Math.max(1, maxShots)\n\n for (let shot = 0; shot < shots; shot++) {\n if (signal.aborted) break\n await run({\n harness,\n cwd: worktreePath,\n taskPrompt: prompt,\n timeoutMs: opts.timeoutMs,\n signal,\n })\n // The worktree IS the signal: if the harness touched files, we have a\n // candidate. We don't trust the harness's stdout — we trust the diff.\n if (dirty(worktreePath)) {\n return { applied: true, summary: summarize(findings) }\n }\n // No change this shot — give the next attempt explicit feedback.\n prompt = refine(prompt)\n }\n return { applied: false, summary: '' }\n },\n }\n}\n\n/** Turn the analyst's findings (+ optional report) into a concrete coder task. */\nfunction defaultBuildPrompt(args: { report: unknown; findings: AnalystFinding[] }): string {\n const lines: string[] = [\n 'You are improving this codebase based on an evaluation analysis.',\n 'Make the smallest set of edits that addresses the findings below, then stop.',\n 'Do not change unrelated code. Do not commit — leave changes in the working tree.',\n '',\n 'Findings:',\n ]\n for (const f of args.findings) {\n const where = f.subject ? ` [${f.subject}]` : ''\n lines.push(`- (${f.severity})${where} ${f.claim}`)\n if (f.recommended_action) lines.push(` → ${f.recommended_action}`)\n }\n return lines.join('\\n')\n}\n\nfunction refine(prompt: string): string {\n return `${prompt}\\n\\nNOTE: your previous attempt left the working tree unchanged. Make the concrete file edits now.`\n}\n\n/** A one-line summary for the commit message, derived from the findings. */\nfunction summarize(findings: AnalystFinding[]): string {\n if (findings.length === 0) return 'agentic improvement'\n if (findings.length === 1) return `agentic: ${truncate(findings[0]!.claim, 64)}`\n return `agentic: ${findings.length} findings addressed`\n}\n\nfunction truncate(s: string, n: number): string {\n return s.length <= n ? s : `${s.slice(0, n - 1)}…`\n}\n\n/** Non-empty `git status --porcelain` ⇒ the harness changed the worktree.\n * Fails loud: the worktree is a fresh checkout, so a git error here means\n * something is genuinely broken (git missing, corrupt index, killed mid-run).\n * Folding that into `false` would silently discard a candidate and mask the\n * real failure — forbidden by the no-silent-fallbacks doctrine. */\nfunction worktreeDirty(worktreePath: string): boolean {\n const result = spawnSync('git', ['status', '--porcelain'], {\n cwd: worktreePath,\n encoding: 'utf-8',\n })\n if (result.error) {\n throw new Error(\n `agenticGenerator: git status failed to spawn in ${worktreePath}: ${result.error.message}`,\n )\n }\n if (result.status !== 0) {\n throw new Error(\n `agenticGenerator: git status exited ${result.status} in ${worktreePath}: ${result.stderr.trim()}`,\n )\n }\n return result.stdout.trim().length > 0\n}\n","/**\n * @experimental\n *\n * `improvementDriver` — the ONE reflective/agentic improvement driver for\n * agent-eval's improvement loop. It implements `ImprovementDriver` and owns\n * the candidate lifecycle (worktree create → generate → finalize/discard,\n * × populationSize); it delegates the only thing that genuinely varies — HOW\n * a candidate change is produced — to a pluggable `CandidateGenerator`.\n *\n * There is no separate \"analyst driver\" vs \"autoresearch driver\": those are\n * the SAME driver at two settings of a dial.\n * - cheap reflective path → `reflectiveGenerator` (shots=1, no sandbox;\n * applies pre-drafted patches)\n * - full agentic path → `agenticGenerator` (shots=N, sandbox runLoop;\n * an agent reads code + report and edits)\n * Both emit changes into a worktree the driver finalizes into a\n * `CodeSurface{ worktreeRef }` the loop measures on the holdout. See\n * agent-eval's `docs/design/self-improvement-engine.md`.\n */\n\nimport type { AnalystFinding } from '@tangle-network/agent-eval'\nimport type {\n CodeSurface,\n ImprovementDriver,\n LabeledScenarioStore,\n ProposeContext,\n WorktreeAdapter,\n} from '@tangle-network/agent-eval/campaign'\n\n/** The byte-producing seam — the ONE thing that differs between the cheap\n * reflective path and the full agentic path. A generator makes (uncommitted)\n * changes inside `worktreePath`; the driver commits them via the worktree\n * adapter's `finalize`. */\nexport interface CandidateGenerator {\n kind: string\n generate(args: {\n /** The candidate worktree — a fresh checkout of baseRef. Write changes here. */\n worktreePath: string\n /** Phase-2 research report (analyst findings + diff), opaque. */\n report: unknown\n /** Findings resolved from the report or the loop context. */\n findings: AnalystFinding[]\n /** Handle to all captured data, to ground the change. */\n dataset?: LabeledScenarioStore\n /** DEPTH: max iterations the generator may take (agentic uses this; the\n * reflective generator ignores it). */\n maxShots: number\n signal: AbortSignal\n }): Promise<{ applied: boolean; summary: string }>\n}\n\nexport interface ImprovementDriverOptions {\n worktree: WorktreeAdapter\n generator: CandidateGenerator\n /** Base ref candidate worktrees fork from. Default `main`. */\n baseRef?: string\n}\n\nexport function improvementDriver(\n opts: ImprovementDriverOptions,\n): ImprovementDriver<AnalystFinding> {\n const baseRef = opts.baseRef ?? 'main'\n\n return {\n kind: `improvement:${opts.generator.kind}`,\n async propose(ctx) {\n const findings = resolveFindings(ctx)\n // No signal to act on — propose nothing rather than spin up worktrees.\n if (findings.length === 0 && ctx.report === undefined) return []\n\n const surfaces: CodeSurface[] = []\n for (let i = 0; i < ctx.populationSize; i++) {\n if (ctx.signal.aborted) break\n const wt = await opts.worktree.create({\n baseRef,\n label: `${opts.generator.kind}-gen${ctx.generation}-cand${i}`,\n })\n // Once a worktree exists it MUST be accounted for: finalized into a\n // surface, or discarded. A throw from generate()/finalize() must not\n // leak the worktree + branch — discard best-effort, then rethrow loud.\n try {\n const { applied, summary } = await opts.generator.generate({\n worktreePath: wt.path,\n report: ctx.report,\n findings,\n dataset: ctx.dataset,\n maxShots: ctx.maxImprovementShots ?? 1,\n signal: ctx.signal,\n })\n if (!applied) {\n await opts.worktree.discard(wt)\n continue\n }\n surfaces.push(await opts.worktree.finalize(wt, summary))\n } catch (err) {\n // Best-effort cleanup; never mask the original failure.\n await opts.worktree.discard(wt).catch(() => {})\n throw err\n }\n }\n return surfaces\n },\n }\n}\n\n/** Phase-2 report carries `findings` when present; else fall back to the\n * loop's `ctx.findings`. The report is opaque to the substrate, so probe it\n * structurally. */\nfunction resolveFindings(ctx: ProposeContext<AnalystFinding>): AnalystFinding[] {\n const report = ctx.report\n if (report && typeof report === 'object' && 'findings' in report) {\n const f = (report as { findings: unknown }).findings\n if (Array.isArray(f) && f.length > 0) return f as AnalystFinding[]\n }\n return ctx.findings\n}\n","/**\n * @experimental\n *\n * `reflectiveGenerator` — the cheap, no-sandbox `CandidateGenerator`. It drafts\n * surface edits via the existing improvement adapter (`proposeFromFindings`,\n * one LLM patch per finding) and applies them as ONE coherent improvement into\n * the candidate worktree. `maxShots` is ignored — reflection is single-shot by\n * construction (the patches are already drafted).\n *\n * This is the `shots=1, sandbox=off` setting of the one improvement driver.\n * The `agenticGenerator` (sandbox runLoop) is the `shots=N, sandbox=on`\n * setting — both plug into the same `improvementDriver`.\n */\n\nimport { spawnSync } from 'node:child_process'\nimport type { SurfaceImprovementEdit } from '../agent/improvement-adapter'\nimport type { ImprovementAdapter } from '../analyst-loop/types'\nimport type { CandidateGenerator } from './improvement-driver'\n\nexport interface ReflectiveGeneratorOptions {\n improvementAdapter: ImprovementAdapter<SurfaceImprovementEdit>\n}\n\nexport function reflectiveGenerator(opts: ReflectiveGeneratorOptions): CandidateGenerator {\n return {\n kind: 'reflective',\n async generate({ worktreePath, findings }) {\n const batch = await opts.improvementAdapter.proposeFromFindings(findings)\n if (batch.edits.length === 0) return { applied: false, summary: '' }\n\n let applied = 0\n for (const edit of batch.edits) {\n if (applyPatch(edit.patch, worktreePath)) applied++\n }\n if (applied === 0) return { applied: false, summary: '' }\n\n const summary =\n batch.edits.length === 1\n ? batch.edits[0]!.summary\n : `analyst: ${applied} surface edit${applied === 1 ? '' : 's'}`\n return { applied: true, summary }\n },\n }\n}\n\n/** Mirror the improvement adapter's proven apply invocation, run inside the\n * candidate worktree (a fresh checkout of baseRef, so `-p0` paths match). */\nfunction applyPatch(patch: string, cwd: string): boolean {\n const result = spawnSync('git', ['apply', '--whitespace=fix', '-p0', '-'], {\n cwd,\n input: patch,\n encoding: 'utf-8',\n })\n return result.status === 0\n}\n"],"mappings":";;;;;;;;;;AAuBA,SAAS,iBAAiB;AAmBnB,SAAS,iBAAiB,OAAgC,CAAC,GAAuB;AACvF,QAAM,UAAU,KAAK,WAAW;AAChC,QAAM,cAAc,KAAK,eAAe;AACxC,QAAM,MAAM,KAAK,cAAc;AAC/B,QAAM,QAAQ,KAAK,WAAW;AAE9B,SAAO;AAAA,IACL,MAAM,WAAW,OAAO;AAAA,IACxB,MAAM,SAAS,EAAE,cAAc,QAAQ,UAAU,UAAU,OAAO,GAAG;AACnE,UAAI,SAAS,YAAY,EAAE,QAAQ,SAAS,CAAC;AAC7C,YAAM,QAAQ,KAAK,IAAI,GAAG,QAAQ;AAElC,eAAS,OAAO,GAAG,OAAO,OAAO,QAAQ;AACvC,YAAI,OAAO,QAAS;AACpB,cAAM,IAAI;AAAA,UACR;AAAA,UACA,KAAK;AAAA,UACL,YAAY;AAAA,UACZ,WAAW,KAAK;AAAA,UAChB;AAAA,QACF,CAAC;AAGD,YAAI,MAAM,YAAY,GAAG;AACvB,iBAAO,EAAE,SAAS,MAAM,SAAS,UAAU,QAAQ,EAAE;AAAA,QACvD;AAEA,iBAAS,OAAO,MAAM;AAAA,MACxB;AACA,aAAO,EAAE,SAAS,OAAO,SAAS,GAAG;AAAA,IACvC;AAAA,EACF;AACF;AAGA,SAAS,mBAAmB,MAA+D;AACzF,QAAM,QAAkB;AAAA,IACtB;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AACA,aAAW,KAAK,KAAK,UAAU;AAC7B,UAAM,QAAQ,EAAE,UAAU,KAAK,EAAE,OAAO,MAAM;AAC9C,UAAM,KAAK,MAAM,EAAE,QAAQ,IAAI,KAAK,IAAI,EAAE,KAAK,EAAE;AACjD,QAAI,EAAE,mBAAoB,OAAM,KAAK,cAAS,EAAE,kBAAkB,EAAE;AAAA,EACtE;AACA,SAAO,MAAM,KAAK,IAAI;AACxB;AAEA,SAAS,OAAO,QAAwB;AACtC,SAAO,GAAG,MAAM;AAAA;AAAA;AAClB;AAGA,SAAS,UAAU,UAAoC;AACrD,MAAI,SAAS,WAAW,EAAG,QAAO;AAClC,MAAI,SAAS,WAAW,EAAG,QAAO,YAAY,SAAS,SAAS,CAAC,EAAG,OAAO,EAAE,CAAC;AAC9E,SAAO,YAAY,SAAS,MAAM;AACpC;AAEA,SAAS,SAAS,GAAW,GAAmB;AAC9C,SAAO,EAAE,UAAU,IAAI,IAAI,GAAG,EAAE,MAAM,GAAG,IAAI,CAAC,CAAC;AACjD;AAOA,SAAS,cAAc,cAA+B;AACpD,QAAM,SAAS,UAAU,OAAO,CAAC,UAAU,aAAa,GAAG;AAAA,IACzD,KAAK;AAAA,IACL,UAAU;AAAA,EACZ,CAAC;AACD,MAAI,OAAO,OAAO;AAChB,UAAM,IAAI;AAAA,MACR,mDAAmD,YAAY,KAAK,OAAO,MAAM,OAAO;AAAA,IAC1F;AAAA,EACF;AACA,MAAI,OAAO,WAAW,GAAG;AACvB,UAAM,IAAI;AAAA,MACR,uCAAuC,OAAO,MAAM,OAAO,YAAY,KAAK,OAAO,OAAO,KAAK,CAAC;AAAA,IAClG;AAAA,EACF;AACA,SAAO,OAAO,OAAO,KAAK,EAAE,SAAS;AACvC;;;ACvEO,SAAS,kBACd,MACmC;AACnC,QAAM,UAAU,KAAK,WAAW;AAEhC,SAAO;AAAA,IACL,MAAM,eAAe,KAAK,UAAU,IAAI;AAAA,IACxC,MAAM,QAAQ,KAAK;AACjB,YAAM,WAAW,gBAAgB,GAAG;AAEpC,UAAI,SAAS,WAAW,KAAK,IAAI,WAAW,OAAW,QAAO,CAAC;AAE/D,YAAM,WAA0B,CAAC;AACjC,eAAS,IAAI,GAAG,IAAI,IAAI,gBAAgB,KAAK;AAC3C,YAAI,IAAI,OAAO,QAAS;AACxB,cAAM,KAAK,MAAM,KAAK,SAAS,OAAO;AAAA,UACpC;AAAA,UACA,OAAO,GAAG,KAAK,UAAU,IAAI,OAAO,IAAI,UAAU,QAAQ,CAAC;AAAA,QAC7D,CAAC;AAID,YAAI;AACF,gBAAM,EAAE,SAAS,QAAQ,IAAI,MAAM,KAAK,UAAU,SAAS;AAAA,YACzD,cAAc,GAAG;AAAA,YACjB,QAAQ,IAAI;AAAA,YACZ;AAAA,YACA,SAAS,IAAI;AAAA,YACb,UAAU,IAAI,uBAAuB;AAAA,YACrC,QAAQ,IAAI;AAAA,UACd,CAAC;AACD,cAAI,CAAC,SAAS;AACZ,kBAAM,KAAK,SAAS,QAAQ,EAAE;AAC9B;AAAA,UACF;AACA,mBAAS,KAAK,MAAM,KAAK,SAAS,SAAS,IAAI,OAAO,CAAC;AAAA,QACzD,SAAS,KAAK;AAEZ,gBAAM,KAAK,SAAS,QAAQ,EAAE,EAAE,MAAM,MAAM;AAAA,UAAC,CAAC;AAC9C,gBAAM;AAAA,QACR;AAAA,MACF;AACA,aAAO;AAAA,IACT;AAAA,EACF;AACF;AAKA,SAAS,gBAAgB,KAAuD;AAC9E,QAAM,SAAS,IAAI;AACnB,MAAI,UAAU,OAAO,WAAW,YAAY,cAAc,QAAQ;AAChE,UAAM,IAAK,OAAiC;AAC5C,QAAI,MAAM,QAAQ,CAAC,KAAK,EAAE,SAAS,EAAG,QAAO;AAAA,EAC/C;AACA,SAAO,IAAI;AACb;;;ACrGA,SAAS,aAAAA,kBAAiB;AASnB,SAAS,oBAAoB,MAAsD;AACxF,SAAO;AAAA,IACL,MAAM;AAAA,IACN,MAAM,SAAS,EAAE,cAAc,SAAS,GAAG;AACzC,YAAM,QAAQ,MAAM,KAAK,mBAAmB,oBAAoB,QAAQ;AACxE,UAAI,MAAM,MAAM,WAAW,EAAG,QAAO,EAAE,SAAS,OAAO,SAAS,GAAG;AAEnE,UAAI,UAAU;AACd,iBAAW,QAAQ,MAAM,OAAO;AAC9B,YAAI,WAAW,KAAK,OAAO,YAAY,EAAG;AAAA,MAC5C;AACA,UAAI,YAAY,EAAG,QAAO,EAAE,SAAS,OAAO,SAAS,GAAG;AAExD,YAAM,UACJ,MAAM,MAAM,WAAW,IACnB,MAAM,MAAM,CAAC,EAAG,UAChB,YAAY,OAAO,gBAAgB,YAAY,IAAI,KAAK,GAAG;AACjE,aAAO,EAAE,SAAS,MAAM,QAAQ;AAAA,IAClC;AAAA,EACF;AACF;AAIA,SAAS,WAAW,OAAe,KAAsB;AACvD,QAAM,SAASA,WAAU,OAAO,CAAC,SAAS,oBAAoB,OAAO,GAAG,GAAG;AAAA,IACzE;AAAA,IACA,OAAO;AAAA,IACP,UAAU;AAAA,EACZ,CAAC;AACD,SAAO,OAAO,WAAW;AAC3B;","names":["spawnSync"]}
|
|
1
|
+
{"version":3,"sources":["../src/improvement/agentic-generator.ts","../src/improvement/improvement-driver.ts","../src/improvement/reflective-generator.ts"],"sourcesContent":["/**\n * @experimental\n *\n * `agenticGenerator` — the full-agentic `CandidateGenerator`: the\n * `shots=N, sandbox=on` setting of the one `improvementDriver`. It runs a real\n * coding harness (claude / codex / opencode) inside the candidate worktree the\n * driver already created, letting the agent read the codebase + the research\n * report and make the change in place. The driver then commits the worktree\n * into a `CodeSurface`.\n *\n * Mechanism: identical to the proven Phase-2.8 in-process executor — spawn the\n * harness as a subprocess with `cwd` = the worktree, on the same filesystem,\n * so edits land in place (no sandbox-mount round-trip). `runLocalHarness` is\n * the verified primitive. The OUTER sandbox is the improvement loop's own\n * execution context; the generator does not nest a second sandbox per\n * candidate (which would reintroduce a host↔sandbox worktree-transport\n * problem that does not need solving here).\n *\n * `maxShots` is the DEPTH dial: the harness runs once; if it produced no change\n * (the worktree stays clean), the generator refines the prompt and retries, up\n * to `maxShots` times. A harness that already changed files returns on shot 1.\n */\n\nimport { spawnSync } from 'node:child_process'\nimport type { AnalystFinding } from '@tangle-network/agent-eval'\nimport { type LocalHarness, runLocalHarness } from '../mcp/local-harness'\nimport type { CandidateGenerator } from './improvement-driver'\n\nexport interface AgenticGeneratorOptions {\n /** Local coding harness to run in the worktree. Default `claude`. */\n harness?: LocalHarness\n /** Per-shot wall-clock timeout (ms). Default = `runLocalHarness` default (5m). */\n timeoutMs?: number\n /** Build the harness task prompt from the report + findings. Override for\n * domain phrasing; the default turns findings into a concrete coder task. */\n buildPrompt?: (args: { report: unknown; findings: AnalystFinding[] }) => string\n /** Test seam — inject the harness runner (defaults to `runLocalHarness`). */\n runHarness?: typeof runLocalHarness\n /** Test seam — inject the worktree-dirty check (defaults to `git status`). */\n isDirty?: (worktreePath: string) => boolean\n}\n\nexport function agenticGenerator(opts: AgenticGeneratorOptions = {}): CandidateGenerator {\n const harness = opts.harness ?? 'claude'\n const buildPrompt = opts.buildPrompt ?? defaultBuildPrompt\n const run = opts.runHarness ?? runLocalHarness\n const dirty = opts.isDirty ?? worktreeDirty\n\n return {\n kind: `agentic:${harness}`,\n async generate({ worktreePath, report, findings, maxShots, signal }) {\n let prompt = buildPrompt({ report, findings })\n const shots = Math.max(1, maxShots)\n\n for (let shot = 0; shot < shots; shot++) {\n if (signal.aborted) break\n await run({\n harness,\n cwd: worktreePath,\n taskPrompt: prompt,\n timeoutMs: opts.timeoutMs,\n signal,\n })\n // The worktree IS the signal: if the harness touched files, we have a\n // candidate. We don't trust the harness's stdout — we trust the diff.\n if (dirty(worktreePath)) {\n return { applied: true, summary: summarize(findings) }\n }\n // No change this shot — give the next attempt explicit feedback.\n prompt = refine(prompt)\n }\n return { applied: false, summary: '' }\n },\n }\n}\n\n/** Turn the analyst's findings (+ optional report) into a concrete coder task. */\nfunction defaultBuildPrompt(args: { report: unknown; findings: AnalystFinding[] }): string {\n const lines: string[] = [\n 'You are improving this codebase based on an evaluation analysis.',\n 'Make the smallest set of edits that addresses the findings below, then stop.',\n 'Do not change unrelated code. Do not commit — leave changes in the working tree.',\n '',\n 'Findings:',\n ]\n for (const f of args.findings) {\n const where = f.subject ? ` [${f.subject}]` : ''\n lines.push(`- (${f.severity})${where} ${f.claim}`)\n if (f.recommended_action) lines.push(` → ${f.recommended_action}`)\n }\n return lines.join('\\n')\n}\n\nfunction refine(prompt: string): string {\n return `${prompt}\\n\\nNOTE: your previous attempt left the working tree unchanged. Make the concrete file edits now.`\n}\n\n/** A one-line summary for the commit message, derived from the findings. */\nfunction summarize(findings: AnalystFinding[]): string {\n if (findings.length === 0) return 'agentic improvement'\n if (findings.length === 1) return `agentic: ${truncate(findings[0]!.claim, 64)}`\n return `agentic: ${findings.length} findings addressed`\n}\n\nfunction truncate(s: string, n: number): string {\n return s.length <= n ? s : `${s.slice(0, n - 1)}…`\n}\n\n/** Non-empty `git status --porcelain` ⇒ the harness changed the worktree.\n * Fails loud: the worktree is a fresh checkout, so a git error here means\n * something is genuinely broken (git missing, corrupt index, killed mid-run).\n * Folding that into `false` would silently discard a candidate and mask the\n * real failure — forbidden by the no-silent-fallbacks doctrine. */\nfunction worktreeDirty(worktreePath: string): boolean {\n const result = spawnSync('git', ['status', '--porcelain'], {\n cwd: worktreePath,\n encoding: 'utf-8',\n })\n if (result.error) {\n throw new Error(\n `agenticGenerator: git status failed to spawn in ${worktreePath}: ${result.error.message}`,\n )\n }\n if (result.status !== 0) {\n throw new Error(\n `agenticGenerator: git status exited ${result.status} in ${worktreePath}: ${result.stderr.trim()}`,\n )\n }\n return result.stdout.trim().length > 0\n}\n","/**\n * @experimental\n *\n * `improvementDriver` — the ONE reflective/agentic improvement driver for\n * agent-eval's improvement loop. It implements `ImprovementDriver` and owns\n * the candidate lifecycle (worktree create → generate → finalize/discard,\n * × populationSize); it delegates the only thing that genuinely varies — HOW\n * a candidate change is produced — to a pluggable `CandidateGenerator`.\n *\n * There is no separate \"analyst driver\" vs \"autoresearch driver\": those are\n * the SAME driver at two settings of a dial.\n * - cheap reflective path → `reflectiveGenerator` (shots=1, no sandbox;\n * applies pre-drafted patches)\n * - full agentic path → `agenticGenerator` (shots=N, sandbox runLoop;\n * an agent reads code + report and edits)\n * Both emit changes into a worktree the driver finalizes into a\n * `CodeSurface{ worktreeRef }` the loop measures on the holdout. See\n * agent-eval's `docs/design/self-improvement-engine.md`.\n */\n\nimport type { AnalystFinding } from '@tangle-network/agent-eval'\nimport type {\n CodeSurface,\n ImprovementDriver,\n LabeledScenarioStore,\n ProposeContext,\n WorktreeAdapter,\n} from '@tangle-network/agent-eval/campaign'\n\n/** The byte-producing seam — the ONE thing that differs between the cheap\n * reflective path and the full agentic path. A generator makes (uncommitted)\n * changes inside `worktreePath`; the driver commits them via the worktree\n * adapter's `finalize`. */\nexport interface CandidateGenerator {\n kind: string\n generate(args: {\n /** The candidate worktree — a fresh checkout of baseRef. Write changes here. */\n worktreePath: string\n /** Phase-2 research report (analyst findings + diff), opaque. */\n report: unknown\n /** Findings resolved from the report or the loop context. */\n findings: AnalystFinding[]\n /** Handle to all captured data, to ground the change. */\n dataset?: LabeledScenarioStore\n /** DEPTH: max iterations the generator may take (agentic uses this; the\n * reflective generator ignores it). */\n maxShots: number\n signal: AbortSignal\n }): Promise<{ applied: boolean; summary: string }>\n}\n\nexport interface ImprovementDriverOptions {\n worktree: WorktreeAdapter\n generator: CandidateGenerator\n /** Base ref candidate worktrees fork from. Default `main`. */\n baseRef?: string\n}\n\nexport function improvementDriver(\n opts: ImprovementDriverOptions,\n): ImprovementDriver<AnalystFinding> {\n const baseRef = opts.baseRef ?? 'main'\n\n return {\n kind: `improvement:${opts.generator.kind}`,\n async propose(ctx) {\n const findings = resolveFindings(ctx)\n // No signal to act on — propose nothing rather than spin up worktrees.\n if (findings.length === 0 && ctx.report === undefined) return []\n\n const surfaces: CodeSurface[] = []\n for (let i = 0; i < ctx.populationSize; i++) {\n if (ctx.signal.aborted) break\n const wt = await opts.worktree.create({\n baseRef,\n label: `${opts.generator.kind}-gen${ctx.generation}-cand${i}`,\n })\n // Once a worktree exists it MUST be accounted for: finalized into a\n // surface, or discarded. A throw from generate()/finalize() must not\n // leak the worktree + branch — discard best-effort, then rethrow loud.\n try {\n const { applied, summary } = await opts.generator.generate({\n worktreePath: wt.path,\n report: ctx.report,\n findings,\n dataset: ctx.dataset,\n maxShots: ctx.maxImprovementShots ?? 1,\n signal: ctx.signal,\n })\n if (!applied) {\n await opts.worktree.discard(wt)\n continue\n }\n surfaces.push(await opts.worktree.finalize(wt, summary))\n } catch (err) {\n // Best-effort cleanup; never mask the original failure.\n await opts.worktree.discard(wt).catch(() => {})\n throw err\n }\n }\n return surfaces\n },\n }\n}\n\n/** Phase-2 report carries `findings` when present; else fall back to the\n * loop's `ctx.findings`. The report is opaque to the substrate, so probe it\n * structurally. */\nfunction resolveFindings(ctx: ProposeContext<AnalystFinding>): AnalystFinding[] {\n const report = ctx.report\n if (report && typeof report === 'object' && 'findings' in report) {\n const f = (report as { findings: unknown }).findings\n if (Array.isArray(f) && f.length > 0) return f as AnalystFinding[]\n }\n return ctx.findings\n}\n","/**\n * @experimental\n *\n * `reflectiveGenerator` — the cheap, no-sandbox `CandidateGenerator`. It drafts\n * surface edits via the existing improvement adapter (`proposeFromFindings`,\n * one LLM patch per finding) and applies them as ONE coherent improvement into\n * the candidate worktree. `maxShots` is ignored — reflection is single-shot by\n * construction (the patches are already drafted).\n *\n * This is the `shots=1, sandbox=off` setting of the one improvement driver.\n * The `agenticGenerator` (sandbox runLoop) is the `shots=N, sandbox=on`\n * setting — both plug into the same `improvementDriver`.\n */\n\nimport { spawnSync } from 'node:child_process'\nimport type { SurfaceImprovementEdit } from '../agent/improvement-adapter'\nimport type { ImprovementAdapter } from '../analyst-loop/types'\nimport type { CandidateGenerator } from './improvement-driver'\n\nexport interface ReflectiveGeneratorOptions {\n improvementAdapter: ImprovementAdapter<SurfaceImprovementEdit>\n}\n\nexport function reflectiveGenerator(opts: ReflectiveGeneratorOptions): CandidateGenerator {\n return {\n kind: 'reflective',\n async generate({ worktreePath, findings }) {\n const batch = await opts.improvementAdapter.proposeFromFindings(findings)\n if (batch.edits.length === 0) return { applied: false, summary: '' }\n\n let applied = 0\n for (const edit of batch.edits) {\n if (applyPatch(edit.patch, worktreePath)) applied++\n }\n if (applied === 0) return { applied: false, summary: '' }\n\n const summary =\n batch.edits.length === 1\n ? batch.edits[0]!.summary\n : `analyst: ${applied} surface edit${applied === 1 ? '' : 's'}`\n return { applied: true, summary }\n },\n }\n}\n\n/** Mirror the improvement adapter's proven apply invocation, run inside the\n * candidate worktree (a fresh checkout of baseRef, so `-p0` paths match). */\nfunction applyPatch(patch: string, cwd: string): boolean {\n const result = spawnSync('git', ['apply', '--whitespace=fix', '-p0', '-'], {\n cwd,\n input: patch,\n encoding: 'utf-8',\n })\n return result.status === 0\n}\n"],"mappings":";;;;;;AAuBA,SAAS,iBAAiB;AAmBnB,SAAS,iBAAiB,OAAgC,CAAC,GAAuB;AACvF,QAAM,UAAU,KAAK,WAAW;AAChC,QAAM,cAAc,KAAK,eAAe;AACxC,QAAM,MAAM,KAAK,cAAc;AAC/B,QAAM,QAAQ,KAAK,WAAW;AAE9B,SAAO;AAAA,IACL,MAAM,WAAW,OAAO;AAAA,IACxB,MAAM,SAAS,EAAE,cAAc,QAAQ,UAAU,UAAU,OAAO,GAAG;AACnE,UAAI,SAAS,YAAY,EAAE,QAAQ,SAAS,CAAC;AAC7C,YAAM,QAAQ,KAAK,IAAI,GAAG,QAAQ;AAElC,eAAS,OAAO,GAAG,OAAO,OAAO,QAAQ;AACvC,YAAI,OAAO,QAAS;AACpB,cAAM,IAAI;AAAA,UACR;AAAA,UACA,KAAK;AAAA,UACL,YAAY;AAAA,UACZ,WAAW,KAAK;AAAA,UAChB;AAAA,QACF,CAAC;AAGD,YAAI,MAAM,YAAY,GAAG;AACvB,iBAAO,EAAE,SAAS,MAAM,SAAS,UAAU,QAAQ,EAAE;AAAA,QACvD;AAEA,iBAAS,OAAO,MAAM;AAAA,MACxB;AACA,aAAO,EAAE,SAAS,OAAO,SAAS,GAAG;AAAA,IACvC;AAAA,EACF;AACF;AAGA,SAAS,mBAAmB,MAA+D;AACzF,QAAM,QAAkB;AAAA,IACtB;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AACA,aAAW,KAAK,KAAK,UAAU;AAC7B,UAAM,QAAQ,EAAE,UAAU,KAAK,EAAE,OAAO,MAAM;AAC9C,UAAM,KAAK,MAAM,EAAE,QAAQ,IAAI,KAAK,IAAI,EAAE,KAAK,EAAE;AACjD,QAAI,EAAE,mBAAoB,OAAM,KAAK,cAAS,EAAE,kBAAkB,EAAE;AAAA,EACtE;AACA,SAAO,MAAM,KAAK,IAAI;AACxB;AAEA,SAAS,OAAO,QAAwB;AACtC,SAAO,GAAG,MAAM;AAAA;AAAA;AAClB;AAGA,SAAS,UAAU,UAAoC;AACrD,MAAI,SAAS,WAAW,EAAG,QAAO;AAClC,MAAI,SAAS,WAAW,EAAG,QAAO,YAAY,SAAS,SAAS,CAAC,EAAG,OAAO,EAAE,CAAC;AAC9E,SAAO,YAAY,SAAS,MAAM;AACpC;AAEA,SAAS,SAAS,GAAW,GAAmB;AAC9C,SAAO,EAAE,UAAU,IAAI,IAAI,GAAG,EAAE,MAAM,GAAG,IAAI,CAAC,CAAC;AACjD;AAOA,SAAS,cAAc,cAA+B;AACpD,QAAM,SAAS,UAAU,OAAO,CAAC,UAAU,aAAa,GAAG;AAAA,IACzD,KAAK;AAAA,IACL,UAAU;AAAA,EACZ,CAAC;AACD,MAAI,OAAO,OAAO;AAChB,UAAM,IAAI;AAAA,MACR,mDAAmD,YAAY,KAAK,OAAO,MAAM,OAAO;AAAA,IAC1F;AAAA,EACF;AACA,MAAI,OAAO,WAAW,GAAG;AACvB,UAAM,IAAI;AAAA,MACR,uCAAuC,OAAO,MAAM,OAAO,YAAY,KAAK,OAAO,OAAO,KAAK,CAAC;AAAA,IAClG;AAAA,EACF;AACA,SAAO,OAAO,OAAO,KAAK,EAAE,SAAS;AACvC;;;ACvEO,SAAS,kBACd,MACmC;AACnC,QAAM,UAAU,KAAK,WAAW;AAEhC,SAAO;AAAA,IACL,MAAM,eAAe,KAAK,UAAU,IAAI;AAAA,IACxC,MAAM,QAAQ,KAAK;AACjB,YAAM,WAAW,gBAAgB,GAAG;AAEpC,UAAI,SAAS,WAAW,KAAK,IAAI,WAAW,OAAW,QAAO,CAAC;AAE/D,YAAM,WAA0B,CAAC;AACjC,eAAS,IAAI,GAAG,IAAI,IAAI,gBAAgB,KAAK;AAC3C,YAAI,IAAI,OAAO,QAAS;AACxB,cAAM,KAAK,MAAM,KAAK,SAAS,OAAO;AAAA,UACpC;AAAA,UACA,OAAO,GAAG,KAAK,UAAU,IAAI,OAAO,IAAI,UAAU,QAAQ,CAAC;AAAA,QAC7D,CAAC;AAID,YAAI;AACF,gBAAM,EAAE,SAAS,QAAQ,IAAI,MAAM,KAAK,UAAU,SAAS;AAAA,YACzD,cAAc,GAAG;AAAA,YACjB,QAAQ,IAAI;AAAA,YACZ;AAAA,YACA,SAAS,IAAI;AAAA,YACb,UAAU,IAAI,uBAAuB;AAAA,YACrC,QAAQ,IAAI;AAAA,UACd,CAAC;AACD,cAAI,CAAC,SAAS;AACZ,kBAAM,KAAK,SAAS,QAAQ,EAAE;AAC9B;AAAA,UACF;AACA,mBAAS,KAAK,MAAM,KAAK,SAAS,SAAS,IAAI,OAAO,CAAC;AAAA,QACzD,SAAS,KAAK;AAEZ,gBAAM,KAAK,SAAS,QAAQ,EAAE,EAAE,MAAM,MAAM;AAAA,UAAC,CAAC;AAC9C,gBAAM;AAAA,QACR;AAAA,MACF;AACA,aAAO;AAAA,IACT;AAAA,EACF;AACF;AAKA,SAAS,gBAAgB,KAAuD;AAC9E,QAAM,SAAS,IAAI;AACnB,MAAI,UAAU,OAAO,WAAW,YAAY,cAAc,QAAQ;AAChE,UAAM,IAAK,OAAiC;AAC5C,QAAI,MAAM,QAAQ,CAAC,KAAK,EAAE,SAAS,EAAG,QAAO;AAAA,EAC/C;AACA,SAAO,IAAI;AACb;;;ACrGA,SAAS,aAAAA,kBAAiB;AASnB,SAAS,oBAAoB,MAAsD;AACxF,SAAO;AAAA,IACL,MAAM;AAAA,IACN,MAAM,SAAS,EAAE,cAAc,SAAS,GAAG;AACzC,YAAM,QAAQ,MAAM,KAAK,mBAAmB,oBAAoB,QAAQ;AACxE,UAAI,MAAM,MAAM,WAAW,EAAG,QAAO,EAAE,SAAS,OAAO,SAAS,GAAG;AAEnE,UAAI,UAAU;AACd,iBAAW,QAAQ,MAAM,OAAO;AAC9B,YAAI,WAAW,KAAK,OAAO,YAAY,EAAG;AAAA,MAC5C;AACA,UAAI,YAAY,EAAG,QAAO,EAAE,SAAS,OAAO,SAAS,GAAG;AAExD,YAAM,UACJ,MAAM,MAAM,WAAW,IACnB,MAAM,MAAM,CAAC,EAAG,UAChB,YAAY,OAAO,gBAAgB,YAAY,IAAI,KAAK,GAAG;AACjE,aAAO,EAAE,SAAS,MAAM,QAAQ;AAAA,IAClC;AAAA,EACF;AACF;AAIA,SAAS,WAAW,OAAe,KAAsB;AACvD,QAAM,SAASA,WAAU,OAAO,CAAC,SAAS,oBAAoB,OAAO,GAAG,GAAG;AAAA,IACzE;AAAA,IACA,OAAO;AAAA,IACP,UAAU;AAAA,EACZ,CAAC;AACD,SAAO,OAAO,WAAW;AAC3B;","names":["spawnSync"]}
|
package/dist/index.d.ts
CHANGED
|
@@ -1,16 +1,19 @@
|
|
|
1
1
|
import { AgentEvalError, KnowledgeReadinessReport, RunRecord, ControlEvalResult, KnowledgeRequirement } from '@tangle-network/agent-eval';
|
|
2
2
|
export { AgentEvalError, AgentEvalErrorCode, ConfigError, ControlBudget, ControlDecision, ControlEvalResult, ControlRunResult, ControlStep, DataAcquisitionPlan, JudgeError, KnowledgeReadinessReport, KnowledgeRequirement, NotFoundError, RunRecord, ValidationError } from '@tangle-network/agent-eval';
|
|
3
|
-
import {
|
|
4
|
-
export {
|
|
5
|
-
export { C as CoderLoopRunnerOptions, D as DELEGATED_LOOP_MODES, a as DelegatedLoopMode, b as DelegatedLoopRegistry, c as DelegatedLoopResult, d as DelegatedLoopRunner, e as DynamicLoopRunnerOptions, L as LoopRunnerCliArgs, f as LoopRunnerCliResult, R as ResearchLoopResult, g as ResearchLoopRunnerOptions, h as RunDelegatedLoopOptions, V as VetoedFact, i as auditLoopRunner, j as coderLoopRunner, k as dynamicLoopRunner, l as isDelegatedLoopMode, p as parseLoopRunnerArgv, r as researchLoopRunner, m as reviewLoopRunner, n as runDelegatedLoop, o as runLoopRunnerCli, s as selfImproveLoopRunner } from './loop-runner-bin-
|
|
6
|
-
export { E as EvalRunEvent, b as EvalRunGeneration, c as EvalRunsExportConfig, d as EvalRunsExportResult, I as INTELLIGENCE_WIRE_VERSION, e as OtelAttribute, f as OtelExportConfig, O as OtelExporter, g as OtelSpan, h as buildLoopOtelSpans, i as createOtelExporter, j as exportEvalRuns, l as loopEventToOtelSpan, m as mcpToolsForRuntimeMcp, a as mcpToolsForRuntimeMcpSubset } from './otel-export-
|
|
3
|
+
import { g as AgentBackendInput, h as AgentExecutionBackend, d as OpenAIChatTool, i as OpenAIChatToolChoice, j as AgentBackendContext, R as RuntimeStreamEvent, K as KnowledgeReadinessDecision, k as RunAgentTaskOptions, l as AgentTaskRunResult, m as RunAgentTaskStreamOptions, n as AgentRuntimeEvent, o as AgentTaskStatus, p as RuntimeSessionStore, q as RuntimeSession } from './types-DnYoHvvZ.js';
|
|
4
|
+
export { r as AgentAdapter, s as AgentKnowledgeProvider, t as AgentRuntimeEventSink, u as AgentTaskContext, v as AgentTaskSpec, B as BackendErrorDetail, w as RuntimeRunHandle, x as RuntimeRunPersistenceAdapter, y as RuntimeRunRow, z as startRuntimeRun } from './types-DnYoHvvZ.js';
|
|
5
|
+
export { C as CoderLoopRunnerOptions, D as DELEGATED_LOOP_MODES, a as DelegatedLoopMode, b as DelegatedLoopRegistry, c as DelegatedLoopResult, d as DelegatedLoopRunner, e as DynamicLoopRunnerOptions, L as LoopRunnerCliArgs, f as LoopRunnerCliResult, R as ResearchLoopResult, g as ResearchLoopRunnerOptions, h as RunDelegatedLoopOptions, V as VetoedFact, i as auditLoopRunner, j as coderLoopRunner, k as dynamicLoopRunner, l as isDelegatedLoopMode, p as parseLoopRunnerArgv, r as researchLoopRunner, m as reviewLoopRunner, n as runDelegatedLoop, o as runLoopRunnerCli, s as selfImproveLoopRunner } from './loop-runner-bin-D-K6bRp3.js';
|
|
6
|
+
export { E as EvalRunEvent, b as EvalRunGeneration, c as EvalRunsExportConfig, d as EvalRunsExportResult, I as INTELLIGENCE_WIRE_VERSION, e as OtelAttribute, f as OtelExportConfig, O as OtelExporter, g as OtelSpan, h as buildLoopOtelSpans, i as createOtelExporter, j as exportEvalRuns, l as loopEventToOtelSpan, m as mcpToolsForRuntimeMcp, a as mcpToolsForRuntimeMcpSubset } from './otel-export-nurzFwuJ.js';
|
|
7
|
+
import { R as RuntimeHooks } from './runtime-hooks-C7JwKb9E.js';
|
|
8
|
+
export { b as RuntimeDecisionEvidenceRef, c as RuntimeDecisionKind, d as RuntimeDecisionPoint, e as RuntimeHookContext, f as RuntimeHookErrorContext, a as RuntimeHookEvent, g as RuntimeHookPhase, h as RuntimeHookTarget, i as composeRuntimeHooks, j as defineRuntimeHooks, n as notifyRuntimeDecisionPoint, k as notifyRuntimeHookEvent } from './runtime-hooks-C7JwKb9E.js';
|
|
7
9
|
import '@tangle-network/sandbox';
|
|
8
10
|
import '@tangle-network/agent-eval/campaign';
|
|
11
|
+
import '@tangle-network/agent-eval/contract';
|
|
9
12
|
import './types-p8dWBIXL.js';
|
|
10
|
-
import './
|
|
11
|
-
import './
|
|
12
|
-
import './
|
|
13
|
-
import './
|
|
13
|
+
import './kb-gate-2Gwpz_27.js';
|
|
14
|
+
import './coder-DCWFQpmJ.js';
|
|
15
|
+
import './substrate-CUgk7F7s.js';
|
|
16
|
+
import './driver-C-mtBo7h.js';
|
|
14
17
|
|
|
15
18
|
/**
|
|
16
19
|
* @stable
|
|
@@ -1295,4 +1298,114 @@ declare function readinessServerSentEvent(report: KnowledgeReadinessReport, opti
|
|
|
1295
1298
|
/** @stable */
|
|
1296
1299
|
declare function runtimeStreamServerSentEvent(event: RuntimeStreamEvent, options?: RuntimeTelemetryOptions & ServerSentEventOptions): string;
|
|
1297
1300
|
|
|
1298
|
-
|
|
1301
|
+
/**
|
|
1302
|
+
* Bounded turn-level tool-dispatch loop.
|
|
1303
|
+
*
|
|
1304
|
+
* `runAgentTaskStream` runs ONE model turn; `runLoop` orchestrates DELEGATED
|
|
1305
|
+
* multi-agent topologies (refine / fanout-vote). Neither is the everyday
|
|
1306
|
+
* interactive shape: a chat turn where the model may emit tool calls, each is
|
|
1307
|
+
* executed, the results are folded back, and the turn re-runs until the model
|
|
1308
|
+
* stops (or a turn cap). Every agent app hand-rolls that loop — this is it,
|
|
1309
|
+
* as a reusable primitive.
|
|
1310
|
+
*
|
|
1311
|
+
* Substrate-neutral by design: the caller supplies `streamTurn` (wrapping
|
|
1312
|
+
* whatever backend / `runAgentTaskStream` it uses) and `executeToolCall`
|
|
1313
|
+
* (routing to its executors). This module owns the LOOP; the caller owns the
|
|
1314
|
+
* model and the executors. `Raw` (streaming variant) is the caller's own
|
|
1315
|
+
* event type. The only imported contract is the runtime hook type: hooks are
|
|
1316
|
+
* execution-scoped observers, not part of the agent profile.
|
|
1317
|
+
*/
|
|
1318
|
+
|
|
1319
|
+
interface ToolLoopCall {
|
|
1320
|
+
toolCallId?: string;
|
|
1321
|
+
toolName: string;
|
|
1322
|
+
args: Record<string, unknown>;
|
|
1323
|
+
}
|
|
1324
|
+
/** Outcome of one tool dispatch — structurally compatible with a hub/integration
|
|
1325
|
+
* tool-outcome union, so callers can fold either through the loop. */
|
|
1326
|
+
type ToolCallOutcome = {
|
|
1327
|
+
ok: true;
|
|
1328
|
+
result: unknown;
|
|
1329
|
+
} | {
|
|
1330
|
+
ok: false;
|
|
1331
|
+
code: string;
|
|
1332
|
+
message: string;
|
|
1333
|
+
status?: number;
|
|
1334
|
+
};
|
|
1335
|
+
type ToolLoopMessage = {
|
|
1336
|
+
role: string;
|
|
1337
|
+
content: string;
|
|
1338
|
+
};
|
|
1339
|
+
type ToolLoopEvent = {
|
|
1340
|
+
type: 'text';
|
|
1341
|
+
text: string;
|
|
1342
|
+
} | {
|
|
1343
|
+
type: 'tool_call';
|
|
1344
|
+
call: ToolLoopCall;
|
|
1345
|
+
} | {
|
|
1346
|
+
type: 'other';
|
|
1347
|
+
event: unknown;
|
|
1348
|
+
};
|
|
1349
|
+
interface ToolLoopResult {
|
|
1350
|
+
finalText: string;
|
|
1351
|
+
toolResults: Array<{
|
|
1352
|
+
call: ToolLoopCall;
|
|
1353
|
+
label: string;
|
|
1354
|
+
outcome: ToolCallOutcome;
|
|
1355
|
+
}>;
|
|
1356
|
+
turns: number;
|
|
1357
|
+
cappedOut: boolean;
|
|
1358
|
+
}
|
|
1359
|
+
interface RunToolLoopOptions {
|
|
1360
|
+
systemPrompt: string;
|
|
1361
|
+
userMessage: string;
|
|
1362
|
+
priorMessages?: ToolLoopMessage[];
|
|
1363
|
+
streamTurn: (messages: ToolLoopMessage[]) => AsyncIterable<ToolLoopEvent>;
|
|
1364
|
+
executeToolCall: (call: ToolLoopCall) => Promise<ToolCallOutcome>;
|
|
1365
|
+
isExecutableTool: (toolName: string) => boolean;
|
|
1366
|
+
maxToolTurns?: number;
|
|
1367
|
+
renderResult?: (label: string, outcome: ToolCallOutcome) => string;
|
|
1368
|
+
labelFor?: (call: ToolLoopCall) => string;
|
|
1369
|
+
runId?: string;
|
|
1370
|
+
scenarioId?: string;
|
|
1371
|
+
hooks?: RuntimeHooks;
|
|
1372
|
+
}
|
|
1373
|
+
/** Run the bounded tool loop and return the final text + every executed tool
|
|
1374
|
+
* outcome. Awaitable — callers needing to stream events to a UI use
|
|
1375
|
+
* {@link streamToolLoop}. */
|
|
1376
|
+
declare function runToolLoop(opts: RunToolLoopOptions): Promise<ToolLoopResult>;
|
|
1377
|
+
type StreamToolLoopYield<Raw> = {
|
|
1378
|
+
kind: 'event';
|
|
1379
|
+
event: Raw;
|
|
1380
|
+
} | {
|
|
1381
|
+
kind: 'tool_result';
|
|
1382
|
+
toolName: string;
|
|
1383
|
+
toolCallId?: string;
|
|
1384
|
+
label: string;
|
|
1385
|
+
outcome: ToolCallOutcome;
|
|
1386
|
+
} | {
|
|
1387
|
+
kind: 'capped';
|
|
1388
|
+
pending: number;
|
|
1389
|
+
};
|
|
1390
|
+
interface StreamToolLoopOptions<Raw> {
|
|
1391
|
+
systemPrompt: string;
|
|
1392
|
+
userMessage: string;
|
|
1393
|
+
priorMessages?: ToolLoopMessage[];
|
|
1394
|
+
streamTurn: (messages: ToolLoopMessage[]) => AsyncIterable<Raw>;
|
|
1395
|
+
extractText: (event: Raw) => string;
|
|
1396
|
+
extractToolCall: (event: Raw) => ToolLoopCall | null;
|
|
1397
|
+
isExecutableTool: (toolName: string) => boolean;
|
|
1398
|
+
executeToolCall: (call: ToolLoopCall) => Promise<ToolCallOutcome>;
|
|
1399
|
+
maxToolTurns?: number;
|
|
1400
|
+
renderResult?: (label: string, outcome: ToolCallOutcome) => string;
|
|
1401
|
+
labelFor?: (call: ToolLoopCall) => string;
|
|
1402
|
+
runId?: string;
|
|
1403
|
+
scenarioId?: string;
|
|
1404
|
+
hooks?: RuntimeHooks;
|
|
1405
|
+
}
|
|
1406
|
+
/** Streaming bounded tool loop: yields each raw turn event (the caller maps +
|
|
1407
|
+
* telemetries + re-emits it) and each executed `tool_result`; emits one
|
|
1408
|
+
* `capped` if it stops at the turn limit with calls still pending. */
|
|
1409
|
+
declare function streamToolLoop<Raw>(opts: StreamToolLoopOptions<Raw>): AsyncGenerator<StreamToolLoopYield<Raw>, void, unknown>;
|
|
1410
|
+
|
|
1411
|
+
export { AgentBackendContext, AgentBackendInput, AgentExecutionBackend, AgentRuntimeEvent, AgentTaskRunResult, AgentTaskStatus, type AuthSource, type BackendCallPolicy, BackendTransportError, type ChatStreamEvent, type ChatTurnHooks, type ChatTurnIdentity, type ChatTurnProducer, type ChatTurnResult, type CircuitBreakerConfig, CircuitBreakerState, CircuitOpenError, type Conversation, type ConversationDriveState, type ConversationJournal, type ConversationJournalEntry, type ConversationParticipant, type ConversationPolicy, type ConversationResult, type ConversationStreamEvent, type ConversationTurn, type D1DatabaseLike, type D1StmtLike, DEFAULT_MAX_DEPTH, DEFAULT_ROUTER_BASE_URL, DeadlineExceededError, FORWARD_HEADERS, FileConversationJournal, type ForwardHeaderName, type HaltContext, type HaltPredicate, type HaltReason, type HaltSignal, InMemoryConversationJournal, InMemoryRuntimeSessionStore, type ModelInfo, OpenAIChatTool, OpenAIChatToolChoice, PlannerError, type PropagatedHeaders, type ResolvedChatModel, type RetryBackoff, type RetryableErrorPredicate, type RouterEnv, type RunChatTurnInput, type RunConversationOptions, type RunToolLoopOptions, type RuntimeEventCollector, RuntimeHooks, RuntimeRunStateError, RuntimeSessionStore, RuntimeStreamEvent, type RuntimeStreamEventCollector, type RuntimeTelemetryOptions, type SanitizedKnowledgeReadinessReport, type SqlAdapter, SqlConversationJournal, type StreamToolLoopOptions, type StreamToolLoopYield, type ToolCallOutcome, type ToolLoopCall, type ToolLoopEvent, type ToolLoopMessage, type ToolLoopResult, type TurnOrder, applyRunRecordDefaults, buildForwardHeaders, cleanModelId, computeBackoff, createConversationBackend, createIterableBackend, createOpenAICompatibleBackend, createRuntimeEventCollector, createRuntimeStreamEventCollector, createSandboxPromptBackend, d1ToSqlAdapter, decideKnowledgeReadiness, defaultIsRetryable, defineConversation, deriveExecutionId, getModels, handleChatTurn, isDepthExceeded, makePerAttemptSignal, readDepth, readinessServerSentEvent, resolveChatModel, resolveRouterBaseUrl, runAgentTask, runAgentTaskStream, runConversation, runConversationStream, runToolLoop, runtimeStreamServerSentEvent, sanitizeAgentRuntimeEvent, sanitizeKnowledgeReadinessReport, sanitizeRuntimeStreamEvent, sleep, slugifySpeaker, streamToolLoop, turnId, validateChatModelId };
|