@tangle-network/agent-runtime 0.37.0 → 0.38.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/loops.d.ts CHANGED
@@ -1,115 +1,14 @@
1
1
  import { AgentProfile, SandboxEvent } from '@tangle-network/sandbox';
2
2
  export { AgentProfile, CreateSandboxOptions, SandboxEvent, SandboxInstance } from '@tangle-network/sandbox';
3
- import { I as Iteration, D as Driver, L as LoopSandboxClient, A as AgentRunSpec, O as OutputAdapter, V as Validator, E as ExecCtx, a as LoopWinner, b as LoopResult } from './types-CmTjKLyB.js';
4
- export { c as LoopDecisionPayload, d as LoopEndedPayload, e as LoopIterationDispatchPayload, f as LoopIterationEndedPayload, g as LoopIterationStartedPayload, h as LoopPlanDescription, i as LoopPlanPayload, j as LoopSandboxPlacement, k as LoopStartedPayload, l as LoopTokenUsage, m as LoopTraceEmitter, n as LoopTraceEvent, o as ValidationCtx } from './types-CmTjKLyB.js';
3
+ import { P as PlannerContext, T as TopologyPlanner } from './dynamic-DeOPeeAw.js';
4
+ export { C as CreateDynamicDriverOptions, D as DynamicDecision, a as TopologyMove, c as createDynamicDriver, s as summarizeHistory } from './dynamic-DeOPeeAw.js';
5
+ import { D as Driver, I as Iteration, L as LoopSandboxClient, A as AgentRunSpec, O as OutputAdapter, V as Validator, E as ExecCtx, a as LoopWinner, b as LoopResult } from './types-CmkQl8qE.js';
6
+ export { c as LoopDecisionPayload, d as LoopEndedPayload, e as LoopIterationDispatchPayload, f as LoopIterationEndedPayload, g as LoopIterationStartedPayload, h as LoopPlanDescription, i as LoopPlanPayload, j as LoopSandboxPlacement, k as LoopStartedPayload, l as LoopTokenUsage, m as LoopTraceEmitter, n as LoopTraceEvent, o as ValidationCtx } from './types-CmkQl8qE.js';
5
7
  import { DefaultVerdict, AgentProfile as AgentProfile$1 } from '@tangle-network/agent-eval';
6
8
  export { DefaultVerdict } from '@tangle-network/agent-eval';
7
9
  import { Scenario, DispatchFn, ProfileDispatchFn } from '@tangle-network/agent-eval/campaign';
8
10
  import './types-CsCCryln.js';
9
11
 
10
- /**
11
- * @experimental
12
- *
13
- * Dynamic driver — the agent authors the loop topology at runtime.
14
- *
15
- * Where `refine` and `fanout-vote` encode a fixed shape as a pure function of
16
- * history, this driver delegates the per-round shape to an injected
17
- * `TopologyPlanner`. Each round the planner inspects the task + iteration
18
- * history and emits one `TopologyMove`:
19
- * - `refine` → one task next round (optionally rewritten from the prior attempt)
20
- * - `fanout` → N tasks next round (the kernel round-robins `agentRuns`, so a
21
- * 2-harness fanout dispatches branch 0 to harness A and branch 1 to harness B)
22
- * - `stop` → terminate; the kernel selects the winner across all iterations
23
- *
24
- * The planner is the brain; this driver is the structure. It maps moves onto
25
- * the kernel's `plan`/`decide` contract, enforces the iteration + fanout caps,
26
- * and fails loud on a malformed move. The planner is injected exactly like
27
- * `refine`'s `refineTask` and `fanout-vote`'s `selector` — so a test can drive
28
- * a deterministic policy through the real kernel, and production can wire it to
29
- * an LLM via `createSandboxPlanner`.
30
- *
31
- * Topology is orthogonal to harness: the planner never names a backend. Which
32
- * harness runs a branch is decided by the `AgentRunSpec` the kernel round-robins
33
- * to, so one dynamic driver works across claude-code, codex, opencode, pi —
34
- * including fanning a single round across several at once.
35
- */
36
-
37
- /** Terminal once `decide` returns `'done'` (a kernel terminal decision). */
38
- type DynamicDecision = 'continue' | 'done';
39
- /**
40
- * One topology decision for the next round. `fanout` carries explicit tasks
41
- * rather than a count so the planner can issue heterogeneous branches (a
42
- * different sub-task per harness); pass N copies of one task for a homogeneous
43
- * fanout that relies on `agentRuns` diversity instead.
44
- *
45
- * @experimental
46
- */
47
- type TopologyMove<Task> = {
48
- kind: 'refine';
49
- task: Task;
50
- rationale?: string;
51
- } | {
52
- kind: 'fanout';
53
- tasks: Task[];
54
- rationale?: string;
55
- } | {
56
- kind: 'stop';
57
- rationale?: string;
58
- };
59
- /** @experimental */
60
- interface PlannerContext<Task, Output> {
61
- /** The root task the loop was invoked with — stable across rounds. */
62
- task: Task;
63
- /** Every iteration so far, in dispatch order, with outputs + verdicts. */
64
- history: ReadonlyArray<Iteration<Task, Output>>;
65
- /** `history.length` — iterations already spent. */
66
- iterationsSpent: number;
67
- /** Iterations left before the driver's `maxIterations` cap forces a stop. */
68
- iterationsRemaining: number;
69
- }
70
- /**
71
- * Chooses the next topology move from the task + history. Sync or async; an
72
- * async planner is where an LLM call goes (see `createSandboxPlanner`).
73
- *
74
- * @experimental
75
- */
76
- type TopologyPlanner<Task, Output> = (ctx: PlannerContext<Task, Output>) => TopologyMove<Task> | Promise<TopologyMove<Task>>;
77
- /** @experimental */
78
- interface CreateDynamicDriverOptions<Task, Output> {
79
- /** The agent-authored topology policy. Invoked once per round in `plan`. */
80
- planner: TopologyPlanner<Task, Output>;
81
- /**
82
- * Hard safety cap on total iterations. When reached, the driver stops before
83
- * consulting the planner. Default 8. Set the kernel's `runLoop`
84
- * `maxIterations >= ` this so the driver's cap governs and the loop closes on
85
- * a clean `'done'` rather than a truncated `'continue'`.
86
- */
87
- maxIterations?: number;
88
- /** Max branches a single `fanout` move may dispatch. Default 4. */
89
- maxFanout?: number;
90
- /** Stable identifier surfaced in trace events. Default `'dynamic'`. */
91
- name?: string;
92
- }
93
- /** @experimental */
94
- declare function createDynamicDriver<Task, Output>(options: CreateDynamicDriverOptions<Task, Output>): Driver<Task, Output, DynamicDecision>;
95
- /**
96
- * Compact, planner-friendly view of iteration history — what an LLM planner
97
- * needs to choose the next move without the raw event streams. Output is
98
- * truncated so a long run's prompt stays bounded.
99
- *
100
- * @experimental
101
- */
102
- declare function summarizeHistory<Task, Output>(history: ReadonlyArray<Iteration<Task, Output>>, opts?: {
103
- maxOutputChars?: number;
104
- }): Array<{
105
- index: number;
106
- agentRunName: string;
107
- valid?: boolean;
108
- score?: number;
109
- error?: string;
110
- output?: string;
111
- }>;
112
-
113
12
  /**
114
13
  * @experimental
115
14
  *
@@ -413,4 +312,4 @@ interface UsageSink {
413
312
  */
414
313
  declare function reportLoopUsage<Task, Output, Decision>(cost: UsageSink, result: Pick<LoopResult<Task, Output, Decision>, 'costUsd' | 'tokenUsage'>, source?: string): void;
415
314
 
416
- export { AgentRunSpec, type CreateDynamicDriverOptions, type CreateFanoutVoteDriverOptions, type CreateRefineDriverOptions, type CreateSandboxPlannerOptions, Driver, type DynamicDecision, ExecCtx, type FanoutVoteDecision, type FanoutVoteScored, Iteration, type LoopDispatchOptions, type LoopOptionsForDispatch, LoopResult, LoopSandboxClient, LoopWinner, OutputAdapter, type PlannerContext, type RefineDecision, type RunLoopOptions, type TopologyMove, type TopologyMoveEnvelope, type TopologyPlanner, type UsageSink, Validator, createDynamicDriver, createFanoutVoteDriver, createRefineDriver, createSandboxPlanner, loopCampaignDispatch, loopDispatch, refineWinnerIndex, reportLoopUsage, runLoop, scoreFanoutVoteIterations, summarizeHistory };
315
+ export { AgentRunSpec, type CreateFanoutVoteDriverOptions, type CreateRefineDriverOptions, type CreateSandboxPlannerOptions, Driver, ExecCtx, type FanoutVoteDecision, type FanoutVoteScored, Iteration, type LoopDispatchOptions, type LoopOptionsForDispatch, LoopResult, LoopSandboxClient, LoopWinner, OutputAdapter, PlannerContext, type RefineDecision, type RunLoopOptions, type TopologyMoveEnvelope, TopologyPlanner, type UsageSink, Validator, createFanoutVoteDriver, createRefineDriver, createSandboxPlanner, loopCampaignDispatch, loopDispatch, refineWinnerIndex, reportLoopUsage, runLoop, scoreFanoutVoteIterations };
@@ -1,6 +1,6 @@
1
- import { L as LoopSandboxClient, j as LoopSandboxPlacement, m as LoopTraceEmitter } from '../types-CmTjKLyB.js';
2
- import { F as FleetHandle, D as DelegationExecutor, a as DelegateFeedbackArgs, b as DelegationFeedbackSnapshot, c as DelegationProfile, d as DelegateCodeArgs, e as DelegateResearchArgs, f as DelegationStatus, g as DelegationProgress, h as DelegationResultPayload, i as DelegationError, j as DelegationStatusResult, k as DelegationHistoryArgs, l as DelegationHistoryEntry, C as CoderDelegate, R as ResearcherDelegate, m as DelegateCodeResult, n as DelegateFeedbackResult, o as ResearchSource, p as DelegateResearchResult, q as DelegationHistoryResult, r as DelegationStatusArgs, O as OtelExporter } from '../otel-export-DgFMwsVy.js';
3
- export { s as CoderReview, t as CoderReviewer, u as CoderWinnerSelection, v as CreateDefaultCoderDelegateOptions, w as DelegateCodeConfig, x as DelegateResearchConfig, y as DelegateRunCtx, z as FeedbackRating, A as FeedbackRefersTo, B as FleetWorkspaceExecutorOptions, E as ResearchOutputShape, S as SiblingSandboxExecutorOptions, G as createDefaultCoderDelegate, H as createFleetWorkspaceExecutor, I as createSiblingSandboxExecutor, J as mcpToolsForRuntimeMcp, K as mcpToolsForRuntimeMcpSubset } from '../otel-export-DgFMwsVy.js';
1
+ import { L as LoopSandboxClient, j as LoopSandboxPlacement, m as LoopTraceEmitter } from '../types-CmkQl8qE.js';
2
+ import { F as FleetHandle, D as DelegationExecutor, a as DelegateFeedbackArgs, b as DelegationFeedbackSnapshot, c as DelegationProfile, d as DelegateCodeArgs, e as DelegateResearchArgs, f as DelegationStatus, g as DelegationProgress, h as DelegationResultPayload, i as DelegationError, j as DelegationStatusResult, k as DelegationHistoryArgs, l as DelegationHistoryEntry, C as CoderDelegate, R as ResearcherDelegate, m as DelegateCodeResult, n as DelegateFeedbackResult, o as ResearchSource, p as DelegateResearchResult, q as DelegationHistoryResult, r as DelegationStatusArgs, O as OtelExporter } from '../otel-export-CNmeg_7B.js';
3
+ export { s as CoderReview, t as CoderReviewer, u as CoderWinnerSelection, v as CreateDefaultCoderDelegateOptions, w as CreateKbGateOptions, x as DelegateCodeConfig, y as DelegateResearchConfig, z as DelegateRunCtx, A as FactCandidate, B as FactJudge, E as FactJudgeVerdict, G as FeedbackRating, H as FeedbackRefersTo, I as FleetWorkspaceExecutorOptions, K as KbGateResult, J as ResearchOutputShape, S as SiblingSandboxExecutorOptions, L as createDefaultCoderDelegate, M as createFleetWorkspaceExecutor, N as createKbGate, P as createSiblingSandboxExecutor, Q as mcpToolsForRuntimeMcp, T as mcpToolsForRuntimeMcpSubset } from '../otel-export-CNmeg_7B.js';
4
4
  import { L as LocalHarness, r as runLocalHarness } from '../local-harness-KrdFTY5R.js';
5
5
  export { a as LocalHarnessResult, R as RunLocalHarnessOptions } from '../local-harness-KrdFTY5R.js';
6
6
  import '@tangle-network/agent-eval';
@@ -285,81 +285,6 @@ interface InProcessExecutorDescribePlacement extends LoopSandboxPlacement {
285
285
  */
286
286
  declare function createInProcessExecutor(options: InProcessExecutorOptions): DelegationExecutor;
287
287
 
288
- /**
289
- * @experimental
290
- *
291
- * `createKbGate` — the valid-only knowledge-base growth gate, distilled from
292
- * physim's KB-research subsystem. A research-in-a-loop delegate (or any KB
293
- * writer) runs candidate facts through this before persisting, so the KB grows
294
- * with ONLY grounded facts — hallucinated, unsourced, or laundered claims are
295
- * vetoed at the gate.
296
- *
297
- * Fail-closed by construction: every judge must `accept`; the FIRST veto wins
298
- * and the fact is rejected. The non-negotiable floor (always on, can't be
299
- * disabled) is the **passage-present guard** — a fact's `verbatimPassage` MUST
300
- * literally appear in its `sourceText`. That single check kills the dominant
301
- * failure mode (a confident claim decoupled from any real source).
302
- *
303
- * Pure + dependency-free: it operates on fact candidates, not on a store, so it
304
- * composes with `@tangle-network/agent-knowledge` or any persistence layer
305
- * without importing it. The remediation policy (correct-on-veto vs
306
- * escalate-as-unverified) is the caller's — this returns the verdict; it never
307
- * drops a fact silently.
308
- */
309
- /** @experimental A fact proposed for the KB, with its grounding. */
310
- interface FactCandidate {
311
- /** The atomic claim text. */
312
- claim: string;
313
- /** Optional extracted value (number or string) the claim asserts. */
314
- value?: string | number;
315
- /** Verbatim span lifted from the source that backs the claim. */
316
- verbatimPassage: string;
317
- /** The raw source text the passage must be grounded in. */
318
- sourceText: string;
319
- /** Where the fact claims to come from — checked for circular/self citations. */
320
- citation?: string;
321
- }
322
- /** @experimental */
323
- interface FactJudgeVerdict {
324
- accept: boolean;
325
- reason?: string;
326
- }
327
- /** @experimental A pluggable fact validator. Throw is NOT allowed — return a
328
- * verdict; a thrown judge is a programmer error, not a veto. */
329
- interface FactJudge {
330
- name: string;
331
- judge(candidate: FactCandidate): FactJudgeVerdict | Promise<FactJudgeVerdict>;
332
- }
333
- /** @experimental */
334
- interface KbGateResult {
335
- accepted: boolean;
336
- /** Name of the judge that vetoed; undefined when accepted. */
337
- vetoedBy?: string;
338
- reason?: string;
339
- }
340
- /** @experimental */
341
- interface CreateKbGateOptions {
342
- /** Extra judges appended after the built-in floor (e.g. an LLM judge). */
343
- judges?: FactJudge[];
344
- /** Minimum verbatim-passage length. Default 12 — kills empty/stub passages. */
345
- minPassageChars?: number;
346
- /**
347
- * Citation tokens that denote a SELF-generated artifact (e.g. `'spec'`,
348
- * `'cad_params'`, `'requirements'`). A citation naming one is circular
349
- * (laundering) — the fact cites a derived artifact, not a real source.
350
- * Default `[]` (no circular check unless the consumer declares its kinds).
351
- */
352
- selfArtifactKinds?: string[];
353
- }
354
- /**
355
- * @experimental
356
- *
357
- * Build a fail-closed KB gate. The returned function runs the built-in floor
358
- * (passage-non-empty → passage-present → value-in-passage → no-circular-citation)
359
- * then any consumer judges, returning on the first veto.
360
- */
361
- declare function createKbGate(options?: CreateKbGateOptions): (candidate: FactCandidate) => Promise<KbGateResult>;
362
-
363
288
  /**
364
289
  * @experimental
365
290
  *
@@ -947,4 +872,4 @@ declare function createPropagatingTraceEmitter(ctx: TraceContext): {
947
872
  */
948
873
  declare function traceContextToEnv(ctx: TraceContext): Record<string, string>;
949
874
 
950
- export { CoderDelegate, type CreateKbGateOptions, type CreateWorktreeOptions, DELEGATE_CODE_DESCRIPTION, DELEGATE_CODE_INPUT_SCHEMA, DELEGATE_CODE_TOOL_NAME, DELEGATE_FEEDBACK_DESCRIPTION, DELEGATE_FEEDBACK_INPUT_SCHEMA, DELEGATE_FEEDBACK_TOOL_NAME, DELEGATE_RESEARCH_DESCRIPTION, DELEGATE_RESEARCH_INPUT_SCHEMA, DELEGATE_RESEARCH_TOOL_NAME, DELEGATION_HISTORY_DESCRIPTION, DELEGATION_HISTORY_INPUT_SCHEMA, DELEGATION_HISTORY_TOOL_NAME, DELEGATION_STATUS_DESCRIPTION, DELEGATION_STATUS_INPUT_SCHEMA, DELEGATION_STATUS_TOOL_NAME, DelegateCodeArgs, DelegateCodeResult, DelegateFeedbackArgs, DelegateFeedbackResult, DelegateResearchArgs, DelegateResearchResult, DelegationError, DelegationExecutor, DelegationFeedbackSnapshot, DelegationHistoryArgs, DelegationHistoryEntry, DelegationHistoryResult, DelegationProfile, DelegationProgress, type DelegationRecord, DelegationResultPayload, DelegationStatus, DelegationStatusArgs, DelegationStatusResult, DelegationTaskQueue, type DelegationTaskQueueOptions, type DetectExecutorArgs, type DiffOptions, type DiffResult, type FactCandidate, type FactJudge, type FactJudgeVerdict, type FeedbackEvent, type FeedbackStore, FleetHandle, type GitRunner, InMemoryFeedbackStore, type InProcessExecutorDescribePlacement, type InProcessExecutorOptions, type JsonRpcMessage, type JsonRpcResponse, type KbGateResult, LocalHarness, type McpServer, type McpServerOptions, type McpToolDescriptor, type McpTransport, type RemoveWorktreeOptions, ResearchSource, ResearcherDelegate, type SubmitInput, type SubmitOutput, type TraceContext, type WorktreeHandle, captureWorktreeDiff, createDelegateCodeHandler, createDelegateFeedbackHandler, createDelegateResearchHandler, createDelegationHistoryHandler, createDelegationStatusHandler, createInProcessExecutor, createInProcessTransport, createKbGate, createMcpServer, createPropagatingTraceEmitter, createWorktree, detectExecutor, eventToSnapshot, hashIdempotencyInput, readTraceContextFromEnv, removeWorktree, runLocalHarness, traceContextToEnv, validateDelegateCodeArgs, validateDelegateFeedbackArgs, validateDelegateResearchArgs, validateDelegationHistoryArgs, validateDelegationStatusArgs };
875
+ export { CoderDelegate, type CreateWorktreeOptions, DELEGATE_CODE_DESCRIPTION, DELEGATE_CODE_INPUT_SCHEMA, DELEGATE_CODE_TOOL_NAME, DELEGATE_FEEDBACK_DESCRIPTION, DELEGATE_FEEDBACK_INPUT_SCHEMA, DELEGATE_FEEDBACK_TOOL_NAME, DELEGATE_RESEARCH_DESCRIPTION, DELEGATE_RESEARCH_INPUT_SCHEMA, DELEGATE_RESEARCH_TOOL_NAME, DELEGATION_HISTORY_DESCRIPTION, DELEGATION_HISTORY_INPUT_SCHEMA, DELEGATION_HISTORY_TOOL_NAME, DELEGATION_STATUS_DESCRIPTION, DELEGATION_STATUS_INPUT_SCHEMA, DELEGATION_STATUS_TOOL_NAME, DelegateCodeArgs, DelegateCodeResult, DelegateFeedbackArgs, DelegateFeedbackResult, DelegateResearchArgs, DelegateResearchResult, DelegationError, DelegationExecutor, DelegationFeedbackSnapshot, DelegationHistoryArgs, DelegationHistoryEntry, DelegationHistoryResult, DelegationProfile, DelegationProgress, type DelegationRecord, DelegationResultPayload, DelegationStatus, DelegationStatusArgs, DelegationStatusResult, DelegationTaskQueue, type DelegationTaskQueueOptions, type DetectExecutorArgs, type DiffOptions, type DiffResult, type FeedbackEvent, type FeedbackStore, FleetHandle, type GitRunner, InMemoryFeedbackStore, type InProcessExecutorDescribePlacement, type InProcessExecutorOptions, type JsonRpcMessage, type JsonRpcResponse, LocalHarness, type McpServer, type McpServerOptions, type McpToolDescriptor, type McpTransport, type RemoveWorktreeOptions, ResearchSource, ResearcherDelegate, type SubmitInput, type SubmitOutput, type TraceContext, type WorktreeHandle, captureWorktreeDiff, createDelegateCodeHandler, createDelegateFeedbackHandler, createDelegateResearchHandler, createDelegationHistoryHandler, createDelegationStatusHandler, createInProcessExecutor, createInProcessTransport, createMcpServer, createPropagatingTraceEmitter, createWorktree, detectExecutor, eventToSnapshot, hashIdempotencyInput, readTraceContextFromEnv, removeWorktree, runLocalHarness, traceContextToEnv, validateDelegateCodeArgs, validateDelegateFeedbackArgs, validateDelegateResearchArgs, validateDelegationHistoryArgs, validateDelegationStatusArgs };
package/dist/mcp/index.js CHANGED
@@ -9,10 +9,11 @@ import {
9
9
  } from "../chunk-M65QJD35.js";
10
10
  import {
11
11
  buildLoopOtelSpans,
12
+ createKbGate,
12
13
  createOtelExporter,
13
14
  mcpToolsForRuntimeMcp,
14
15
  mcpToolsForRuntimeMcpSubset
15
- } from "../chunk-T3GJBKHA.js";
16
+ } from "../chunk-Z523NPJK.js";
16
17
  import {
17
18
  DELEGATE_CODE_DESCRIPTION,
18
19
  DELEGATE_CODE_INPUT_SCHEMA,
@@ -56,62 +57,6 @@ import "../chunk-PY6NMZYX.js";
56
57
  import "../chunk-SQSCRJ7U.js";
57
58
  import "../chunk-DGUM43GV.js";
58
59
 
59
- // src/mcp/kb-gate.ts
60
- var norm = (s) => s.toLowerCase().replace(/\s+/g, " ").trim();
61
- function valueAppears(value, passageNorm) {
62
- if (passageNorm.includes(norm(String(value)))) return true;
63
- if (typeof value !== "number" || !Number.isFinite(value)) return false;
64
- const forms = [value.toLocaleString("en-US")];
65
- if (Math.abs(value) >= 1e9) forms.push(`${trimZero(value / 1e9)} billion`);
66
- if (Math.abs(value) >= 1e6) forms.push(`${trimZero(value / 1e6)} million`);
67
- return forms.some((f) => passageNorm.includes(norm(f)));
68
- }
69
- function trimZero(n) {
70
- return Number.isInteger(n) ? String(n) : String(Number(n.toFixed(2)));
71
- }
72
- function builtinJudges(minPassageChars, selfArtifactKinds) {
73
- const kinds = selfArtifactKinds.map((k) => k.toLowerCase());
74
- return [
75
- {
76
- name: "passage-non-empty",
77
- judge: (c) => c.verbatimPassage.trim().length >= minPassageChars ? { accept: true } : { accept: false, reason: `passage shorter than ${minPassageChars} chars` }
78
- },
79
- {
80
- // THE anti-hallucination floor — the passage must literally be in the source.
81
- name: "passage-present",
82
- judge: (c) => norm(c.sourceText).includes(norm(c.verbatimPassage)) ? { accept: true } : { accept: false, reason: "verbatim passage not found in source (unbacked fact)" }
83
- },
84
- {
85
- name: "value-in-passage",
86
- judge: (c) => c.value === void 0 || valueAppears(c.value, norm(c.verbatimPassage)) ? { accept: true } : { accept: false, reason: `value ${JSON.stringify(c.value)} not present in passage` }
87
- },
88
- {
89
- name: "no-circular-citation",
90
- judge: (c) => {
91
- if (!c.citation || kinds.length === 0) return { accept: true };
92
- const cite = c.citation.toLowerCase();
93
- const hit = kinds.find((k) => cite.includes(k));
94
- return hit ? { accept: false, reason: `circular citation to self-generated artifact "${hit}"` } : { accept: true };
95
- }
96
- }
97
- ];
98
- }
99
- function createKbGate(options = {}) {
100
- const judges = [
101
- ...builtinJudges(options.minPassageChars ?? 12, options.selfArtifactKinds ?? []),
102
- ...options.judges ?? []
103
- ];
104
- return async (candidate) => {
105
- for (const j of judges) {
106
- const verdict = await j.judge(candidate);
107
- if (!verdict.accept) {
108
- return { accepted: false, vetoedBy: j.name, reason: verdict.reason };
109
- }
110
- }
111
- return { accepted: true };
112
- };
113
- }
114
-
115
60
  // src/mcp/trace-propagation.ts
116
61
  function readTraceContextFromEnv() {
117
62
  const traceId = process.env.TRACE_ID || generateTraceId();
@@ -1 +1 @@
1
- {"version":3,"sources":["../../src/mcp/kb-gate.ts","../../src/mcp/trace-propagation.ts"],"sourcesContent":["/**\n * @experimental\n *\n * `createKbGate` — the valid-only knowledge-base growth gate, distilled from\n * physim's KB-research subsystem. A research-in-a-loop delegate (or any KB\n * writer) runs candidate facts through this before persisting, so the KB grows\n * with ONLY grounded facts — hallucinated, unsourced, or laundered claims are\n * vetoed at the gate.\n *\n * Fail-closed by construction: every judge must `accept`; the FIRST veto wins\n * and the fact is rejected. The non-negotiable floor (always on, can't be\n * disabled) is the **passage-present guard** — a fact's `verbatimPassage` MUST\n * literally appear in its `sourceText`. That single check kills the dominant\n * failure mode (a confident claim decoupled from any real source).\n *\n * Pure + dependency-free: it operates on fact candidates, not on a store, so it\n * composes with `@tangle-network/agent-knowledge` or any persistence layer\n * without importing it. The remediation policy (correct-on-veto vs\n * escalate-as-unverified) is the caller's — this returns the verdict; it never\n * drops a fact silently.\n */\n\n/** @experimental A fact proposed for the KB, with its grounding. */\nexport interface FactCandidate {\n /** The atomic claim text. */\n claim: string\n /** Optional extracted value (number or string) the claim asserts. */\n value?: string | number\n /** Verbatim span lifted from the source that backs the claim. */\n verbatimPassage: string\n /** The raw source text the passage must be grounded in. */\n sourceText: string\n /** Where the fact claims to come from — checked for circular/self citations. */\n citation?: string\n}\n\n/** @experimental */\nexport interface FactJudgeVerdict {\n accept: boolean\n reason?: string\n}\n\n/** @experimental A pluggable fact validator. Throw is NOT allowed — return a\n * verdict; a thrown judge is a programmer error, not a veto. */\nexport interface FactJudge {\n name: string\n judge(candidate: FactCandidate): FactJudgeVerdict | Promise<FactJudgeVerdict>\n}\n\n/** @experimental */\nexport interface KbGateResult {\n accepted: boolean\n /** Name of the judge that vetoed; undefined when accepted. */\n vetoedBy?: string\n reason?: string\n}\n\n/** @experimental */\nexport interface CreateKbGateOptions {\n /** Extra judges appended after the built-in floor (e.g. an LLM judge). */\n judges?: FactJudge[]\n /** Minimum verbatim-passage length. Default 12 — kills empty/stub passages. */\n minPassageChars?: number\n /**\n * Citation tokens that denote a SELF-generated artifact (e.g. `'spec'`,\n * `'cad_params'`, `'requirements'`). A citation naming one is circular\n * (laundering) — the fact cites a derived artifact, not a real source.\n * Default `[]` (no circular check unless the consumer declares its kinds).\n */\n selfArtifactKinds?: string[]\n}\n\nconst norm = (s: string): string => s.toLowerCase().replace(/\\s+/g, ' ').trim()\n\n/** Does `value` appear in the (normalized) passage — literally, comma-grouped,\n * or in billion/million shorthand (the forms a source actually writes). */\nfunction valueAppears(value: string | number, passageNorm: string): boolean {\n if (passageNorm.includes(norm(String(value)))) return true\n if (typeof value !== 'number' || !Number.isFinite(value)) return false\n const forms = [value.toLocaleString('en-US')]\n if (Math.abs(value) >= 1e9) forms.push(`${trimZero(value / 1e9)} billion`)\n if (Math.abs(value) >= 1e6) forms.push(`${trimZero(value / 1e6)} million`)\n return forms.some((f) => passageNorm.includes(norm(f)))\n}\n\nfunction trimZero(n: number): string {\n return Number.isInteger(n) ? String(n) : String(Number(n.toFixed(2)))\n}\n\n/** The always-on floor judges. Order matters: cheapest / most-fundamental first. */\nfunction builtinJudges(minPassageChars: number, selfArtifactKinds: string[]): FactJudge[] {\n const kinds = selfArtifactKinds.map((k) => k.toLowerCase())\n return [\n {\n name: 'passage-non-empty',\n judge: (c) =>\n c.verbatimPassage.trim().length >= minPassageChars\n ? { accept: true }\n : { accept: false, reason: `passage shorter than ${minPassageChars} chars` },\n },\n {\n // THE anti-hallucination floor — the passage must literally be in the source.\n name: 'passage-present',\n judge: (c) =>\n norm(c.sourceText).includes(norm(c.verbatimPassage))\n ? { accept: true }\n : { accept: false, reason: 'verbatim passage not found in source (unbacked fact)' },\n },\n {\n name: 'value-in-passage',\n judge: (c) =>\n c.value === undefined || valueAppears(c.value, norm(c.verbatimPassage))\n ? { accept: true }\n : { accept: false, reason: `value ${JSON.stringify(c.value)} not present in passage` },\n },\n {\n name: 'no-circular-citation',\n judge: (c) => {\n if (!c.citation || kinds.length === 0) return { accept: true }\n const cite = c.citation.toLowerCase()\n const hit = kinds.find((k) => cite.includes(k))\n return hit\n ? { accept: false, reason: `circular citation to self-generated artifact \"${hit}\"` }\n : { accept: true }\n },\n },\n ]\n}\n\n/**\n * @experimental\n *\n * Build a fail-closed KB gate. The returned function runs the built-in floor\n * (passage-non-empty → passage-present → value-in-passage → no-circular-citation)\n * then any consumer judges, returning on the first veto.\n */\nexport function createKbGate(\n options: CreateKbGateOptions = {},\n): (candidate: FactCandidate) => Promise<KbGateResult> {\n const judges = [\n ...builtinJudges(options.minPassageChars ?? 12, options.selfArtifactKinds ?? []),\n ...(options.judges ?? []),\n ]\n return async (candidate) => {\n for (const j of judges) {\n const verdict = await j.judge(candidate)\n if (!verdict.accept) {\n return { accepted: false, vetoedBy: j.name, reason: verdict.reason }\n }\n }\n return { accepted: true }\n }\n}\n","/**\n * @experimental\n *\n * Trace context propagation for MCP subprocess.\n *\n * When the MCP server is launched as a child process by a sandbox harness,\n * the parent passes trace context via environment variables:\n *\n * TRACE_ID=<current-run-trace-id>\n * PARENT_SPAN_ID=<span-that-dispatched-the-delegation>\n *\n * The MCP server reads these at startup and uses them as the root of its\n * internal trace tree. All spans emitted by `runLoop` invocations inside\n * the MCP are children of the parent's delegation span.\n *\n * When these env vars are absent, the MCP generates a fresh trace root —\n * the server operates standalone without trace joining.\n */\n\nimport type { LoopTraceEmitter, LoopTraceEvent } from '../loops/types'\nimport type { OtelExporter } from '../otel-export'\nimport { buildLoopOtelSpans, createOtelExporter } from '../otel-export'\n\nexport interface TraceContext {\n /** Trace id inherited from the parent process, or a fresh one. */\n traceId: string\n /** Parent span id from the delegation that launched this MCP server. */\n parentSpanId?: string\n}\n\n/**\n * Read trace context from the process environment.\n * Returns a context with inherited ids or a freshly generated root.\n */\nexport function readTraceContextFromEnv(): TraceContext {\n const traceId = process.env.TRACE_ID || generateTraceId()\n const parentSpanId = process.env.PARENT_SPAN_ID || undefined\n return { traceId, parentSpanId }\n}\n\n/**\n * Create a LoopTraceEmitter that:\n * 1. Parents all spans under the inherited PARENT_SPAN_ID.\n * 2. Exports spans to OTEL when OTEL_EXPORTER_OTLP_ENDPOINT is set.\n *\n * Returns both the emitter and the optional exporter handle for shutdown.\n */\nexport function createPropagatingTraceEmitter(ctx: TraceContext): {\n emitter: LoopTraceEmitter\n exporter: OtelExporter | undefined\n context: TraceContext\n} {\n const exporter = createOtelExporter()\n\n // Buffer events per loop run, then emit the full nested span tree on\n // `loop.ended` so the topology hierarchy (loop → round → branch) reaches the\n // OTLP collector — not a flat list of zero-duration point spans. A run that\n // never reaches `loop.ended` (hard abort) drops its buffer; acceptable for\n // the short-lived MCP subprocess.\n const buffers = new Map<string, LoopTraceEvent[]>()\n\n const emitter: LoopTraceEmitter = {\n emit(event: LoopTraceEvent) {\n if (!exporter) return\n const buf = buffers.get(event.runId)\n if (buf) buf.push(event)\n else buffers.set(event.runId, [event])\n if (event.kind === 'loop.ended') {\n const events = buffers.get(event.runId) ?? [event]\n buffers.delete(event.runId)\n for (const span of buildLoopOtelSpans(events, ctx.traceId, ctx.parentSpanId)) {\n exporter.exportSpan(span)\n }\n }\n },\n }\n\n return { emitter, exporter, context: ctx }\n}\n\n/**\n * Build env vars to pass to a child MCP subprocess so it inherits the\n * current trace context.\n */\nexport function traceContextToEnv(ctx: TraceContext): Record<string, string> {\n const env: Record<string, string> = { TRACE_ID: ctx.traceId }\n if (ctx.parentSpanId) env.PARENT_SPAN_ID = ctx.parentSpanId\n return env\n}\n\nfunction generateTraceId(): string {\n const bytes = new Uint8Array(16)\n if (typeof globalThis.crypto?.getRandomValues === 'function') {\n globalThis.crypto.getRandomValues(bytes)\n } else {\n for (let i = 0; i < 16; i++) bytes[i] = Math.floor(Math.random() * 256)\n }\n return Array.from(bytes)\n .map((b) => b.toString(16).padStart(2, '0'))\n .join('')\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAwEA,IAAM,OAAO,CAAC,MAAsB,EAAE,YAAY,EAAE,QAAQ,QAAQ,GAAG,EAAE,KAAK;AAI9E,SAAS,aAAa,OAAwB,aAA8B;AAC1E,MAAI,YAAY,SAAS,KAAK,OAAO,KAAK,CAAC,CAAC,EAAG,QAAO;AACtD,MAAI,OAAO,UAAU,YAAY,CAAC,OAAO,SAAS,KAAK,EAAG,QAAO;AACjE,QAAM,QAAQ,CAAC,MAAM,eAAe,OAAO,CAAC;AAC5C,MAAI,KAAK,IAAI,KAAK,KAAK,IAAK,OAAM,KAAK,GAAG,SAAS,QAAQ,GAAG,CAAC,UAAU;AACzE,MAAI,KAAK,IAAI,KAAK,KAAK,IAAK,OAAM,KAAK,GAAG,SAAS,QAAQ,GAAG,CAAC,UAAU;AACzE,SAAO,MAAM,KAAK,CAAC,MAAM,YAAY,SAAS,KAAK,CAAC,CAAC,CAAC;AACxD;AAEA,SAAS,SAAS,GAAmB;AACnC,SAAO,OAAO,UAAU,CAAC,IAAI,OAAO,CAAC,IAAI,OAAO,OAAO,EAAE,QAAQ,CAAC,CAAC,CAAC;AACtE;AAGA,SAAS,cAAc,iBAAyB,mBAA0C;AACxF,QAAM,QAAQ,kBAAkB,IAAI,CAAC,MAAM,EAAE,YAAY,CAAC;AAC1D,SAAO;AAAA,IACL;AAAA,MACE,MAAM;AAAA,MACN,OAAO,CAAC,MACN,EAAE,gBAAgB,KAAK,EAAE,UAAU,kBAC/B,EAAE,QAAQ,KAAK,IACf,EAAE,QAAQ,OAAO,QAAQ,wBAAwB,eAAe,SAAS;AAAA,IACjF;AAAA,IACA;AAAA;AAAA,MAEE,MAAM;AAAA,MACN,OAAO,CAAC,MACN,KAAK,EAAE,UAAU,EAAE,SAAS,KAAK,EAAE,eAAe,CAAC,IAC/C,EAAE,QAAQ,KAAK,IACf,EAAE,QAAQ,OAAO,QAAQ,uDAAuD;AAAA,IACxF;AAAA,IACA;AAAA,MACE,MAAM;AAAA,MACN,OAAO,CAAC,MACN,EAAE,UAAU,UAAa,aAAa,EAAE,OAAO,KAAK,EAAE,eAAe,CAAC,IAClE,EAAE,QAAQ,KAAK,IACf,EAAE,QAAQ,OAAO,QAAQ,SAAS,KAAK,UAAU,EAAE,KAAK,CAAC,0BAA0B;AAAA,IAC3F;AAAA,IACA;AAAA,MACE,MAAM;AAAA,MACN,OAAO,CAAC,MAAM;AACZ,YAAI,CAAC,EAAE,YAAY,MAAM,WAAW,EAAG,QAAO,EAAE,QAAQ,KAAK;AAC7D,cAAM,OAAO,EAAE,SAAS,YAAY;AACpC,cAAM,MAAM,MAAM,KAAK,CAAC,MAAM,KAAK,SAAS,CAAC,CAAC;AAC9C,eAAO,MACH,EAAE,QAAQ,OAAO,QAAQ,iDAAiD,GAAG,IAAI,IACjF,EAAE,QAAQ,KAAK;AAAA,MACrB;AAAA,IACF;AAAA,EACF;AACF;AASO,SAAS,aACd,UAA+B,CAAC,GACqB;AACrD,QAAM,SAAS;AAAA,IACb,GAAG,cAAc,QAAQ,mBAAmB,IAAI,QAAQ,qBAAqB,CAAC,CAAC;AAAA,IAC/E,GAAI,QAAQ,UAAU,CAAC;AAAA,EACzB;AACA,SAAO,OAAO,cAAc;AAC1B,eAAW,KAAK,QAAQ;AACtB,YAAM,UAAU,MAAM,EAAE,MAAM,SAAS;AACvC,UAAI,CAAC,QAAQ,QAAQ;AACnB,eAAO,EAAE,UAAU,OAAO,UAAU,EAAE,MAAM,QAAQ,QAAQ,OAAO;AAAA,MACrE;AAAA,IACF;AACA,WAAO,EAAE,UAAU,KAAK;AAAA,EAC1B;AACF;;;ACtHO,SAAS,0BAAwC;AACtD,QAAM,UAAU,QAAQ,IAAI,YAAY,gBAAgB;AACxD,QAAM,eAAe,QAAQ,IAAI,kBAAkB;AACnD,SAAO,EAAE,SAAS,aAAa;AACjC;AASO,SAAS,8BAA8B,KAI5C;AACA,QAAM,WAAW,mBAAmB;AAOpC,QAAM,UAAU,oBAAI,IAA8B;AAElD,QAAM,UAA4B;AAAA,IAChC,KAAK,OAAuB;AAC1B,UAAI,CAAC,SAAU;AACf,YAAM,MAAM,QAAQ,IAAI,MAAM,KAAK;AACnC,UAAI,IAAK,KAAI,KAAK,KAAK;AAAA,UAClB,SAAQ,IAAI,MAAM,OAAO,CAAC,KAAK,CAAC;AACrC,UAAI,MAAM,SAAS,cAAc;AAC/B,cAAM,SAAS,QAAQ,IAAI,MAAM,KAAK,KAAK,CAAC,KAAK;AACjD,gBAAQ,OAAO,MAAM,KAAK;AAC1B,mBAAW,QAAQ,mBAAmB,QAAQ,IAAI,SAAS,IAAI,YAAY,GAAG;AAC5E,mBAAS,WAAW,IAAI;AAAA,QAC1B;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAEA,SAAO,EAAE,SAAS,UAAU,SAAS,IAAI;AAC3C;AAMO,SAAS,kBAAkB,KAA2C;AAC3E,QAAM,MAA8B,EAAE,UAAU,IAAI,QAAQ;AAC5D,MAAI,IAAI,aAAc,KAAI,iBAAiB,IAAI;AAC/C,SAAO;AACT;AAEA,SAAS,kBAA0B;AACjC,QAAM,QAAQ,IAAI,WAAW,EAAE;AAC/B,MAAI,OAAO,WAAW,QAAQ,oBAAoB,YAAY;AAC5D,eAAW,OAAO,gBAAgB,KAAK;AAAA,EACzC,OAAO;AACL,aAAS,IAAI,GAAG,IAAI,IAAI,IAAK,OAAM,CAAC,IAAI,KAAK,MAAM,KAAK,OAAO,IAAI,GAAG;AAAA,EACxE;AACA,SAAO,MAAM,KAAK,KAAK,EACpB,IAAI,CAAC,MAAM,EAAE,SAAS,EAAE,EAAE,SAAS,GAAG,GAAG,CAAC,EAC1C,KAAK,EAAE;AACZ;","names":[]}
1
+ {"version":3,"sources":["../../src/mcp/trace-propagation.ts"],"sourcesContent":["/**\n * @experimental\n *\n * Trace context propagation for MCP subprocess.\n *\n * When the MCP server is launched as a child process by a sandbox harness,\n * the parent passes trace context via environment variables:\n *\n * TRACE_ID=<current-run-trace-id>\n * PARENT_SPAN_ID=<span-that-dispatched-the-delegation>\n *\n * The MCP server reads these at startup and uses them as the root of its\n * internal trace tree. All spans emitted by `runLoop` invocations inside\n * the MCP are children of the parent's delegation span.\n *\n * When these env vars are absent, the MCP generates a fresh trace root —\n * the server operates standalone without trace joining.\n */\n\nimport type { LoopTraceEmitter, LoopTraceEvent } from '../loops/types'\nimport type { OtelExporter } from '../otel-export'\nimport { buildLoopOtelSpans, createOtelExporter } from '../otel-export'\n\nexport interface TraceContext {\n /** Trace id inherited from the parent process, or a fresh one. */\n traceId: string\n /** Parent span id from the delegation that launched this MCP server. */\n parentSpanId?: string\n}\n\n/**\n * Read trace context from the process environment.\n * Returns a context with inherited ids or a freshly generated root.\n */\nexport function readTraceContextFromEnv(): TraceContext {\n const traceId = process.env.TRACE_ID || generateTraceId()\n const parentSpanId = process.env.PARENT_SPAN_ID || undefined\n return { traceId, parentSpanId }\n}\n\n/**\n * Create a LoopTraceEmitter that:\n * 1. Parents all spans under the inherited PARENT_SPAN_ID.\n * 2. Exports spans to OTEL when OTEL_EXPORTER_OTLP_ENDPOINT is set.\n *\n * Returns both the emitter and the optional exporter handle for shutdown.\n */\nexport function createPropagatingTraceEmitter(ctx: TraceContext): {\n emitter: LoopTraceEmitter\n exporter: OtelExporter | undefined\n context: TraceContext\n} {\n const exporter = createOtelExporter()\n\n // Buffer events per loop run, then emit the full nested span tree on\n // `loop.ended` so the topology hierarchy (loop → round → branch) reaches the\n // OTLP collector — not a flat list of zero-duration point spans. A run that\n // never reaches `loop.ended` (hard abort) drops its buffer; acceptable for\n // the short-lived MCP subprocess.\n const buffers = new Map<string, LoopTraceEvent[]>()\n\n const emitter: LoopTraceEmitter = {\n emit(event: LoopTraceEvent) {\n if (!exporter) return\n const buf = buffers.get(event.runId)\n if (buf) buf.push(event)\n else buffers.set(event.runId, [event])\n if (event.kind === 'loop.ended') {\n const events = buffers.get(event.runId) ?? [event]\n buffers.delete(event.runId)\n for (const span of buildLoopOtelSpans(events, ctx.traceId, ctx.parentSpanId)) {\n exporter.exportSpan(span)\n }\n }\n },\n }\n\n return { emitter, exporter, context: ctx }\n}\n\n/**\n * Build env vars to pass to a child MCP subprocess so it inherits the\n * current trace context.\n */\nexport function traceContextToEnv(ctx: TraceContext): Record<string, string> {\n const env: Record<string, string> = { TRACE_ID: ctx.traceId }\n if (ctx.parentSpanId) env.PARENT_SPAN_ID = ctx.parentSpanId\n return env\n}\n\nfunction generateTraceId(): string {\n const bytes = new Uint8Array(16)\n if (typeof globalThis.crypto?.getRandomValues === 'function') {\n globalThis.crypto.getRandomValues(bytes)\n } else {\n for (let i = 0; i < 16; i++) bytes[i] = Math.floor(Math.random() * 256)\n }\n return Array.from(bytes)\n .map((b) => b.toString(16).padStart(2, '0'))\n .join('')\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAkCO,SAAS,0BAAwC;AACtD,QAAM,UAAU,QAAQ,IAAI,YAAY,gBAAgB;AACxD,QAAM,eAAe,QAAQ,IAAI,kBAAkB;AACnD,SAAO,EAAE,SAAS,aAAa;AACjC;AASO,SAAS,8BAA8B,KAI5C;AACA,QAAM,WAAW,mBAAmB;AAOpC,QAAM,UAAU,oBAAI,IAA8B;AAElD,QAAM,UAA4B;AAAA,IAChC,KAAK,OAAuB;AAC1B,UAAI,CAAC,SAAU;AACf,YAAM,MAAM,QAAQ,IAAI,MAAM,KAAK;AACnC,UAAI,IAAK,KAAI,KAAK,KAAK;AAAA,UAClB,SAAQ,IAAI,MAAM,OAAO,CAAC,KAAK,CAAC;AACrC,UAAI,MAAM,SAAS,cAAc;AAC/B,cAAM,SAAS,QAAQ,IAAI,MAAM,KAAK,KAAK,CAAC,KAAK;AACjD,gBAAQ,OAAO,MAAM,KAAK;AAC1B,mBAAW,QAAQ,mBAAmB,QAAQ,IAAI,SAAS,IAAI,YAAY,GAAG;AAC5E,mBAAS,WAAW,IAAI;AAAA,QAC1B;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAEA,SAAO,EAAE,SAAS,UAAU,SAAS,IAAI;AAC3C;AAMO,SAAS,kBAAkB,KAA2C;AAC3E,QAAM,MAA8B,EAAE,UAAU,IAAI,QAAQ;AAC5D,MAAI,IAAI,aAAc,KAAI,iBAAiB,IAAI;AAC/C,SAAO;AACT;AAEA,SAAS,kBAA0B;AACjC,QAAM,QAAQ,IAAI,WAAW,EAAE;AAC/B,MAAI,OAAO,WAAW,QAAQ,oBAAoB,YAAY;AAC5D,eAAW,OAAO,gBAAgB,KAAK;AAAA,EACzC,OAAO;AACL,aAAS,IAAI,GAAG,IAAI,IAAI,IAAK,OAAM,CAAC,IAAI,KAAK,MAAM,KAAK,OAAO,IAAI,GAAG;AAAA,EACxE;AACA,SAAO,MAAM,KAAK,KAAK,EACpB,IAAI,CAAC,MAAM,EAAE,SAAS,EAAE,EAAE,SAAS,GAAG,GAAG,CAAC,EAC1C,KAAK,EAAE;AACZ;","names":[]}
@@ -0,0 +1,129 @@
1
+ import { LlmClientOptions } from '@tangle-network/agent-eval';
2
+ import { Scenario, DispatchContext, JudgeConfig, ImprovementDriver, Gate, CampaignStorage, GateResult, RunImprovementLoopResult } from '@tangle-network/agent-eval/campaign';
3
+
4
+ /**
5
+ * @experimental
6
+ *
7
+ * `optimizePrompt` — identity-gated optimization for any TEXT prompt surface
8
+ * (system prompt, planner prompt, judge rubric, skill doc).
9
+ *
10
+ * The text-surface sibling to this module's `improvementDriver` (the
11
+ * CODE-surface / worktree path). Both feed agent-eval's `runImprovementLoop`;
12
+ * this one defaults the driver to agent-eval's `gepaDriver` (reflective text
13
+ * mutator) and the gate to `heldOutGate`.
14
+ *
15
+ * IDENTITY-GATED BY CONSTRUCTION — the whole point. The loop runs evals,
16
+ * collects per-scenario signal, proposes candidates, and the gate compares
17
+ * candidate-vs-baseline ON THE HELDOUT. `result.prompt` is the baseline
18
+ * (identity) UNLESS the gate decided `'ship'`. So wiring a surface up is safe:
19
+ * a surface with no beneficial mutation simply keeps its baseline. You never
20
+ * regress by registering a prompt — you only ever improve when the held-out
21
+ * data earns it.
22
+ *
23
+ * Generic over the runtime: `runWithPrompt` is the only domain seam — given a
24
+ * candidate prompt + scenario, run it however the surface runs (sandbox
25
+ * `streamPrompt`, a `runLoop`, a direct model call) and return the artifact the
26
+ * judges score. The optimizer never assumes how a prompt is executed.
27
+ */
28
+
29
+ /** Reflection config for the default `gepaDriver`. Omit when passing a custom
30
+ * `driver`. */
31
+ interface OptimizePromptReflection {
32
+ /** Router transport for the reflection model. */
33
+ llm: LlmClientOptions;
34
+ /** Model that performs the reflective rewrite. */
35
+ model: string;
36
+ /** What is being optimized — orients the reflection prompt. Default
37
+ * `'system prompt'`. */
38
+ target?: string;
39
+ /** Surface-specific mutation levers offered to the reflector. */
40
+ mutationPrimitives?: string[];
41
+ /** H2 (`## Foo`) headings that MUST survive every candidate. gepaDriver's
42
+ * only structural guard — load-bearing sections of the prompt should be
43
+ * `##` headings so a rewrite cannot drop them. */
44
+ preserveSections?: string[];
45
+ /** Max sentence-level edits per candidate vs the parent (a textual learning
46
+ * rate). Caps a rewrite from wiping prior rules in one generation. */
47
+ maxSentenceEdits?: number;
48
+ }
49
+ /** @experimental */
50
+ interface OptimizePromptOptions<TScenario extends Scenario, TArtifact> {
51
+ /** The prompt being optimized — the identity baseline the gate protects. */
52
+ baselinePrompt: string;
53
+ /** Domain seam: run a candidate prompt against a scenario → artifact the
54
+ * judges score. The optimizer is agnostic to HOW the prompt runs. */
55
+ runWithPrompt: (prompt: string, scenario: TScenario, ctx: DispatchContext) => Promise<TArtifact>;
56
+ /** Training pool — scored each generation to rank candidates. */
57
+ scenarios: TScenario[];
58
+ /** Held out of training — scored ONLY for the gate's baseline-vs-winner
59
+ * delta. Disjoint from `scenarios`; this is what makes promotion measure
60
+ * generalization, not memorization. */
61
+ holdoutScenarios: TScenario[];
62
+ /** Scorers — deterministic checks or LLM judges. */
63
+ judges: JudgeConfig<TArtifact, TScenario>[];
64
+ /** Where artifacts + traces land (opaque key under in-memory storage). */
65
+ runDir: string;
66
+ /** Default driver = `gepaDriver` built from this. Required UNLESS `driver`
67
+ * is supplied. */
68
+ reflection?: OptimizePromptReflection;
69
+ /** Override the improvement strategy (custom driver / deterministic tests). */
70
+ driver?: ImprovementDriver;
71
+ /** Override the promotion gate. Default `heldOutGate` over `holdoutScenarios`
72
+ * — zero extra LLM. Wrap `defaultProductionGate` for red-team/reward-hacking
73
+ * hardening on production wiring. */
74
+ gate?: Gate<TArtifact, TScenario>;
75
+ /** Minimum held-out composite lift to ship, forwarded to the default
76
+ * `heldOutGate`. When omitted the gate uses its own default. */
77
+ deltaThreshold?: number;
78
+ /** Candidates proposed per generation. Default 4. */
79
+ populationSize?: number;
80
+ /** Generations to run. Default 3. */
81
+ maxGenerations?: number;
82
+ /** Candidates carried to the next generation. Default 2. */
83
+ promoteTopK?: number;
84
+ /** Storage backend. Pass `inMemoryCampaignStorage()` for filesystem-less /
85
+ * test runs. Default: Node filesystem. */
86
+ storage?: CampaignStorage;
87
+ /** Reproducibility seed. Default 42. */
88
+ seed?: number;
89
+ /** Per-scenario replicates for CI bands. Default 1. */
90
+ reps?: number;
91
+ /** Max concurrent cells. Default 2. */
92
+ maxConcurrency?: number;
93
+ /** Test seam — override the wall clock. */
94
+ now?: () => Date;
95
+ /** On a shipped gate: `'pr'` opens a PR, `'none'` just reports. Default
96
+ * `'none'`. */
97
+ autoOnPromote?: 'pr' | 'none';
98
+ ghOwner?: string;
99
+ ghRepo?: string;
100
+ }
101
+ /** @experimental */
102
+ interface OptimizePromptResult<TArtifact, TScenario extends Scenario> {
103
+ /** The prompt to USE. Identity (the baseline) unless the gate shipped a
104
+ * winner — so a caller can always assign `result.prompt` unconditionally. */
105
+ prompt: string;
106
+ /** True only when the gate promoted a candidate over baseline on holdout. */
107
+ improved: boolean;
108
+ /** The gate's verdict (`'ship' | 'hold' | 'need_more_work' | ...`). */
109
+ decision: GateResult['decision'];
110
+ /** Human-readable reasons the gate gave. */
111
+ reasons: string[];
112
+ /** Mean held-out composite of the baseline. */
113
+ baselineComposite: number;
114
+ /** Mean held-out composite of the winner candidate. */
115
+ winnerComposite: number;
116
+ /** Held-out lift (winner − baseline); the gate's `delta` when it reported one. */
117
+ delta: number;
118
+ /** Why the winner was proposed — present when a shipped winner carried a
119
+ * driver rationale. */
120
+ rationale?: string;
121
+ /** Unified baseline→winner diff (empty when the winner is the baseline). */
122
+ diff: string;
123
+ /** The full loop result for callers that need generations / campaigns. */
124
+ raw: RunImprovementLoopResult<TArtifact, TScenario>;
125
+ }
126
+ /** @experimental */
127
+ declare function optimizePrompt<TScenario extends Scenario, TArtifact>(opts: OptimizePromptOptions<TScenario, TArtifact>): Promise<OptimizePromptResult<TArtifact, TScenario>>;
128
+
129
+ export { type OptimizePromptOptions as O, type OptimizePromptReflection as a, type OptimizePromptResult as b, optimizePrompt as o };
@@ -1,5 +1,5 @@
1
1
  import { CoderOutput, CoderTask } from './profiles.js';
2
- import { L as LoopSandboxClient } from './types-CmTjKLyB.js';
2
+ import { L as LoopSandboxClient } from './types-CmkQl8qE.js';
3
3
  import { SandboxInstance } from '@tangle-network/sandbox';
4
4
  import { O as OpenAIChatTool } from './types-CsCCryln.js';
5
5
 
@@ -361,6 +361,81 @@ interface CreateDefaultCoderDelegateOptions {
361
361
  */
362
362
  declare function createDefaultCoderDelegate(options: CreateDefaultCoderDelegateOptions): CoderDelegate;
363
363
 
364
+ /**
365
+ * @experimental
366
+ *
367
+ * `createKbGate` — the valid-only knowledge-base growth gate, distilled from
368
+ * physim's KB-research subsystem. A research-in-a-loop delegate (or any KB
369
+ * writer) runs candidate facts through this before persisting, so the KB grows
370
+ * with ONLY grounded facts — hallucinated, unsourced, or laundered claims are
371
+ * vetoed at the gate.
372
+ *
373
+ * Fail-closed by construction: every judge must `accept`; the FIRST veto wins
374
+ * and the fact is rejected. The non-negotiable floor (always on, can't be
375
+ * disabled) is the **passage-present guard** — a fact's `verbatimPassage` MUST
376
+ * literally appear in its `sourceText`. That single check kills the dominant
377
+ * failure mode (a confident claim decoupled from any real source).
378
+ *
379
+ * Pure + dependency-free: it operates on fact candidates, not on a store, so it
380
+ * composes with `@tangle-network/agent-knowledge` or any persistence layer
381
+ * without importing it. The remediation policy (correct-on-veto vs
382
+ * escalate-as-unverified) is the caller's — this returns the verdict; it never
383
+ * drops a fact silently.
384
+ */
385
+ /** @experimental A fact proposed for the KB, with its grounding. */
386
+ interface FactCandidate {
387
+ /** The atomic claim text. */
388
+ claim: string;
389
+ /** Optional extracted value (number or string) the claim asserts. */
390
+ value?: string | number;
391
+ /** Verbatim span lifted from the source that backs the claim. */
392
+ verbatimPassage: string;
393
+ /** The raw source text the passage must be grounded in. */
394
+ sourceText: string;
395
+ /** Where the fact claims to come from — checked for circular/self citations. */
396
+ citation?: string;
397
+ }
398
+ /** @experimental */
399
+ interface FactJudgeVerdict {
400
+ accept: boolean;
401
+ reason?: string;
402
+ }
403
+ /** @experimental A pluggable fact validator. Throw is NOT allowed — return a
404
+ * verdict; a thrown judge is a programmer error, not a veto. */
405
+ interface FactJudge {
406
+ name: string;
407
+ judge(candidate: FactCandidate): FactJudgeVerdict | Promise<FactJudgeVerdict>;
408
+ }
409
+ /** @experimental */
410
+ interface KbGateResult {
411
+ accepted: boolean;
412
+ /** Name of the judge that vetoed; undefined when accepted. */
413
+ vetoedBy?: string;
414
+ reason?: string;
415
+ }
416
+ /** @experimental */
417
+ interface CreateKbGateOptions {
418
+ /** Extra judges appended after the built-in floor (e.g. an LLM judge). */
419
+ judges?: FactJudge[];
420
+ /** Minimum verbatim-passage length. Default 12 — kills empty/stub passages. */
421
+ minPassageChars?: number;
422
+ /**
423
+ * Citation tokens that denote a SELF-generated artifact (e.g. `'spec'`,
424
+ * `'cad_params'`, `'requirements'`). A citation naming one is circular
425
+ * (laundering) — the fact cites a derived artifact, not a real source.
426
+ * Default `[]` (no circular check unless the consumer declares its kinds).
427
+ */
428
+ selfArtifactKinds?: string[];
429
+ }
430
+ /**
431
+ * @experimental
432
+ *
433
+ * Build a fail-closed KB gate. The returned function runs the built-in floor
434
+ * (passage-non-empty → passage-present → value-in-passage → no-circular-citation)
435
+ * then any consumer judges, returning on the first veto.
436
+ */
437
+ declare function createKbGate(options?: CreateKbGateOptions): (candidate: FactCandidate) => Promise<KbGateResult>;
438
+
364
439
  /**
365
440
  * @experimental
366
441
  *
@@ -549,4 +624,4 @@ interface EvalRunsExportResult {
549
624
  */
550
625
  declare function exportEvalRuns(events: EvalRunEvent[], config?: EvalRunsExportConfig): Promise<EvalRunsExportResult>;
551
626
 
552
- export { type FeedbackRefersTo as A, type FleetWorkspaceExecutorOptions as B, type CoderDelegate as C, type DelegationExecutor as D, type ResearchOutputShape as E, type FleetHandle as F, createDefaultCoderDelegate as G, createFleetWorkspaceExecutor as H, createSiblingSandboxExecutor as I, mcpToolsForRuntimeMcp as J, mcpToolsForRuntimeMcpSubset as K, type EvalRunEvent as L, type EvalRunGeneration as M, type EvalRunsExportConfig as N, type OtelExporter as O, type EvalRunsExportResult as P, INTELLIGENCE_WIRE_VERSION as Q, type ResearcherDelegate as R, type SiblingSandboxExecutorOptions as S, type OtelAttribute as T, type OtelExportConfig as U, type OtelSpan as V, buildLoopOtelSpans as W, createOtelExporter as X, exportEvalRuns as Y, loopEventToOtelSpan as Z, type DelegateFeedbackArgs as a, type DelegationFeedbackSnapshot as b, type DelegationProfile as c, type DelegateCodeArgs as d, type DelegateResearchArgs as e, type DelegationStatus as f, type DelegationProgress as g, type DelegationResultPayload as h, type DelegationError as i, type DelegationStatusResult as j, type DelegationHistoryArgs as k, type DelegationHistoryEntry as l, type DelegateCodeResult as m, type DelegateFeedbackResult as n, type ResearchSource as o, type DelegateResearchResult as p, type DelegationHistoryResult as q, type DelegationStatusArgs as r, type CoderReview as s, type CoderReviewer as t, type CoderWinnerSelection as u, type CreateDefaultCoderDelegateOptions as v, type DelegateCodeConfig as w, type DelegateResearchConfig as x, type DelegateRunCtx as y, type FeedbackRating as z };
627
+ export { type OtelSpan as $, type FactCandidate as A, type FactJudge as B, type CoderDelegate as C, type DelegationExecutor as D, type FactJudgeVerdict as E, type FleetHandle as F, type FeedbackRating as G, type FeedbackRefersTo as H, type FleetWorkspaceExecutorOptions as I, type ResearchOutputShape as J, type KbGateResult as K, createDefaultCoderDelegate as L, createFleetWorkspaceExecutor as M, createKbGate as N, type OtelExporter as O, createSiblingSandboxExecutor as P, mcpToolsForRuntimeMcp as Q, type ResearcherDelegate as R, type SiblingSandboxExecutorOptions as S, mcpToolsForRuntimeMcpSubset as T, type EvalRunEvent as U, type EvalRunGeneration as V, type EvalRunsExportConfig as W, type EvalRunsExportResult as X, INTELLIGENCE_WIRE_VERSION as Y, type OtelAttribute as Z, type OtelExportConfig as _, type DelegateFeedbackArgs as a, buildLoopOtelSpans as a0, createOtelExporter as a1, exportEvalRuns as a2, loopEventToOtelSpan as a3, type DelegationFeedbackSnapshot as b, type DelegationProfile as c, type DelegateCodeArgs as d, type DelegateResearchArgs as e, type DelegationStatus as f, type DelegationProgress as g, type DelegationResultPayload as h, type DelegationError as i, type DelegationStatusResult as j, type DelegationHistoryArgs as k, type DelegationHistoryEntry as l, type DelegateCodeResult as m, type DelegateFeedbackResult as n, type ResearchSource as o, type DelegateResearchResult as p, type DelegationHistoryResult as q, type DelegationStatusArgs as r, type CoderReview as s, type CoderReviewer as t, type CoderWinnerSelection as u, type CreateDefaultCoderDelegateOptions as v, type CreateKbGateOptions as w, type DelegateCodeConfig as x, type DelegateResearchConfig as y, type DelegateRunCtx as z };
@@ -1,5 +1,5 @@
1
1
  import { AgentProfile } from '@tangle-network/sandbox';
2
- import { O as OutputAdapter, V as Validator, A as AgentRunSpec, D as Driver } from './types-CmTjKLyB.js';
2
+ import { O as OutputAdapter, V as Validator, A as AgentRunSpec, D as Driver } from './types-CmkQl8qE.js';
3
3
  import '@tangle-network/agent-eval';
4
4
  import './types-CsCCryln.js';
5
5
 
@@ -1,5 +1,5 @@
1
1
  import { DefaultVerdict } from '@tangle-network/agent-eval';
2
- import { CreateSandboxOptions, SandboxInstance, AgentProfile, SandboxEvent } from '@tangle-network/sandbox';
2
+ import { CreateSandboxOptions, SandboxInstance, SandboxEvent, AgentProfile } from '@tangle-network/sandbox';
3
3
  import { A as AgentTaskSpec, R as RuntimeStreamEvent } from './types-CsCCryln.js';
4
4
 
5
5
  /**
@@ -1,4 +1,4 @@
1
- import { AnalystRunEvent, AnalystRunInputs, AnalystFinding, AnalystRunResult, FindingsDiff } from '@tangle-network/agent-eval';
1
+ import { AnalystRunInputs, AnalystFinding, AnalystRunResult, AnalystRunEvent, FindingsDiff } from '@tangle-network/agent-eval';
2
2
 
3
3
  /**
4
4
  * Public types for the closed-loop analyst orchestrator.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tangle-network/agent-runtime",
3
- "version": "0.37.0",
3
+ "version": "0.38.0",
4
4
  "description": "Reusable runtime lifecycle for domain-specific agents.",
5
5
  "homepage": "https://github.com/tangle-network/agent-runtime#readme",
6
6
  "repository": {