@tangle-network/agent-runtime 0.36.0 → 0.38.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent.d.ts +3 -3
- package/dist/analyst-loop.d.ts +2 -2
- package/dist/analyst-loop.js +3 -257
- package/dist/analyst-loop.js.map +1 -1
- package/dist/{chunk-NYGEI3NV.js → chunk-M65QJD35.js} +5 -211
- package/dist/chunk-M65QJD35.js.map +1 -0
- package/dist/{chunk-HSX6PFZR.js → chunk-V6GURW4W.js} +209 -1
- package/dist/chunk-V6GURW4W.js.map +1 -0
- package/dist/chunk-VOX6Z3II.js +90 -0
- package/dist/chunk-VOX6Z3II.js.map +1 -0
- package/dist/chunk-XBUG326M.js +261 -0
- package/dist/chunk-XBUG326M.js.map +1 -0
- package/dist/{chunk-7ZECSZ3C.js → chunk-Z523NPJK.js} +59 -2
- package/dist/chunk-Z523NPJK.js.map +1 -0
- package/dist/dynamic-DeOPeeAw.d.ts +106 -0
- package/dist/{improvement-adapter-CaZxFxTd.d.ts → improvement-adapter-BC4HhuAR.d.ts} +1 -1
- package/dist/improvement.d.ts +6 -130
- package/dist/improvement.js +4 -85
- package/dist/improvement.js.map +1 -1
- package/dist/index.d.ts +148 -3
- package/dist/index.js +109 -2
- package/dist/index.js.map +1 -1
- package/dist/loops.d.ts +5 -107
- package/dist/mcp/bin.js +4 -3
- package/dist/mcp/bin.js.map +1 -1
- package/dist/mcp/index.d.ts +6 -440
- package/dist/mcp/index.js +7 -62
- package/dist/mcp/index.js.map +1 -1
- package/dist/optimize-prompt-cmH9wZdH.d.ts +129 -0
- package/dist/otel-export-CNmeg_7B.d.ts +627 -0
- package/dist/profiles.d.ts +1 -2
- package/dist/{types-DrXVR2Fu.d.ts → types-CmkQl8qE.d.ts} +137 -3
- package/dist/{types-D_MXrmJP.d.ts → types-p8dWBIXL.d.ts} +1 -1
- package/package.json +1 -1
- package/dist/chunk-7ZECSZ3C.js.map +0 -1
- package/dist/chunk-HSX6PFZR.js.map +0 -1
- package/dist/chunk-NYGEI3NV.js.map +0 -1
- package/dist/otel-export-xgf4J6bo.d.ts +0 -191
- package/dist/runtime-run-B8VIiOhI.d.ts +0 -137
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
import { LlmClientOptions } from '@tangle-network/agent-eval';
|
|
2
|
+
import { Scenario, DispatchContext, JudgeConfig, ImprovementDriver, Gate, CampaignStorage, GateResult, RunImprovementLoopResult } from '@tangle-network/agent-eval/campaign';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* @experimental
|
|
6
|
+
*
|
|
7
|
+
* `optimizePrompt` — identity-gated optimization for any TEXT prompt surface
|
|
8
|
+
* (system prompt, planner prompt, judge rubric, skill doc).
|
|
9
|
+
*
|
|
10
|
+
* The text-surface sibling to this module's `improvementDriver` (the
|
|
11
|
+
* CODE-surface / worktree path). Both feed agent-eval's `runImprovementLoop`;
|
|
12
|
+
* this one defaults the driver to agent-eval's `gepaDriver` (reflective text
|
|
13
|
+
* mutator) and the gate to `heldOutGate`.
|
|
14
|
+
*
|
|
15
|
+
* IDENTITY-GATED BY CONSTRUCTION — the whole point. The loop runs evals,
|
|
16
|
+
* collects per-scenario signal, proposes candidates, and the gate compares
|
|
17
|
+
* candidate-vs-baseline ON THE HELDOUT. `result.prompt` is the baseline
|
|
18
|
+
* (identity) UNLESS the gate decided `'ship'`. So wiring a surface up is safe:
|
|
19
|
+
* a surface with no beneficial mutation simply keeps its baseline. You never
|
|
20
|
+
* regress by registering a prompt — you only ever improve when the held-out
|
|
21
|
+
* data earns it.
|
|
22
|
+
*
|
|
23
|
+
* Generic over the runtime: `runWithPrompt` is the only domain seam — given a
|
|
24
|
+
* candidate prompt + scenario, run it however the surface runs (sandbox
|
|
25
|
+
* `streamPrompt`, a `runLoop`, a direct model call) and return the artifact the
|
|
26
|
+
* judges score. The optimizer never assumes how a prompt is executed.
|
|
27
|
+
*/
|
|
28
|
+
|
|
29
|
+
/** Reflection config for the default `gepaDriver`. Omit when passing a custom
|
|
30
|
+
* `driver`. */
|
|
31
|
+
interface OptimizePromptReflection {
|
|
32
|
+
/** Router transport for the reflection model. */
|
|
33
|
+
llm: LlmClientOptions;
|
|
34
|
+
/** Model that performs the reflective rewrite. */
|
|
35
|
+
model: string;
|
|
36
|
+
/** What is being optimized — orients the reflection prompt. Default
|
|
37
|
+
* `'system prompt'`. */
|
|
38
|
+
target?: string;
|
|
39
|
+
/** Surface-specific mutation levers offered to the reflector. */
|
|
40
|
+
mutationPrimitives?: string[];
|
|
41
|
+
/** H2 (`## Foo`) headings that MUST survive every candidate. gepaDriver's
|
|
42
|
+
* only structural guard — load-bearing sections of the prompt should be
|
|
43
|
+
* `##` headings so a rewrite cannot drop them. */
|
|
44
|
+
preserveSections?: string[];
|
|
45
|
+
/** Max sentence-level edits per candidate vs the parent (a textual learning
|
|
46
|
+
* rate). Caps a rewrite from wiping prior rules in one generation. */
|
|
47
|
+
maxSentenceEdits?: number;
|
|
48
|
+
}
|
|
49
|
+
/** @experimental */
|
|
50
|
+
interface OptimizePromptOptions<TScenario extends Scenario, TArtifact> {
|
|
51
|
+
/** The prompt being optimized — the identity baseline the gate protects. */
|
|
52
|
+
baselinePrompt: string;
|
|
53
|
+
/** Domain seam: run a candidate prompt against a scenario → artifact the
|
|
54
|
+
* judges score. The optimizer is agnostic to HOW the prompt runs. */
|
|
55
|
+
runWithPrompt: (prompt: string, scenario: TScenario, ctx: DispatchContext) => Promise<TArtifact>;
|
|
56
|
+
/** Training pool — scored each generation to rank candidates. */
|
|
57
|
+
scenarios: TScenario[];
|
|
58
|
+
/** Held out of training — scored ONLY for the gate's baseline-vs-winner
|
|
59
|
+
* delta. Disjoint from `scenarios`; this is what makes promotion measure
|
|
60
|
+
* generalization, not memorization. */
|
|
61
|
+
holdoutScenarios: TScenario[];
|
|
62
|
+
/** Scorers — deterministic checks or LLM judges. */
|
|
63
|
+
judges: JudgeConfig<TArtifact, TScenario>[];
|
|
64
|
+
/** Where artifacts + traces land (opaque key under in-memory storage). */
|
|
65
|
+
runDir: string;
|
|
66
|
+
/** Default driver = `gepaDriver` built from this. Required UNLESS `driver`
|
|
67
|
+
* is supplied. */
|
|
68
|
+
reflection?: OptimizePromptReflection;
|
|
69
|
+
/** Override the improvement strategy (custom driver / deterministic tests). */
|
|
70
|
+
driver?: ImprovementDriver;
|
|
71
|
+
/** Override the promotion gate. Default `heldOutGate` over `holdoutScenarios`
|
|
72
|
+
* — zero extra LLM. Wrap `defaultProductionGate` for red-team/reward-hacking
|
|
73
|
+
* hardening on production wiring. */
|
|
74
|
+
gate?: Gate<TArtifact, TScenario>;
|
|
75
|
+
/** Minimum held-out composite lift to ship, forwarded to the default
|
|
76
|
+
* `heldOutGate`. When omitted the gate uses its own default. */
|
|
77
|
+
deltaThreshold?: number;
|
|
78
|
+
/** Candidates proposed per generation. Default 4. */
|
|
79
|
+
populationSize?: number;
|
|
80
|
+
/** Generations to run. Default 3. */
|
|
81
|
+
maxGenerations?: number;
|
|
82
|
+
/** Candidates carried to the next generation. Default 2. */
|
|
83
|
+
promoteTopK?: number;
|
|
84
|
+
/** Storage backend. Pass `inMemoryCampaignStorage()` for filesystem-less /
|
|
85
|
+
* test runs. Default: Node filesystem. */
|
|
86
|
+
storage?: CampaignStorage;
|
|
87
|
+
/** Reproducibility seed. Default 42. */
|
|
88
|
+
seed?: number;
|
|
89
|
+
/** Per-scenario replicates for CI bands. Default 1. */
|
|
90
|
+
reps?: number;
|
|
91
|
+
/** Max concurrent cells. Default 2. */
|
|
92
|
+
maxConcurrency?: number;
|
|
93
|
+
/** Test seam — override the wall clock. */
|
|
94
|
+
now?: () => Date;
|
|
95
|
+
/** On a shipped gate: `'pr'` opens a PR, `'none'` just reports. Default
|
|
96
|
+
* `'none'`. */
|
|
97
|
+
autoOnPromote?: 'pr' | 'none';
|
|
98
|
+
ghOwner?: string;
|
|
99
|
+
ghRepo?: string;
|
|
100
|
+
}
|
|
101
|
+
/** @experimental */
|
|
102
|
+
interface OptimizePromptResult<TArtifact, TScenario extends Scenario> {
|
|
103
|
+
/** The prompt to USE. Identity (the baseline) unless the gate shipped a
|
|
104
|
+
* winner — so a caller can always assign `result.prompt` unconditionally. */
|
|
105
|
+
prompt: string;
|
|
106
|
+
/** True only when the gate promoted a candidate over baseline on holdout. */
|
|
107
|
+
improved: boolean;
|
|
108
|
+
/** The gate's verdict (`'ship' | 'hold' | 'need_more_work' | ...`). */
|
|
109
|
+
decision: GateResult['decision'];
|
|
110
|
+
/** Human-readable reasons the gate gave. */
|
|
111
|
+
reasons: string[];
|
|
112
|
+
/** Mean held-out composite of the baseline. */
|
|
113
|
+
baselineComposite: number;
|
|
114
|
+
/** Mean held-out composite of the winner candidate. */
|
|
115
|
+
winnerComposite: number;
|
|
116
|
+
/** Held-out lift (winner − baseline); the gate's `delta` when it reported one. */
|
|
117
|
+
delta: number;
|
|
118
|
+
/** Why the winner was proposed — present when a shipped winner carried a
|
|
119
|
+
* driver rationale. */
|
|
120
|
+
rationale?: string;
|
|
121
|
+
/** Unified baseline→winner diff (empty when the winner is the baseline). */
|
|
122
|
+
diff: string;
|
|
123
|
+
/** The full loop result for callers that need generations / campaigns. */
|
|
124
|
+
raw: RunImprovementLoopResult<TArtifact, TScenario>;
|
|
125
|
+
}
|
|
126
|
+
/** @experimental */
|
|
127
|
+
declare function optimizePrompt<TScenario extends Scenario, TArtifact>(opts: OptimizePromptOptions<TScenario, TArtifact>): Promise<OptimizePromptResult<TArtifact, TScenario>>;
|
|
128
|
+
|
|
129
|
+
export { type OptimizePromptOptions as O, type OptimizePromptReflection as a, type OptimizePromptResult as b, optimizePrompt as o };
|