clementine-agent 1.18.167 → 1.18.169
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent/run-agent.d.ts +7 -0
- package/dist/agent/run-agent.js +96 -18
- package/dist/agent/tool-output-guard.d.ts +165 -0
- package/dist/agent/tool-output-guard.js +388 -0
- package/dist/cli/dashboard.js +269 -79
- package/dist/config/clementine-json.d.ts +7 -0
- package/dist/config/clementine-json.js +23 -0
- package/dist/config.d.ts +7 -0
- package/dist/config.js +31 -0
- package/package.json +1 -1
|
@@ -31,6 +31,7 @@ export declare function invalidateMcpStatusEntry(name: string): {
|
|
|
31
31
|
cleared: boolean;
|
|
32
32
|
updatedAt: string;
|
|
33
33
|
};
|
|
34
|
+
import { type ToolOutputGuardConfig } from './tool-output-guard.js';
|
|
34
35
|
import type { AgentProfile } from '../types.js';
|
|
35
36
|
import type { AgentManager } from './agent-manager.js';
|
|
36
37
|
import type { MemoryStore } from '../memory/store.js';
|
|
@@ -110,6 +111,12 @@ export interface RunAgentOptions {
|
|
|
110
111
|
* team-task) keep the prompt small. When unset, falls back to
|
|
111
112
|
* profile.systemPromptBody (legacy single-source behavior). */
|
|
112
113
|
systemPromptAppend?: string;
|
|
114
|
+
/** Per-run override for the tool-output-guard config (1.18.169).
|
|
115
|
+
* Defaults come from src/config.ts TOOL_OUTPUT_GUARD (env +
|
|
116
|
+
* clementine.json). Pass null to disable the guard for this run
|
|
117
|
+
* (rarely needed — almost always a sign that perTool overrides
|
|
118
|
+
* would be safer). */
|
|
119
|
+
toolOutputGuard?: Partial<ToolOutputGuardConfig> | null;
|
|
113
120
|
}
|
|
114
121
|
export interface RunAgentResult {
|
|
115
122
|
/** Final text response from the agent. */
|
package/dist/agent/run-agent.js
CHANGED
|
@@ -84,7 +84,8 @@ export function invalidateMcpStatusEntry(name) {
|
|
|
84
84
|
};
|
|
85
85
|
return { cleared: _lastMcpStatusSnapshot.servers.length < before, updatedAt: _lastMcpStatusSnapshot.updatedAt };
|
|
86
86
|
}
|
|
87
|
-
import { BASE_DIR, PKG_DIR, CLAUDE_CODE_OAUTH_TOKEN, ANTHROPIC_API_KEY as CONFIG_ANTHROPIC_API_KEY, normalizeClaudeSdkOptionsForOneMillionContext, } from '../config.js';
|
|
87
|
+
import { BASE_DIR, PKG_DIR, CLAUDE_CODE_OAUTH_TOKEN, ANTHROPIC_API_KEY as CONFIG_ANTHROPIC_API_KEY, normalizeClaudeSdkOptionsForOneMillionContext, TOOL_OUTPUT_GUARD, } from '../config.js';
|
|
88
|
+
import { buildGuardHooks } from './tool-output-guard.js';
|
|
88
89
|
import { buildAgentMap } from './agent-definitions.js';
|
|
89
90
|
import { buildExecutionToolPolicy, } from './execution-policy.js';
|
|
90
91
|
const MCP_SERVER_SCRIPT = path.join(PKG_DIR, 'dist', 'tools', 'mcp-server.js');
|
|
@@ -277,6 +278,69 @@ export async function runAgent(prompt, opts) {
|
|
|
277
278
|
opts.abortSignal.addEventListener('abort', () => sdkAbortController.abort(), { once: true });
|
|
278
279
|
}
|
|
279
280
|
}
|
|
281
|
+
// PRD §6 / 1.18.85: stable run id created before sdkOptions so the
|
|
282
|
+
// tool-output guard (1.18.169) can namespace its on-disk archive by
|
|
283
|
+
// runId. EventLog writers below also reference this id.
|
|
284
|
+
const runId = randomUUID();
|
|
285
|
+
const eventLog = new EventLog();
|
|
286
|
+
let eventSeq = 0;
|
|
287
|
+
const writeEvent = (e) => {
|
|
288
|
+
try {
|
|
289
|
+
eventLog.append({
|
|
290
|
+
...e,
|
|
291
|
+
runId,
|
|
292
|
+
seq: eventSeq++,
|
|
293
|
+
});
|
|
294
|
+
}
|
|
295
|
+
catch { /* never block */ }
|
|
296
|
+
};
|
|
297
|
+
// ── Tool-output guard hooks (1.18.169) ─────────────────────────────
|
|
298
|
+
// Bounds the per-tool-call output that reaches the model so SDK
|
|
299
|
+
// auto-compaction never thrashes on a runaway MCP result. The hook
|
|
300
|
+
// ALSO mirrors compression events into the run's EventLog so the Run
|
|
301
|
+
// detail page can show "[guard] outlook_inbox: 412KB → 28KB" badges.
|
|
302
|
+
// Per-run config merges over the static TOOL_OUTPUT_GUARD defaults;
|
|
303
|
+
// pass opts.toolOutputGuard = null to opt out entirely.
|
|
304
|
+
//
|
|
305
|
+
// `mostRecentUsageTokens` is updated from each assistant message's
|
|
306
|
+
// usage block (input + cache_read + cache_creation tokens). The
|
|
307
|
+
// window estimate is a conservative 180K — even 1M-context runs
|
|
308
|
+
// benefit from staying near 200K because compaction kicks in
|
|
309
|
+
// earlier and tools.outputs amplify thrash regardless of window.
|
|
310
|
+
let mostRecentUsageTokens = 0;
|
|
311
|
+
const usageWindowEstimate = 180_000; // tokens, conservative
|
|
312
|
+
const guardConfig = opts.toolOutputGuard === null
|
|
313
|
+
? null
|
|
314
|
+
: {
|
|
315
|
+
softLimitBytes: opts.toolOutputGuard?.softLimitBytes ?? TOOL_OUTPUT_GUARD.softLimitBytes,
|
|
316
|
+
hardLimitBytes: opts.toolOutputGuard?.hardLimitBytes ?? TOOL_OUTPUT_GUARD.hardLimitBytes,
|
|
317
|
+
adaptive: opts.toolOutputGuard?.adaptive ?? TOOL_OUTPUT_GUARD.adaptive,
|
|
318
|
+
perTool: { ...TOOL_OUTPUT_GUARD.perTool, ...(opts.toolOutputGuard?.perTool ?? {}) },
|
|
319
|
+
};
|
|
320
|
+
const guard = guardConfig
|
|
321
|
+
? buildGuardHooks({
|
|
322
|
+
runId,
|
|
323
|
+
config: guardConfig,
|
|
324
|
+
usageRatio: () => mostRecentUsageTokens / usageWindowEstimate,
|
|
325
|
+
onCompress: (info) => {
|
|
326
|
+
writeEvent({
|
|
327
|
+
kind: 'tool_result',
|
|
328
|
+
ts: new Date().toISOString(),
|
|
329
|
+
sessionId,
|
|
330
|
+
toolUseId: info.toolUseId,
|
|
331
|
+
toolResult: {
|
|
332
|
+
_clementine_guard: true,
|
|
333
|
+
tool: info.toolName,
|
|
334
|
+
originalBytes: info.originalBytes,
|
|
335
|
+
capBytes: info.capBytes,
|
|
336
|
+
bytesShed: info.bytesShed,
|
|
337
|
+
ceilingHit: info.ceilingHit,
|
|
338
|
+
...(info.archivePath ? { archivePath: info.archivePath } : {}),
|
|
339
|
+
},
|
|
340
|
+
});
|
|
341
|
+
},
|
|
342
|
+
})
|
|
343
|
+
: { hooks: {}, stats: { inspected: 0, compressed: 0, ceilingHits: 0, bytesShed: 0, compactions: 0 } };
|
|
280
344
|
// Apply 1M-context env normalization (existing infra)
|
|
281
345
|
const sdkOptionsRaw = {
|
|
282
346
|
systemPrompt: profileAppend
|
|
@@ -316,6 +380,10 @@ export async function runAgent(prompt, opts) {
|
|
|
316
380
|
...(opts.additionalDirectories && opts.additionalDirectories.length > 0
|
|
317
381
|
? { additionalDirectories: opts.additionalDirectories }
|
|
318
382
|
: {}),
|
|
383
|
+
// 1.18.169 — install the tool-output guard hooks. SDK types accept
|
|
384
|
+
// `hooks` keyed by HookEvent; the empty object is a no-op when the
|
|
385
|
+
// guard is disabled.
|
|
386
|
+
...(Object.keys(guard.hooks).length > 0 ? { hooks: guard.hooks } : {}),
|
|
319
387
|
};
|
|
320
388
|
const sdkOptions = normalizeClaudeSdkOptionsForOneMillionContext(sdkOptionsRaw);
|
|
321
389
|
logger.info({
|
|
@@ -332,23 +400,9 @@ export async function runAgent(prompt, opts) {
|
|
|
332
400
|
permissionMode: toolPolicy.permissionMode,
|
|
333
401
|
mcpServerCount: Object.keys(mcpServers).length,
|
|
334
402
|
}, 'runAgent: starting query');
|
|
335
|
-
// PRD §6 / 1.18.85: path A in-process tap.
|
|
336
|
-
//
|
|
337
|
-
//
|
|
338
|
-
// throws back to the caller — telemetry is best-effort.
|
|
339
|
-
const runId = randomUUID();
|
|
340
|
-
const eventLog = new EventLog();
|
|
341
|
-
let eventSeq = 0;
|
|
342
|
-
const writeEvent = (e) => {
|
|
343
|
-
try {
|
|
344
|
-
eventLog.append({
|
|
345
|
-
...e,
|
|
346
|
-
runId,
|
|
347
|
-
seq: eventSeq++,
|
|
348
|
-
});
|
|
349
|
-
}
|
|
350
|
-
catch { /* never block */ }
|
|
351
|
-
};
|
|
403
|
+
// PRD §6 / 1.18.85: path A in-process tap. runId / eventLog / writeEvent
|
|
404
|
+
// are declared earlier (above sdkOptionsRaw) because the tool-output
|
|
405
|
+
// guard's onCompress callback needs them at hook-registration time.
|
|
352
406
|
let finalText = '';
|
|
353
407
|
let sessionId = '';
|
|
354
408
|
let totalCostUsd = 0;
|
|
@@ -389,6 +443,21 @@ export async function runAgent(prompt, opts) {
|
|
|
389
443
|
}
|
|
390
444
|
if (message.type === 'assistant') {
|
|
391
445
|
const am = message;
|
|
446
|
+
// 1.18.169 — capture this turn's usage so the tool-output guard can
|
|
447
|
+
// adaptively tighten its cap as cumulative context climbs. We sum
|
|
448
|
+
// input + cache_read + cache_creation because all three count
|
|
449
|
+
// against the model's window for the NEXT turn. Output_tokens isn't
|
|
450
|
+
// included — it's not retained in context after the model response
|
|
451
|
+
// is processed.
|
|
452
|
+
const tokenUsage = am.message?.usage;
|
|
453
|
+
if (tokenUsage) {
|
|
454
|
+
const recent = (tokenUsage.input_tokens ?? 0)
|
|
455
|
+
+ (tokenUsage.cache_read_input_tokens ?? 0)
|
|
456
|
+
+ (tokenUsage.cache_creation_input_tokens ?? 0);
|
|
457
|
+
if (Number.isFinite(recent) && recent > 0) {
|
|
458
|
+
mostRecentUsageTokens = recent;
|
|
459
|
+
}
|
|
460
|
+
}
|
|
392
461
|
// SDK content blocks include text, tool_use, and (when extended-thinking
|
|
393
462
|
// is enabled) thinking. We tap each kind into the Event store.
|
|
394
463
|
const blocks = (am.message?.content ?? []);
|
|
@@ -562,6 +631,15 @@ export async function runAgent(prompt, opts) {
|
|
|
562
631
|
totalCostUsd: Number(totalCostUsd.toFixed(4)),
|
|
563
632
|
durationMs: Date.now() - startedAt,
|
|
564
633
|
finalTextChars: finalText.length,
|
|
634
|
+
// 1.18.169 — tool-output guard summary, surfaced for observability.
|
|
635
|
+
// Non-zero `compressed` means the guard kept the SDK from thrashing.
|
|
636
|
+
guard: guard.stats.inspected > 0 ? {
|
|
637
|
+
inspected: guard.stats.inspected,
|
|
638
|
+
compressed: guard.stats.compressed,
|
|
639
|
+
bytesShed: guard.stats.bytesShed,
|
|
640
|
+
compactions: guard.stats.compactions,
|
|
641
|
+
ceilingHits: guard.stats.ceilingHits,
|
|
642
|
+
} : undefined,
|
|
565
643
|
}, 'runAgent: query complete');
|
|
566
644
|
// PRD §6 Phase 4e: subagent transcript backfill (Path C). The SDK persists
|
|
567
645
|
// every subagent's full message stream to ~/.claude/projects/<encoded-cwd>/
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* tool-output-guard — PostToolUse hook that bounds per-call tool output size
|
|
3
|
+
* so the SDK's auto-compactor can never thrash on a runaway MCP result.
|
|
4
|
+
*
|
|
5
|
+
* Why this exists
|
|
6
|
+
* ───────────────
|
|
7
|
+
* Anthropic's Claude Agent SDK auto-compacts when context approaches the
|
|
8
|
+
* model's window. When a *single* tool result is larger than the room left
|
|
9
|
+
* after compaction, the next turn refills the window immediately. After 3
|
|
10
|
+
* consecutive compactions that don't help, the SDK throws:
|
|
11
|
+
*
|
|
12
|
+
* "Autocompact is thrashing: the context refilled to the limit within 3
|
|
13
|
+
* turns of the previous compact, 3 times in a row."
|
|
14
|
+
*
|
|
15
|
+
* This used to happen on outlook-email-triage and imessage-triage when their
|
|
16
|
+
* MCP tools (`mcp__claude_ai_Microsoft_365__outlook_inbox`,
|
|
17
|
+
* `imessage_read`) returned tens-to-hundreds of KB per call. Our own
|
|
18
|
+
* Clementine MCP tools cap output at 30KB (`capOutput` in src/tools/shared.ts)
|
|
19
|
+
* but third-party MCPs (Composio, claude.ai, iMessage) ignore that.
|
|
20
|
+
*
|
|
21
|
+
* The fix is the canonical Anthropic primitive: a `PostToolUse` hook that
|
|
22
|
+
* returns `hookSpecificOutput.updatedToolOutput` to replace the result
|
|
23
|
+
* before it reaches the model. From sdk.d.ts:1979 — "Replaces the tool
|
|
24
|
+
* output before it is sent to the model."
|
|
25
|
+
*
|
|
26
|
+
* Design properties
|
|
27
|
+
* ─────────────────
|
|
28
|
+
* 1. Operates at the SDK boundary, once. Every runAgent caller (chat, cron,
|
|
29
|
+
* runSkill, heartbeat, team-task, hired agents) is protected for free.
|
|
30
|
+
* 2. Transparent: full payload is archived to disk so the agent can
|
|
31
|
+
* `Read <path>` if it really needs the rest. Nothing is silently lost.
|
|
32
|
+
* 3. Structure-aware: arrays of objects keep the first N + last 2 items
|
|
33
|
+
* plus a summary of the rest; emails/messages drop verbose body fields
|
|
34
|
+
* when they alone exceed the cap.
|
|
35
|
+
* 4. Per-tool overrides via clementine.json `toolOutputGuard.perTool`.
|
|
36
|
+
* 5. Adaptive: when cumulative cache-creation tokens climb >50% of the
|
|
37
|
+
* model's window, the soft cap shrinks ×0.5. Stops thrashing in the
|
|
38
|
+
* pathological "many medium-sized calls" case the static cap misses.
|
|
39
|
+
* 6. Hard ceiling (default 200KB) always enforced — three back-to-back
|
|
40
|
+
* 400KB outputs would thrash even at 1M context.
|
|
41
|
+
*
|
|
42
|
+
* Failure mode: this module never throws. A bad input, an archive write
|
|
43
|
+
* failure, anything — falls back to returning the original output. The
|
|
44
|
+
* caller (runAgent → SDK) is unblocked. The guard logs the issue and
|
|
45
|
+
* continues. Telemetry must never break execution.
|
|
46
|
+
*/
|
|
47
|
+
import type { HookCallbackMatcher, HookEvent } from '@anthropic-ai/claude-agent-sdk';
|
|
48
|
+
export interface ToolOutputGuardConfig {
|
|
49
|
+
/** Bytes — typical soft cap before compression kicks in. */
|
|
50
|
+
softLimitBytes: number;
|
|
51
|
+
/** Bytes — feasibility ceiling. Never exceeded regardless of context. */
|
|
52
|
+
hardLimitBytes: number;
|
|
53
|
+
/** When true, the soft cap shrinks as cumulative context fills up. */
|
|
54
|
+
adaptive: boolean;
|
|
55
|
+
/** Tool-name → bytes overrides. Keys match the SDK `tool_name` field
|
|
56
|
+
* (MCP tools: `mcp__<server>__<tool>`; native tools: `Read`, `Bash`, …). */
|
|
57
|
+
perTool: Record<string, number>;
|
|
58
|
+
}
|
|
59
|
+
/** Default config from src/config.ts; callers can override per-run via
|
|
60
|
+
* RunAgentOptions if a particular run truly needs more space. */
|
|
61
|
+
export declare function defaultGuardConfig(): ToolOutputGuardConfig;
|
|
62
|
+
export interface GuardRunStats {
|
|
63
|
+
/** Tool calls inspected by the guard. */
|
|
64
|
+
inspected: number;
|
|
65
|
+
/** Tool calls that exceeded the soft cap and were compressed. */
|
|
66
|
+
compressed: number;
|
|
67
|
+
/** Tool calls that exceeded the hard ceiling. */
|
|
68
|
+
ceilingHits: number;
|
|
69
|
+
/** Bytes of payload deferred to the archive (i.e. not sent to the model). */
|
|
70
|
+
bytesShed: number;
|
|
71
|
+
/** Number of SDK auto-compactions observed for this run. */
|
|
72
|
+
compactions: number;
|
|
73
|
+
}
|
|
74
|
+
/**
|
|
75
|
+
* Approximate the byte size of a tool_response as it will appear in the
|
|
76
|
+
* model's context. The SDK's tool_response may be a primitive, an object,
|
|
77
|
+
* a content-block array, or already-truncated string. We JSON-stringify
|
|
78
|
+
* because that's how the SDK ships it onwards, then byte-length the result.
|
|
79
|
+
*/
|
|
80
|
+
export declare function estimateBytes(value: unknown): number;
|
|
81
|
+
interface CompressionContext {
|
|
82
|
+
toolName: string;
|
|
83
|
+
archivePath: string | null;
|
|
84
|
+
cap: number;
|
|
85
|
+
}
|
|
86
|
+
/**
|
|
87
|
+
* Tighten the soft cap when cumulative context usage is already high.
|
|
88
|
+
*
|
|
89
|
+
* Inputs: `recentUsageRatio` is the most-recent (cache_read + input) /
|
|
90
|
+
* window-size estimate from the SDK's usage block — see runAgent for how
|
|
91
|
+
* it's plumbed. We don't know the exact window (200K vs 1M depends on
|
|
92
|
+
* model + plan + 1m flag) so we pass a conservative 180K when nothing
|
|
93
|
+
* else is known.
|
|
94
|
+
*
|
|
95
|
+
* usage <50% of window: full softLimit
|
|
96
|
+
* usage 50–75%: softLimit × 0.6
|
|
97
|
+
* usage ≥75%: softLimit × 0.35
|
|
98
|
+
*
|
|
99
|
+
* The hard ceiling is never reduced — it's already a feasibility cap.
|
|
100
|
+
*/
|
|
101
|
+
export declare function adaptiveSoftCap(baseSoftLimit: number, recentUsageRatio: number, adaptive: boolean): number;
|
|
102
|
+
/** Resolve the effective cap for a given tool call. */
|
|
103
|
+
export declare function resolveCap(toolName: string, config: ToolOutputGuardConfig, usageRatio: number): {
|
|
104
|
+
softCap: number;
|
|
105
|
+
hardCap: number;
|
|
106
|
+
};
|
|
107
|
+
export interface CompressOutcome {
|
|
108
|
+
/** What goes back to the model. Same shape contract as the SDK
|
|
109
|
+
* tool_response — string OR an object — preserving caller intent. */
|
|
110
|
+
output: unknown;
|
|
111
|
+
/** Bytes of payload that didn't reach the model. */
|
|
112
|
+
bytesShed: number;
|
|
113
|
+
/** Did we trip the hard ceiling? */
|
|
114
|
+
ceilingHit: boolean;
|
|
115
|
+
/** Did the input fit under the cap (no compression done)? */
|
|
116
|
+
passthrough: boolean;
|
|
117
|
+
}
|
|
118
|
+
/**
|
|
119
|
+
* Compress a tool result if it exceeds the cap. Pure function — does not
|
|
120
|
+
* write to disk. The caller handles archive + telemetry.
|
|
121
|
+
*/
|
|
122
|
+
export declare function compressToolOutput(_toolName: string, rawOutput: unknown, ctx: CompressionContext): CompressOutcome;
|
|
123
|
+
export interface GuardHookOptions {
|
|
124
|
+
/** Stable run identifier — used to namespace the on-disk archive. */
|
|
125
|
+
runId: string;
|
|
126
|
+
/** Static config (env + clementine.json). */
|
|
127
|
+
config?: ToolOutputGuardConfig;
|
|
128
|
+
/** Optional callback fired on every compression. Used by runAgent to
|
|
129
|
+
* record an Event row for the Run detail page. */
|
|
130
|
+
onCompress?: (info: {
|
|
131
|
+
toolName: string;
|
|
132
|
+
toolUseId: string;
|
|
133
|
+
originalBytes: number;
|
|
134
|
+
capBytes: number;
|
|
135
|
+
bytesShed: number;
|
|
136
|
+
ceilingHit: boolean;
|
|
137
|
+
archivePath: string | null;
|
|
138
|
+
}) => void;
|
|
139
|
+
/** Optional source of the current cumulative context-usage ratio
|
|
140
|
+
* (cache_read + input) / window. Returns a number in [0,1]. The
|
|
141
|
+
* guard calls this once per tool result to adapt the cap. When
|
|
142
|
+
* absent, ratio is assumed 0 (full soft cap is always used). */
|
|
143
|
+
usageRatio?: () => number;
|
|
144
|
+
/** Optional archive root override for tests. Defaults to Clementine home. */
|
|
145
|
+
archiveBaseDir?: string;
|
|
146
|
+
}
|
|
147
|
+
export interface GuardHookHandles {
|
|
148
|
+
/** Hook map suitable for SDK `query({ options: { hooks } })`. */
|
|
149
|
+
hooks: Partial<Record<HookEvent, HookCallbackMatcher[]>>;
|
|
150
|
+
/** Aggregated telemetry — read after the run completes. */
|
|
151
|
+
stats: GuardRunStats;
|
|
152
|
+
}
|
|
153
|
+
/**
|
|
154
|
+
* Build the hook handles that runAgent will hand to the SDK.
|
|
155
|
+
*
|
|
156
|
+
* Returns one PostToolUse hook (does the work), one PreCompact hook
|
|
157
|
+
* (logs the compaction request), and one PostCompact hook (logs the
|
|
158
|
+
* summary length so we can correlate with the next turn's behavior).
|
|
159
|
+
*
|
|
160
|
+
* If `TOOL_OUTPUT_GUARD.enabled` is false, returns empty handles —
|
|
161
|
+
* a noop merge with whatever the caller already had.
|
|
162
|
+
*/
|
|
163
|
+
export declare function buildGuardHooks(opts: GuardHookOptions): GuardHookHandles;
|
|
164
|
+
export {};
|
|
165
|
+
//# sourceMappingURL=tool-output-guard.d.ts.map
|