npm - @agi-cli/server - Versions diffs - 0.1.57 → 0.1.59 - Mend

@agi-cli/server 0.1.57 → 0.1.59

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/package.json +3 -3
package/src/index.ts +30 -21
package/src/runtime/agent-registry.ts +3 -2
package/src/runtime/cache-optimizer.ts +115 -0
package/src/runtime/context-optimizer.ts +192 -0
package/src/runtime/db-operations.ts +154 -15
package/src/runtime/runner.ts +29 -4
package/src/runtime/session-manager.ts +2 -0
package/src/runtime/stream-handlers.ts +61 -12
package/src/tools/adapter.ts +261 -173

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
 	"name": "@agi-cli/server",
-	"version": "0.1.57",
+	"version": "0.1.59",
 	"description": "HTTP API server for AGI CLI",
 	"type": "module",
 	"main": "./src/index.ts",
@@ -29,8 +29,8 @@
 		"typecheck": "tsc --noEmit"
 	},
 	"dependencies": {
-		"@agi-cli/sdk": "0.1.57",
-		"@agi-cli/database": "0.1.57",
+		"@agi-cli/sdk": "0.1.59",
+		"@agi-cli/database": "0.1.59",
 		"drizzle-orm": "^0.44.5",
 		"hono": "^4.9.9"
 	},

package/src/index.ts CHANGED Viewed

@@ -14,7 +14,7 @@ import type { AgentConfigEntry } from './runtime/agent-registry.ts';
 function initApp() {
 	const app = new Hono();
-	// Enable CORS for all localhost ports (for web UI on random ports)
+	// Enable CORS for localhost and local network access
 	app.use(
 		'*',
 		cors({
@@ -22,15 +22,16 @@ function initApp() {
 				// Allow all localhost and 127.0.0.1 on any port
 				if (
 					origin.startsWith('http://localhost:') ||
-					origin.startsWith('http://127.0.0.1:')
+					origin.startsWith('http://127.0.0.1:') ||
+					origin.startsWith('https://localhost:') ||
+					origin.startsWith('https://127.0.0.1:')
 				) {
 					return origin;
 				}
-				// Allow common dev ports
-				if (
-					origin === 'http://localhost:5173' ||
-					origin === 'http://localhost:5174'
-				) {
+				// Allow local network IPs (192.168.x.x, 10.x.x.x, 172.16-31.x.x)
+				const localNetworkPattern =
+					/^https?:\/\/(192\.168\.\d{1,3}\.\d{1,3}|10\.\d{1,3}\.\d{1,3}\.\d{1,3}|172\.(1[6-9]|2\d|3[01])\.\d{1,3}\.\d{1,3}):\d+$/;
+				if (localNetworkPattern.test(origin)) {
 					return origin;
 				}
 				// Default to allowing the origin (can be restricted in production)
@@ -76,7 +77,7 @@ export type StandaloneAppConfig = {
 export function createStandaloneApp(_config?: StandaloneAppConfig) {
 	const honoApp = new Hono();
-	// Enable CORS for all localhost ports (for web UI on random ports)
+	// Enable CORS for localhost and local network access
 	honoApp.use(
 		'*',
 		cors({
@@ -84,15 +85,16 @@ export function createStandaloneApp(_config?: StandaloneAppConfig) {
 				// Allow all localhost and 127.0.0.1 on any port
 				if (
 					origin.startsWith('http://localhost:') ||
-					origin.startsWith('http://127.0.0.1:')
+					origin.startsWith('http://127.0.0.1:') ||
+					origin.startsWith('https://localhost:') ||
+					origin.startsWith('https://127.0.0.1:')
 				) {
 					return origin;
 				}
-				// Allow common dev ports
-				if (
-					origin === 'http://localhost:5173' ||
-					origin === 'http://localhost:5174'
-				) {
+				// Allow local network IPs (192.168.x.x, 10.x.x.x, 172.16-31.x.x)
+				const localNetworkPattern =
+					/^https?:\/\/(192\.168\.\d{1,3}\.\d{1,3}|10\.\d{1,3}\.\d{1,3}\.\d{1,3}|172\.(1[6-9]|2\d|3[01])\.\d{1,3}\.\d{1,3}):\d+$/;
+				if (localNetworkPattern.test(origin)) {
 					return origin;
 				}
 				// Default to allowing the origin
@@ -148,6 +150,8 @@ export type EmbeddedAppConfig = {
 		model?: string;
 		agent?: string;
 	};
+	/** Additional CORS origins for proxies/Tailscale (e.g., ['https://myapp.ts.net', 'https://example.com']) */
+	corsOrigins?: string[];
 };
 export function createEmbeddedApp(config: EmbeddedAppConfig = {}) {
@@ -160,7 +164,7 @@ export function createEmbeddedApp(config: EmbeddedAppConfig = {}) {
 		await next();
 	});
-	// Enable CORS for all localhost ports (for web UI on random ports)
+	// Enable CORS for localhost and local network access
 	honoApp.use(
 		'*',
 		cors({
@@ -168,15 +172,20 @@ export function createEmbeddedApp(config: EmbeddedAppConfig = {}) {
 				// Allow all localhost and 127.0.0.1 on any port
 				if (
 					origin.startsWith('http://localhost:') ||
-					origin.startsWith('http://127.0.0.1:')
+					origin.startsWith('http://127.0.0.1:') ||
+					origin.startsWith('https://localhost:') ||
+					origin.startsWith('https://127.0.0.1:')
 				) {
 					return origin;
 				}
-				// Allow common dev ports
-				if (
-					origin === 'http://localhost:5173' ||
-					origin === 'http://localhost:5174'
-				) {
+				// Allow local network IPs (192.168.x.x, 10.x.x.x, 172.16-31.x.x)
+				const localNetworkPattern =
+					/^https?:\/\/(192\.168\.\d{1,3}\.\d{1,3}|10\.\d{1,3}\.\d{1,3}\.\d{1,3}|172\.(1[6-9]|2\d|3[01])\.\d{1,3}\.\d{1,3}):\d+$/;
+				if (localNetworkPattern.test(origin)) {
+					return origin;
+				}
+				// Allow custom CORS origins (for Tailscale, proxies, etc.)
+				if (config.corsOrigins?.includes(origin)) {
 					return origin;
 				}
 				// Default to allowing the origin

package/src/runtime/agent-registry.ts CHANGED Viewed

@@ -119,10 +119,10 @@ const defaultToolExtras: Record<string, string[]> = {
 		'tree',
 		'bash',
 		'update_plan',
-		'grep',
+		'glob',
+		'ripgrep',
 		'git_status',
 		'git_diff',
-		'ripgrep',
 		'apply_patch',
 		'websearch',
 	],
@@ -134,6 +134,7 @@ const defaultToolExtras: Record<string, string[]> = {
 		'tree',
 		'bash',
 		'ripgrep',
+		'glob',
 		'websearch',
 		'update_plan',
 	],

package/src/runtime/cache-optimizer.ts ADDED Viewed

@@ -0,0 +1,115 @@
+import type { ModelMessage } from 'ai';
+/**
+ * Adds cache control to messages for prompt caching optimization.
+ * Anthropic supports caching for system messages, tools, and long context.
+ */
+export function addCacheControl(
+	provider: string,
+	system: string | undefined,
+	messages: ModelMessage[],
+): {
+	system?:
+		| string
+		| Array<{
+				type: 'text';
+				text: string;
+				cache_control?: { type: 'ephemeral' };
+		  }>;
+	messages: ModelMessage[];
+} {
+	// Only Anthropic supports prompt caching currently
+	if (provider !== 'anthropic') {
+		return { system, messages };
+	}
+	// Convert system to cacheable format if it's long enough
+	let cachedSystem: any = system;
+	if (system && system.length > 1024) {
+		// Anthropic requires 1024+ tokens for Claude Sonnet/Opus
+		cachedSystem = [
+			{
+				type: 'text',
+				text: system,
+				cache_control: { type: 'ephemeral' as const },
+			},
+		];
+	}
+	// Anthropic cache_control limits:
+	// - Max 4 cache blocks total
+	// - System message: 1 block
+	// - Tools: 2 blocks (read, write)
+	// - Last user message: 1 block
+	// Total: 4 blocks
+	// Add cache control to the last user message if conversation is long
+	// This caches the conversation history up to that point
+	if (messages.length >= 3) {
+		const cachedMessages = [...messages];
+		// Find second-to-last user message (not the current one)
+		const userIndices = cachedMessages
+			.map((m, i) => (m.role === 'user' ? i : -1))
+			.filter((i) => i >= 0);
+		if (userIndices.length >= 2) {
+			const targetIndex = userIndices[userIndices.length - 2];
+			const targetMsg = cachedMessages[targetIndex];
+			if (Array.isArray(targetMsg.content)) {
+				// Add cache control to the last content part of that message
+				const lastPart = targetMsg.content[targetMsg.content.length - 1];
+				if (lastPart && typeof lastPart === 'object' && 'type' in lastPart) {
+					(lastPart as any).providerOptions = {
+						anthropic: { cacheControl: { type: 'ephemeral' } },
+					};
+				}
+			}
+		}
+		return { system: cachedSystem, messages: cachedMessages };
+	}
+	return { system: cachedSystem, messages };
+}
+/**
+ * Truncates old messages to reduce context size while keeping recent context.
+ * Strategy: Keep system message + last N messages
+ */
+export function truncateHistory(
+	messages: ModelMessage[],
+	maxMessages = 20,
+): ModelMessage[] {
+	if (messages.length <= maxMessages) {
+		return messages;
+	}
+	// Keep the most recent messages
+	return messages.slice(-maxMessages);
+}
+/**
+ * Estimates token count (rough approximation: ~4 chars per token)
+ */
+export function estimateTokens(text: string): number {
+	return Math.ceil(text.length / 4);
+}
+/**
+ * Summarizes tool results if they're too long
+ */
+export function summarizeToolResult(result: unknown, maxLength = 5000): string {
+	const str = typeof result === 'string' ? result : JSON.stringify(result);
+	if (str.length <= maxLength) {
+		return str;
+	}
+	// Truncate and add indicator
+	return (
+		str.slice(0, maxLength) +
+		`\n\n[... truncated ${str.length - maxLength} characters]`
+	);
+}

package/src/runtime/context-optimizer.ts ADDED Viewed

@@ -0,0 +1,192 @@
+import type { ModelMessage } from 'ai';
+/**
+ * Optimizes message context by deduplicating file reads and pruning old tool results.
+ */
+interface FileRead {
+	messageIndex: number;
+	partIndex: number;
+	path: string;
+}
+/**
+ * Deduplicates file read results, keeping only the latest version of each file.
+ *
+ * Strategy:
+ * - Track all file reads (read, grep, glob tools)
+ * - For files read multiple times, remove older results
+ * - Keep only the most recent read of each file
+ */
+export function deduplicateFileReads(messages: ModelMessage[]): ModelMessage[] {
+	const fileReads = new Map<string, FileRead[]>();
+	// First pass: identify all file reads and their locations
+	messages.forEach((msg, msgIdx) => {
+		if (msg.role !== 'assistant' || !Array.isArray(msg.content)) return;
+		msg.content.forEach((part, partIdx) => {
+			if (!part || typeof part !== 'object') return;
+			if (!('type' in part)) return;
+			const toolType = part.type as string;
+			// Check if this is a file read tool (read, grep, glob)
+			if (!toolType.startsWith('tool-')) return;
+			const toolName = toolType.replace('tool-', '');
+			if (!['read', 'grep', 'glob'].includes(toolName)) return;
+			// Extract file path from input
+			const input = (part as any).input;
+			if (!input) return;
+			const path = input.path || input.filePattern || input.pattern;
+			if (!path) return;
+			// Track this file read
+			if (!fileReads.has(path)) {
+				fileReads.set(path, []);
+			}
+			fileReads
+				.get(path)!
+				.push({ messageIndex: msgIdx, partIndex: partIdx, path });
+		});
+	});
+	// Second pass: identify reads to remove (all but the latest for each file)
+	const readsToRemove = new Set<string>();
+	for (const [_path, reads] of fileReads) {
+		if (reads.length <= 1) continue;
+		// Sort by message index descending (latest first)
+		reads.sort((a, b) => b.messageIndex - a.messageIndex);
+		// Remove all but the first (latest)
+		for (let i = 1; i < reads.length; i++) {
+			const read = reads[i];
+			readsToRemove.add(`${read.messageIndex}-${read.partIndex}`);
+		}
+	}
+	// Third pass: rebuild messages without removed reads
+	return messages.map((msg, msgIdx) => {
+		if (msg.role !== 'assistant' || !Array.isArray(msg.content)) return msg;
+		const filteredContent = msg.content.filter((_part, partIdx) => {
+			const key = `${msgIdx}-${partIdx}`;
+			return !readsToRemove.has(key);
+		});
+		return {
+			...msg,
+			content: filteredContent,
+		};
+	});
+}
+/**
+ * Prunes old tool results to reduce context size.
+ *
+ * Strategy:
+ * - Keep only the last N tool results
+ * - Preserve tool calls but remove their output
+ * - Keep text parts intact
+ */
+export function pruneToolResults(
+	messages: ModelMessage[],
+	maxToolResults = 30,
+): ModelMessage[] {
+	// Collect all tool result locations
+	const toolResults: Array<{ messageIndex: number; partIndex: number }> = [];
+	messages.forEach((msg, msgIdx) => {
+		if (msg.role !== 'assistant' || !Array.isArray(msg.content)) return;
+		msg.content.forEach((part, partIdx) => {
+			if (!part || typeof part !== 'object') return;
+			if (!('type' in part)) return;
+			const toolType = part.type as string;
+			if (!toolType.startsWith('tool-')) return;
+			// Check if this has output
+			const hasOutput = (part as any).output !== undefined;
+			if (!hasOutput) return;
+			toolResults.push({ messageIndex: msgIdx, partIndex: partIdx });
+		});
+	});
+	// If under limit, no pruning needed
+	if (toolResults.length <= maxToolResults) {
+		return messages;
+	}
+	// Keep only the last N tool results
+	const toKeep = new Set<string>();
+	const keepCount = Math.min(maxToolResults, toolResults.length);
+	const keepStart = toolResults.length - keepCount;
+	for (let i = keepStart; i < toolResults.length; i++) {
+		const result = toolResults[i];
+		toKeep.add(`${result.messageIndex}-${result.partIndex}`);
+	}
+	// Rebuild messages, removing old tool outputs
+	return messages.map((msg, msgIdx) => {
+		if (msg.role !== 'assistant' || !Array.isArray(msg.content)) return msg;
+		const processedContent = msg.content.map((part, partIdx) => {
+			if (!part || typeof part !== 'object') return part;
+			if (!('type' in part)) return part;
+			const toolType = (part as any).type as string;
+			if (!toolType.startsWith('tool-')) return part;
+			const key = `${msgIdx}-${partIdx}`;
+			const hasOutput = (part as any).output !== undefined;
+			// If this tool result should be pruned, remove its output
+			if (hasOutput && !toKeep.has(key)) {
+				return {
+					...part,
+					output: '[pruned to save context]',
+				};
+			}
+			return part;
+		});
+		return {
+			...msg,
+			content: processedContent,
+		};
+	});
+}
+/**
+ * Applies all context optimizations:
+ * 1. Deduplicate file reads
+ * 2. Prune old tool results
+ */
+export function optimizeContext(
+	messages: ModelMessage[],
+	options: {
+		deduplicateFiles?: boolean;
+		maxToolResults?: number;
+	} = {},
+): ModelMessage[] {
+	let optimized = messages;
+	if (options.deduplicateFiles !== false) {
+		optimized = deduplicateFileReads(optimized);
+	}
+	if (options.maxToolResults !== undefined) {
+		optimized = pruneToolResults(optimized, options.maxToolResults);
+	}
+	return optimized;
+}

package/src/runtime/db-operations.ts CHANGED Viewed

@@ -3,8 +3,96 @@ import { messages, messageParts, sessions } from '@agi-cli/database/schema';
 import { eq } from 'drizzle-orm';
 import type { RunOpts } from './session-queue.ts';
+type UsageData = {
+	inputTokens?: number;
+	outputTokens?: number;
+	totalTokens?: number;
+	cachedInputTokens?: number;
+	reasoningTokens?: number;
+};
+/**
+ * Updates session token counts incrementally after each step.
+ * Note: onStepFinish.usage is CUMULATIVE per message, so we compute DELTA and add to session.
+ */
+export async function updateSessionTokensIncremental(
+	usage: UsageData,
+	providerMetadata: Record<string, any> | undefined,
+	opts: RunOpts,
+	db: Awaited<ReturnType<typeof getDb>>,
+) {
+	if (!usage) return;
+	// Read session totals
+	const sessRows = await db
+		.select()
+		.from(sessions)
+		.where(eq(sessions.id, opts.sessionId));
+	if (sessRows.length === 0 || !sessRows[0]) return;
+	const sess = sessRows[0];
+	const priorInputSess = Number(sess.totalInputTokens ?? 0);
+	const priorOutputSess = Number(sess.totalOutputTokens ?? 0);
+	const priorCachedSess = Number(sess.totalCachedTokens ?? 0);
+	const priorReasoningSess = Number(sess.totalReasoningTokens ?? 0);
+	// Read current message totals to compute delta
+	const msgRows = await db
+		.select()
+		.from(messages)
+		.where(eq(messages.id, opts.assistantMessageId));
+	const msg = msgRows[0];
+	const priorPromptMsg = Number(msg?.promptTokens ?? 0);
+	const priorCompletionMsg = Number(msg?.completionTokens ?? 0);
+	const priorCachedMsg = Number(msg?.cachedInputTokens ?? 0);
+	const priorReasoningMsg = Number(msg?.reasoningTokens ?? 0);
+	// Treat usage as cumulative per-message for this step
+	const cumPrompt =
+		usage.inputTokens != null ? Number(usage.inputTokens) : priorPromptMsg;
+	const cumCompletion =
+		usage.outputTokens != null
+			? Number(usage.outputTokens)
+			: priorCompletionMsg;
+	const cumReasoning =
+		usage.reasoningTokens != null
+			? Number(usage.reasoningTokens)
+			: priorReasoningMsg;
+	const cumCached =
+		usage.cachedInputTokens != null
+			? Number(usage.cachedInputTokens)
+			: providerMetadata?.openai?.cachedPromptTokens != null
+				? Number(providerMetadata.openai.cachedPromptTokens)
+				: priorCachedMsg;
+	// Compute deltas for this step; clamp to 0 in case provider reports smaller values
+	const deltaInput = Math.max(0, cumPrompt - priorPromptMsg);
+	const deltaOutput = Math.max(0, cumCompletion - priorCompletionMsg);
+	const deltaCached = Math.max(0, cumCached - priorCachedMsg);
+	const deltaReasoning = Math.max(0, cumReasoning - priorReasoningMsg);
+	const nextInputSess = priorInputSess + deltaInput;
+	const nextOutputSess = priorOutputSess + deltaOutput;
+	const nextCachedSess = priorCachedSess + deltaCached;
+	const nextReasoningSess = priorReasoningSess + deltaReasoning;
+	await db
+		.update(sessions)
+		.set({
+			totalInputTokens: nextInputSess,
+			totalOutputTokens: nextOutputSess,
+			totalCachedTokens: nextCachedSess,
+			totalReasoningTokens: nextReasoningSess,
+		})
+		.where(eq(sessions.id, opts.sessionId));
+}
 /**
  * Updates session token counts after a run completes.
+ * @deprecated Use updateSessionTokensIncremental for per-step tracking
  */
 export async function updateSessionTokens(
 	fin: { usage?: { inputTokens?: number; outputTokens?: number } },
@@ -36,7 +124,67 @@ export async function updateSessionTokens(
 }
 /**
- * Marks an assistant message as complete with token usage information.
+ * Updates message token counts incrementally after each step.
+ * Note: onStepFinish.usage is CUMULATIVE per message, so we REPLACE values, not add.
+ */
+export async function updateMessageTokensIncremental(
+	usage: UsageData,
+	providerMetadata: Record<string, any> | undefined,
+	opts: RunOpts,
+	db: Awaited<ReturnType<typeof getDb>>,
+) {
+	if (!usage) return;
+	const msgRows = await db
+		.select()
+		.from(messages)
+		.where(eq(messages.id, opts.assistantMessageId));
+	if (msgRows.length > 0 && msgRows[0]) {
+		const msg = msgRows[0];
+		const priorPrompt = Number(msg.promptTokens ?? 0);
+		const priorCompletion = Number(msg.completionTokens ?? 0);
+		const priorCached = Number(msg.cachedInputTokens ?? 0);
+		const priorReasoning = Number(msg.reasoningTokens ?? 0);
+		// Treat usage as cumulative per-message - REPLACE not ADD
+		const cumPrompt =
+			usage.inputTokens != null ? Number(usage.inputTokens) : priorPrompt;
+		const cumCompletion =
+			usage.outputTokens != null ? Number(usage.outputTokens) : priorCompletion;
+		const cumReasoning =
+			usage.reasoningTokens != null
+				? Number(usage.reasoningTokens)
+				: priorReasoning;
+		const cumCached =
+			usage.cachedInputTokens != null
+				? Number(usage.cachedInputTokens)
+				: providerMetadata?.openai?.cachedPromptTokens != null
+					? Number(providerMetadata.openai.cachedPromptTokens)
+					: priorCached;
+		const cumTotal =
+			usage.totalTokens != null
+				? Number(usage.totalTokens)
+				: cumPrompt + cumCompletion + cumReasoning;
+		await db
+			.update(messages)
+			.set({
+				promptTokens: cumPrompt,
+				completionTokens: cumCompletion,
+				totalTokens: cumTotal,
+				cachedInputTokens: cumCached,
+				reasoningTokens: cumReasoning,
+			})
+			.where(eq(messages.id, opts.assistantMessageId));
+	}
+}
+/**
+ * Marks an assistant message as complete.
+ * Token usage is tracked incrementally via updateMessageTokensIncremental().
  */
 export async function completeAssistantMessage(
 	fin: {
@@ -49,22 +197,13 @@ export async function completeAssistantMessage(
 	opts: RunOpts,
 	db: Awaited<ReturnType<typeof getDb>>,
 ) {
-	const vals: Record<string, unknown> = {
-		status: 'complete',
-		completedAt: Date.now(),
-	};
-	if (fin.usage) {
-		vals.promptTokens = fin.usage.inputTokens;
-		vals.completionTokens = fin.usage.outputTokens;
-		vals.totalTokens =
-			fin.usage.totalTokens ??
-			(vals.promptTokens as number) + (vals.completionTokens as number);
-	}
+	// Only mark as complete - tokens are already tracked incrementally
 	await db
 		.update(messages)
-		.set(vals)
+		.set({
+			status: 'complete',
+			completedAt: Date.now(),
+		})
 		.where(eq(messages.id, opts.assistantMessageId));
 }

package/src/runtime/runner.ts CHANGED Viewed

@@ -28,6 +28,8 @@ import {
 } from './tool-context-setup.ts';
 import {
 	updateSessionTokens,
+	updateSessionTokensIncremental,
+	updateMessageTokensIncremental,
 	completeAssistantMessage,
 	cleanupEmptyTextParts,
 } from './db-operations.ts';
@@ -180,7 +182,7 @@ async function runAssistant(opts: RunOpts) {
 		opts,
 		db,
 	);
-	const toolset = adaptTools(gated, sharedCtx);
+	const toolset = adaptTools(gated, sharedCtx, opts.provider);
 	const modelTimer = time('runner:resolveModel');
 	const model = await resolveModel(opts.provider, opts.model, cfg);
@@ -229,6 +231,8 @@ async function runAssistant(opts: RunOpts) {
 		updateCurrentPartId,
 		updateAccumulated,
 		incrementStepIndex,
+		updateSessionTokensIncremental,
+		updateMessageTokensIncremental,
 	);
 	const onError = createErrorHandler(opts, db, getStepIndex, sharedCtx);
@@ -239,16 +243,37 @@ async function runAssistant(opts: RunOpts) {
 		opts,
 		db,
 		() => ensureFinishToolCalled(finishObserved, toolset, sharedCtx, stepIndex),
-		updateSessionTokens,
 		completeAssistantMessage,
 	);
+	// Apply optimizations: deduplication, pruning, cache control, and truncation
+	const { addCacheControl, truncateHistory } = await import(
+		'./cache-optimizer.ts'
+	);
+	const { optimizeContext } = await import('./context-optimizer.ts');
+	// 1. Optimize context (deduplicate file reads, prune old tool results)
+	const contextOptimized = optimizeContext(messagesWithSystemInstructions, {
+		deduplicateFiles: true,
+		maxToolResults: 30,
+	});
+	// 2. Truncate history
+	const truncatedMessages = truncateHistory(contextOptimized, 20);
+	// 3. Add cache control
+	const { system: cachedSystem, messages: optimizedMessages } = addCacheControl(
+		opts.provider as any,
+		system,
+		truncatedMessages,
+	);
 	try {
 		const result = streamText({
 			model,
 			tools: toolset,
-			...(String(system || '').trim() ? { system } : {}),
-			messages: messagesWithSystemInstructions,
+			...(cachedSystem ? { system: cachedSystem } : {}),
+			messages: optimizedMessages,
 			...(maxOutputTokens ? { maxOutputTokens } : {}),
 			abortSignal: opts.abortSignal,
 			stopWhen: hasToolCall('finish'),

package/src/runtime/session-manager.ts CHANGED Viewed

@@ -50,6 +50,8 @@ export async function createSession({
 		lastActiveAt: null,
 		totalInputTokens: null,
 		totalOutputTokens: null,
+		totalCachedTokens: null,
+		totalReasoningTokens: null,
 		totalToolTimeMs: null,
 		toolCountsJson: null,
 	};

package/src/runtime/stream-handlers.ts CHANGED Viewed

@@ -9,9 +9,16 @@ import type { RunOpts } from './session-queue.ts';
 import type { ToolAdapterContext } from '../tools/adapter.ts';
 type StepFinishEvent = {
-	usage?: { inputTokens?: number; outputTokens?: number };
+	usage?: {
+		inputTokens?: number;
+		outputTokens?: number;
+		totalTokens?: number;
+		cachedInputTokens?: number;
+		reasoningTokens?: number;
+	};
 	finishReason?: string;
 	response?: unknown;
+	experimental_providerMetadata?: Record<string, any>;
 };
 type FinishEvent = {
@@ -39,6 +46,18 @@ export function createStepFinishHandler(
 	updateCurrentPartId: (id: string) => void,
 	updateAccumulated: (text: string) => void,
 	incrementStepIndex: () => number,
+	updateSessionTokensIncrementalFn: (
+		usage: any,
+		providerMetadata: Record<string, any> | undefined,
+		opts: RunOpts,
+		db: Awaited<ReturnType<typeof getDb>>,
+	) => Promise<void>,
+	updateMessageTokensIncrementalFn: (
+		usage: any,
+		providerMetadata: Record<string, any> | undefined,
+		opts: RunOpts,
+		db: Awaited<ReturnType<typeof getDb>>,
+	) => Promise<void>,
 ) {
 	return async (step: StepFinishEvent) => {
 		const finishedAt = Date.now();
@@ -52,6 +71,27 @@ export function createStepFinishHandler(
 				.where(eq(messageParts.id, currentPartId));
 		} catch {}
+		// Update token counts incrementally after each step
+		if (step.usage) {
+			try {
+				await updateSessionTokensIncrementalFn(
+					step.usage,
+					step.experimental_providerMetadata,
+					opts,
+					db,
+				);
+			} catch {}
+			try {
+				await updateMessageTokensIncrementalFn(
+					step.usage,
+					step.experimental_providerMetadata,
+					opts,
+					db,
+				);
+			} catch {}
+		}
 		try {
 			publish({
 				type: 'finish-step',
@@ -234,11 +274,6 @@ export function createFinishHandler(
 	opts: RunOpts,
 	db: Awaited<ReturnType<typeof getDb>>,
 	ensureFinishToolCalled: () => Promise<void>,
-	updateSessionTokensFn: (
-		fin: FinishEvent,
-		opts: RunOpts,
-		db: Awaited<ReturnType<typeof getDb>>,
-	) => Promise<void>,
 	completeAssistantMessageFn: (
 		fin: FinishEvent,
 		opts: RunOpts,
@@ -250,23 +285,37 @@ export function createFinishHandler(
 			await ensureFinishToolCalled();
 		} catch {}
-		try {
-			await updateSessionTokensFn(fin, opts, db);
-		} catch {}
+		// Note: Token updates are handled incrementally in onStepFinish
+		// Do NOT add fin.usage here as it would cause double-counting
 		try {
 			await completeAssistantMessageFn(fin, opts, db);
 		} catch {}
-		const costUsd = fin.usage
-			? estimateModelCostUsd(opts.provider, opts.model, fin.usage)
+		// Use session totals from DB for accurate cost calculation
+		const sessRows = await db
+			.select()
+			.from(messages)
+			.where(eq(messages.id, opts.assistantMessageId));
+		const usage = sessRows[0]
+			? {
+					inputTokens: Number(sessRows[0].promptTokens ?? 0),
+					outputTokens: Number(sessRows[0].completionTokens ?? 0),
+					totalTokens: Number(sessRows[0].totalTokens ?? 0),
+				}
+			: fin.usage;
+		const costUsd = usage
+			? estimateModelCostUsd(opts.provider, opts.model, usage)
 			: undefined;
 		publish({
 			type: 'message.completed',
 			sessionId: opts.sessionId,
 			payload: {
 				id: opts.assistantMessageId,
-				usage: fin.usage,
+				usage,
 				costUsd,
 				finishReason: fin.finishReason,
 			},

package/src/tools/adapter.ts CHANGED Viewed

@@ -39,15 +39,40 @@ function getPendingQueue(
 	return queue;
 }
-export function adaptTools(tools: DiscoveredTool[], ctx: ToolAdapterContext) {
+export function adaptTools(
+	tools: DiscoveredTool[],
+	ctx: ToolAdapterContext,
+	provider?: string,
+) {
 	const out: Record<string, Tool> = {};
 	const pendingCalls = new Map<string, PendingCallMeta[]>();
 	let firstToolCallReported = false;
+	// Anthropic allows max 4 cache_control blocks
+	// Cache only the most frequently used tools: read, write, bash
+	const cacheableTools = new Set(['read', 'write', 'bash', 'edit']);
+	let cachedToolCount = 0;
 	for (const { name, tool } of tools) {
 		const base = tool;
+		// Add cache control for Anthropic to cache tool definitions (max 2 tools)
+		const shouldCache =
+			provider === 'anthropic' &&
+			cacheableTools.has(name) &&
+			cachedToolCount < 2;
+		if (shouldCache) {
+			cachedToolCount++;
+		}
+		const providerOptions = shouldCache
+			? { anthropic: { cacheControl: { type: 'ephemeral' as const } } }
+			: undefined;
 		out[name] = {
 			...base,
+			...(providerOptions ? { providerOptions } : {}),
 			async onInputStart(options: unknown) {
 				const queue = getPendingQueue(pendingCalls, name);
 				queue.push({
@@ -185,194 +210,257 @@ export function adaptTools(tools: DiscoveredTool[], ctx: ToolAdapterContext) {
 				const callIdFromQueue = meta?.callId;
 				const startTsFromQueue = meta?.startTs;
 				const stepIndexForEvent = meta?.stepIndex ?? ctx.stepIndex;
-				// Handle session-relative paths and cwd tools
-				let res: ToolExecuteReturn | { cwd: string } | null | undefined;
-				const cwd = getCwd(ctx.sessionId);
-				if (name === 'pwd') {
-					res = { cwd };
-				} else if (name === 'cd') {
-					const next = joinRelative(
-						cwd,
-						String((input as Record<string, unknown>)?.path ?? '.'),
-					);
-					setCwd(ctx.sessionId, next);
-					res = { cwd: next };
-				} else if (
-					['read', 'write', 'ls', 'tree'].includes(name) &&
-					typeof (input as Record<string, unknown>)?.path === 'string'
-				) {
-					const rel = joinRelative(
-						cwd,
-						String((input as Record<string, unknown>).path),
-					);
-					const nextInput = {
-						...(input as Record<string, unknown>),
-						path: rel,
-					} as ToolExecuteInput;
-					// biome-ignore lint/suspicious/noExplicitAny: AI SDK types are complex
-					res = base.execute?.(nextInput, options as any);
-				} else if (name === 'bash') {
-					const needsCwd =
-						!input ||
-						typeof (input as Record<string, unknown>).cwd !== 'string';
-					const nextInput = needsCwd
-						? ({
-								...(input as Record<string, unknown>),
-								cwd,
-							} as ToolExecuteInput)
-						: input;
-					// biome-ignore lint/suspicious/noExplicitAny: AI SDK types are complex
-					res = base.execute?.(nextInput, options as any);
-				} else {
-					// biome-ignore lint/suspicious/noExplicitAny: AI SDK types are complex
-					res = base.execute?.(input, options as any);
-				}
-				let result: unknown = res;
-				// If tool returns an async iterable, stream deltas while accumulating
-				if (res && typeof res === 'object' && Symbol.asyncIterator in res) {
-					const chunks: unknown[] = [];
-					for await (const chunk of res as AsyncIterable<unknown>) {
-						chunks.push(chunk);
+				try {
+					// Handle session-relative paths and cwd tools
+					let res: ToolExecuteReturn | { cwd: string } | null | undefined;
+					const cwd = getCwd(ctx.sessionId);
+					if (name === 'pwd') {
+						res = { cwd };
+					} else if (name === 'cd') {
+						const next = joinRelative(
+							cwd,
+							String((input as Record<string, unknown>)?.path ?? '.'),
+						);
+						setCwd(ctx.sessionId, next);
+						res = { cwd: next };
+					} else if (
+						['read', 'write', 'ls', 'tree'].includes(name) &&
+						typeof (input as Record<string, unknown>)?.path === 'string'
+					) {
+						const rel = joinRelative(
+							cwd,
+							String((input as Record<string, unknown>).path),
+						);
+						const nextInput = {
+							...(input as Record<string, unknown>),
+							path: rel,
+						} as ToolExecuteInput;
+						// biome-ignore lint/suspicious/noExplicitAny: AI SDK types are complex
+						res = base.execute?.(nextInput, options as any);
+					} else if (name === 'bash') {
+						const needsCwd =
+							!input ||
+							typeof (input as Record<string, unknown>).cwd !== 'string';
+						const nextInput = needsCwd
+							? ({
+									...(input as Record<string, unknown>),
+									cwd,
+								} as ToolExecuteInput)
+							: input;
+						// biome-ignore lint/suspicious/noExplicitAny: AI SDK types are complex
+						res = base.execute?.(nextInput, options as any);
+					} else {
+						// biome-ignore lint/suspicious/noExplicitAny: AI SDK types are complex
+						res = base.execute?.(input, options as any);
+					}
+					let result: unknown = res;
+					// If tool returns an async iterable, stream deltas while accumulating
+					if (res && typeof res === 'object' && Symbol.asyncIterator in res) {
+						const chunks: unknown[] = [];
+						for await (const chunk of res as AsyncIterable<unknown>) {
+							chunks.push(chunk);
+							publish({
+								type: 'tool.delta',
+								sessionId: ctx.sessionId,
+								payload: {
+									name,
+									channel: 'output',
+									delta: chunk,
+									stepIndex: stepIndexForEvent,
+									callId: callIdFromQueue,
+								},
+							});
+						}
+						// Prefer the last chunk as the result if present, otherwise the entire array
+						result = chunks.length > 0 ? chunks[chunks.length - 1] : null;
+					} else {
+						// Await promise or passthrough value
+						result = await Promise.resolve(res as ToolExecuteReturn);
+					}
+					const resultPartId = crypto.randomUUID();
+					const callId = callIdFromQueue;
+					const startTs = startTsFromQueue;
+					const contentObj: {
+						name: string;
+						result: unknown;
+						callId?: string;
+						artifact?: unknown;
+						args?: unknown;
+					} = {
+						name,
+						result,
+						callId,
+					};
+					if (meta?.args !== undefined) {
+						contentObj.args = meta.args;
+					}
+					if (result && typeof result === 'object' && 'artifact' in result) {
+						try {
+							const maybeArtifact = (result as { artifact?: unknown }).artifact;
+							if (maybeArtifact !== undefined)
+								contentObj.artifact = maybeArtifact;
+						} catch {}
+					}
+					const index = await ctx.nextIndex();
+					const endTs = Date.now();
+					const dur =
+						typeof startTs === 'number' ? Math.max(0, endTs - startTs) : null;
+					// Special-case: keep progress_update result lightweight; publish first, persist best-effort
+					if (name === 'progress_update') {
 						publish({
-							type: 'tool.delta',
+							type: 'tool.result',
 							sessionId: ctx.sessionId,
-							payload: {
-								name,
-								channel: 'output',
-								delta: chunk,
-								stepIndex: stepIndexForEvent,
-								callId: callIdFromQueue,
-							},
+							payload: { ...contentObj, stepIndex: stepIndexForEvent },
 						});
+						// Persist without blocking the event loop
+						(async () => {
+							try {
+								await ctx.db.insert(messageParts).values({
+									id: resultPartId,
+									messageId: ctx.messageId,
+									index,
+									stepIndex: stepIndexForEvent,
+									type: 'tool_result',
+									content: JSON.stringify(contentObj),
+									agent: ctx.agent,
+									provider: ctx.provider,
+									model: ctx.model,
+									startedAt: startTs,
+									completedAt: endTs,
+									toolName: name,
+									toolCallId: callId,
+									toolDurationMs: dur ?? undefined,
+								});
+							} catch {}
+						})();
+						return result as ToolExecuteReturn;
 					}
-					// Prefer the last chunk as the result if present, otherwise the entire array
-					result = chunks.length > 0 ? chunks[chunks.length - 1] : null;
-				} else {
-					// Await promise or passthrough value
-					result = await Promise.resolve(res as ToolExecuteReturn);
-				}
-				const resultPartId = crypto.randomUUID();
-				const callId = callIdFromQueue;
-				const startTs = startTsFromQueue;
-				const contentObj: {
-					name: string;
-					result: unknown;
-					callId?: string;
-					artifact?: unknown;
-					args?: unknown;
-				} = {
-					name,
-					result,
-					callId,
-				};
-				if (meta?.args !== undefined) {
-					contentObj.args = meta.args;
-				}
-				if (result && typeof result === 'object' && 'artifact' in result) {
+					await ctx.db.insert(messageParts).values({
+						id: resultPartId,
+						messageId: ctx.messageId,
+						index,
+						stepIndex: stepIndexForEvent,
+						type: 'tool_result',
+						content: JSON.stringify(contentObj),
+						agent: ctx.agent,
+						provider: ctx.provider,
+						model: ctx.model,
+						startedAt: startTs,
+						completedAt: endTs,
+						toolName: name,
+						toolCallId: callId,
+						toolDurationMs: dur ?? undefined,
+					});
+					// Update session aggregates: total tool time and counts per tool
 					try {
-						const maybeArtifact = (result as { artifact?: unknown }).artifact;
-						if (maybeArtifact !== undefined)
-							contentObj.artifact = maybeArtifact;
+						const sessRows = await ctx.db
+							.select()
+							.from(sessions)
+							.where(eq(sessions.id, ctx.sessionId));
+						if (sessRows.length) {
+							const row = sessRows[0] as typeof sessions.$inferSelect;
+							const totalToolTimeMs =
+								Number(row.totalToolTimeMs || 0) + (dur ?? 0);
+							let counts: Record<string, number> = {};
+							try {
+								counts = row.toolCountsJson
+									? JSON.parse(row.toolCountsJson)
+									: {};
+							} catch {}
+							counts[name] = (counts[name] || 0) + 1;
+							await ctx.db
+								.update(sessions)
+								.set({
+									totalToolTimeMs,
+									toolCountsJson: JSON.stringify(counts),
+									lastActiveAt: endTs,
+								})
+								.where(eq(sessions.id, ctx.sessionId));
+						}
 					} catch {}
-				}
-				const index = await ctx.nextIndex();
-				const endTs = Date.now();
-				const dur =
-					typeof startTs === 'number' ? Math.max(0, endTs - startTs) : null;
-				// Special-case: keep progress_update result lightweight; publish first, persist best-effort
-				if (name === 'progress_update') {
 					publish({
 						type: 'tool.result',
 						sessionId: ctx.sessionId,
 						payload: { ...contentObj, stepIndex: stepIndexForEvent },
 					});
-					// Persist without blocking the event loop
-					(async () => {
+					if (name === 'update_plan') {
 						try {
-							await ctx.db.insert(messageParts).values({
-								id: resultPartId,
-								messageId: ctx.messageId,
-								index,
-								stepIndex: stepIndexForEvent,
-								type: 'tool_result',
-								content: JSON.stringify(contentObj),
-								agent: ctx.agent,
-								provider: ctx.provider,
-								model: ctx.model,
-								startedAt: startTs,
-								completedAt: endTs,
-								toolName: name,
-								toolCallId: callId,
-								toolDurationMs: dur ?? undefined,
-							});
+							const result = (contentObj as { result?: unknown }).result as
+								| { items?: unknown; note?: unknown }
+								| undefined;
+							if (result && Array.isArray(result.items)) {
+								publish({
+									type: 'plan.updated',
+									sessionId: ctx.sessionId,
+									payload: { items: result.items, note: result.note },
+								});
+							}
 						} catch {}
-					})();
-					return result as ToolExecuteReturn;
-				}
+					}
+					return result;
+				} catch (error) {
+					// Tool execution failed - save error to database as tool_result
+					const resultPartId = crypto.randomUUID();
+					const callId = callIdFromQueue;
+					const startTs = startTsFromQueue;
+					const endTs = Date.now();
+					const dur =
+						typeof startTs === 'number' ? Math.max(0, endTs - startTs) : null;
-				await ctx.db.insert(messageParts).values({
-					id: resultPartId,
-					messageId: ctx.messageId,
-					index,
-					stepIndex: stepIndexForEvent,
-					type: 'tool_result',
-					content: JSON.stringify(contentObj),
-					agent: ctx.agent,
-					provider: ctx.provider,
-					model: ctx.model,
-					startedAt: startTs,
-					completedAt: endTs,
-					toolName: name,
-					toolCallId: callId,
-					toolDurationMs: dur ?? undefined,
-				});
-				// Update session aggregates: total tool time and counts per tool
-				try {
-					const sessRows = await ctx.db
-						.select()
-						.from(sessions)
-						.where(eq(sessions.id, ctx.sessionId));
-					if (sessRows.length) {
-						const row = sessRows[0] as typeof sessions.$inferSelect;
-						const totalToolTimeMs =
-							Number(row.totalToolTimeMs || 0) + (dur ?? 0);
-						let counts: Record<string, number> = {};
-						try {
-							counts = row.toolCountsJson ? JSON.parse(row.toolCountsJson) : {};
-						} catch {}
-						counts[name] = (counts[name] || 0) + 1;
-						await ctx.db
-							.update(sessions)
-							.set({
-								totalToolTimeMs,
-								toolCountsJson: JSON.stringify(counts),
-								lastActiveAt: endTs,
-							})
-							.where(eq(sessions.id, ctx.sessionId));
+					const errorMessage =
+						error instanceof Error ? error.message : String(error);
+					const errorStack = error instanceof Error ? error.stack : undefined;
+					const errorResult = {
+						ok: false,
+						error: errorMessage,
+						stack: errorStack,
+					};
+					const contentObj = {
+						name,
+						result: errorResult,
+						callId,
+					};
+					if (meta?.args !== undefined) {
+						contentObj.args = meta.args;
 					}
-				} catch {}
-				publish({
-					type: 'tool.result',
-					sessionId: ctx.sessionId,
-					payload: { ...contentObj, stepIndex: stepIndexForEvent },
-				});
-				if (name === 'update_plan') {
-					try {
-						const result = (contentObj as { result?: unknown }).result as
-							| { items?: unknown; note?: unknown }
-							| undefined;
-						if (result && Array.isArray(result.items)) {
-							publish({
-								type: 'plan.updated',
-								sessionId: ctx.sessionId,
-								payload: { items: result.items, note: result.note },
-							});
-						}
-					} catch {}
+					const index = await ctx.nextIndex();
+					// Save error result to database
+					await ctx.db.insert(messageParts).values({
+						id: resultPartId,
+						messageId: ctx.messageId,
+						index,
+						stepIndex: stepIndexForEvent,
+						type: 'tool_result',
+						content: JSON.stringify(contentObj),
+						agent: ctx.agent,
+						provider: ctx.provider,
+						model: ctx.model,
+						startedAt: startTs,
+						completedAt: endTs,
+						toolName: name,
+						toolCallId: callId,
+						toolDurationMs: dur ?? undefined,
+					});
+					// Publish error result
+					publish({
+						type: 'tool.result',
+						sessionId: ctx.sessionId,
+						payload: { ...contentObj, stepIndex: stepIndexForEvent },
+					});
+					// Re-throw so AI SDK can handle it
+					throw error;
 				}
-				return result;
 			},
 		} as Tool;
 	}