@agi-cli/server 0.1.106 → 0.1.107

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@agi-cli/server",
3
- "version": "0.1.106",
3
+ "version": "0.1.107",
4
4
  "description": "HTTP API server for AGI CLI",
5
5
  "type": "module",
6
6
  "main": "./src/index.ts",
@@ -29,8 +29,8 @@
29
29
  "typecheck": "tsc --noEmit"
30
30
  },
31
31
  "dependencies": {
32
- "@agi-cli/sdk": "0.1.106",
33
- "@agi-cli/database": "0.1.106",
32
+ "@agi-cli/sdk": "0.1.107",
33
+ "@agi-cli/database": "0.1.107",
34
34
  "drizzle-orm": "^0.44.5",
35
35
  "hono": "^4.9.9",
36
36
  "zod": "^4.1.8"
@@ -76,15 +76,14 @@ export async function updateSessionTokensIncremental(
76
76
  : priorCachedMsg;
77
77
 
78
78
  // Compute deltas for this step; clamp to 0 in case provider reports smaller values
79
- // Cached tokens reduce the billable input, so we subtract them from the delta
80
79
  const deltaInput = Math.max(0, cumPrompt - priorPromptMsg);
81
80
  const deltaOutput = Math.max(0, cumCompletion - priorCompletionMsg);
82
81
  const deltaCached = Math.max(0, cumCached - priorCachedMsg);
83
82
  const deltaReasoning = Math.max(0, cumReasoning - priorReasoningMsg);
84
83
 
85
- // Session input should only count non-cached tokens
86
- // Total cached tokens are tracked separately for reference
87
- const nextInputSess = priorInputSess + deltaInput - deltaCached;
84
+ // Note: AI SDK's inputTokens already excludes cached tokens for Anthropic,
85
+ // so we don't need to subtract deltaCached here. Just accumulate directly.
86
+ const nextInputSess = priorInputSess + deltaInput;
88
87
  const nextOutputSess = priorOutputSess + deltaOutput;
89
88
  const nextCachedSess = priorCachedSess + deltaCached;
90
89
  const nextReasoningSess = priorReasoningSess + deltaReasoning;
@@ -7,7 +7,7 @@ import {
7
7
  setAuth,
8
8
  } from '@agi-cli/sdk';
9
9
  import { openai, createOpenAI } from '@ai-sdk/openai';
10
- import { anthropic, createAnthropic } from '@ai-sdk/anthropic';
10
+ import { createAnthropic } from '@ai-sdk/anthropic';
11
11
  import { google, createGoogleGenerativeAI } from '@ai-sdk/google';
12
12
  import { createOpenRouter } from '@openrouter/ai-sdk-provider';
13
13
  import { toClaudeCodeName } from './tool-mapping.ts';
@@ -170,7 +170,7 @@ async function getAnthropicInstance(cfg: AGIConfig) {
170
170
  url += url.includes('?') ? '&beta=true' : '?beta=true';
171
171
  }
172
172
 
173
- // Transform request body: tool names to PascalCase
173
+ // Transform request body: tool names to PascalCase + apply caching
174
174
  let body = init?.body;
175
175
  if (body && typeof body === 'string') {
176
176
  try {
@@ -186,36 +186,117 @@ async function getAnthropicInstance(cfg: AGIConfig) {
186
186
  );
187
187
  }
188
188
 
189
- // Transform tool names in messages (for tool_use blocks in history)
189
+ // Apply ephemeral caching (max 4 cache breakpoints total)
190
+ // Adapter adds 2 tool cache blocks, so we can add 2 more:
191
+ // - 1 system block (the first one with tools description)
192
+ // - 1 message block (the last user message)
193
+ const MAX_SYSTEM_CACHE = 1;
194
+ const MAX_MESSAGE_CACHE = 1;
195
+ let systemCacheUsed = 0;
196
+ let messageCacheUsed = 0;
197
+
198
+ // Cache first system message only (contains agent instructions)
199
+ if (parsed.system && Array.isArray(parsed.system)) {
200
+ parsed.system = parsed.system.map(
201
+ (
202
+ block: { type: string; cache_control?: unknown },
203
+ index: number,
204
+ ) => {
205
+ if (block.cache_control) return block;
206
+ if (
207
+ systemCacheUsed < MAX_SYSTEM_CACHE &&
208
+ index === 0 &&
209
+ block.type === 'text'
210
+ ) {
211
+ systemCacheUsed++;
212
+ return { ...block, cache_control: { type: 'ephemeral' } };
213
+ }
214
+ return block;
215
+ },
216
+ );
217
+ }
218
+
219
+ // Transform tool names in messages and apply caching to last message only
190
220
  if (parsed.messages && Array.isArray(parsed.messages)) {
221
+ const messageCount = parsed.messages.length;
222
+
191
223
  parsed.messages = parsed.messages.map(
192
- (msg: {
193
- role: string;
194
- content: unknown;
195
- [key: string]: unknown;
196
- }) => {
224
+ (
225
+ msg: {
226
+ role: string;
227
+ content: unknown;
228
+ [key: string]: unknown;
229
+ },
230
+ msgIndex: number,
231
+ ) => {
232
+ // Only cache the very last message
233
+ const isLast = msgIndex === messageCount - 1;
234
+
197
235
  if (Array.isArray(msg.content)) {
236
+ const content = msg.content.map(
237
+ (
238
+ block: {
239
+ type: string;
240
+ name?: string;
241
+ cache_control?: unknown;
242
+ },
243
+ blockIndex: number,
244
+ ) => {
245
+ let transformedBlock = block;
246
+
247
+ // Transform tool names
248
+ if (block.type === 'tool_use' && block.name) {
249
+ transformedBlock = {
250
+ ...block,
251
+ name: toClaudeCodeName(block.name),
252
+ };
253
+ }
254
+ if (block.type === 'tool_result' && block.name) {
255
+ transformedBlock = {
256
+ ...block,
257
+ name: toClaudeCodeName(block.name),
258
+ };
259
+ }
260
+
261
+ // Add cache_control to last block of last message
262
+ if (
263
+ isLast &&
264
+ !transformedBlock.cache_control &&
265
+ messageCacheUsed < MAX_MESSAGE_CACHE &&
266
+ blockIndex === (msg.content as unknown[]).length - 1
267
+ ) {
268
+ messageCacheUsed++;
269
+ return {
270
+ ...transformedBlock,
271
+ cache_control: { type: 'ephemeral' },
272
+ };
273
+ }
274
+
275
+ return transformedBlock;
276
+ },
277
+ );
278
+ return { ...msg, content };
279
+ }
280
+
281
+ // For string content, wrap in array with cache_control if last message
282
+ if (
283
+ isLast &&
284
+ messageCacheUsed < MAX_MESSAGE_CACHE &&
285
+ typeof msg.content === 'string'
286
+ ) {
287
+ messageCacheUsed++;
198
288
  return {
199
289
  ...msg,
200
- content: msg.content.map(
201
- (block: { type: string; name?: string }) => {
202
- if (block.type === 'tool_use' && block.name) {
203
- return {
204
- ...block,
205
- name: toClaudeCodeName(block.name),
206
- };
207
- }
208
- if (block.type === 'tool_result' && block.name) {
209
- return {
210
- ...block,
211
- name: toClaudeCodeName(block.name),
212
- };
213
- }
214
- return block;
290
+ content: [
291
+ {
292
+ type: 'text',
293
+ text: msg.content,
294
+ cache_control: { type: 'ephemeral' },
215
295
  },
216
- ),
296
+ ],
217
297
  };
218
298
  }
299
+
219
300
  return msg;
220
301
  },
221
302
  );
@@ -239,7 +320,116 @@ async function getAnthropicInstance(cfg: AGIConfig) {
239
320
  });
240
321
  }
241
322
 
242
- return anthropic;
323
+ // For API key auth, also apply caching via customFetch
324
+ // This optimizes token usage even without OAuth
325
+ const customFetch = async (
326
+ input: string | URL | Request,
327
+ init?: RequestInit,
328
+ ) => {
329
+ let body = init?.body;
330
+ if (body && typeof body === 'string') {
331
+ try {
332
+ const parsed = JSON.parse(body);
333
+
334
+ // Apply ephemeral caching (max 4 cache breakpoints total)
335
+ // Adapter adds 2 tool cache blocks, so we can add 2 more:
336
+ // - 1 system block + 1 message block = 2
337
+ const MAX_SYSTEM_CACHE = 1;
338
+ const MAX_MESSAGE_CACHE = 1;
339
+ let systemCacheUsed = 0;
340
+ let messageCacheUsed = 0;
341
+
342
+ // Cache first system message
343
+ if (parsed.system && Array.isArray(parsed.system)) {
344
+ parsed.system = parsed.system.map(
345
+ (
346
+ block: { type: string; cache_control?: unknown },
347
+ index: number,
348
+ ) => {
349
+ if (block.cache_control) return block;
350
+ if (
351
+ systemCacheUsed < MAX_SYSTEM_CACHE &&
352
+ index === 0 &&
353
+ block.type === 'text'
354
+ ) {
355
+ systemCacheUsed++;
356
+ return { ...block, cache_control: { type: 'ephemeral' } };
357
+ }
358
+ return block;
359
+ },
360
+ );
361
+ }
362
+
363
+ // Cache last message only
364
+ if (parsed.messages && Array.isArray(parsed.messages)) {
365
+ const messageCount = parsed.messages.length;
366
+ parsed.messages = parsed.messages.map(
367
+ (
368
+ msg: {
369
+ role: string;
370
+ content: unknown;
371
+ [key: string]: unknown;
372
+ },
373
+ msgIndex: number,
374
+ ) => {
375
+ const isLast = msgIndex === messageCount - 1;
376
+
377
+ if (Array.isArray(msg.content)) {
378
+ const blocks = msg.content as {
379
+ type: string;
380
+ cache_control?: unknown;
381
+ }[];
382
+ const content = blocks.map((block, blockIndex) => {
383
+ if (block.cache_control) return block;
384
+ if (
385
+ isLast &&
386
+ messageCacheUsed < MAX_MESSAGE_CACHE &&
387
+ blockIndex === blocks.length - 1
388
+ ) {
389
+ messageCacheUsed++;
390
+ return { ...block, cache_control: { type: 'ephemeral' } };
391
+ }
392
+ return block;
393
+ });
394
+ return { ...msg, content };
395
+ }
396
+
397
+ if (
398
+ isLast &&
399
+ messageCacheUsed < MAX_MESSAGE_CACHE &&
400
+ typeof msg.content === 'string'
401
+ ) {
402
+ messageCacheUsed++;
403
+ return {
404
+ ...msg,
405
+ content: [
406
+ {
407
+ type: 'text',
408
+ text: msg.content,
409
+ cache_control: { type: 'ephemeral' },
410
+ },
411
+ ],
412
+ };
413
+ }
414
+
415
+ return msg;
416
+ },
417
+ );
418
+ }
419
+
420
+ body = JSON.stringify(parsed);
421
+ } catch {
422
+ // If parsing fails, send as-is
423
+ }
424
+ }
425
+
426
+ const url = typeof input === 'string' ? input : input.toString();
427
+ return fetch(url, { ...init, body });
428
+ };
429
+
430
+ return createAnthropic({
431
+ fetch: customFetch as typeof fetch,
432
+ });
243
433
  }
244
434
 
245
435
  export async function resolveModel(