@kenkaiiii/gg-ai 4.11.2 → 4.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -255,6 +255,18 @@ declare class StreamResult implements AsyncIterable<StreamEvent> {
255
255
  private resolveResponse;
256
256
  private rejectResponse;
257
257
  private resolveWait;
258
+ /**
259
+ * High-water mark: when the buffer exceeds this many unconsumed events,
260
+ * the pump pauses until the consumer drains below the low-water mark.
261
+ * Prevents unbounded memory growth when a consumer is slow.
262
+ * Only active when someone IS iterating — if nobody iterates (the `then()`
263
+ * path), backpressure is skipped so the pump can complete and resolve.
264
+ */
265
+ private static readonly HIGH_WATER;
266
+ private static readonly LOW_WATER;
267
+ private iterating;
268
+ private paused;
269
+ private resolveDrain;
258
270
  constructor(generator: AsyncGenerator<StreamEvent, StreamResponse>, signal?: AbortSignal);
259
271
  private pump;
260
272
  private _nextWithAbort;
@@ -451,6 +463,28 @@ declare function toOpenAIMessages(messages: Message[], options?: {
451
463
  supportsImages?: boolean;
452
464
  }): OpenAI.ChatCompletionMessageParam[];
453
465
 
466
+ /**
467
+ * Fire a minimal `max_tokens: 1` request that populates the Anthropic prompt
468
+ * cache with the system prompt + tools prefix, so the first real user turn is
469
+ * a cache read instead of a cold cache write. Best-effort: any error is
470
+ * swallowed so a failed pre-warm never blocks the session.
471
+ *
472
+ * Called by AgentSession when speedProfile is "optimized", before the first
473
+ * real agent-loop turn. The cache TTL follows the `cacheRetention` option —
474
+ * pass "long" (1 h) so the pre-warm survives until the user's first message.
475
+ */
476
+ declare function prewarmAnthropicCache(options: {
477
+ apiKey: string;
478
+ model: string;
479
+ system: string;
480
+ tools?: StreamOptions["tools"];
481
+ serverTools?: StreamOptions["serverTools"];
482
+ baseUrl?: string;
483
+ userAgent?: string;
484
+ cacheRetention?: StreamOptions["cacheRetention"];
485
+ signal?: AbortSignal;
486
+ }): Promise<void>;
487
+
454
488
  interface PalsuProviderState {
455
489
  callCount: number;
456
490
  }
@@ -520,4 +554,4 @@ interface PalsuProviderConfig {
520
554
  */
521
555
  declare function registerPalsuProvider(config?: PalsuProviderConfig): PalsuProviderHandle;
522
556
 
523
- export { type AssistantMessage, type CacheRetention, type ContentPart, type DoneEvent, type ErrorEvent, type ErrorSource, EventStream, type FormattedError, GGAIError, type ImageContent, type Message, type PalsuModelConfig, type PalsuModelHandle, type PalsuProviderConfig, type PalsuProviderHandle, type PalsuProviderState, type PalsuResponse, type PalsuResponseFactory, type Provider, type ProviderDiagnosticFn, type ProviderEntry, ProviderError, type ProviderStreamFn, type RawContent, type ServerToolCall, type ServerToolCallEvent, type ServerToolDefinition, type ServerToolResult, type ServerToolResultEvent, type StopReason, type StreamEvent, type StreamOptions, type StreamResponse, StreamResult, type SystemMessage, type TextContent, type TextDeltaEvent, type ThinkingContent, type ThinkingDeltaEvent, type ThinkingLevel, type Tool, type ToolCall, type ToolCallDeltaEvent, type ToolCallDoneEvent, type ToolChoice, type ToolResult, type ToolResultContent, type ToolResultMessage, type Usage, type UserMessage, type VideoContent, classifyProviderError, formatError, formatErrorForDisplay, isHardBillingMessage, isUsageLimitError, palsuAssistantMessage, palsuText, palsuThinking, palsuToolCall, providerRegistry, registerPalsuProvider, setProviderDiagnostic, stream, toAnthropicMessages, toOpenAIMessages };
557
+ export { type AssistantMessage, type CacheRetention, type ContentPart, type DoneEvent, type ErrorEvent, type ErrorSource, EventStream, type FormattedError, GGAIError, type ImageContent, type Message, type PalsuModelConfig, type PalsuModelHandle, type PalsuProviderConfig, type PalsuProviderHandle, type PalsuProviderState, type PalsuResponse, type PalsuResponseFactory, type Provider, type ProviderDiagnosticFn, type ProviderEntry, ProviderError, type ProviderStreamFn, type RawContent, type ServerToolCall, type ServerToolCallEvent, type ServerToolDefinition, type ServerToolResult, type ServerToolResultEvent, type StopReason, type StreamEvent, type StreamOptions, type StreamResponse, StreamResult, type SystemMessage, type TextContent, type TextDeltaEvent, type ThinkingContent, type ThinkingDeltaEvent, type ThinkingLevel, type Tool, type ToolCall, type ToolCallDeltaEvent, type ToolCallDoneEvent, type ToolChoice, type ToolResult, type ToolResultContent, type ToolResultMessage, type Usage, type UserMessage, type VideoContent, classifyProviderError, formatError, formatErrorForDisplay, isHardBillingMessage, isUsageLimitError, palsuAssistantMessage, palsuText, palsuThinking, palsuToolCall, prewarmAnthropicCache, providerRegistry, registerPalsuProvider, setProviderDiagnostic, stream, toAnthropicMessages, toOpenAIMessages };
package/dist/index.d.ts CHANGED
@@ -255,6 +255,18 @@ declare class StreamResult implements AsyncIterable<StreamEvent> {
255
255
  private resolveResponse;
256
256
  private rejectResponse;
257
257
  private resolveWait;
258
+ /**
259
+ * High-water mark: when the buffer exceeds this many unconsumed events,
260
+ * the pump pauses until the consumer drains below the low-water mark.
261
+ * Prevents unbounded memory growth when a consumer is slow.
262
+ * Only active when someone IS iterating — if nobody iterates (the `then()`
263
+ * path), backpressure is skipped so the pump can complete and resolve.
264
+ */
265
+ private static readonly HIGH_WATER;
266
+ private static readonly LOW_WATER;
267
+ private iterating;
268
+ private paused;
269
+ private resolveDrain;
258
270
  constructor(generator: AsyncGenerator<StreamEvent, StreamResponse>, signal?: AbortSignal);
259
271
  private pump;
260
272
  private _nextWithAbort;
@@ -451,6 +463,28 @@ declare function toOpenAIMessages(messages: Message[], options?: {
451
463
  supportsImages?: boolean;
452
464
  }): OpenAI.ChatCompletionMessageParam[];
453
465
 
466
+ /**
467
+ * Fire a minimal `max_tokens: 1` request that populates the Anthropic prompt
468
+ * cache with the system prompt + tools prefix, so the first real user turn is
469
+ * a cache read instead of a cold cache write. Best-effort: any error is
470
+ * swallowed so a failed pre-warm never blocks the session.
471
+ *
472
+ * Called by AgentSession when speedProfile is "optimized", before the first
473
+ * real agent-loop turn. The cache TTL follows the `cacheRetention` option —
474
+ * pass "long" (1 h) so the pre-warm survives until the user's first message.
475
+ */
476
+ declare function prewarmAnthropicCache(options: {
477
+ apiKey: string;
478
+ model: string;
479
+ system: string;
480
+ tools?: StreamOptions["tools"];
481
+ serverTools?: StreamOptions["serverTools"];
482
+ baseUrl?: string;
483
+ userAgent?: string;
484
+ cacheRetention?: StreamOptions["cacheRetention"];
485
+ signal?: AbortSignal;
486
+ }): Promise<void>;
487
+
454
488
  interface PalsuProviderState {
455
489
  callCount: number;
456
490
  }
@@ -520,4 +554,4 @@ interface PalsuProviderConfig {
520
554
  */
521
555
  declare function registerPalsuProvider(config?: PalsuProviderConfig): PalsuProviderHandle;
522
556
 
523
- export { type AssistantMessage, type CacheRetention, type ContentPart, type DoneEvent, type ErrorEvent, type ErrorSource, EventStream, type FormattedError, GGAIError, type ImageContent, type Message, type PalsuModelConfig, type PalsuModelHandle, type PalsuProviderConfig, type PalsuProviderHandle, type PalsuProviderState, type PalsuResponse, type PalsuResponseFactory, type Provider, type ProviderDiagnosticFn, type ProviderEntry, ProviderError, type ProviderStreamFn, type RawContent, type ServerToolCall, type ServerToolCallEvent, type ServerToolDefinition, type ServerToolResult, type ServerToolResultEvent, type StopReason, type StreamEvent, type StreamOptions, type StreamResponse, StreamResult, type SystemMessage, type TextContent, type TextDeltaEvent, type ThinkingContent, type ThinkingDeltaEvent, type ThinkingLevel, type Tool, type ToolCall, type ToolCallDeltaEvent, type ToolCallDoneEvent, type ToolChoice, type ToolResult, type ToolResultContent, type ToolResultMessage, type Usage, type UserMessage, type VideoContent, classifyProviderError, formatError, formatErrorForDisplay, isHardBillingMessage, isUsageLimitError, palsuAssistantMessage, palsuText, palsuThinking, palsuToolCall, providerRegistry, registerPalsuProvider, setProviderDiagnostic, stream, toAnthropicMessages, toOpenAIMessages };
557
+ export { type AssistantMessage, type CacheRetention, type ContentPart, type DoneEvent, type ErrorEvent, type ErrorSource, EventStream, type FormattedError, GGAIError, type ImageContent, type Message, type PalsuModelConfig, type PalsuModelHandle, type PalsuProviderConfig, type PalsuProviderHandle, type PalsuProviderState, type PalsuResponse, type PalsuResponseFactory, type Provider, type ProviderDiagnosticFn, type ProviderEntry, ProviderError, type ProviderStreamFn, type RawContent, type ServerToolCall, type ServerToolCallEvent, type ServerToolDefinition, type ServerToolResult, type ServerToolResultEvent, type StopReason, type StreamEvent, type StreamOptions, type StreamResponse, StreamResult, type SystemMessage, type TextContent, type TextDeltaEvent, type ThinkingContent, type ThinkingDeltaEvent, type ThinkingLevel, type Tool, type ToolCall, type ToolCallDeltaEvent, type ToolCallDoneEvent, type ToolChoice, type ToolResult, type ToolResultContent, type ToolResultMessage, type Usage, type UserMessage, type VideoContent, classifyProviderError, formatError, formatErrorForDisplay, isHardBillingMessage, isUsageLimitError, palsuAssistantMessage, palsuText, palsuThinking, palsuToolCall, prewarmAnthropicCache, providerRegistry, registerPalsuProvider, setProviderDiagnostic, stream, toAnthropicMessages, toOpenAIMessages };
package/dist/index.js CHANGED
@@ -281,7 +281,7 @@ var EventStream = class {
281
281
  }
282
282
  }
283
283
  };
284
- var StreamResult = class {
284
+ var StreamResult = class _StreamResult {
285
285
  response;
286
286
  buffer = [];
287
287
  done = false;
@@ -289,6 +289,18 @@ var StreamResult = class {
289
289
  resolveResponse;
290
290
  rejectResponse;
291
291
  resolveWait = null;
292
+ /**
293
+ * High-water mark: when the buffer exceeds this many unconsumed events,
294
+ * the pump pauses until the consumer drains below the low-water mark.
295
+ * Prevents unbounded memory growth when a consumer is slow.
296
+ * Only active when someone IS iterating — if nobody iterates (the `then()`
297
+ * path), backpressure is skipped so the pump can complete and resolve.
298
+ */
299
+ static HIGH_WATER = 5e3;
300
+ static LOW_WATER = 1e3;
301
+ iterating = false;
302
+ paused = false;
303
+ resolveDrain = null;
292
304
  constructor(generator, signal) {
293
305
  this.response = new Promise((resolve, reject) => {
294
306
  this.resolveResponse = resolve;
@@ -303,6 +315,13 @@ var StreamResult = class {
303
315
  this.buffer.push(next.value);
304
316
  this.resolveWait?.();
305
317
  this.resolveWait = null;
318
+ if (this.iterating && this.buffer.length > _StreamResult.HIGH_WATER) {
319
+ this.paused = true;
320
+ await new Promise((r) => {
321
+ this.resolveDrain = r;
322
+ });
323
+ this.paused = false;
324
+ }
306
325
  next = await this._nextWithAbort(generator, signal);
307
326
  }
308
327
  this.done = true;
@@ -341,11 +360,20 @@ var StreamResult = class {
341
360
  }
342
361
  }
343
362
  async *[Symbol.asyncIterator]() {
363
+ this.iterating = true;
344
364
  let index = 0;
345
365
  while (true) {
346
366
  while (index < this.buffer.length) {
347
367
  yield this.buffer[index++];
348
368
  }
369
+ if (this.paused && index > _StreamResult.LOW_WATER) {
370
+ this.resolveDrain?.();
371
+ this.resolveDrain = null;
372
+ }
373
+ if (index > 0 && !this.paused) {
374
+ this.buffer.splice(0, index);
375
+ index = 0;
376
+ }
349
377
  if (this.error) throw this.error;
350
378
  if (this.done) return;
351
379
  await new Promise((r) => {
@@ -358,16 +386,26 @@ var StreamResult = class {
358
386
  }
359
387
  }
360
388
  then(onfulfilled, onrejected) {
389
+ if (this.paused) {
390
+ this.paused = false;
391
+ this.resolveDrain?.();
392
+ this.resolveDrain = null;
393
+ }
361
394
  return this.response.then(onfulfilled, onrejected);
362
395
  }
363
396
  };
364
397
 
365
398
  // src/utils/zod-to-json-schema.ts
366
399
  import { z } from "zod";
400
+ var schemaCache = /* @__PURE__ */ new WeakMap();
367
401
  function zodToJsonSchema(schema) {
402
+ const cached = schemaCache.get(schema);
403
+ if (cached) return cached;
368
404
  const jsonSchema = z.toJSONSchema(schema);
369
405
  const { $schema: _schema, ...rest } = jsonSchema;
370
- return normalizeRootForAnthropic(rest);
406
+ const normalized = normalizeRootForAnthropic(rest);
407
+ schemaCache.set(schema, normalized);
408
+ return normalized;
371
409
  }
372
410
  function resolveToolSchema(tool) {
373
411
  return tool.rawInputSchema ?? zodToJsonSchema(tool.parameters);
@@ -759,16 +797,17 @@ function toAnthropicThinking(level, maxTokens, model) {
759
797
  outputConfig: { effort }
760
798
  };
761
799
  }
800
+ const VISIBLE_FLOOR = 1024;
762
801
  const effectiveLevel = level === "xhigh" || level === "max" ? "high" : level;
763
802
  const budgetMap = {
764
- low: Math.max(1024, Math.floor(maxTokens * 0.25)),
765
- medium: Math.max(2048, Math.floor(maxTokens * 0.5)),
766
- high: Math.max(4096, maxTokens)
803
+ low: Math.max(1024, Math.floor(maxTokens * 0.2)),
804
+ medium: Math.max(2048, Math.floor(maxTokens * 0.45)),
805
+ high: Math.max(4096, Math.floor(maxTokens * 0.8))
767
806
  };
768
- const budget = budgetMap[effectiveLevel];
807
+ const budget = Math.max(0, Math.min(budgetMap[effectiveLevel], maxTokens - VISIBLE_FLOOR));
769
808
  return {
770
809
  thinking: { type: "enabled", budget_tokens: budget },
771
- maxTokens: maxTokens + budget
810
+ maxTokens
772
811
  };
773
812
  }
774
813
  function remapToolCallId(id, idMap) {
@@ -974,26 +1013,83 @@ function parseToolArguments(argsJson) {
974
1013
  }
975
1014
 
976
1015
  // src/providers/anthropic.ts
1016
+ var anthropicClientCache = /* @__PURE__ */ new Map();
977
1017
  function createClient(options) {
978
1018
  const isOAuth = options.apiKey?.startsWith("sk-ant-oat");
979
- return new Anthropic({
1019
+ const userAgent = isOAuth ? options.userAgent ?? "claude-cli/2.1.75 (external, cli)" : "";
1020
+ const cacheKey = `${options.apiKey ?? ""}|${options.baseUrl ?? ""}|${userAgent}`;
1021
+ if (!options.fetch) {
1022
+ const cached = anthropicClientCache.get(cacheKey);
1023
+ if (cached) return cached;
1024
+ }
1025
+ const client = new Anthropic({
980
1026
  ...isOAuth ? { apiKey: null, authToken: options.apiKey } : { apiKey: options.apiKey },
981
1027
  ...options.baseUrl ? { baseURL: options.baseUrl } : {},
982
1028
  ...options.fetch ? { fetch: options.fetch } : {},
983
- // Disable SDK retries — the agent loop has its own stall/overload retry
984
- // logic that surfaces errors properly. SDK retries on 429s can cause
985
- // multi-minute hangs when the provider stops responding mid-retry.
986
1029
  maxRetries: 0,
987
1030
  ...isOAuth ? {
988
1031
  defaultHeaders: {
989
- // Anthropic's OAuth edge validates the claude-cli version. Callers
990
- // (ggcoder) resolve the live version at runtime; the literal here
991
- // is the offline fallback for direct gg-ai consumers.
992
- "user-agent": options.userAgent ?? "claude-cli/2.1.75 (external, cli)",
1032
+ "user-agent": userAgent,
993
1033
  "x-app": "cli"
994
1034
  }
995
1035
  } : {}
996
1036
  });
1037
+ if (!options.fetch) {
1038
+ if (anthropicClientCache.size >= 8) {
1039
+ const oldest = anthropicClientCache.keys().next().value;
1040
+ if (oldest) anthropicClientCache.delete(oldest);
1041
+ }
1042
+ anthropicClientCache.set(cacheKey, client);
1043
+ }
1044
+ return client;
1045
+ }
1046
+ async function prewarmAnthropicCache(options) {
1047
+ try {
1048
+ const client = createClient({
1049
+ apiKey: options.apiKey,
1050
+ baseUrl: options.baseUrl,
1051
+ userAgent: options.userAgent
1052
+ });
1053
+ const cacheControl = toAnthropicCacheControl(options.cacheRetention ?? "long", options.baseUrl);
1054
+ const { system, messages } = toAnthropicMessages(
1055
+ [
1056
+ { role: "system", content: options.system },
1057
+ { role: "user", content: "." }
1058
+ ],
1059
+ cacheControl
1060
+ );
1061
+ const isOAuth = options.apiKey.startsWith("sk-ant-oat");
1062
+ const fullSystem = isOAuth ? [
1063
+ {
1064
+ type: "text",
1065
+ text: "You are Claude Code, Anthropic's official CLI for Claude."
1066
+ },
1067
+ ...system ?? []
1068
+ ] : system;
1069
+ const tools = options.tools?.length ? toAnthropicTools(options.tools, {
1070
+ cacheControl,
1071
+ enableFineGrainedToolStreaming: true
1072
+ }) : void 0;
1073
+ await client.messages.create(
1074
+ {
1075
+ model: options.model,
1076
+ max_tokens: 1,
1077
+ messages,
1078
+ ...fullSystem ? { system: fullSystem } : {},
1079
+ ...tools ? {
1080
+ tools: [
1081
+ ...tools,
1082
+ ...options.serverTools ?? []
1083
+ ]
1084
+ } : {}
1085
+ },
1086
+ {
1087
+ signal: options.signal ?? void 0,
1088
+ ...isOAuth ? { headers: { "anthropic-beta": "claude-code-20250219,oauth-2025-04-20" } } : {}
1089
+ }
1090
+ );
1091
+ } catch {
1092
+ }
997
1093
  }
998
1094
  function streamAnthropic(options) {
999
1095
  return new StreamResult(runStream(options), options.signal);
@@ -1573,13 +1669,27 @@ function extractOpenAIUsage(usage) {
1573
1669
  cacheRead
1574
1670
  };
1575
1671
  }
1672
+ var openaiClientCache = /* @__PURE__ */ new Map();
1576
1673
  function createClient2(options) {
1577
- return new OpenAI({
1674
+ const cacheKey = `${options.apiKey ?? ""}|${options.baseUrl ?? ""}|${JSON.stringify(options.defaultHeaders ?? {})}`;
1675
+ if (!options.fetch) {
1676
+ const cached = openaiClientCache.get(cacheKey);
1677
+ if (cached) return cached;
1678
+ }
1679
+ const client = new OpenAI({
1578
1680
  apiKey: options.apiKey,
1579
1681
  ...options.baseUrl ? { baseURL: options.baseUrl } : {},
1580
1682
  ...options.fetch ? { fetch: options.fetch } : {},
1581
1683
  ...options.defaultHeaders ? { defaultHeaders: options.defaultHeaders } : {}
1582
1684
  });
1685
+ if (!options.fetch) {
1686
+ if (openaiClientCache.size >= 8) {
1687
+ const oldest = openaiClientCache.keys().next().value;
1688
+ if (oldest) openaiClientCache.delete(oldest);
1689
+ }
1690
+ openaiClientCache.set(cacheKey, client);
1691
+ }
1692
+ return client;
1583
1693
  }
1584
1694
  function streamOpenAI(options) {
1585
1695
  return new StreamResult(runStream2(options), options.signal);
@@ -1994,9 +2104,6 @@ async function* runStream3(options) {
1994
2104
  body.tools = toCodexTools(options.tools);
1995
2105
  }
1996
2106
  body.prompt_cache_key = normalizePromptCacheKey(options.promptCacheKey ?? "ggcoder");
1997
- if (options.cacheRetention === "long") {
1998
- body.prompt_cache_retention = "24h";
1999
- }
2000
2107
  if (options.temperature != null && !options.thinking) {
2001
2108
  body.temperature = options.temperature;
2002
2109
  }
@@ -3309,6 +3416,7 @@ export {
3309
3416
  palsuText,
3310
3417
  palsuThinking,
3311
3418
  palsuToolCall,
3419
+ prewarmAnthropicCache,
3312
3420
  providerRegistry,
3313
3421
  registerPalsuProvider,
3314
3422
  setProviderDiagnostic,