npm - @kenkaiiii/gg-ai - Versions diffs - 4.11.2 → 4.12.1 - Mend

@kenkaiiii/gg-ai 4.11.2 → 4.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.d.cts CHANGED Viewed

@@ -255,6 +255,18 @@ declare class StreamResult implements AsyncIterable<StreamEvent> {
     private resolveResponse;
     private rejectResponse;
     private resolveWait;
+    /**
+     * High-water mark: when the buffer exceeds this many unconsumed events,
+     * the pump pauses until the consumer drains below the low-water mark.
+     * Prevents unbounded memory growth when a consumer is slow.
+     * Only active when someone IS iterating — if nobody iterates (the `then()`
+     * path), backpressure is skipped so the pump can complete and resolve.
+     */
+    private static readonly HIGH_WATER;
+    private static readonly LOW_WATER;
+    private iterating;
+    private paused;
+    private resolveDrain;
     constructor(generator: AsyncGenerator<StreamEvent, StreamResponse>, signal?: AbortSignal);
     private pump;
     private _nextWithAbort;
@@ -451,6 +463,28 @@ declare function toOpenAIMessages(messages: Message[], options?: {
     supportsImages?: boolean;
 }): OpenAI.ChatCompletionMessageParam[];
+/**
+ * Fire a minimal `max_tokens: 1` request that populates the Anthropic prompt
+ * cache with the system prompt + tools prefix, so the first real user turn is
+ * a cache read instead of a cold cache write. Best-effort: any error is
+ * swallowed so a failed pre-warm never blocks the session.
+ *
+ * Called by AgentSession when speedProfile is "optimized", before the first
+ * real agent-loop turn. The cache TTL follows the `cacheRetention` option —
+ * pass "long" (1 h) so the pre-warm survives until the user's first message.
+ */
+declare function prewarmAnthropicCache(options: {
+    apiKey: string;
+    model: string;
+    system: string;
+    tools?: StreamOptions["tools"];
+    serverTools?: StreamOptions["serverTools"];
+    baseUrl?: string;
+    userAgent?: string;
+    cacheRetention?: StreamOptions["cacheRetention"];
+    signal?: AbortSignal;
+}): Promise<void>;
 interface PalsuProviderState {
     callCount: number;
 }
@@ -520,4 +554,4 @@ interface PalsuProviderConfig {
  */
 declare function registerPalsuProvider(config?: PalsuProviderConfig): PalsuProviderHandle;
-export { type AssistantMessage, type CacheRetention, type ContentPart, type DoneEvent, type ErrorEvent, type ErrorSource, EventStream, type FormattedError, GGAIError, type ImageContent, type Message, type PalsuModelConfig, type PalsuModelHandle, type PalsuProviderConfig, type PalsuProviderHandle, type PalsuProviderState, type PalsuResponse, type PalsuResponseFactory, type Provider, type ProviderDiagnosticFn, type ProviderEntry, ProviderError, type ProviderStreamFn, type RawContent, type ServerToolCall, type ServerToolCallEvent, type ServerToolDefinition, type ServerToolResult, type ServerToolResultEvent, type StopReason, type StreamEvent, type StreamOptions, type StreamResponse, StreamResult, type SystemMessage, type TextContent, type TextDeltaEvent, type ThinkingContent, type ThinkingDeltaEvent, type ThinkingLevel, type Tool, type ToolCall, type ToolCallDeltaEvent, type ToolCallDoneEvent, type ToolChoice, type ToolResult, type ToolResultContent, type ToolResultMessage, type Usage, type UserMessage, type VideoContent, classifyProviderError, formatError, formatErrorForDisplay, isHardBillingMessage, isUsageLimitError, palsuAssistantMessage, palsuText, palsuThinking, palsuToolCall, providerRegistry, registerPalsuProvider, setProviderDiagnostic, stream, toAnthropicMessages, toOpenAIMessages };
+export { type AssistantMessage, type CacheRetention, type ContentPart, type DoneEvent, type ErrorEvent, type ErrorSource, EventStream, type FormattedError, GGAIError, type ImageContent, type Message, type PalsuModelConfig, type PalsuModelHandle, type PalsuProviderConfig, type PalsuProviderHandle, type PalsuProviderState, type PalsuResponse, type PalsuResponseFactory, type Provider, type ProviderDiagnosticFn, type ProviderEntry, ProviderError, type ProviderStreamFn, type RawContent, type ServerToolCall, type ServerToolCallEvent, type ServerToolDefinition, type ServerToolResult, type ServerToolResultEvent, type StopReason, type StreamEvent, type StreamOptions, type StreamResponse, StreamResult, type SystemMessage, type TextContent, type TextDeltaEvent, type ThinkingContent, type ThinkingDeltaEvent, type ThinkingLevel, type Tool, type ToolCall, type ToolCallDeltaEvent, type ToolCallDoneEvent, type ToolChoice, type ToolResult, type ToolResultContent, type ToolResultMessage, type Usage, type UserMessage, type VideoContent, classifyProviderError, formatError, formatErrorForDisplay, isHardBillingMessage, isUsageLimitError, palsuAssistantMessage, palsuText, palsuThinking, palsuToolCall, prewarmAnthropicCache, providerRegistry, registerPalsuProvider, setProviderDiagnostic, stream, toAnthropicMessages, toOpenAIMessages };

package/dist/index.d.ts CHANGED Viewed

@@ -255,6 +255,18 @@ declare class StreamResult implements AsyncIterable<StreamEvent> {
     private resolveResponse;
     private rejectResponse;
     private resolveWait;
+    /**
+     * High-water mark: when the buffer exceeds this many unconsumed events,
+     * the pump pauses until the consumer drains below the low-water mark.
+     * Prevents unbounded memory growth when a consumer is slow.
+     * Only active when someone IS iterating — if nobody iterates (the `then()`
+     * path), backpressure is skipped so the pump can complete and resolve.
+     */
+    private static readonly HIGH_WATER;
+    private static readonly LOW_WATER;
+    private iterating;
+    private paused;
+    private resolveDrain;
     constructor(generator: AsyncGenerator<StreamEvent, StreamResponse>, signal?: AbortSignal);
     private pump;
     private _nextWithAbort;
@@ -451,6 +463,28 @@ declare function toOpenAIMessages(messages: Message[], options?: {
     supportsImages?: boolean;
 }): OpenAI.ChatCompletionMessageParam[];
+/**
+ * Fire a minimal `max_tokens: 1` request that populates the Anthropic prompt
+ * cache with the system prompt + tools prefix, so the first real user turn is
+ * a cache read instead of a cold cache write. Best-effort: any error is
+ * swallowed so a failed pre-warm never blocks the session.
+ *
+ * Called by AgentSession when speedProfile is "optimized", before the first
+ * real agent-loop turn. The cache TTL follows the `cacheRetention` option —
+ * pass "long" (1 h) so the pre-warm survives until the user's first message.
+ */
+declare function prewarmAnthropicCache(options: {
+    apiKey: string;
+    model: string;
+    system: string;
+    tools?: StreamOptions["tools"];
+    serverTools?: StreamOptions["serverTools"];
+    baseUrl?: string;
+    userAgent?: string;
+    cacheRetention?: StreamOptions["cacheRetention"];
+    signal?: AbortSignal;
+}): Promise<void>;
 interface PalsuProviderState {
     callCount: number;
 }
@@ -520,4 +554,4 @@ interface PalsuProviderConfig {
  */
 declare function registerPalsuProvider(config?: PalsuProviderConfig): PalsuProviderHandle;
-export { type AssistantMessage, type CacheRetention, type ContentPart, type DoneEvent, type ErrorEvent, type ErrorSource, EventStream, type FormattedError, GGAIError, type ImageContent, type Message, type PalsuModelConfig, type PalsuModelHandle, type PalsuProviderConfig, type PalsuProviderHandle, type PalsuProviderState, type PalsuResponse, type PalsuResponseFactory, type Provider, type ProviderDiagnosticFn, type ProviderEntry, ProviderError, type ProviderStreamFn, type RawContent, type ServerToolCall, type ServerToolCallEvent, type ServerToolDefinition, type ServerToolResult, type ServerToolResultEvent, type StopReason, type StreamEvent, type StreamOptions, type StreamResponse, StreamResult, type SystemMessage, type TextContent, type TextDeltaEvent, type ThinkingContent, type ThinkingDeltaEvent, type ThinkingLevel, type Tool, type ToolCall, type ToolCallDeltaEvent, type ToolCallDoneEvent, type ToolChoice, type ToolResult, type ToolResultContent, type ToolResultMessage, type Usage, type UserMessage, type VideoContent, classifyProviderError, formatError, formatErrorForDisplay, isHardBillingMessage, isUsageLimitError, palsuAssistantMessage, palsuText, palsuThinking, palsuToolCall, providerRegistry, registerPalsuProvider, setProviderDiagnostic, stream, toAnthropicMessages, toOpenAIMessages };
+export { type AssistantMessage, type CacheRetention, type ContentPart, type DoneEvent, type ErrorEvent, type ErrorSource, EventStream, type FormattedError, GGAIError, type ImageContent, type Message, type PalsuModelConfig, type PalsuModelHandle, type PalsuProviderConfig, type PalsuProviderHandle, type PalsuProviderState, type PalsuResponse, type PalsuResponseFactory, type Provider, type ProviderDiagnosticFn, type ProviderEntry, ProviderError, type ProviderStreamFn, type RawContent, type ServerToolCall, type ServerToolCallEvent, type ServerToolDefinition, type ServerToolResult, type ServerToolResultEvent, type StopReason, type StreamEvent, type StreamOptions, type StreamResponse, StreamResult, type SystemMessage, type TextContent, type TextDeltaEvent, type ThinkingContent, type ThinkingDeltaEvent, type ThinkingLevel, type Tool, type ToolCall, type ToolCallDeltaEvent, type ToolCallDoneEvent, type ToolChoice, type ToolResult, type ToolResultContent, type ToolResultMessage, type Usage, type UserMessage, type VideoContent, classifyProviderError, formatError, formatErrorForDisplay, isHardBillingMessage, isUsageLimitError, palsuAssistantMessage, palsuText, palsuThinking, palsuToolCall, prewarmAnthropicCache, providerRegistry, registerPalsuProvider, setProviderDiagnostic, stream, toAnthropicMessages, toOpenAIMessages };

package/dist/index.js CHANGED Viewed

@@ -281,7 +281,7 @@ var EventStream = class {
     }
   }
 };
-var StreamResult = class {
+var StreamResult = class _StreamResult {
   response;
   buffer = [];
   done = false;
@@ -289,6 +289,18 @@ var StreamResult = class {
   resolveResponse;
   rejectResponse;
   resolveWait = null;
+  /**
+   * High-water mark: when the buffer exceeds this many unconsumed events,
+   * the pump pauses until the consumer drains below the low-water mark.
+   * Prevents unbounded memory growth when a consumer is slow.
+   * Only active when someone IS iterating — if nobody iterates (the `then()`
+   * path), backpressure is skipped so the pump can complete and resolve.
+   */
+  static HIGH_WATER = 5e3;
+  static LOW_WATER = 1e3;
+  iterating = false;
+  paused = false;
+  resolveDrain = null;
   constructor(generator, signal) {
     this.response = new Promise((resolve, reject) => {
       this.resolveResponse = resolve;
@@ -303,6 +315,13 @@ var StreamResult = class {
         this.buffer.push(next.value);
         this.resolveWait?.();
         this.resolveWait = null;
+        if (this.iterating && this.buffer.length > _StreamResult.HIGH_WATER) {
+          this.paused = true;
+          await new Promise((r) => {
+            this.resolveDrain = r;
+          });
+          this.paused = false;
+        }
         next = await this._nextWithAbort(generator, signal);
       }
       this.done = true;
@@ -341,11 +360,20 @@ var StreamResult = class {
     }
   }
   async *[Symbol.asyncIterator]() {
+    this.iterating = true;
     let index = 0;
     while (true) {
       while (index < this.buffer.length) {
         yield this.buffer[index++];
       }
+      if (this.paused && index > _StreamResult.LOW_WATER) {
+        this.resolveDrain?.();
+        this.resolveDrain = null;
+      }
+      if (index > 0 && !this.paused) {
+        this.buffer.splice(0, index);
+        index = 0;
+      }
       if (this.error) throw this.error;
       if (this.done) return;
       await new Promise((r) => {
@@ -358,16 +386,26 @@ var StreamResult = class {
     }
   }
   then(onfulfilled, onrejected) {
+    if (this.paused) {
+      this.paused = false;
+      this.resolveDrain?.();
+      this.resolveDrain = null;
+    }
     return this.response.then(onfulfilled, onrejected);
   }
 };
 // src/utils/zod-to-json-schema.ts
 import { z } from "zod";
+var schemaCache = /* @__PURE__ */ new WeakMap();
 function zodToJsonSchema(schema) {
+  const cached = schemaCache.get(schema);
+  if (cached) return cached;
   const jsonSchema = z.toJSONSchema(schema);
   const { $schema: _schema, ...rest } = jsonSchema;
-  return normalizeRootForAnthropic(rest);
+  const normalized = normalizeRootForAnthropic(rest);
+  schemaCache.set(schema, normalized);
+  return normalized;
 }
 function resolveToolSchema(tool) {
   return tool.rawInputSchema ?? zodToJsonSchema(tool.parameters);
@@ -759,16 +797,17 @@ function toAnthropicThinking(level, maxTokens, model) {
       outputConfig: { effort }
     };
   }
+  const VISIBLE_FLOOR = 1024;
   const effectiveLevel = level === "xhigh" || level === "max" ? "high" : level;
   const budgetMap = {
-    low: Math.max(1024, Math.floor(maxTokens * 0.25)),
-    medium: Math.max(2048, Math.floor(maxTokens * 0.5)),
-    high: Math.max(4096, maxTokens)
+    low: Math.max(1024, Math.floor(maxTokens * 0.2)),
+    medium: Math.max(2048, Math.floor(maxTokens * 0.45)),
+    high: Math.max(4096, Math.floor(maxTokens * 0.8))
   };
-  const budget = budgetMap[effectiveLevel];
+  const budget = Math.max(0, Math.min(budgetMap[effectiveLevel], maxTokens - VISIBLE_FLOOR));
   return {
     thinking: { type: "enabled", budget_tokens: budget },
-    maxTokens: maxTokens + budget
+    maxTokens
   };
 }
 function remapToolCallId(id, idMap) {
@@ -974,26 +1013,83 @@ function parseToolArguments(argsJson) {
 }
 // src/providers/anthropic.ts
+var anthropicClientCache = /* @__PURE__ */ new Map();
 function createClient(options) {
   const isOAuth = options.apiKey?.startsWith("sk-ant-oat");
-  return new Anthropic({
+  const userAgent = isOAuth ? options.userAgent ?? "claude-cli/2.1.75 (external, cli)" : "";
+  const cacheKey = `${options.apiKey ?? ""}|${options.baseUrl ?? ""}|${userAgent}`;
+  if (!options.fetch) {
+    const cached = anthropicClientCache.get(cacheKey);
+    if (cached) return cached;
+  }
+  const client = new Anthropic({
     ...isOAuth ? { apiKey: null, authToken: options.apiKey } : { apiKey: options.apiKey },
     ...options.baseUrl ? { baseURL: options.baseUrl } : {},
     ...options.fetch ? { fetch: options.fetch } : {},
-    // Disable SDK retries — the agent loop has its own stall/overload retry
-    // logic that surfaces errors properly. SDK retries on 429s can cause
-    // multi-minute hangs when the provider stops responding mid-retry.
     maxRetries: 0,
     ...isOAuth ? {
       defaultHeaders: {
-        // Anthropic's OAuth edge validates the claude-cli version. Callers
-        // (ggcoder) resolve the live version at runtime; the literal here
-        // is the offline fallback for direct gg-ai consumers.
-        "user-agent": options.userAgent ?? "claude-cli/2.1.75 (external, cli)",
+        "user-agent": userAgent,
         "x-app": "cli"
       }
     } : {}
   });
+  if (!options.fetch) {
+    if (anthropicClientCache.size >= 8) {
+      const oldest = anthropicClientCache.keys().next().value;
+      if (oldest) anthropicClientCache.delete(oldest);
+    }
+    anthropicClientCache.set(cacheKey, client);
+  }
+  return client;
+}
+async function prewarmAnthropicCache(options) {
+  try {
+    const client = createClient({
+      apiKey: options.apiKey,
+      baseUrl: options.baseUrl,
+      userAgent: options.userAgent
+    });
+    const cacheControl = toAnthropicCacheControl(options.cacheRetention ?? "long", options.baseUrl);
+    const { system, messages } = toAnthropicMessages(
+      [
+        { role: "system", content: options.system },
+        { role: "user", content: "." }
+      ],
+      cacheControl
+    );
+    const isOAuth = options.apiKey.startsWith("sk-ant-oat");
+    const fullSystem = isOAuth ? [
+      {
+        type: "text",
+        text: "You are Claude Code, Anthropic's official CLI for Claude."
+      },
+      ...system ?? []
+    ] : system;
+    const tools = options.tools?.length ? toAnthropicTools(options.tools, {
+      cacheControl,
+      enableFineGrainedToolStreaming: true
+    }) : void 0;
+    await client.messages.create(
+      {
+        model: options.model,
+        max_tokens: 1,
+        messages,
+        ...fullSystem ? { system: fullSystem } : {},
+        ...tools ? {
+          tools: [
+            ...tools,
+            ...options.serverTools ?? []
+          ]
+        } : {}
+      },
+      {
+        signal: options.signal ?? void 0,
+        ...isOAuth ? { headers: { "anthropic-beta": "claude-code-20250219,oauth-2025-04-20" } } : {}
+      }
+    );
+  } catch {
+  }
 }
 function streamAnthropic(options) {
   return new StreamResult(runStream(options), options.signal);
@@ -1573,13 +1669,27 @@ function extractOpenAIUsage(usage) {
     cacheRead
   };
 }
+var openaiClientCache = /* @__PURE__ */ new Map();
 function createClient2(options) {
-  return new OpenAI({
+  const cacheKey = `${options.apiKey ?? ""}|${options.baseUrl ?? ""}|${JSON.stringify(options.defaultHeaders ?? {})}`;
+  if (!options.fetch) {
+    const cached = openaiClientCache.get(cacheKey);
+    if (cached) return cached;
+  }
+  const client = new OpenAI({
     apiKey: options.apiKey,
     ...options.baseUrl ? { baseURL: options.baseUrl } : {},
     ...options.fetch ? { fetch: options.fetch } : {},
     ...options.defaultHeaders ? { defaultHeaders: options.defaultHeaders } : {}
   });
+  if (!options.fetch) {
+    if (openaiClientCache.size >= 8) {
+      const oldest = openaiClientCache.keys().next().value;
+      if (oldest) openaiClientCache.delete(oldest);
+    }
+    openaiClientCache.set(cacheKey, client);
+  }
+  return client;
 }
 function streamOpenAI(options) {
   return new StreamResult(runStream2(options), options.signal);
@@ -1994,9 +2104,6 @@ async function* runStream3(options) {
     body.tools = toCodexTools(options.tools);
   }
   body.prompt_cache_key = normalizePromptCacheKey(options.promptCacheKey ?? "ggcoder");
-  if (options.cacheRetention === "long") {
-    body.prompt_cache_retention = "24h";
-  }
   if (options.temperature != null && !options.thinking) {
     body.temperature = options.temperature;
   }
@@ -3309,6 +3416,7 @@ export {
   palsuText,
   palsuThinking,
   palsuToolCall,
+  prewarmAnthropicCache,
   providerRegistry,
   registerPalsuProvider,
   setProviderDiagnostic,