npm - opencode-gemini-auth - Versions diffs - 1.4.3 → 1.4.4 - Mend

opencode-gemini-auth 1.4.3 → 1.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/package.json +1 -1
package/src/plugin/notify.test.ts +144 -0
package/src/plugin/notify.ts +89 -0
package/src/plugin/request/prepare.ts +42 -0
package/src/plugin/request.test.ts +30 -0
package/src/plugin/retry/index.ts +144 -1
package/src/plugin/retry/quota.ts +29 -10
package/src/plugin/retry.test.ts +162 -8
package/src/plugin/types.ts +10 -0
package/src/plugin.ts +8 -0

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "opencode-gemini-auth",
   "module": "index.ts",
-  "version": "1.4.3",
+  "version": "1.4.4",
   "author": "jenslys",
   "repository": "https://github.com/jenslys/opencode-gemini-auth",
   "files": [

package/src/plugin/notify.test.ts ADDED Viewed

@@ -0,0 +1,144 @@
+import { afterEach, beforeEach, describe, expect, it, mock } from "bun:test";
+import { maybeShowGeminiCapacityToast, notifyInternals } from "./notify";
+import type { PluginClient } from "./types";
+function makeQuota429(reason: string): Response {
+  return new Response(
+    JSON.stringify([
+      {
+        error: {
+          message: "rate limited",
+          details: [
+            {
+              "@type": "type.googleapis.com/google.rpc.ErrorInfo",
+              reason,
+              domain: "cloudcode-pa.googleapis.com",
+            },
+          ],
+        },
+      },
+    ]),
+    {
+      status: 429,
+      headers: { "content-type": "application/json" },
+    },
+  );
+}
+describe("maybeShowGeminiCapacityToast", () => {
+  const originalTestToastFlag = process.env.OPENCODE_GEMINI_TEST_TOAST;
+  beforeEach(() => {
+    notifyInternals.resetCooldowns();
+    delete process.env.OPENCODE_GEMINI_TEST_TOAST;
+  });
+  afterEach(() => {
+    mock.restore();
+    if (originalTestToastFlag === undefined) {
+      delete process.env.OPENCODE_GEMINI_TEST_TOAST;
+    } else {
+      process.env.OPENCODE_GEMINI_TEST_TOAST = originalTestToastFlag;
+    }
+  });
+  it("shows toast for MODEL_CAPACITY_EXHAUSTED", async () => {
+    const showToast = mock(async (_input: unknown) => true);
+    const client = { auth: { set: async () => {} }, tui: { showToast } } as PluginClient;
+    const response = makeQuota429("MODEL_CAPACITY_EXHAUSTED");
+    await maybeShowGeminiCapacityToast(client, response, "project-1", "gemini-3-flash-preview");
+    expect(showToast.mock.calls.length).toBe(1);
+    const firstCall = showToast.mock.calls.at(0);
+    expect(firstCall?.[0]).toEqual({
+      body: {
+        title: "Gemini Capacity Unavailable",
+        message:
+          "Google reports temporary server capacity limits for gemini-3-flash-preview. Please retry in a few seconds.",
+        variant: "warning",
+        duration: 7000,
+      },
+    });
+  });
+  it("does not show toast for non-capacity 429 reasons", async () => {
+    const showToast = mock(async (_input: unknown) => true);
+    const client = { auth: { set: async () => {} }, tui: { showToast } } as PluginClient;
+    const response = makeQuota429("RATE_LIMIT_EXCEEDED");
+    await maybeShowGeminiCapacityToast(client, response, "project-1", "gemini-3-flash-preview");
+    expect(showToast.mock.calls.length).toBe(0);
+  });
+  it("dedupes toasts within cooldown window", async () => {
+    const showToast = mock(async (_input: unknown) => true);
+    const client = { auth: { set: async () => {} }, tui: { showToast } } as PluginClient;
+    await maybeShowGeminiCapacityToast(
+      client,
+      makeQuota429("MODEL_CAPACITY_EXHAUSTED"),
+      "project-1",
+      "gemini-3-flash-preview",
+    );
+    await maybeShowGeminiCapacityToast(
+      client,
+      makeQuota429("MODEL_CAPACITY_EXHAUSTED"),
+      "project-1",
+      "gemini-3-flash-preview",
+    );
+    expect(showToast.mock.calls.length).toBe(1);
+  });
+});
+describe("maybeShowGeminiTestToast", () => {
+  const originalTestToastFlag = process.env.OPENCODE_GEMINI_TEST_TOAST;
+  beforeEach(() => {
+    notifyInternals.resetCooldowns();
+    delete process.env.OPENCODE_GEMINI_TEST_TOAST;
+  });
+  afterEach(() => {
+    mock.restore();
+    if (originalTestToastFlag === undefined) {
+      delete process.env.OPENCODE_GEMINI_TEST_TOAST;
+    } else {
+      process.env.OPENCODE_GEMINI_TEST_TOAST = originalTestToastFlag;
+    }
+  });
+  it("does not show test toast when flag is not enabled", async () => {
+    const { maybeShowGeminiTestToast } = await import("./notify");
+    const showToast = mock(async (_input: unknown) => true);
+    const client = { auth: { set: async () => {} }, tui: { showToast } } as PluginClient;
+    await maybeShowGeminiTestToast(client, "project-1");
+    expect(showToast.mock.calls.length).toBe(0);
+  });
+  it("shows test toast once per project when flag is enabled", async () => {
+    process.env.OPENCODE_GEMINI_TEST_TOAST = "1";
+    const { maybeShowGeminiTestToast } = await import("./notify");
+    const showToast = mock(async (_input: unknown) => true);
+    const client = { auth: { set: async () => {} }, tui: { showToast } } as PluginClient;
+    await maybeShowGeminiTestToast(client, "project-1");
+    await maybeShowGeminiTestToast(client, "project-1");
+    expect(showToast.mock.calls.length).toBe(1);
+    const firstCall = showToast.mock.calls.at(0);
+    expect(firstCall?.[0]).toEqual({
+      body: {
+        title: "Gemini Toast Test",
+        message: "Temporary test toast from opencode-gemini-auth.",
+        variant: "info",
+        duration: 5000,
+      },
+    });
+  });
+});

package/src/plugin/notify.ts ADDED Viewed

@@ -0,0 +1,89 @@
+import { classifyQuotaResponse } from "./retry/quota";
+import { isGeminiDebugEnabled, logGeminiDebugMessage } from "./debug";
+import type { PluginClient } from "./types";
+const MODEL_CAPACITY_TOAST_COOLDOWN_MS = 30_000;
+const modelCapacityToastCooldownByKey = new Map<string, number>();
+const TEST_TOAST_FLAG = "OPENCODE_GEMINI_TEST_TOAST";
+const testToastShownByProject = new Set<string>();
+/**
+ * Emits a user-facing toast for server-side Gemini model capacity exhaustion.
+ *
+ * We deliberately notify only `MODEL_CAPACITY_EXHAUSTED` (not generic 429s)
+ * so we do not mislabel account-level quota limits as backend incidents.
+ */
+export async function maybeShowGeminiCapacityToast(
+  client: PluginClient,
+  response: Response,
+  projectId: string,
+  requestedModel?: string,
+): Promise<void> {
+  if (response.status !== 429 || !client.tui?.showToast) {
+    return;
+  }
+  const quotaContext = await classifyQuotaResponse(response);
+  if (quotaContext?.reason !== "MODEL_CAPACITY_EXHAUSTED") {
+    return;
+  }
+  const model = requestedModel ?? "the selected model";
+  const toastKey = `${projectId}|${model}|MODEL_CAPACITY_EXHAUSTED`;
+  const now = Date.now();
+  const cooldownUntil = modelCapacityToastCooldownByKey.get(toastKey) ?? 0;
+  if (cooldownUntil > now) {
+    return;
+  }
+  modelCapacityToastCooldownByKey.set(toastKey, now + MODEL_CAPACITY_TOAST_COOLDOWN_MS);
+  await client.tui.showToast({
+    body: {
+      title: "Gemini Capacity Unavailable",
+      message: `Google reports temporary server capacity limits for ${model}. Please retry in a few seconds.`,
+      variant: "warning",
+      duration: 7000,
+    },
+  });
+  if (isGeminiDebugEnabled()) {
+    logGeminiDebugMessage(`Toast: emitted capacity warning for model=${model} project=${projectId}`);
+  }
+}
+/**
+ * Temporary smoke-test toast, enabled only with OPENCODE_GEMINI_TEST_TOAST=1.
+ * Emits once per project per process lifetime to avoid toast spam.
+ */
+export async function maybeShowGeminiTestToast(
+  client: PluginClient,
+  projectId: string,
+): Promise<void> {
+  if (process.env[TEST_TOAST_FLAG]?.trim() !== "1" || !client.tui?.showToast) {
+    return;
+  }
+  const key = projectId || "global";
+  if (testToastShownByProject.has(key)) {
+    return;
+  }
+  testToastShownByProject.add(key);
+  await client.tui.showToast({
+    body: {
+      title: "Gemini Toast Test",
+      message: "Temporary test toast from opencode-gemini-auth.",
+      variant: "info",
+      duration: 5000,
+    },
+  });
+  if (isGeminiDebugEnabled()) {
+    logGeminiDebugMessage(`Toast: emitted test toast (project=${key})`);
+  }
+}
+export const notifyInternals = {
+  resetCooldowns() {
+    modelCapacityToastCooldownByKey.clear();
+    testToastShownByProject.clear();
+  },
+};

package/src/plugin/request/prepare.ts CHANGED Viewed

@@ -116,6 +116,7 @@ function transformRequestBody(
     normalizeThinking(requestPayload);
     normalizeSystemInstruction(requestPayload);
     normalizeCachedContent(requestPayload);
+    stripThoughtPartsFromHistory(requestPayload);
     if ("model" in requestPayload) {
       delete requestPayload.model;
@@ -188,3 +189,44 @@ function normalizeCachedContent(requestPayload: Record<string, unknown>): void {
     delete requestPayload.extra_body;
   }
 }
+function stripThoughtPartsFromHistory(requestPayload: Record<string, unknown>): void {
+  const contents = requestPayload.contents;
+  if (!Array.isArray(contents)) {
+    return;
+  }
+  const sanitizedContents: unknown[] = [];
+  for (const content of contents) {
+    if (!content || typeof content !== "object") {
+      sanitizedContents.push(content);
+      continue;
+    }
+    const record = content as Record<string, unknown>;
+    const parts = Array.isArray(record.parts) ? record.parts : undefined;
+    if (!parts) {
+      sanitizedContents.push(content);
+      continue;
+    }
+    const filteredParts = parts.filter((part) => {
+      if (!part || typeof part !== "object") {
+        return true;
+      }
+      return (part as Record<string, unknown>).thought !== true;
+    });
+    // Drop empty model turns produced by interrupted thought streaming.
+    if (filteredParts.length === 0 && record.role === "model") {
+      continue;
+    }
+    sanitizedContents.push({
+      ...record,
+      parts: filteredParts,
+    });
+  }
+  requestPayload.contents = sanitizedContents;
+}

package/src/plugin/request.test.ts CHANGED Viewed

@@ -55,6 +55,36 @@ describe("request helpers", () => {
     expect((parsed.request as Record<string, unknown>).system_instruction).toBeUndefined();
   });
+  it("drops thought-only model parts from replayed history", () => {
+    const input =
+      "https://generativelanguage.googleapis.com/v1beta/models/gemini-3-flash-preview:streamGenerateContent";
+    const init: RequestInit = {
+      method: "POST",
+      headers: {
+        "Content-Type": "application/json",
+      },
+      body: JSON.stringify({
+        contents: [
+          { role: "user", parts: [{ text: "give me a joke" }] },
+          {
+            role: "model",
+            parts: [{ text: "internal thought", thought: true }],
+          },
+          { role: "user", parts: [{ text: "well?" }] },
+        ],
+      }),
+    };
+    const result = prepareGeminiRequest(input, init, "token-123", "project-456");
+    const parsed = JSON.parse(result.init.body as string) as Record<string, unknown>;
+    const request = parsed.request as Record<string, unknown>;
+    const contents = request.contents as Array<Record<string, unknown>>;
+    expect(contents.length).toBe(2);
+    expect(contents[0]?.role).toBe("user");
+    expect(contents[1]?.role).toBe("user");
+  });
   it("maps traceId to responseId for JSON responses", async () => {
     const response = new Response(
       JSON.stringify({

package/src/plugin/retry/index.ts CHANGED Viewed

@@ -8,6 +8,11 @@ import {
   wait,
 } from "./helpers";
 import { classifyQuotaResponse, retryInternals } from "./quota";
+import { isGeminiDebugEnabled, logGeminiDebugMessage } from "../debug";
+const retryCooldownByKey = new Map<string, number>();
+const RETRY_IN_FLIGHT_LOG_INTERVAL_MS = 5000;
+const MODEL_CAPACITY_COOLDOWN_MS = 8000;
 /**
  * Sends requests with retry/backoff semantics aligned to Gemini CLI:
@@ -24,39 +29,74 @@ export async function fetchWithRetry(
   }
   const retryInit = cloneRetryableInit(init);
+  const throttleKey = buildRetryThrottleKey(input, retryInit);
+  await waitForRetryCooldown(throttleKey, retryInit.signal);
   let attempt = 1;
+  const url = readRequestUrl(input);
   while (attempt <= DEFAULT_MAX_ATTEMPTS) {
     let response: Response;
+    const stopInFlightLog = startInFlightLog(attempt, url);
     try {
+      debugRetry(
+        `attempt ${attempt}/${DEFAULT_MAX_ATTEMPTS} -> ${url}`,
+      );
       response = await fetch(input, retryInit);
     } catch (error) {
+      stopInFlightLog();
       if (attempt >= DEFAULT_MAX_ATTEMPTS || !isRetryableNetworkError(error)) {
+        debugRetry(
+          `attempt ${attempt} network error is non-retryable or maxed: ${formatErrorSummary(error)}`,
+        );
         throw error;
       }
       if (retryInit.signal?.aborted) {
+        debugRetry(`attempt ${attempt} aborted before retry`);
         throw error;
       }
-      await wait(getExponentialDelayWithJitter(attempt));
+      const delayMs = getExponentialDelayWithJitter(attempt);
+      debugRetry(
+        `attempt ${attempt} network retry scheduled in ${delayMs}ms (${formatErrorSummary(error)})`,
+      );
+      await wait(delayMs);
       attempt += 1;
       continue;
     }
+    stopInFlightLog();
     if (!isRetryableStatus(response.status)) {
+      debugRetry(`attempt ${attempt} success or non-retryable status: ${response.status}`);
       return response;
     }
     const quotaContext = response.status === 429 ? await classifyQuotaResponse(response) : null;
     if (response.status === 429 && quotaContext?.terminal) {
+      if (quotaContext.reason === "MODEL_CAPACITY_EXHAUSTED") {
+        const cooldownMs = quotaContext.retryDelayMs ?? MODEL_CAPACITY_COOLDOWN_MS;
+        setRetryCooldown(throttleKey, cooldownMs);
+        debugRetry(`terminal model capacity; cooldown ${cooldownMs}ms before next request`);
+      }
+      debugRetry(
+        `attempt ${attempt} terminal 429 (${quotaContext.reason ?? "unknown"}), returning without retry`,
+      );
       return response;
     }
     if (attempt >= DEFAULT_MAX_ATTEMPTS || retryInit.signal?.aborted) {
+      debugRetry(
+        `attempt ${attempt} reached retry boundary (status=${response.status})`,
+      );
       return response;
     }
     const delayMs = await resolveRetryDelayMs(response, attempt, quotaContext?.retryDelayMs);
+    debugRetry(
+      `attempt ${attempt} retrying status=${response.status} reason=${quotaContext?.reason ?? "n/a"} delay=${delayMs}ms`,
+    );
+    if (delayMs > 0 && response.status === 429) {
+      setRetryCooldown(throttleKey, delayMs);
+    }
     if (delayMs > 0) {
       await wait(delayMs);
     }
@@ -76,4 +116,107 @@ function cloneRetryableInit(init: RequestInit | undefined): RequestInit {
   };
 }
+function buildRetryThrottleKey(input: RequestInfo, init: RequestInit): string {
+  const url = readRequestUrl(input);
+  const body = typeof init.body === "string" ? safeParseBody(init.body) : null;
+  const project = readString(body?.project);
+  const model = readString(body?.model);
+  return `${url}|${project ?? ""}|${model ?? ""}`;
+}
+async function waitForRetryCooldown(key: string, signal?: AbortSignal | null): Promise<void> {
+  const until = retryCooldownByKey.get(key);
+  if (!until) {
+    return;
+  }
+  const remaining = until - Date.now();
+  if (remaining <= 0) {
+    retryCooldownByKey.delete(key);
+    return;
+  }
+  if (signal?.aborted) {
+    debugRetry(`cooldown skipped due to abort (key=${shortKey(key)})`);
+    return;
+  }
+  debugRetry(`cooldown wait ${remaining}ms (key=${shortKey(key)})`);
+  await wait(remaining);
+  retryCooldownByKey.delete(key);
+}
+function setRetryCooldown(key: string, delayMs: number): void {
+  const next = Date.now() + delayMs;
+  const current = retryCooldownByKey.get(key) ?? 0;
+  retryCooldownByKey.set(key, Math.max(current, next));
+  debugRetry(`cooldown set ${delayMs}ms (key=${shortKey(key)})`);
+}
+function readRequestUrl(input: RequestInfo): string {
+  if (typeof input === "string") {
+    return input;
+  }
+  if (input instanceof URL) {
+    return input.toString();
+  }
+  const request = input as Request;
+  if (request.url) {
+    return request.url;
+  }
+  return input.toString();
+}
+function safeParseBody(body: string): Record<string, unknown> | null {
+  if (!body) {
+    return null;
+  }
+  try {
+    const parsed = JSON.parse(body);
+    if (parsed && typeof parsed === "object") {
+      return parsed as Record<string, unknown>;
+    }
+  } catch {}
+  return null;
+}
+function readString(value: unknown): string | undefined {
+  return typeof value === "string" && value.trim() ? value : undefined;
+}
+function debugRetry(message: string): void {
+  if (!isGeminiDebugEnabled()) {
+    return;
+  }
+  logGeminiDebugMessage(`Retry: ${message}`);
+}
+function formatErrorSummary(error: unknown): string {
+  if (error instanceof Error) {
+    return error.message;
+  }
+  return String(error);
+}
+function shortKey(key: string): string {
+  return key.length <= 120 ? key : `${key.slice(0, 120)}...`;
+}
+function startInFlightLog(attempt: number, url: string): () => void {
+  if (!isGeminiDebugEnabled()) {
+    return () => {};
+  }
+  const startedAt = Date.now();
+  const interval = setInterval(() => {
+    const elapsed = Date.now() - startedAt;
+    debugRetry(`attempt ${attempt} still waiting for response (${elapsed}ms) -> ${url}`);
+  }, RETRY_IN_FLIGHT_LOG_INTERVAL_MS);
+  return () => {
+    clearInterval(interval);
+  };
+}
 export { retryInternals };

package/src/plugin/retry/quota.ts CHANGED Viewed

@@ -23,6 +23,7 @@ interface GoogleRpcRetryInfo {
 export interface QuotaContext {
   terminal: boolean;
   retryDelayMs?: number;
+  reason?: string;
 }
 const CLOUDCODE_DOMAINS = new Set([
@@ -65,10 +66,17 @@ export async function classifyQuotaResponse(response: Response): Promise<QuotaCo
     return null;
   }
   if (errorInfo?.reason === "QUOTA_EXHAUSTED") {
-    return { terminal: true, retryDelayMs };
+    return { terminal: true, retryDelayMs, reason: errorInfo.reason };
   }
   if (errorInfo?.reason === "RATE_LIMIT_EXCEEDED") {
-    return { terminal: false, retryDelayMs: retryDelayMs ?? 10_000 };
+    return { terminal: false, retryDelayMs: retryDelayMs ?? 10_000, reason: errorInfo.reason };
+  }
+  if (errorInfo?.reason === "MODEL_CAPACITY_EXHAUSTED") {
+    return {
+      terminal: retryDelayMs === undefined,
+      retryDelayMs,
+      reason: errorInfo.reason,
+    };
   }
   const quotaFailure = details.find(
@@ -83,20 +91,20 @@ export async function classifyQuotaResponse(response: Response): Promise<QuotaCo
       .toLowerCase();
     if (allTexts.includes("perday") || allTexts.includes("daily") || allTexts.includes("per day")) {
-      return { terminal: true, retryDelayMs };
+      return { terminal: true, retryDelayMs, reason: errorInfo?.reason };
     }
     if (allTexts.includes("perminute") || allTexts.includes("per minute")) {
-      return { terminal: false, retryDelayMs: retryDelayMs ?? 60_000 };
+      return { terminal: false, retryDelayMs: retryDelayMs ?? 60_000, reason: errorInfo?.reason };
     }
-    return { terminal: false, retryDelayMs };
+    return { terminal: false, retryDelayMs, reason: errorInfo?.reason };
   }
   const quotaLimit = errorInfo?.metadata?.quota_limit?.toLowerCase() ?? "";
   if (quotaLimit.includes("perminute") || quotaLimit.includes("per minute")) {
-    return { terminal: false, retryDelayMs: retryDelayMs ?? 60_000 };
+    return { terminal: false, retryDelayMs: retryDelayMs ?? 60_000, reason: errorInfo?.reason };
   }
-  return { terminal: false, retryDelayMs };
+  return { terminal: false, retryDelayMs, reason: errorInfo?.reason };
 }
 /**
@@ -191,12 +199,15 @@ async function parseErrorBody(
     return null;
   }
-  if (!isObject(parsed) || !isObject(parsed.error)) {
+  const normalized = normalizeErrorEnvelope(parsed);
+  if (!normalized || !isObject(normalized.error)) {
     return null;
   }
+  const error = normalized.error as Record<string, unknown>;
   return {
-    message: typeof parsed.error.message === "string" ? parsed.error.message : undefined,
-    details: Array.isArray(parsed.error.details) ? parsed.error.details : undefined,
+    message: typeof error.message === "string" ? error.message : undefined,
+    details: Array.isArray(error.details) ? error.details : undefined,
   };
 }
@@ -204,6 +215,14 @@ function isObject(value: unknown): value is Record<string, any> {
   return !!value && typeof value === "object";
 }
+function normalizeErrorEnvelope(parsed: unknown): Record<string, unknown> | null {
+  if (Array.isArray(parsed)) {
+    const first = parsed[0];
+    return isObject(first) ? first : null;
+  }
+  return isObject(parsed) ? parsed : null;
+}
 export const retryInternals = {
   parseRetryDelayValue,
   parseRetryDelayFromMessage,

package/src/plugin/retry.test.ts CHANGED Viewed

@@ -3,8 +3,13 @@ import { afterEach, beforeEach, describe, expect, it, mock } from "bun:test";
 import { fetchWithRetry, retryInternals } from "./retry";
 const originalSetTimeout = globalThis.setTimeout;
+const scheduledDelays: number[] = [];
-function makeQuota429(reason: "RATE_LIMIT_EXCEEDED" | "QUOTA_EXHAUSTED", retryDelay?: string): Response {
+function makeQuota429(
+  reason: "RATE_LIMIT_EXCEEDED" | "QUOTA_EXHAUSTED" | "MODEL_CAPACITY_EXHAUSTED",
+  retryDelay?: string,
+  wrappedAsArray = false,
+): Response {
   const details: Record<string, unknown>[] = [
     {
       "@type": "type.googleapis.com/google.rpc.ErrorInfo",
@@ -18,13 +23,49 @@ function makeQuota429(reason: "RATE_LIMIT_EXCEEDED" | "QUOTA_EXHAUSTED", retryDe
       retryDelay,
     });
   }
+  const payload = {
+    error: {
+      message: "rate limited",
+      details,
+    },
+  };
+  return new Response(
+    JSON.stringify(
+      wrappedAsArray
+        ? [payload]
+        : payload,
+    ),
+    {
+      status: 429,
+      headers: { "content-type": "application/json" },
+    },
+  );
+}
+function makeQuota429WithMessage(
+  reason: "RATE_LIMIT_EXCEEDED" | "QUOTA_EXHAUSTED" | "MODEL_CAPACITY_EXHAUSTED",
+  message: string,
+  wrappedAsArray = false,
+): Response {
+  const details: Record<string, unknown>[] = [
+    {
+      "@type": "type.googleapis.com/google.rpc.ErrorInfo",
+      reason,
+      domain: "cloudcode-pa.googleapis.com",
+    },
+  ];
+  const payload = {
+    error: {
+      message,
+      details,
+    },
+  };
   return new Response(
-    JSON.stringify({
-      error: {
-        message: "rate limited",
-        details,
-      },
-    }),
+    JSON.stringify(
+      wrappedAsArray
+        ? [payload]
+        : payload,
+    ),
     {
       status: 429,
       headers: { "content-type": "application/json" },
@@ -35,7 +76,12 @@ function makeQuota429(reason: "RATE_LIMIT_EXCEEDED" | "QUOTA_EXHAUSTED", retryDe
 describe("fetchWithRetry", () => {
   beforeEach(() => {
     mock.restore();
-    (globalThis as { setTimeout: typeof setTimeout }).setTimeout = ((fn: (...args: any[]) => void) => {
+    scheduledDelays.length = 0;
+    (globalThis as { setTimeout: typeof setTimeout }).setTimeout = ((
+      fn: (...args: any[]) => void,
+      delay?: number | undefined,
+    ) => {
+      scheduledDelays.push(typeof delay === "number" ? delay : 0);
       fn();
       return 0 as unknown as ReturnType<typeof setTimeout>;
     }) as typeof setTimeout;
@@ -96,6 +142,84 @@ describe("fetchWithRetry", () => {
     expect(fetchMock.mock.calls.length).toBe(1);
   });
+  it("fails fast on model capacity exhaustion when no retry hint is provided", async () => {
+    const fetchMock = mock(async () => makeQuota429("MODEL_CAPACITY_EXHAUSTED"));
+    (globalThis as { fetch: typeof fetch }).fetch = fetchMock as unknown as typeof fetch;
+    const response = await fetchWithRetry("https://example.com", {
+      method: "POST",
+      body: JSON.stringify({ hello: "world" }),
+    });
+    expect(response.status).toBe(429);
+    expect(fetchMock.mock.calls.length).toBe(1);
+  });
+  it("fails fast on array-wrapped model capacity exhaustion payload", async () => {
+    const fetchMock = mock(async () =>
+      makeQuota429WithMessage(
+        "MODEL_CAPACITY_EXHAUSTED",
+        "No capacity available for model gemini-3-flash-preview on the server",
+        true,
+      ),
+    );
+    (globalThis as { fetch: typeof fetch }).fetch = fetchMock as unknown as typeof fetch;
+    const response = await fetchWithRetry("https://example.com", {
+      method: "POST",
+      body: JSON.stringify({ hello: "world" }),
+    });
+    expect(response.status).toBe(429);
+    expect(fetchMock.mock.calls.length).toBe(1);
+  });
+  it("applies cooldown after terminal model capacity exhaustion", async () => {
+    const fetchMock = mock(async () => {
+      if (fetchMock.mock.calls.length === 1) {
+        return makeQuota429WithMessage(
+          "MODEL_CAPACITY_EXHAUSTED",
+          "No capacity available for model gemini-3-flash-preview on the server",
+          true,
+        );
+      }
+      return new Response("ok", { status: 200 });
+    });
+    (globalThis as { fetch: typeof fetch }).fetch = fetchMock as unknown as typeof fetch;
+    const firstResponse = await fetchWithRetry("https://example.com", {
+      method: "POST",
+      body: JSON.stringify({ project: "project-1", model: "gemini-3-flash-preview" }),
+    });
+    const secondResponse = await fetchWithRetry("https://example.com", {
+      method: "POST",
+      body: JSON.stringify({ project: "project-1", model: "gemini-3-flash-preview" }),
+    });
+    expect(firstResponse.status).toBe(429);
+    expect(secondResponse.status).toBe(200);
+    expect(fetchMock.mock.calls.length).toBe(2);
+    expect(scheduledDelays).toContain(8000);
+  });
+  it("retries model capacity exhaustion when server provides RetryInfo", async () => {
+    const fetchMock = mock(async () => {
+      if (fetchMock.mock.calls.length === 1) {
+        return makeQuota429("MODEL_CAPACITY_EXHAUSTED", "500ms");
+      }
+      return new Response("ok", { status: 200 });
+    });
+    (globalThis as { fetch: typeof fetch }).fetch = fetchMock as unknown as typeof fetch;
+    const response = await fetchWithRetry("https://example.com", {
+      method: "POST",
+      body: JSON.stringify({ hello: "world" }),
+    });
+    expect(response.status).toBe(200);
+    expect(fetchMock.mock.calls.length).toBe(2);
+  });
   it("retries immediately when server returns Retry-After: 0", async () => {
     const fetchMock = mock(async () => {
       if (fetchMock.mock.calls.length === 1) {
@@ -116,6 +240,36 @@ describe("fetchWithRetry", () => {
     expect(response.status).toBe(200);
     expect(fetchMock.mock.calls.length).toBe(2);
   });
+  it("applies cooldown across requests to avoid repeated initial 429s", async () => {
+    const fetchMock = mock(async () => {
+      const callNumber = fetchMock.mock.calls.length;
+      if (callNumber === 1) {
+        return makeQuota429("RATE_LIMIT_EXCEEDED", "1500ms");
+      }
+      if (callNumber === 3 && scheduledDelays.length < 2) {
+        return makeQuota429("RATE_LIMIT_EXCEEDED", "1500ms");
+      }
+      return new Response("ok", { status: 200 });
+    });
+    (globalThis as { fetch: typeof fetch }).fetch = fetchMock as unknown as typeof fetch;
+    const firstResponse = await fetchWithRetry("https://example.com", {
+      method: "POST",
+      body: JSON.stringify({ project: "project-1", model: "gemini-2.5-flash" }),
+    });
+    const secondResponse = await fetchWithRetry("https://example.com", {
+      method: "POST",
+      body: JSON.stringify({ project: "project-1", model: "gemini-2.5-flash" }),
+    });
+    expect(firstResponse.status).toBe(200);
+    expect(secondResponse.status).toBe(200);
+    expect(fetchMock.mock.calls.length).toBe(3);
+    expect(scheduledDelays.length).toBe(2);
+    expect(scheduledDelays[0]).toBe(1500);
+    expect(scheduledDelays[1]).toBe(1500);
+  });
 });
 describe("retryInternals", () => {

package/src/plugin/types.ts CHANGED Viewed

@@ -52,6 +52,16 @@ export interface PluginClient {
   auth: {
     set(input: { path: { id: string }; body: OAuthAuthDetails }): Promise<void>;
   };
+  tui?: {
+    showToast(input: {
+      body: {
+        title?: string;
+        message: string;
+        variant: "info" | "success" | "warning" | "error";
+        duration?: number;
+      };
+    }): Promise<unknown>;
+  };
 }
 export interface PluginContext {

package/src/plugin.ts CHANGED Viewed

@@ -8,6 +8,7 @@ import {
   GEMINI_QUOTA_TOOL_NAME,
 } from "./plugin/quota";
 import { isGeminiDebugEnabled, logGeminiDebugMessage, startGeminiDebugRequest } from "./plugin/debug";
+import { maybeShowGeminiCapacityToast, maybeShowGeminiTestToast } from "./plugin/notify";
 import {
   isGenerativeLanguageRequest,
   prepareGeminiRequest,
@@ -98,6 +99,7 @@ export const GeminiCLIOAuthPlugin = async (
             client,
             configuredProjectId,
           );
+          await maybeShowGeminiTestToast(client, projectContext.effectiveProjectId);
           await maybeLogAvailableQuotaModels(
             authRecord.access,
             projectContext.effectiveProjectId,
@@ -123,6 +125,12 @@ export const GeminiCLIOAuthPlugin = async (
            * We intentionally do not auto-downgrade model tiers to avoid misleading users.
            */
           const response = await fetchWithRetry(transformed.request, transformed.init);
+          await maybeShowGeminiCapacityToast(
+            client,
+            response,
+            projectContext.effectiveProjectId,
+            transformed.requestedModel,
+          );
           return transformGeminiResponse(
             response,
             transformed.streaming,