npm - @kylebrodeur/pi-model-router - Versions diffs - 0.1.3 → 0.1.4 - Mend

@kylebrodeur/pi-model-router 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/CHANGELOG.md CHANGED Viewed

@@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 ### Added
+- Transparent wait and retry interception for string-based rate limit errors (e.g., "quota will reset after X seconds")
 - Ollama auto-sync feature
 - Rate-limit fallback with transparent HTTP error handling (402, 429, 503, 529)
 - Feature toggles in config (`features` object)

package/LEARNINGS.md CHANGED Viewed

@@ -73,10 +73,11 @@ The fallback mechanism uses a user-configurable sequence of models: `fallbackSeq
 *   **Key benefit**: Prevents catastrophic failures when a primary model is unavailable.
 ### 3. Graceful Error Handling
-The extension transparently handles errors. For "out of credits" (`402`) or "rate limit" (`429`), it automatically switches to a fallback model and emits a custom session entry (`router-fallback`) for headless tooling to detect.
+The extension transparently handles errors. For "out of credits" (`402`) or "rate limit" (`429`), it automatically switches to a fallback model and emits a custom session entry (`router-fallback`) for headless tooling to detect.
+Additionally, for string-based 429 errors specifying a cooldown (e.g., "quota will reset after 58s"), the router can intercept the stream, pause for the required duration (if under `shortDelayThreshold`), and automatically retry the original request without failing the turn.
 *   **When to use**: For any extension exposed to external API services.
-*   **Key insight**: Never mask API errors; provide enough detail (status codes) in UI notifications for users to diagnose.
+*   **Key insight**: Never mask API errors; provide enough detail (status codes) in UI notifications for users to diagnose, but handle transient issues (like short rate limits) invisibly where possible.
 ## 🔌 Pi Integration Patterns

package/README.md CHANGED Viewed

@@ -121,6 +121,30 @@ Copy the example config to one of:
 **Priority:** Project config `.pi/model-router.json` overrides user config `~/.pi/agent/model-router.json`. Both override defaults.
+### Rate Limit Interception & Fallback
+The router can gracefully handle 429 Rate Limit and Quota errors. If the error specifies a wait time (e.g., "reset after 58s"), the router will pause and automatically retry the prompt if the wait time is under your threshold. If it exceeds the threshold or is unparseable, it fails over to the next available model in your fallback sequence.
+```json
+{
+  "rateLimitFallback": {
+    "enabled": true,
+    "shortDelayThreshold": 60,
+    "autoFallback": true,
+    "autoRestore": true,
+    "restoreCheckInterval": 300,
+    "fallbackSequence": ["anthropic/claude-3-haiku-20240307", "ollama/*"]
+  }
+}
+```
+| Field | Description |
+|-------|-------------|
+| `shortDelayThreshold` | Maximum time (in seconds) the router will pause and wait to retry when encountering a rate limit. If the cooldown is longer than this, it triggers a fallback. |
+| `fallbackSequence` | Array of model IDs (or wildcards like `ollama/*`) to try if the primary model fails or the wait time is too long. |
+| `autoFallback` | (Optional) Automatically switch session to the fallback model globally after a hard failure. |
+| `autoRestore` | (Optional) If fallback was triggered, automatically try to restore the original cloud model after `restoreCheckInterval` seconds. |
 ### Progressive Enhancement Configs
 After installing optional extensions, copy one of these to `.pi/model-router.json`:

package/extensions/provider.ts CHANGED Viewed

@@ -30,6 +30,20 @@ import {
   hasImageAttachment,
 } from './routing';
+const rateLimitRegex = /(?:429|rate limit|quota).*?(?:reset after|try again in|wait)\s*(\d+)\s*([smh])/i;
+function extractWaitTimeMs(errorText: string): number | null {
+  const match = errorText.match(rateLimitRegex);
+  if (!match) return null;
+  const value = parseInt(match[1], 10);
+  const unit = match[2].toLowerCase();
+  if (unit === 's') return value * 1000;
+  if (unit === 'm') return value * 60000;
+  if (unit === 'h') return value * 3600000;
+  return null;
+}
 export const createErrorMessage = (
   model: Model<Api>,
   message: string,
@@ -457,74 +471,109 @@ export const registerRouterProvider = (
             const apiKey = auth.apiKey;
             const headers = auth.headers;
-            try {
-              // HONESTY CHECK & AUTO-TRUNCATION
-              // If the picked model has a smaller context than what we reported, truncate now.
-              let effectiveContext = context;
-              const targetLimit = targetModel.contextWindow || 128_000;
-              if (targetLimit < model.contextWindow!) {
-                effectiveContext = truncateContext(context, targetLimit);
-              }
+            let retryCount = 0;
+            let modelSuccess = false;
-              const thinkingOverride = actions.getThinkingOverride(
-                model.id,
-                decision.tier,
-              );
-              const delegatedReasoning =
-                targetModel.reasoning &&
-                (thinkingOverride ?? decision.thinking) !== 'off'
-                  ? (thinkingOverride ?? decision.thinking)
-                  : undefined;
-              if (state.lastExtensionContext) {
-                if (delegatedReasoning) {
-                  state.lastExtensionContext.ui.setHiddenThinkingLabel?.(
-                    `Thinking (${targetProvider}/${targetModelId})...`,
-                  );
-                } else {
-                  state.lastExtensionContext.ui.setHiddenThinkingLabel?.();
+            while (retryCount < 2) {
+              let contentReceived = false;
+              try {
+                // HONESTY CHECK & AUTO-TRUNCATION
+                // If the picked model has a smaller context than what we reported, truncate now.
+                let effectiveContext = context;
+                const targetLimit = targetModel.contextWindow || 128_000;
+                if (targetLimit < model.contextWindow!) {
+                  effectiveContext = truncateContext(context, targetLimit);
                 }
-              }
-              const delegatedStream = streamSimple(
-                targetModel,
-                effectiveContext,
-                {
-                  ...options,
-                  apiKey,
-                  headers,
-                  ...(delegatedReasoning
-                    ? { reasoning: delegatedReasoning }
-                    : {}),
-                },
-              );
+                const thinkingOverride = actions.getThinkingOverride(
+                  model.id,
+                  decision.tier,
+                );
+                const delegatedReasoning =
+                  targetModel.reasoning &&
+                  (thinkingOverride ?? decision.thinking) !== 'off'
+                    ? (thinkingOverride ?? decision.thinking)
+                    : undefined;
+                if (state.lastExtensionContext) {
+                  if (delegatedReasoning) {
+                    state.lastExtensionContext.ui.setHiddenThinkingLabel?.(
+                      `Thinking (${targetProvider}/${targetModelId})...`,
+                    );
+                  } else {
+                    state.lastExtensionContext.ui.setHiddenThinkingLabel?.();
+                  }
+                }
-              let contentReceived = false;
-              for await (const event of delegatedStream) {
-                if (event.type === 'done') {
-                  const cost = event.message.usage?.cost?.total ?? 0;
-                  state.accumulatedCost += cost;
+                const delegatedStream = streamSimple(
+                  targetModel,
+                  effectiveContext,
+                  {
+                    ...options,
+                    apiKey,
+                    headers,
+                    ...(delegatedReasoning
+                      ? { reasoning: delegatedReasoning }
+                      : {}),
+                  },
+                );
+                for await (const event of delegatedStream) {
+                  if (event.type === 'done') {
+                    const cost = event.message.usage?.cost?.total ?? 0;
+                    state.accumulatedCost += cost;
+                  }
+                  if (event.type === 'error' && !contentReceived) {
+                    throw new Error(
+                      (event as any).error?.errorMessage ||
+                        'Model failed before sending content.',
+                    );
+                  }
+                  const isContent =
+                    event.type === 'text_delta' ||
+                    event.type === 'thinking_delta' ||
+                    event.type === 'toolcall_delta' ||
+                    event.type === 'toolcall_end';
+                  if (isContent) contentReceived = true;
+                  stream.push(event);
                 }
-                if (event.type === 'error' && !contentReceived) {
-                  throw new Error(
-                    (event as any).error?.errorMessage ||
-                      'Model failed before sending content.',
-                  );
+                modelSuccess = true;
+                success = true;
+                if (i > 0) decision.isFallback = true;
+                break; // break the retry loop
+              } catch (err) {
+                const errMsg = err instanceof Error ? err.message : String(err);
+                const waitMs = extractWaitTimeMs(errMsg);
+                const maxWaitMs = (state.currentConfig.rateLimitFallback?.shortDelayThreshold ?? 60) * 1000;
+                if (waitMs && waitMs <= maxWaitMs && retryCount === 0 && !contentReceived) {
+                  const partialMsg = {
+                    role: 'assistant',
+                    content: [],
+                    api: model.api,
+                    provider: targetProvider,
+                    model: targetModelId,
+                    usage: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, totalTokens: 0, cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 } },
+                    timestamp: Date.now(),
+                  } as unknown as AssistantMessage;
+                  stream.push({
+                    type: 'text_delta',
+                    contentIndex: 0,
+                    delta: `\n_⏳ [Router] Rate limit reached on ${targetProvider}/${targetModelId}. Waiting ${Math.ceil(waitMs/1000)}s before retrying..._\n`,
+                    partial: partialMsg
+                  });
+                  await new Promise(resolve => setTimeout(resolve, waitMs + 1000)); // buffer 1s
+                  retryCount++;
+                  continue; // try the same model again
                 }
-                const isContent =
-                  event.type === 'text_delta' ||
-                  event.type === 'thinking_delta' ||
-                  event.type === 'toolcall_delta' ||
-                  event.type === 'toolcall_end';
-                if (isContent) contentReceived = true;
-                stream.push(event);
+                lastError = err;
+                break; // model failed completely, break retry loop to go to next fallback model
               }
-              success = true;
-              if (i > 0) decision.isFallback = true;
-              break;
-            } catch (err) {
-              lastError = err;
             }
+            if (modelSuccess) break; // break fallback loop
           }
           if (!success) {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@kylebrodeur/pi-model-router",
-  "version": "0.1.3",
+  "version": "0.1.4",
   "type": "module",
   "description": "Intelligent per-turn model router extension for the pi coding agent (Enhanced Fork)",
   "keywords": [