npm - @juspay/neurolink - Versions diffs - 9.59.0 → 9.59.2 - Mend

@juspay/neurolink 9.59.0 → 9.59.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/CHANGELOG.md +12 -0
package/dist/browser/neurolink.min.js +1025 -1025
package/dist/lib/neurolink.d.ts +29 -1
package/dist/lib/neurolink.js +406 -37
package/dist/lib/providers/googleAiStudio.js +7 -0
package/dist/lib/providers/googleVertex.js +5 -0
package/dist/lib/types/index.d.ts +1 -0
package/dist/lib/types/index.js +2 -0
package/dist/lib/types/streamDedup.d.ts +14 -0
package/dist/lib/types/streamDedup.js +2 -0
package/dist/neurolink.d.ts +29 -1
package/dist/neurolink.js +406 -37
package/dist/providers/googleAiStudio.js +7 -0
package/dist/providers/googleVertex.js +5 -0
package/dist/types/index.d.ts +1 -0
package/dist/types/index.js +2 -0
package/dist/types/streamDedup.d.ts +14 -0
package/dist/types/streamDedup.js +1 -0
package/package.json +1 -1

package/dist/neurolink.js CHANGED Viewed

@@ -194,6 +194,12 @@ function isNonRetryableProviderError(error) {
     if (error instanceof ModelAccessDeniedError) {
         return true;
     }
+    // Note: ContextBudgetExceededError is intentionally NOT non-retryable.
+    // Each provider has its own context window, so a budget rejection on
+    // one provider doesn't preclude another provider's window fitting the
+    // same payload. The directProviderGeneration loop should continue
+    // trying alternate providers; the after-loop rethrow preserves the
+    // typed error when all providers reject (see `directProviderGeneration`).
     // Check for HTTP status codes on error objects (e.g., from Vercel AI SDK)
     if (error && typeof error === "object") {
         const err = error;
@@ -297,6 +303,37 @@ function isNonRetryableProviderError(error) {
  * same NeuroLink instance would clobber each other's trace context.
  */
 const metricsTraceContextStorage = new AsyncLocalStorage();
+/**
+ * Curator P2-4 dedup (concurrency-safe): native providers emit
+ * `generation:end` on the shared SDK emitter. We attach a fresh
+ * mutable `dedupContext` object directly to the per-call
+ * `StreamOptions` (under `_streamDedupContext`) so each stream gets
+ * its own instance — concurrent streams have different option objects
+ * and therefore different contexts, so they cannot interfere.
+ *
+ * Native provider emit sites read `options._streamDedupContext` and
+ * flip `.providerEmitted = true` before emitting; the orchestration's
+ * finally block reads the same closed-over reference and skips its
+ * own emit when the flag is set.
+ *
+ * This avoids the AsyncLocalStorage approach which doesn't reliably
+ * propagate through async-generator yield boundaries when iteration
+ * happens from outside the original `run()` scope (e.g. when the
+ * consumer drives `for await of result.stream` after `sdk.stream(...)`
+ * returns).
+ */
+export const STREAM_DEDUP_CONTEXT_KEY = "_streamDedupContext";
+/**
+ * Native providers call this from their `generation:end` emit sites,
+ * passing the same `options` object they received. Safe no-op when
+ * the field isn't set.
+ */
+export function markStreamProviderEmittedGenerationEnd(options) {
+    const ctx = options?._streamDedupContext;
+    if (ctx) {
+        ctx.providerEmitted = true;
+    }
+}
 export class NeuroLink {
     mcpInitialized = false;
     mcpSkipped = false;
@@ -3693,7 +3730,16 @@ Current user's request: ${currentInput}`;
         return null;
     }
     async tryRecoverGenerateTextOverflow(options, functionTag, error) {
-        if (!isContextOverflowError(error) || !this.conversationMemory) {
+        // Reviewer Finding #3: drop the `!this.conversationMemory` gate so
+        // inline-conversationMessages callers also benefit from post-provider
+        // recovery when their pre-dispatch estimate happens to undershoot
+        // and the provider rejects at a higher real token count.
+        if (!isContextOverflowError(error)) {
+            return null;
+        }
+        const inlineMessages = options._originalConversationMessages;
+        const callerMessages = options.conversationMessages;
+        if (!this.conversationMemory && !inlineMessages && !callerMessages) {
             return null;
         }
         logger.warn(`[${functionTag}] Context overflow detected by provider, attempting smart recovery`, {
@@ -3702,8 +3748,11 @@ Current user's request: ${currentInput}`;
         });
         try {
             const actualOverflow = parseProviderOverflowDetails(error);
-            const originalMessages = options._originalConversationMessages ??
-                (await getConversationMessages(this.conversationMemory, options));
+            const originalMessages = inlineMessages ??
+                callerMessages ??
+                (this.conversationMemory
+                    ? await getConversationMessages(this.conversationMemory, options)
+                    : []);
             const recoveryBudget = checkContextBudget({
                 provider: options.provider || "openai",
                 model: options.model,
@@ -3717,49 +3766,129 @@ Current user's request: ${currentInput}`;
             const requiredReduction = actualTokens > 0
                 ? (actualTokens - compactionTarget) / actualTokens
                 : 0.5;
-            const compactor = new ContextCompactor({
-                enableSummarize: false,
-                enablePrune: true,
-                enableDeduplicate: true,
-                enableTruncate: true,
-                truncationFraction: Math.min(0.9, requiredReduction + 0.15),
-            });
-            const compactionResult = await compactor.compact(originalMessages, compactionTarget, undefined, options.context?.requestId);
-            if (!compactionResult.compacted) {
-                return null;
+            // Reviewer Finding #3: escalating truncation across attempts. The
+            // first attempt uses the budget-derived fraction (single-round
+            // compaction). If that still leaves the conversation over budget,
+            // subsequent attempts apply progressively harder truncation
+            // (0.5 → 0.75 → 0.9) before giving up. This replaces the previous
+            // single-pass behaviour where one undersized fraction guaranteed
+            // failure on the next provider call.
+            const escalationFractions = [
+                Math.min(0.9, requiredReduction + 0.15),
+                0.5,
+                0.75,
+                0.9,
+            ];
+            let lastCompactionResult = null;
+            let compactedMessages = originalMessages;
+            let verifiedBudget = null;
+            let recoveredFraction = -1;
+            for (let i = 0; i < escalationFractions.length; i++) {
+                const fraction = escalationFractions[i];
+                const compactor = new ContextCompactor({
+                    enableSummarize: false,
+                    enablePrune: true,
+                    enableDeduplicate: true,
+                    enableTruncate: true,
+                    truncationFraction: fraction,
+                });
+                const compactionResult = await compactor.compact(originalMessages, compactionTarget, undefined, options.context?.requestId);
+                if (!compactionResult.compacted) {
+                    continue;
+                }
+                lastCompactionResult = compactionResult;
+                const repairedResult = repairToolPairs(compactionResult.messages);
+                const verifyBudget = checkContextBudget({
+                    provider: options.provider || "openai",
+                    model: options.model,
+                    maxTokens: options.maxTokens,
+                    systemPrompt: options.systemPrompt,
+                    currentPrompt: options.prompt,
+                    conversationMessages: repairedResult.messages,
+                });
+                if (verifyBudget.withinBudget) {
+                    compactedMessages = repairedResult.messages;
+                    verifiedBudget = verifyBudget;
+                    recoveredFraction = fraction;
+                    break;
+                }
+                verifiedBudget = verifyBudget;
+            }
+            if (!lastCompactionResult) {
+                // Reviewer follow-up: when no escalation fraction managed to
+                // compact the conversation, the request will hit the same
+                // provider 400 again on retry. Surface a typed
+                // ContextBudgetExceededError + `compaction.insufficient` event
+                // instead of returning null (which lets callers propagate the
+                // opaque provider error).
+                try {
+                    this.emitter.emit("compaction.insufficient", {
+                        stagesAttempted: [],
+                        finalTokens: actualTokens,
+                        budget: budgetTokens,
+                        provider: options.provider || "openai",
+                        model: options.model,
+                        phase: "post-provider-recovery-no-compaction",
+                        fractionsTried: escalationFractions,
+                        timestamp: Date.now(),
+                    });
+                }
+                catch {
+                    /* listener errors are non-fatal */
+                }
+                throw new ContextBudgetExceededError(`Context overflow recovery: no compaction stage was able to ` +
+                    `reduce conversation messages. Provider rejected at ` +
+                    `~${actualTokens} tokens; budget is ${budgetTokens} tokens.`, {
+                    estimatedTokens: actualTokens,
+                    availableTokens: budgetTokens,
+                    stagesUsed: [],
+                    breakdown: {},
+                });
             }
-            const repairedResult = repairToolPairs(compactionResult.messages);
-            const verifyBudget = checkContextBudget({
-                provider: options.provider || "openai",
-                model: options.model,
-                maxTokens: options.maxTokens,
-                systemPrompt: options.systemPrompt,
-                currentPrompt: options.prompt,
-                conversationMessages: repairedResult.messages,
-            });
-            if (!verifyBudget.withinBudget) {
-                logger.error(`[${functionTag}] Recovery compaction insufficient, aborting retry`, {
-                    estimatedTokens: verifyBudget.estimatedInputTokens,
-                    availableTokens: verifyBudget.availableInputTokens,
+            if (!verifiedBudget?.withinBudget) {
+                logger.error(`[${functionTag}] Recovery compaction insufficient after escalation, aborting retry`, {
+                    estimatedTokens: verifiedBudget?.estimatedInputTokens,
+                    availableTokens: verifiedBudget?.availableInputTokens,
+                    stagesAttempted: lastCompactionResult.stagesUsed,
+                    fractionsTried: escalationFractions,
                 });
+                // Reviewer Finding #3: emit `compaction.insufficient` so
+                // cost / audit listeners record the specific failure mode.
+                try {
+                    this.emitter.emit("compaction.insufficient", {
+                        stagesAttempted: lastCompactionResult.stagesUsed,
+                        finalTokens: verifiedBudget?.estimatedInputTokens,
+                        budget: verifiedBudget?.availableInputTokens,
+                        provider: options.provider || "openai",
+                        model: options.model,
+                        phase: "post-provider-recovery",
+                        fractionsTried: escalationFractions,
+                        timestamp: Date.now(),
+                    });
+                }
+                catch {
+                    /* listener errors are non-fatal */
+                }
                 throw new ContextBudgetExceededError(`Context overflow recovery failed. Provider rejected at ~${actualTokens} tokens, ` +
-                    `recovery compaction achieved ${compactionResult.tokensAfter} tokens ` +
-                    `but budget is ${budgetTokens} tokens.`, {
-                    estimatedTokens: compactionResult.tokensAfter,
+                    `recovery compaction achieved ${lastCompactionResult.tokensAfter} tokens ` +
+                    `but budget is ${budgetTokens} tokens (after escalation through ` +
+                    `${escalationFractions.length} fractions).`, {
+                    estimatedTokens: lastCompactionResult.tokensAfter,
                     availableTokens: budgetTokens,
-                    stagesUsed: compactionResult.stagesUsed,
-                    breakdown: verifyBudget.breakdown,
+                    stagesUsed: lastCompactionResult.stagesUsed,
+                    breakdown: verifiedBudget?.breakdown ?? {},
                 });
             }
             logger.info(`[${functionTag}] Smart recovery verified, retrying generation`, {
-                tokensSaved: compactionResult.tokensSaved,
+                tokensSaved: lastCompactionResult.tokensSaved,
                 compactionTarget,
-                verifiedTokens: verifyBudget.estimatedInputTokens,
-                verifiedBudget: verifyBudget.availableInputTokens,
+                verifiedTokens: verifiedBudget.estimatedInputTokens,
+                verifiedBudget: verifiedBudget.availableInputTokens,
+                recoveredFraction,
             });
             return this.directProviderGeneration({
                 ...options,
-                conversationMessages: repairedResult.messages,
+                conversationMessages: compactedMessages,
             });
         }
         catch (retryError) {
@@ -4390,8 +4519,51 @@ Current user's request: ${currentInput}`;
                 });
                 const dpgMessageCount = conversationMessages?.length || 0;
                 const dpgCompactionSessionId = this.getCompactionSessionId(options);
+                // Curator P1-2: pre-dispatch compaction must run for inline
+                // `conversationMessages` too (not just conversationMemory). Without
+                // this, a 1.3M-token caller-supplied conversation against a 128K
+                // window dispatches anyway and the provider returns
+                // "prompt is too long" — the bug Curator's report cited.
+                const dpgHasInlineMessages = !!optionsWithMessages.conversationMessages?.length;
+                // Reviewer follow-up: gate the hard cap on the *actual compactable
+                // history* rather than `this.conversationMemory`. A configured-but-
+                // empty memory store leaves nothing to compact yet still satisfies
+                // `!this.conversationMemory === false`, so the previous check
+                // skipped the hard cap and dispatched the oversized payload.
+                const dpgHasCompactableMessages = dpgMessageCount > 0;
+                // Reviewer Finding #4: pre-dispatch hard cap for the standalone
+                // oversized case. When the budget check shows the request is
+                // over budget but there's nothing to compact (no memory + no
+                // inline messages — e.g. a huge prompt or huge tool definitions
+                // alone), throw before dispatch instead of wasting a roundtrip.
+                if (!budgetCheck.withinBudget && !dpgHasCompactableMessages) {
+                    try {
+                        this.emitter.emit("compaction.insufficient", {
+                            stagesAttempted: ["pre-dispatch hard cap"],
+                            finalTokens: budgetCheck.estimatedInputTokens,
+                            budget: budgetCheck.availableInputTokens,
+                            provider: providerName,
+                            model: options.model,
+                            phase: "pre-dispatch-no-recovery",
+                            timestamp: Date.now(),
+                        });
+                    }
+                    catch {
+                        /* listener errors are non-fatal */
+                    }
+                    throw new ContextBudgetExceededError(`Context exceeds model budget and no compaction is possible ` +
+                        `(no conversationMemory, no inline conversationMessages — only ` +
+                        `prompt + tools). Estimated: ${budgetCheck.estimatedInputTokens} ` +
+                        `tokens, budget: ${budgetCheck.availableInputTokens} tokens. ` +
+                        `Reduce prompt or tool-definition size, or trim the request.`, {
+                        estimatedTokens: budgetCheck.estimatedInputTokens,
+                        availableTokens: budgetCheck.availableInputTokens,
+                        stagesUsed: [],
+                        breakdown: budgetCheck.breakdown,
+                    });
+                }
                 if (budgetCheck.shouldCompact &&
-                    this.conversationMemory &&
+                    (this.conversationMemory || dpgHasInlineMessages) &&
                     dpgMessageCount >
                         (this.lastCompactionMessageCount.get(dpgCompactionSessionId) ?? 0)) {
                     const compactor = new ContextCompactor({
@@ -4425,6 +4597,26 @@ Current user's request: ${currentInput}`;
                             availableTokens: postCompactBudget.availableInputTokens,
                             overagePercent: Math.round((postCompactBudget.usageRatio - 1.0) * 100),
                         });
+                        // Curator P1-2: emit `compaction.insufficient` whenever a
+                        // single round of compaction wasn't enough — even when
+                        // emergency truncation will save the day. Lets cost / audit
+                        // listeners track the "compaction was insufficient" signal
+                        // separately from the eventual outcome.
+                        try {
+                            this.emitter.emit("compaction.insufficient", {
+                                stagesAttempted: compactionResult.stagesUsed,
+                                finalTokens: postCompactBudget.estimatedInputTokens,
+                                budget: postCompactBudget.availableInputTokens,
+                                provider: providerName,
+                                model: options.model,
+                                phase: "mid-compaction",
+                                willEmergencyTruncate: true,
+                                timestamp: Date.now(),
+                            });
+                        }
+                        catch {
+                            /* listener errors are non-fatal */
+                        }
                         conversationMessages = emergencyContentTruncation(conversationMessages, postCompactBudget.availableInputTokens, postCompactBudget.breakdown, providerName);
                         const finalBudget = checkContextBudget({
                             provider: providerName,
@@ -4440,6 +4632,23 @@ Current user's request: ${currentInput}`;
                         if (!finalBudget.withinBudget) {
                             // Clear watermark so handleContextOverflow recovery can re-compact
                             this.lastCompactionMessageCount.delete(dpgCompactionSessionId);
+                            // Curator P1-2: emit `compaction.insufficient` so cost / audit
+                            // listeners can record the specific failure mode (separate
+                            // from a generic provider error).
+                            try {
+                                this.emitter.emit("compaction.insufficient", {
+                                    stagesAttempted: compactionResult.stagesUsed,
+                                    finalTokens: finalBudget.estimatedInputTokens,
+                                    budget: finalBudget.availableInputTokens,
+                                    provider: providerName,
+                                    model: options.model,
+                                    phase: "post-emergency-truncation",
+                                    timestamp: Date.now(),
+                                });
+                            }
+                            catch {
+                                /* listener errors are non-fatal */
+                            }
                             throw new ContextBudgetExceededError(`Context exceeds model budget after all compaction stages. ` +
                                 `Estimated: ${finalBudget.estimatedInputTokens} tokens, ` +
                                 `Budget: ${finalBudget.availableInputTokens} tokens.`, {
@@ -4546,6 +4755,14 @@ Current user's request: ${currentInput}`;
             lastError: lastError?.message,
             responseTime,
         });
+        // Reviewer follow-up: preserve typed ContextBudgetExceededError after
+        // the per-provider fallback loop. Each provider's hard cap is
+        // per-window; we let the loop try them all, but if every provider
+        // rejected on budget the caller still needs the typed error to
+        // distinguish "context too large" from a generic provider failure.
+        if (lastError instanceof ContextBudgetExceededError) {
+            throw lastError;
+        }
         throw new Error(`Failed to generate text with all providers. Last error: ${lastError?.message || "Unknown error"}`);
     }
     /**
@@ -4984,8 +5201,23 @@ Current user's request: ${currentInput}`;
             const streamStartTime = Date.now();
             const sessionId = enhancedOptions.context
                 ?.sessionId;
+            // Curator P2-4 dedup (concurrency-safe): native provider stream paths
+            // (Gemini 3 on Vertex / Google AI Studio) emit `generation:end`
+            // themselves. We attach a per-stream mutable flag directly to
+            // `enhancedOptions._streamDedupContext` — native providers receive
+            // these options and flip the flag before their emit; this finally
+            // block reads the same closed-over reference. Concurrent streams
+            // have different option objects so the contexts don't interfere.
+            const dedupContext = {
+                providerEmitted: false,
+            };
+            enhancedOptions._streamDedupContext = dedupContext;
             const processedStream = (async function* () {
                 let streamError;
+                // Curator P2-4: hoist `resolvedUsage` so the finally block can emit a
+                // single `generation:end` event with cost data. Cost listeners
+                // subscribe here; previously the stream path never fired it.
+                let resolvedUsage;
                 try {
                     for await (const chunk of mcpStream) {
                         chunkCount++;
@@ -5015,7 +5247,7 @@ Current user's request: ${currentInput}`;
                             accumulatedContent += content;
                         });
                     }
-                    let resolvedUsage = streamUsage;
+                    resolvedUsage = streamUsage;
                     if (!resolvedUsage && streamAnalytics) {
                         try {
                             const resolved = await Promise.resolve(streamAnalytics);
@@ -5090,6 +5322,61 @@ Current user's request: ${currentInput}`;
                         guardrailsBlocked: metadata.guardrailsBlocked,
                         error: metadata.error,
                     });
+                    // Curator P2-4: emit `generation:end` exactly once per stream so
+                    // cost listeners receive the same contract as for `generate()`.
+                    // The previous implementation only fired `stream:complete`, leaving
+                    // any subscriber to `generation:end` with zero events.
+                    //
+                    // Dedup: native provider stream paths (Gemini 3 on Vertex / Google
+                    // AI Studio) already emit `generation:end` themselves so Pipeline B
+                    // (Langfuse) records a GENERATION observation. Skip our emit when
+                    // they already fired — preserves their Pipeline B observation
+                    // source and keeps the "exactly once" contract. Per-stream flag
+                    // is concurrency-safe because it's scoped via AsyncLocalStorage.
+                    if (!dedupContext.providerEmitted) {
+                        try {
+                            const finalProvider = metadata.fallbackProvider ?? providerName ?? "unknown";
+                            const finalModel = metadata.fallbackModel ??
+                                streamModel ??
+                                enhancedOptions.model ??
+                                "unknown";
+                            const finalFinishReason = streamError
+                                ? "error"
+                                : (streamState.finishReason ?? "stop");
+                            self.emitter.emit("generation:end", {
+                                provider: finalProvider,
+                                model: finalModel,
+                                responseTime: Date.now() - streamStartTime,
+                                toolsUsed: streamState.toolCalls?.map((t) => t.toolName),
+                                timestamp: Date.now(),
+                                result: {
+                                    content: accumulatedContent,
+                                    usage: resolvedUsage,
+                                    model: finalModel,
+                                    provider: finalProvider,
+                                    finishReason: finalFinishReason,
+                                },
+                                prompt: enhancedOptions.input?.text ||
+                                    enhancedOptions.prompt,
+                                temperature: enhancedOptions.temperature,
+                                maxTokens: enhancedOptions.maxTokens,
+                                success: !streamError,
+                                error: streamError
+                                    ? streamError instanceof Error
+                                        ? streamError.message
+                                        : String(streamError)
+                                    : undefined,
+                                pipelineAHandled: true,
+                            });
+                        }
+                        catch (emitError) {
+                            logger.debug("[NeuroLink.stream] generation:end listener threw — ignored", {
+                                error: emitError instanceof Error
+                                    ? emitError.message
+                                    : String(emitError),
+                            });
+                        }
+                    }
                     self._disableToolCacheForCurrentRequest = false;
                     cleanupListeners();
                     streamSpan.setAttribute("neurolink.response_time_ms", Date.now() - spanStartTime);
@@ -5641,6 +5928,42 @@ Current user's request: ${currentInput}`;
         });
         const streamMessageCount = conversationMessages?.length || 0;
         const streamCompactionSessionId = this.getCompactionSessionId(options);
+        // Reviewer follow-up: gate the hard cap on the *actual compactable
+        // history* rather than `this.conversationMemory`. A configured-but-
+        // empty memory store leaves nothing to compact yet still satisfies
+        // `!this.conversationMemory === false`, so the previous check
+        // skipped the hard cap and dispatched the oversized payload.
+        const streamHasCompactableMessages = streamMessageCount > 0;
+        // Curator P1-2: pre-dispatch hard cap mirrors directProviderGeneration.
+        // When the budget check fails AND there's nothing to compact (no memory
+        // + no inline messages — only prompt + tools), throw before dispatch
+        // instead of wasting a roundtrip on a payload the provider will reject.
+        if (!streamBudget.withinBudget && !streamHasCompactableMessages) {
+            try {
+                this.emitter.emit("compaction.insufficient", {
+                    stagesAttempted: ["pre-dispatch hard cap"],
+                    finalTokens: streamBudget.estimatedInputTokens,
+                    budget: streamBudget.availableInputTokens,
+                    provider: providerName,
+                    model: options.model,
+                    phase: "pre-dispatch-no-recovery",
+                    timestamp: Date.now(),
+                });
+            }
+            catch {
+                /* listener errors are non-fatal */
+            }
+            throw new ContextBudgetExceededError(`Stream context exceeds model budget and no compaction is possible ` +
+                `(no conversationMemory, no inline conversationMessages — only ` +
+                `prompt + tools). Estimated: ${streamBudget.estimatedInputTokens} ` +
+                `tokens, budget: ${streamBudget.availableInputTokens} tokens. ` +
+                `Reduce prompt or tool-definition size, or trim the request.`, {
+                estimatedTokens: streamBudget.estimatedInputTokens,
+                availableTokens: streamBudget.availableInputTokens,
+                stagesUsed: [],
+                breakdown: streamBudget.breakdown,
+            });
+        }
         if (streamBudget.shouldCompact &&
             (hasCallerConversationHistory || this.conversationMemory) &&
             streamMessageCount >
@@ -5677,6 +6000,26 @@ Current user's request: ${currentInput}`;
                     availableTokens: postCompactBudget.availableInputTokens,
                     overagePercent: Math.round((postCompactBudget.usageRatio - 1.0) * 100),
                 });
+                // Curator P1-2: emit `compaction.insufficient` whenever a single
+                // round of compaction wasn't enough — even when emergency
+                // truncation will save the day. Lets cost / audit listeners track
+                // the "compaction was insufficient" signal separately from the
+                // eventual outcome.
+                try {
+                    this.emitter.emit("compaction.insufficient", {
+                        stagesAttempted: compactionResult.stagesUsed,
+                        finalTokens: postCompactBudget.estimatedInputTokens,
+                        budget: postCompactBudget.availableInputTokens,
+                        provider: providerName,
+                        model: options.model,
+                        phase: "mid-compaction",
+                        willEmergencyTruncate: true,
+                        timestamp: Date.now(),
+                    });
+                }
+                catch {
+                    /* listener errors are non-fatal */
+                }
                 conversationMessages = emergencyContentTruncation(conversationMessages, postCompactBudget.availableInputTokens, postCompactBudget.breakdown, providerName);
                 // Keep options in sync after emergency truncation so fallback paths
                 // use the truncated history.
@@ -5693,6 +6036,23 @@ Current user's request: ${currentInput}`;
                 if (!finalBudget.withinBudget) {
                     // Clear watermark so handleContextOverflow recovery can re-compact
                     this.lastCompactionMessageCount.delete(streamCompactionSessionId);
+                    // Curator P1-2: emit `compaction.insufficient` on the terminal
+                    // failure path so cost / audit listeners can record the specific
+                    // failure mode (compaction + emergency truncation both insufficient).
+                    try {
+                        this.emitter.emit("compaction.insufficient", {
+                            stagesAttempted: compactionResult.stagesUsed,
+                            finalTokens: finalBudget.estimatedInputTokens,
+                            budget: finalBudget.availableInputTokens,
+                            provider: providerName,
+                            model: options.model,
+                            phase: "post-emergency-truncation",
+                            timestamp: Date.now(),
+                        });
+                    }
+                    catch {
+                        /* listener errors are non-fatal */
+                    }
                     throw new ContextBudgetExceededError(`Stream context exceeds model budget after all compaction stages. ` +
                         `Estimated: ${finalBudget.estimatedInputTokens} tokens, ` +
                         `Budget: ${finalBudget.availableInputTokens} tokens.`, {
@@ -5780,6 +6140,15 @@ Current user's request: ${currentInput}`;
      * Handle stream error with fallback
      */
     async handleStreamError(error, options, startTime, streamId, enhancedOptions, _factoryResult) {
+        // Curator P1-2: when the pre-dispatch hard cap or post-emergency
+        // truncation budget check throws ContextBudgetExceededError, the
+        // payload is too large for the model and a same-payload retry would
+        // just fail again at the provider — wasting the same tokens that
+        // the hard cap was meant to save. Rethrow so the caller sees the
+        // typed error instead of a fallback ProviderError that hides it.
+        if (error instanceof ContextBudgetExceededError) {
+            throw error;
+        }
         logger.error("Stream generation failed, attempting fallback", {
             error: error instanceof Error ? error.message : String(error),
         });

package/dist/providers/googleAiStudio.js CHANGED Viewed

@@ -4,6 +4,7 @@ import { ErrorCategory, ErrorSeverity, GoogleAIModels, } from "../constants/enum
 import { BaseProvider } from "../core/baseProvider.js";
 import { DEFAULT_MAX_STEPS } from "../core/constants.js";
 import { streamAnalyticsCollector } from "../core/streamAnalytics.js";
+import { markStreamProviderEmittedGenerationEnd, } from "../neurolink.js";
 import { SpanStatusCode } from "@opentelemetry/api";
 import { ATTR, tracers, withClientSpan } from "../telemetry/index.js";
 import { AuthenticationError, NetworkError, ProviderError, RateLimitError, } from "../types/index.js";
@@ -735,6 +736,9 @@ export class GoogleAIStudioProvider extends BaseProvider {
                         // AI SDK so experimental_telemetry is never injected; we emit manually.
                         const nativeStreamEmitter = this.neurolink?.getEventEmitter();
                         if (nativeStreamEmitter) {
+                            // Curator P2-4 dedup: flag the per-stream context attached
+                            // to options so the orchestration skips its own emit.
+                            markStreamProviderEmittedGenerationEnd(options);
                             nativeStreamEmitter.emit("generation:end", {
                                 provider: this.providerName,
                                 responseTime,
@@ -767,6 +771,9 @@ export class GoogleAIStudioProvider extends BaseProvider {
                         // Emit failure generation:end so Pipeline B records the failed stream
                         const errorEmitter = this.neurolink?.getEventEmitter();
                         if (errorEmitter) {
+                            // Curator P2-4 dedup: flag the per-stream context attached
+                            // to options so the orchestration skips its own emit.
+                            markStreamProviderEmittedGenerationEnd(options);
                             errorEmitter.emit("generation:end", {
                                 provider: this.providerName,
                                 responseTime: Date.now() - startTime,

package/dist/providers/googleVertex.js CHANGED Viewed

@@ -10,6 +10,7 @@ import { ErrorCategory, ErrorSeverity, } from "../constants/enums.js";
 import { BaseProvider } from "../core/baseProvider.js";
 import { DEFAULT_MAX_STEPS, GLOBAL_LOCATION_MODELS, } from "../core/constants.js";
 import { ModelConfigurationManager } from "../core/modelConfiguration.js";
+import { markStreamProviderEmittedGenerationEnd, } from "../neurolink.js";
 import { createProxyFetch } from "../proxy/proxyFetch.js";
 import { ATTR, tracers, withClientSpan } from "../telemetry/index.js";
 import { AuthenticationError, InvalidModelError, NetworkError, ProviderError, RateLimitError, } from "../types/index.js";
@@ -1630,8 +1631,12 @@ export class GoogleVertexProvider extends BaseProvider {
             // Emit generation:end so Pipeline B (Langfuse) creates a GENERATION
             // observation. The native @google/genai stream path on Vertex bypasses the
             // Vercel AI SDK so experimental_telemetry is never injected; we emit manually.
+            // Curator P2-4 dedup: flag the per-stream context attached to options
+            // so the orchestration in `runStandardStreamRequest` knows we already
+            // emitted and skips its own emit (preserving exactly-once).
             const vertexStreamEmitter = this.neurolink?.getEventEmitter();
             if (vertexStreamEmitter) {
+                markStreamProviderEmittedGenerationEnd(params.options);
                 vertexStreamEmitter.emit("generation:end", {
                     provider: this.providerName,
                     responseTime,

package/dist/types/index.d.ts CHANGED Viewed

@@ -57,3 +57,4 @@ export * from "./span.js";
 export * from "./imageGen.js";
 export * from "./elicitation.js";
 export * from "./dynamic.js";
+export * from "./streamDedup.js";

package/dist/types/index.js CHANGED Viewed

@@ -60,3 +60,5 @@ export * from "./imageGen.js";
 export * from "./elicitation.js";
 // Dynamic Arguments types
 export * from "./dynamic.js";
+// Curator P2-4 dedup: per-stream AsyncLocalStorage context
+export * from "./streamDedup.js";

package/dist/types/streamDedup.d.ts ADDED Viewed

@@ -0,0 +1,14 @@
+/**
+ * Curator P2-4 dedup (concurrency-safe): per-stream context that lets
+ * the orchestration's `runStandardStreamRequest` finally block know
+ * whether a *native provider* path within THIS stream's async chain
+ * already emitted `generation:end`. Native providers (Vertex / Google
+ * AI Studio for Gemini 3, etc.) emit on the shared SDK emitter; without
+ * scoping, a concurrent unrelated stream's emit on the same NeuroLink
+ * instance would suppress the wrong stream's orchestration emit.
+ *
+ * AsyncLocalStorage scopes each stream's flag to its own async chain.
+ */
+export type StreamGenerationEndContext = {
+    providerEmitted: boolean;
+};

package/dist/types/streamDedup.js ADDED Viewed

	@@ -0,0 +1 @@
1	+ export {};