@juspay/neurolink 9.59.0 → 9.59.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/neurolink.js CHANGED
@@ -194,6 +194,12 @@ function isNonRetryableProviderError(error) {
194
194
  if (error instanceof ModelAccessDeniedError) {
195
195
  return true;
196
196
  }
197
+ // Note: ContextBudgetExceededError is intentionally NOT non-retryable.
198
+ // Each provider has its own context window, so a budget rejection on
199
+ // one provider doesn't preclude another provider's window fitting the
200
+ // same payload. The directProviderGeneration loop should continue
201
+ // trying alternate providers; the after-loop rethrow preserves the
202
+ // typed error when all providers reject (see `directProviderGeneration`).
197
203
  // Check for HTTP status codes on error objects (e.g., from Vercel AI SDK)
198
204
  if (error && typeof error === "object") {
199
205
  const err = error;
@@ -297,6 +303,37 @@ function isNonRetryableProviderError(error) {
297
303
  * same NeuroLink instance would clobber each other's trace context.
298
304
  */
299
305
  const metricsTraceContextStorage = new AsyncLocalStorage();
306
+ /**
307
+ * Curator P2-4 dedup (concurrency-safe): native providers emit
308
+ * `generation:end` on the shared SDK emitter. We attach a fresh
309
+ * mutable `dedupContext` object directly to the per-call
310
+ * `StreamOptions` (under `_streamDedupContext`) so each stream gets
311
+ * its own instance — concurrent streams have different option objects
312
+ * and therefore different contexts, so they cannot interfere.
313
+ *
314
+ * Native provider emit sites read `options._streamDedupContext` and
315
+ * flip `.providerEmitted = true` before emitting; the orchestration's
316
+ * finally block reads the same closed-over reference and skips its
317
+ * own emit when the flag is set.
318
+ *
319
+ * This avoids the AsyncLocalStorage approach which doesn't reliably
320
+ * propagate through async-generator yield boundaries when iteration
321
+ * happens from outside the original `run()` scope (e.g. when the
322
+ * consumer drives `for await of result.stream` after `sdk.stream(...)`
323
+ * returns).
324
+ */
325
+ export const STREAM_DEDUP_CONTEXT_KEY = "_streamDedupContext";
326
+ /**
327
+ * Native providers call this from their `generation:end` emit sites,
328
+ * passing the same `options` object they received. Safe no-op when
329
+ * the field isn't set.
330
+ */
331
+ export function markStreamProviderEmittedGenerationEnd(options) {
332
+ const ctx = options?._streamDedupContext;
333
+ if (ctx) {
334
+ ctx.providerEmitted = true;
335
+ }
336
+ }
300
337
  export class NeuroLink {
301
338
  mcpInitialized = false;
302
339
  mcpSkipped = false;
@@ -3693,7 +3730,16 @@ Current user's request: ${currentInput}`;
3693
3730
  return null;
3694
3731
  }
3695
3732
  async tryRecoverGenerateTextOverflow(options, functionTag, error) {
3696
- if (!isContextOverflowError(error) || !this.conversationMemory) {
3733
+ // Reviewer Finding #3: drop the `!this.conversationMemory` gate so
3734
+ // inline-conversationMessages callers also benefit from post-provider
3735
+ // recovery when their pre-dispatch estimate happens to undershoot
3736
+ // and the provider rejects at a higher real token count.
3737
+ if (!isContextOverflowError(error)) {
3738
+ return null;
3739
+ }
3740
+ const inlineMessages = options._originalConversationMessages;
3741
+ const callerMessages = options.conversationMessages;
3742
+ if (!this.conversationMemory && !inlineMessages && !callerMessages) {
3697
3743
  return null;
3698
3744
  }
3699
3745
  logger.warn(`[${functionTag}] Context overflow detected by provider, attempting smart recovery`, {
@@ -3702,8 +3748,11 @@ Current user's request: ${currentInput}`;
3702
3748
  });
3703
3749
  try {
3704
3750
  const actualOverflow = parseProviderOverflowDetails(error);
3705
- const originalMessages = options._originalConversationMessages ??
3706
- (await getConversationMessages(this.conversationMemory, options));
3751
+ const originalMessages = inlineMessages ??
3752
+ callerMessages ??
3753
+ (this.conversationMemory
3754
+ ? await getConversationMessages(this.conversationMemory, options)
3755
+ : []);
3707
3756
  const recoveryBudget = checkContextBudget({
3708
3757
  provider: options.provider || "openai",
3709
3758
  model: options.model,
@@ -3717,49 +3766,129 @@ Current user's request: ${currentInput}`;
3717
3766
  const requiredReduction = actualTokens > 0
3718
3767
  ? (actualTokens - compactionTarget) / actualTokens
3719
3768
  : 0.5;
3720
- const compactor = new ContextCompactor({
3721
- enableSummarize: false,
3722
- enablePrune: true,
3723
- enableDeduplicate: true,
3724
- enableTruncate: true,
3725
- truncationFraction: Math.min(0.9, requiredReduction + 0.15),
3726
- });
3727
- const compactionResult = await compactor.compact(originalMessages, compactionTarget, undefined, options.context?.requestId);
3728
- if (!compactionResult.compacted) {
3729
- return null;
3769
+ // Reviewer Finding #3: escalating truncation across attempts. The
3770
+ // first attempt uses the budget-derived fraction (single-round
3771
+ // compaction). If that still leaves the conversation over budget,
3772
+ // subsequent attempts apply progressively harder truncation
3773
+ // (0.5 → 0.75 → 0.9) before giving up. This replaces the previous
3774
+ // single-pass behaviour where one undersized fraction guaranteed
3775
+ // failure on the next provider call.
3776
+ const escalationFractions = [
3777
+ Math.min(0.9, requiredReduction + 0.15),
3778
+ 0.5,
3779
+ 0.75,
3780
+ 0.9,
3781
+ ];
3782
+ let lastCompactionResult = null;
3783
+ let compactedMessages = originalMessages;
3784
+ let verifiedBudget = null;
3785
+ let recoveredFraction = -1;
3786
+ for (let i = 0; i < escalationFractions.length; i++) {
3787
+ const fraction = escalationFractions[i];
3788
+ const compactor = new ContextCompactor({
3789
+ enableSummarize: false,
3790
+ enablePrune: true,
3791
+ enableDeduplicate: true,
3792
+ enableTruncate: true,
3793
+ truncationFraction: fraction,
3794
+ });
3795
+ const compactionResult = await compactor.compact(originalMessages, compactionTarget, undefined, options.context?.requestId);
3796
+ if (!compactionResult.compacted) {
3797
+ continue;
3798
+ }
3799
+ lastCompactionResult = compactionResult;
3800
+ const repairedResult = repairToolPairs(compactionResult.messages);
3801
+ const verifyBudget = checkContextBudget({
3802
+ provider: options.provider || "openai",
3803
+ model: options.model,
3804
+ maxTokens: options.maxTokens,
3805
+ systemPrompt: options.systemPrompt,
3806
+ currentPrompt: options.prompt,
3807
+ conversationMessages: repairedResult.messages,
3808
+ });
3809
+ if (verifyBudget.withinBudget) {
3810
+ compactedMessages = repairedResult.messages;
3811
+ verifiedBudget = verifyBudget;
3812
+ recoveredFraction = fraction;
3813
+ break;
3814
+ }
3815
+ verifiedBudget = verifyBudget;
3816
+ }
3817
+ if (!lastCompactionResult) {
3818
+ // Reviewer follow-up: when no escalation fraction managed to
3819
+ // compact the conversation, the request will hit the same
3820
+ // provider 400 again on retry. Surface a typed
3821
+ // ContextBudgetExceededError + `compaction.insufficient` event
3822
+ // instead of returning null (which lets callers propagate the
3823
+ // opaque provider error).
3824
+ try {
3825
+ this.emitter.emit("compaction.insufficient", {
3826
+ stagesAttempted: [],
3827
+ finalTokens: actualTokens,
3828
+ budget: budgetTokens,
3829
+ provider: options.provider || "openai",
3830
+ model: options.model,
3831
+ phase: "post-provider-recovery-no-compaction",
3832
+ fractionsTried: escalationFractions,
3833
+ timestamp: Date.now(),
3834
+ });
3835
+ }
3836
+ catch {
3837
+ /* listener errors are non-fatal */
3838
+ }
3839
+ throw new ContextBudgetExceededError(`Context overflow recovery: no compaction stage was able to ` +
3840
+ `reduce conversation messages. Provider rejected at ` +
3841
+ `~${actualTokens} tokens; budget is ${budgetTokens} tokens.`, {
3842
+ estimatedTokens: actualTokens,
3843
+ availableTokens: budgetTokens,
3844
+ stagesUsed: [],
3845
+ breakdown: {},
3846
+ });
3730
3847
  }
3731
- const repairedResult = repairToolPairs(compactionResult.messages);
3732
- const verifyBudget = checkContextBudget({
3733
- provider: options.provider || "openai",
3734
- model: options.model,
3735
- maxTokens: options.maxTokens,
3736
- systemPrompt: options.systemPrompt,
3737
- currentPrompt: options.prompt,
3738
- conversationMessages: repairedResult.messages,
3739
- });
3740
- if (!verifyBudget.withinBudget) {
3741
- logger.error(`[${functionTag}] Recovery compaction insufficient, aborting retry`, {
3742
- estimatedTokens: verifyBudget.estimatedInputTokens,
3743
- availableTokens: verifyBudget.availableInputTokens,
3848
+ if (!verifiedBudget?.withinBudget) {
3849
+ logger.error(`[${functionTag}] Recovery compaction insufficient after escalation, aborting retry`, {
3850
+ estimatedTokens: verifiedBudget?.estimatedInputTokens,
3851
+ availableTokens: verifiedBudget?.availableInputTokens,
3852
+ stagesAttempted: lastCompactionResult.stagesUsed,
3853
+ fractionsTried: escalationFractions,
3744
3854
  });
3855
+ // Reviewer Finding #3: emit `compaction.insufficient` so
3856
+ // cost / audit listeners record the specific failure mode.
3857
+ try {
3858
+ this.emitter.emit("compaction.insufficient", {
3859
+ stagesAttempted: lastCompactionResult.stagesUsed,
3860
+ finalTokens: verifiedBudget?.estimatedInputTokens,
3861
+ budget: verifiedBudget?.availableInputTokens,
3862
+ provider: options.provider || "openai",
3863
+ model: options.model,
3864
+ phase: "post-provider-recovery",
3865
+ fractionsTried: escalationFractions,
3866
+ timestamp: Date.now(),
3867
+ });
3868
+ }
3869
+ catch {
3870
+ /* listener errors are non-fatal */
3871
+ }
3745
3872
  throw new ContextBudgetExceededError(`Context overflow recovery failed. Provider rejected at ~${actualTokens} tokens, ` +
3746
- `recovery compaction achieved ${compactionResult.tokensAfter} tokens ` +
3747
- `but budget is ${budgetTokens} tokens.`, {
3748
- estimatedTokens: compactionResult.tokensAfter,
3873
+ `recovery compaction achieved ${lastCompactionResult.tokensAfter} tokens ` +
3874
+ `but budget is ${budgetTokens} tokens (after escalation through ` +
3875
+ `${escalationFractions.length} fractions).`, {
3876
+ estimatedTokens: lastCompactionResult.tokensAfter,
3749
3877
  availableTokens: budgetTokens,
3750
- stagesUsed: compactionResult.stagesUsed,
3751
- breakdown: verifyBudget.breakdown,
3878
+ stagesUsed: lastCompactionResult.stagesUsed,
3879
+ breakdown: verifiedBudget?.breakdown ?? {},
3752
3880
  });
3753
3881
  }
3754
3882
  logger.info(`[${functionTag}] Smart recovery verified, retrying generation`, {
3755
- tokensSaved: compactionResult.tokensSaved,
3883
+ tokensSaved: lastCompactionResult.tokensSaved,
3756
3884
  compactionTarget,
3757
- verifiedTokens: verifyBudget.estimatedInputTokens,
3758
- verifiedBudget: verifyBudget.availableInputTokens,
3885
+ verifiedTokens: verifiedBudget.estimatedInputTokens,
3886
+ verifiedBudget: verifiedBudget.availableInputTokens,
3887
+ recoveredFraction,
3759
3888
  });
3760
3889
  return this.directProviderGeneration({
3761
3890
  ...options,
3762
- conversationMessages: repairedResult.messages,
3891
+ conversationMessages: compactedMessages,
3763
3892
  });
3764
3893
  }
3765
3894
  catch (retryError) {
@@ -4390,8 +4519,51 @@ Current user's request: ${currentInput}`;
4390
4519
  });
4391
4520
  const dpgMessageCount = conversationMessages?.length || 0;
4392
4521
  const dpgCompactionSessionId = this.getCompactionSessionId(options);
4522
+ // Curator P1-2: pre-dispatch compaction must run for inline
4523
+ // `conversationMessages` too (not just conversationMemory). Without
4524
+ // this, a 1.3M-token caller-supplied conversation against a 128K
4525
+ // window dispatches anyway and the provider returns
4526
+ // "prompt is too long" — the bug Curator's report cited.
4527
+ const dpgHasInlineMessages = !!optionsWithMessages.conversationMessages?.length;
4528
+ // Reviewer follow-up: gate the hard cap on the *actual compactable
4529
+ // history* rather than `this.conversationMemory`. A configured-but-
4530
+ // empty memory store leaves nothing to compact yet still satisfies
4531
+ // `!this.conversationMemory === false`, so the previous check
4532
+ // skipped the hard cap and dispatched the oversized payload.
4533
+ const dpgHasCompactableMessages = dpgMessageCount > 0;
4534
+ // Reviewer Finding #4: pre-dispatch hard cap for the standalone
4535
+ // oversized case. When the budget check shows the request is
4536
+ // over budget but there's nothing to compact (no memory + no
4537
+ // inline messages — e.g. a huge prompt or huge tool definitions
4538
+ // alone), throw before dispatch instead of wasting a roundtrip.
4539
+ if (!budgetCheck.withinBudget && !dpgHasCompactableMessages) {
4540
+ try {
4541
+ this.emitter.emit("compaction.insufficient", {
4542
+ stagesAttempted: ["pre-dispatch hard cap"],
4543
+ finalTokens: budgetCheck.estimatedInputTokens,
4544
+ budget: budgetCheck.availableInputTokens,
4545
+ provider: providerName,
4546
+ model: options.model,
4547
+ phase: "pre-dispatch-no-recovery",
4548
+ timestamp: Date.now(),
4549
+ });
4550
+ }
4551
+ catch {
4552
+ /* listener errors are non-fatal */
4553
+ }
4554
+ throw new ContextBudgetExceededError(`Context exceeds model budget and no compaction is possible ` +
4555
+ `(no conversationMemory, no inline conversationMessages — only ` +
4556
+ `prompt + tools). Estimated: ${budgetCheck.estimatedInputTokens} ` +
4557
+ `tokens, budget: ${budgetCheck.availableInputTokens} tokens. ` +
4558
+ `Reduce prompt or tool-definition size, or trim the request.`, {
4559
+ estimatedTokens: budgetCheck.estimatedInputTokens,
4560
+ availableTokens: budgetCheck.availableInputTokens,
4561
+ stagesUsed: [],
4562
+ breakdown: budgetCheck.breakdown,
4563
+ });
4564
+ }
4393
4565
  if (budgetCheck.shouldCompact &&
4394
- this.conversationMemory &&
4566
+ (this.conversationMemory || dpgHasInlineMessages) &&
4395
4567
  dpgMessageCount >
4396
4568
  (this.lastCompactionMessageCount.get(dpgCompactionSessionId) ?? 0)) {
4397
4569
  const compactor = new ContextCompactor({
@@ -4425,6 +4597,26 @@ Current user's request: ${currentInput}`;
4425
4597
  availableTokens: postCompactBudget.availableInputTokens,
4426
4598
  overagePercent: Math.round((postCompactBudget.usageRatio - 1.0) * 100),
4427
4599
  });
4600
+ // Curator P1-2: emit `compaction.insufficient` whenever a
4601
+ // single round of compaction wasn't enough — even when
4602
+ // emergency truncation will save the day. Lets cost / audit
4603
+ // listeners track the "compaction was insufficient" signal
4604
+ // separately from the eventual outcome.
4605
+ try {
4606
+ this.emitter.emit("compaction.insufficient", {
4607
+ stagesAttempted: compactionResult.stagesUsed,
4608
+ finalTokens: postCompactBudget.estimatedInputTokens,
4609
+ budget: postCompactBudget.availableInputTokens,
4610
+ provider: providerName,
4611
+ model: options.model,
4612
+ phase: "mid-compaction",
4613
+ willEmergencyTruncate: true,
4614
+ timestamp: Date.now(),
4615
+ });
4616
+ }
4617
+ catch {
4618
+ /* listener errors are non-fatal */
4619
+ }
4428
4620
  conversationMessages = emergencyContentTruncation(conversationMessages, postCompactBudget.availableInputTokens, postCompactBudget.breakdown, providerName);
4429
4621
  const finalBudget = checkContextBudget({
4430
4622
  provider: providerName,
@@ -4440,6 +4632,23 @@ Current user's request: ${currentInput}`;
4440
4632
  if (!finalBudget.withinBudget) {
4441
4633
  // Clear watermark so handleContextOverflow recovery can re-compact
4442
4634
  this.lastCompactionMessageCount.delete(dpgCompactionSessionId);
4635
+ // Curator P1-2: emit `compaction.insufficient` so cost / audit
4636
+ // listeners can record the specific failure mode (separate
4637
+ // from a generic provider error).
4638
+ try {
4639
+ this.emitter.emit("compaction.insufficient", {
4640
+ stagesAttempted: compactionResult.stagesUsed,
4641
+ finalTokens: finalBudget.estimatedInputTokens,
4642
+ budget: finalBudget.availableInputTokens,
4643
+ provider: providerName,
4644
+ model: options.model,
4645
+ phase: "post-emergency-truncation",
4646
+ timestamp: Date.now(),
4647
+ });
4648
+ }
4649
+ catch {
4650
+ /* listener errors are non-fatal */
4651
+ }
4443
4652
  throw new ContextBudgetExceededError(`Context exceeds model budget after all compaction stages. ` +
4444
4653
  `Estimated: ${finalBudget.estimatedInputTokens} tokens, ` +
4445
4654
  `Budget: ${finalBudget.availableInputTokens} tokens.`, {
@@ -4546,6 +4755,14 @@ Current user's request: ${currentInput}`;
4546
4755
  lastError: lastError?.message,
4547
4756
  responseTime,
4548
4757
  });
4758
+ // Reviewer follow-up: preserve typed ContextBudgetExceededError after
4759
+ // the per-provider fallback loop. Each provider's hard cap is
4760
+ // per-window; we let the loop try them all, but if every provider
4761
+ // rejected on budget the caller still needs the typed error to
4762
+ // distinguish "context too large" from a generic provider failure.
4763
+ if (lastError instanceof ContextBudgetExceededError) {
4764
+ throw lastError;
4765
+ }
4549
4766
  throw new Error(`Failed to generate text with all providers. Last error: ${lastError?.message || "Unknown error"}`);
4550
4767
  }
4551
4768
  /**
@@ -4984,8 +5201,23 @@ Current user's request: ${currentInput}`;
4984
5201
  const streamStartTime = Date.now();
4985
5202
  const sessionId = enhancedOptions.context
4986
5203
  ?.sessionId;
5204
+ // Curator P2-4 dedup (concurrency-safe): native provider stream paths
5205
+ // (Gemini 3 on Vertex / Google AI Studio) emit `generation:end`
5206
+ // themselves. We attach a per-stream mutable flag directly to
5207
+ // `enhancedOptions._streamDedupContext` — native providers receive
5208
+ // these options and flip the flag before their emit; this finally
5209
+ // block reads the same closed-over reference. Concurrent streams
5210
+ // have different option objects so the contexts don't interfere.
5211
+ const dedupContext = {
5212
+ providerEmitted: false,
5213
+ };
5214
+ enhancedOptions._streamDedupContext = dedupContext;
4987
5215
  const processedStream = (async function* () {
4988
5216
  let streamError;
5217
+ // Curator P2-4: hoist `resolvedUsage` so the finally block can emit a
5218
+ // single `generation:end` event with cost data. Cost listeners
5219
+ // subscribe here; previously the stream path never fired it.
5220
+ let resolvedUsage;
4989
5221
  try {
4990
5222
  for await (const chunk of mcpStream) {
4991
5223
  chunkCount++;
@@ -5015,7 +5247,7 @@ Current user's request: ${currentInput}`;
5015
5247
  accumulatedContent += content;
5016
5248
  });
5017
5249
  }
5018
- let resolvedUsage = streamUsage;
5250
+ resolvedUsage = streamUsage;
5019
5251
  if (!resolvedUsage && streamAnalytics) {
5020
5252
  try {
5021
5253
  const resolved = await Promise.resolve(streamAnalytics);
@@ -5090,6 +5322,61 @@ Current user's request: ${currentInput}`;
5090
5322
  guardrailsBlocked: metadata.guardrailsBlocked,
5091
5323
  error: metadata.error,
5092
5324
  });
5325
+ // Curator P2-4: emit `generation:end` exactly once per stream so
5326
+ // cost listeners receive the same contract as for `generate()`.
5327
+ // The previous implementation only fired `stream:complete`, leaving
5328
+ // any subscriber to `generation:end` with zero events.
5329
+ //
5330
+ // Dedup: native provider stream paths (Gemini 3 on Vertex / Google
5331
+ // AI Studio) already emit `generation:end` themselves so Pipeline B
5332
+ // (Langfuse) records a GENERATION observation. Skip our emit when
5333
+ // they already fired — preserves their Pipeline B observation
5334
+ // source and keeps the "exactly once" contract. Per-stream flag
5335
+ // is concurrency-safe because it's scoped via AsyncLocalStorage.
5336
+ if (!dedupContext.providerEmitted) {
5337
+ try {
5338
+ const finalProvider = metadata.fallbackProvider ?? providerName ?? "unknown";
5339
+ const finalModel = metadata.fallbackModel ??
5340
+ streamModel ??
5341
+ enhancedOptions.model ??
5342
+ "unknown";
5343
+ const finalFinishReason = streamError
5344
+ ? "error"
5345
+ : (streamState.finishReason ?? "stop");
5346
+ self.emitter.emit("generation:end", {
5347
+ provider: finalProvider,
5348
+ model: finalModel,
5349
+ responseTime: Date.now() - streamStartTime,
5350
+ toolsUsed: streamState.toolCalls?.map((t) => t.toolName),
5351
+ timestamp: Date.now(),
5352
+ result: {
5353
+ content: accumulatedContent,
5354
+ usage: resolvedUsage,
5355
+ model: finalModel,
5356
+ provider: finalProvider,
5357
+ finishReason: finalFinishReason,
5358
+ },
5359
+ prompt: enhancedOptions.input?.text ||
5360
+ enhancedOptions.prompt,
5361
+ temperature: enhancedOptions.temperature,
5362
+ maxTokens: enhancedOptions.maxTokens,
5363
+ success: !streamError,
5364
+ error: streamError
5365
+ ? streamError instanceof Error
5366
+ ? streamError.message
5367
+ : String(streamError)
5368
+ : undefined,
5369
+ pipelineAHandled: true,
5370
+ });
5371
+ }
5372
+ catch (emitError) {
5373
+ logger.debug("[NeuroLink.stream] generation:end listener threw — ignored", {
5374
+ error: emitError instanceof Error
5375
+ ? emitError.message
5376
+ : String(emitError),
5377
+ });
5378
+ }
5379
+ }
5093
5380
  self._disableToolCacheForCurrentRequest = false;
5094
5381
  cleanupListeners();
5095
5382
  streamSpan.setAttribute("neurolink.response_time_ms", Date.now() - spanStartTime);
@@ -5641,6 +5928,42 @@ Current user's request: ${currentInput}`;
5641
5928
  });
5642
5929
  const streamMessageCount = conversationMessages?.length || 0;
5643
5930
  const streamCompactionSessionId = this.getCompactionSessionId(options);
5931
+ // Reviewer follow-up: gate the hard cap on the *actual compactable
5932
+ // history* rather than `this.conversationMemory`. A configured-but-
5933
+ // empty memory store leaves nothing to compact yet still satisfies
5934
+ // `!this.conversationMemory === false`, so the previous check
5935
+ // skipped the hard cap and dispatched the oversized payload.
5936
+ const streamHasCompactableMessages = streamMessageCount > 0;
5937
+ // Curator P1-2: pre-dispatch hard cap mirrors directProviderGeneration.
5938
+ // When the budget check fails AND there's nothing to compact (no memory
5939
+ // + no inline messages — only prompt + tools), throw before dispatch
5940
+ // instead of wasting a roundtrip on a payload the provider will reject.
5941
+ if (!streamBudget.withinBudget && !streamHasCompactableMessages) {
5942
+ try {
5943
+ this.emitter.emit("compaction.insufficient", {
5944
+ stagesAttempted: ["pre-dispatch hard cap"],
5945
+ finalTokens: streamBudget.estimatedInputTokens,
5946
+ budget: streamBudget.availableInputTokens,
5947
+ provider: providerName,
5948
+ model: options.model,
5949
+ phase: "pre-dispatch-no-recovery",
5950
+ timestamp: Date.now(),
5951
+ });
5952
+ }
5953
+ catch {
5954
+ /* listener errors are non-fatal */
5955
+ }
5956
+ throw new ContextBudgetExceededError(`Stream context exceeds model budget and no compaction is possible ` +
5957
+ `(no conversationMemory, no inline conversationMessages — only ` +
5958
+ `prompt + tools). Estimated: ${streamBudget.estimatedInputTokens} ` +
5959
+ `tokens, budget: ${streamBudget.availableInputTokens} tokens. ` +
5960
+ `Reduce prompt or tool-definition size, or trim the request.`, {
5961
+ estimatedTokens: streamBudget.estimatedInputTokens,
5962
+ availableTokens: streamBudget.availableInputTokens,
5963
+ stagesUsed: [],
5964
+ breakdown: streamBudget.breakdown,
5965
+ });
5966
+ }
5644
5967
  if (streamBudget.shouldCompact &&
5645
5968
  (hasCallerConversationHistory || this.conversationMemory) &&
5646
5969
  streamMessageCount >
@@ -5677,6 +6000,26 @@ Current user's request: ${currentInput}`;
5677
6000
  availableTokens: postCompactBudget.availableInputTokens,
5678
6001
  overagePercent: Math.round((postCompactBudget.usageRatio - 1.0) * 100),
5679
6002
  });
6003
+ // Curator P1-2: emit `compaction.insufficient` whenever a single
6004
+ // round of compaction wasn't enough — even when emergency
6005
+ // truncation will save the day. Lets cost / audit listeners track
6006
+ // the "compaction was insufficient" signal separately from the
6007
+ // eventual outcome.
6008
+ try {
6009
+ this.emitter.emit("compaction.insufficient", {
6010
+ stagesAttempted: compactionResult.stagesUsed,
6011
+ finalTokens: postCompactBudget.estimatedInputTokens,
6012
+ budget: postCompactBudget.availableInputTokens,
6013
+ provider: providerName,
6014
+ model: options.model,
6015
+ phase: "mid-compaction",
6016
+ willEmergencyTruncate: true,
6017
+ timestamp: Date.now(),
6018
+ });
6019
+ }
6020
+ catch {
6021
+ /* listener errors are non-fatal */
6022
+ }
5680
6023
  conversationMessages = emergencyContentTruncation(conversationMessages, postCompactBudget.availableInputTokens, postCompactBudget.breakdown, providerName);
5681
6024
  // Keep options in sync after emergency truncation so fallback paths
5682
6025
  // use the truncated history.
@@ -5693,6 +6036,23 @@ Current user's request: ${currentInput}`;
5693
6036
  if (!finalBudget.withinBudget) {
5694
6037
  // Clear watermark so handleContextOverflow recovery can re-compact
5695
6038
  this.lastCompactionMessageCount.delete(streamCompactionSessionId);
6039
+ // Curator P1-2: emit `compaction.insufficient` on the terminal
6040
+ // failure path so cost / audit listeners can record the specific
6041
+ // failure mode (compaction + emergency truncation both insufficient).
6042
+ try {
6043
+ this.emitter.emit("compaction.insufficient", {
6044
+ stagesAttempted: compactionResult.stagesUsed,
6045
+ finalTokens: finalBudget.estimatedInputTokens,
6046
+ budget: finalBudget.availableInputTokens,
6047
+ provider: providerName,
6048
+ model: options.model,
6049
+ phase: "post-emergency-truncation",
6050
+ timestamp: Date.now(),
6051
+ });
6052
+ }
6053
+ catch {
6054
+ /* listener errors are non-fatal */
6055
+ }
5696
6056
  throw new ContextBudgetExceededError(`Stream context exceeds model budget after all compaction stages. ` +
5697
6057
  `Estimated: ${finalBudget.estimatedInputTokens} tokens, ` +
5698
6058
  `Budget: ${finalBudget.availableInputTokens} tokens.`, {
@@ -5780,6 +6140,15 @@ Current user's request: ${currentInput}`;
5780
6140
  * Handle stream error with fallback
5781
6141
  */
5782
6142
  async handleStreamError(error, options, startTime, streamId, enhancedOptions, _factoryResult) {
6143
+ // Curator P1-2: when the pre-dispatch hard cap or post-emergency
6144
+ // truncation budget check throws ContextBudgetExceededError, the
6145
+ // payload is too large for the model and a same-payload retry would
6146
+ // just fail again at the provider — wasting the same tokens that
6147
+ // the hard cap was meant to save. Rethrow so the caller sees the
6148
+ // typed error instead of a fallback ProviderError that hides it.
6149
+ if (error instanceof ContextBudgetExceededError) {
6150
+ throw error;
6151
+ }
5783
6152
  logger.error("Stream generation failed, attempting fallback", {
5784
6153
  error: error instanceof Error ? error.message : String(error),
5785
6154
  });
@@ -4,6 +4,7 @@ import { ErrorCategory, ErrorSeverity, GoogleAIModels, } from "../constants/enum
4
4
  import { BaseProvider } from "../core/baseProvider.js";
5
5
  import { DEFAULT_MAX_STEPS } from "../core/constants.js";
6
6
  import { streamAnalyticsCollector } from "../core/streamAnalytics.js";
7
+ import { markStreamProviderEmittedGenerationEnd, } from "../neurolink.js";
7
8
  import { SpanStatusCode } from "@opentelemetry/api";
8
9
  import { ATTR, tracers, withClientSpan } from "../telemetry/index.js";
9
10
  import { AuthenticationError, NetworkError, ProviderError, RateLimitError, } from "../types/index.js";
@@ -735,6 +736,9 @@ export class GoogleAIStudioProvider extends BaseProvider {
735
736
  // AI SDK so experimental_telemetry is never injected; we emit manually.
736
737
  const nativeStreamEmitter = this.neurolink?.getEventEmitter();
737
738
  if (nativeStreamEmitter) {
739
+ // Curator P2-4 dedup: flag the per-stream context attached
740
+ // to options so the orchestration skips its own emit.
741
+ markStreamProviderEmittedGenerationEnd(options);
738
742
  nativeStreamEmitter.emit("generation:end", {
739
743
  provider: this.providerName,
740
744
  responseTime,
@@ -767,6 +771,9 @@ export class GoogleAIStudioProvider extends BaseProvider {
767
771
  // Emit failure generation:end so Pipeline B records the failed stream
768
772
  const errorEmitter = this.neurolink?.getEventEmitter();
769
773
  if (errorEmitter) {
774
+ // Curator P2-4 dedup: flag the per-stream context attached
775
+ // to options so the orchestration skips its own emit.
776
+ markStreamProviderEmittedGenerationEnd(options);
770
777
  errorEmitter.emit("generation:end", {
771
778
  provider: this.providerName,
772
779
  responseTime: Date.now() - startTime,
@@ -10,6 +10,7 @@ import { ErrorCategory, ErrorSeverity, } from "../constants/enums.js";
10
10
  import { BaseProvider } from "../core/baseProvider.js";
11
11
  import { DEFAULT_MAX_STEPS, GLOBAL_LOCATION_MODELS, } from "../core/constants.js";
12
12
  import { ModelConfigurationManager } from "../core/modelConfiguration.js";
13
+ import { markStreamProviderEmittedGenerationEnd, } from "../neurolink.js";
13
14
  import { createProxyFetch } from "../proxy/proxyFetch.js";
14
15
  import { ATTR, tracers, withClientSpan } from "../telemetry/index.js";
15
16
  import { AuthenticationError, InvalidModelError, NetworkError, ProviderError, RateLimitError, } from "../types/index.js";
@@ -1630,8 +1631,12 @@ export class GoogleVertexProvider extends BaseProvider {
1630
1631
  // Emit generation:end so Pipeline B (Langfuse) creates a GENERATION
1631
1632
  // observation. The native @google/genai stream path on Vertex bypasses the
1632
1633
  // Vercel AI SDK so experimental_telemetry is never injected; we emit manually.
1634
+ // Curator P2-4 dedup: flag the per-stream context attached to options
1635
+ // so the orchestration in `runStandardStreamRequest` knows we already
1636
+ // emitted and skips its own emit (preserving exactly-once).
1633
1637
  const vertexStreamEmitter = this.neurolink?.getEventEmitter();
1634
1638
  if (vertexStreamEmitter) {
1639
+ markStreamProviderEmittedGenerationEnd(params.options);
1635
1640
  vertexStreamEmitter.emit("generation:end", {
1636
1641
  provider: this.providerName,
1637
1642
  responseTime,
@@ -57,3 +57,4 @@ export * from "./span.js";
57
57
  export * from "./imageGen.js";
58
58
  export * from "./elicitation.js";
59
59
  export * from "./dynamic.js";
60
+ export * from "./streamDedup.js";
@@ -60,3 +60,5 @@ export * from "./imageGen.js";
60
60
  export * from "./elicitation.js";
61
61
  // Dynamic Arguments types
62
62
  export * from "./dynamic.js";
63
+ // Curator P2-4 dedup: per-stream AsyncLocalStorage context
64
+ export * from "./streamDedup.js";
@@ -0,0 +1,14 @@
1
+ /**
2
+ * Curator P2-4 dedup (concurrency-safe): per-stream context that lets
3
+ * the orchestration's `runStandardStreamRequest` finally block know
4
+ * whether a *native provider* path within THIS stream's async chain
5
+ * already emitted `generation:end`. Native providers (Vertex / Google
6
+ * AI Studio for Gemini 3, etc.) emit on the shared SDK emitter; without
7
+ * scoping, a concurrent unrelated stream's emit on the same NeuroLink
8
+ * instance would suppress the wrong stream's orchestration emit.
9
+ *
10
+ * AsyncLocalStorage scopes each stream's flag to its own async chain.
11
+ */
12
+ export type StreamGenerationEndContext = {
13
+ providerEmitted: boolean;
14
+ };
@@ -0,0 +1 @@
1
+ export {};