npm - @adaptic/lumic-utils - Versions diffs - 1.0.18 → 1.0.20 - Mend

@adaptic/lumic-utils 1.0.18 → 1.0.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

package/dist/{index-C3ihLNel.js → index-BVl0tRmx.js} RENAMED Viewed

@@ -2106,15 +2106,33 @@ function resetLLMCostTracker() {
 // llm-openai.ts
 /**
  * Determines if an LLM error should be retried.
- * Only retries on rate limit errors (429).
+ *
+ * Retries on:
+ * - 429 / rate limit errors (transient capacity)
+ * - "could not parse the JSON body" 400s — observed once in production for a
+ *   single symbol on the very first conversation turn (Wave 86, 2026-04-11).
+ *   The exact same call site succeeds millions of times before and after, and
+ *   the prior fix commit `6eaef52` in this repo already eliminated the only
+ *   known SDK-v5 cause (passing `tools: undefined/null`). The remaining cases
+ *   are virtually always proxy/network corruption of the request body in
+ *   flight (request truncated mid-flight, TLS renegotiation, edge proxy
+ *   buffer reset). Retrying once with a fresh connection has a high
+ *   probability of recovering, and a deterministic SDK-side defect would
+ *   re-fail on retry (so we still surface it).
  */
 const isRetryableLLMError = (error) => {
     if (error instanceof Error) {
         const message = error.message;
-        // Retry only on rate limits (429)
+        // Retry on rate limits (429)
         if (message.includes('429') || message.includes('rate limit') || message.includes('Rate limit')) {
             return true;
         }
+        // Retry on transient body-corruption 400s. Match the exact OpenAI error
+        // string to avoid retrying genuine client-side validation 400s (which
+        // would re-fail forever and waste retry budget).
+        if (message.includes('could not parse the JSON body of your request')) {
+            return true;
+        }
     }
     return false;
 };
@@ -2310,20 +2328,66 @@ async function createCompletion(content, responseFormat, options = DEFAULT_OPTIO
     if (responseFormatOption.type !== 'text') {
         queryOptions.response_format = responseFormatOption;
     }
-    const completion = await withRetry(() => openai.chat.completions.create(queryOptions), {
-        maxRetries: 3,
-        baseDelayMs: 2000,
-        maxDelayMs: 30000,
-        retryableErrors: isRetryableLLMError,
-    }, `OpenAI:${normalizedModel}`);
+    let completion;
+    try {
+        completion = await withRetry(() => openai.chat.completions.create(queryOptions), {
+            maxRetries: 3,
+            baseDelayMs: 2000,
+            maxDelayMs: 30000,
+            retryableErrors: isRetryableLLMError,
+        }, `OpenAI:${normalizedModel}`);
+    }
+    catch (error) {
+        // Defensive observability: when the OpenAI SDK rejects our request,
+        // emit a structured snapshot of the queryOptions shape (NOT content) so
+        // a future recurrence of the rare "could not parse JSON body" 400 can be
+        // diagnosed without having to reproduce locally. We deliberately log
+        // metadata only — no message content, no API key — so this is safe even
+        // for production prompts containing sensitive context.
+        const errorMessage = error instanceof Error ? error.message : String(error);
+        const totalContentChars = messages.reduce((sum, msg) => {
+            if (typeof msg.content === 'string')
+                return sum + msg.content.length;
+            if (Array.isArray(msg.content)) {
+                return sum + msg.content.reduce((s, part) => {
+                    if (typeof part === 'object' && part !== null && 'text' in part && typeof part.text === 'string') {
+                        return s + part.text.length;
+                    }
+                    return s;
+                }, 0);
+            }
+            return sum;
+        }, 0);
+        getLumicLogger().error(`OpenAI ChatCompletion call failed for model ${normalizedModel}`, {
+            model: normalizedModel,
+            errorMessage,
+            messageCount: messages.length,
+            roleBreakdown: messages.reduce((acc, msg) => {
+                acc[msg.role] = (acc[msg.role] ?? 0) + 1;
+                return acc;
+            }, {}),
+            totalContentChars,
+            toolCount: queryOptions.tools?.length ?? 0,
+            hasTemperature: queryOptions.temperature !== undefined,
+            hasResponseFormat: queryOptions.response_format !== undefined,
+            hasMaxCompletionTokens: queryOptions.max_completion_tokens !== undefined,
+        });
+        throw error;
+    }
+    // OpenAI returns cached input tokens under `prompt_tokens_details.cached_tokens`
+    // when prompts >1024 tokens hit the automatic prompt cache. We surface this
+    // as a first-class field so cost tracking and dashboards reflect the real
+    // (discounted) input cost rather than billing every input token at full rate.
+    const cachedTokens = completion.usage?.prompt_tokens_details?.cached_tokens ?? 0;
     const response = {
         id: completion.id,
         content: completion.choices[0]?.message?.content || '',
         tool_calls: completion.choices[0]?.message?.tool_calls,
-        usage: completion.usage || {
-            prompt_tokens: 0,
-            completion_tokens: 0,
-            total_tokens: 0,
+        usage: {
+            prompt_tokens: completion.usage?.prompt_tokens ?? 0,
+            completion_tokens: completion.usage?.completion_tokens ?? 0,
+            total_tokens: completion.usage?.total_tokens ?? 0,
+            cached_tokens: cachedTokens,
         },
         system_fingerprint: completion.system_fingerprint,
         service_tier: options.service_tier,
@@ -2346,8 +2410,10 @@ const makeOpenAIChatCompletionCall = async (content, responseFormat = 'text', op
         ...options,
     };
     const completion = await createCompletion(content, responseFormat, mergedOptions);
-    // Track cost in the global cost tracker
-    getLLMCostTracker().trackUsage('openai', completion.model, completion.usage.prompt_tokens, completion.usage.completion_tokens);
+    // Track cost in the global cost tracker. Pass cached tokens through so the
+    // tracker applies the discounted cached-input rate (typically ~50% of the
+    // standard input rate) instead of billing every input token at full price.
+    getLLMCostTracker().trackUsage('openai', completion.model, completion.usage.prompt_tokens, completion.usage.completion_tokens, 0, completion.usage.cached_tokens);
     // Handle tool calls differently
     if (completion.tool_calls && completion.tool_calls.length > 0) {
         const toolCallResponse = {
@@ -2365,7 +2431,8 @@ const makeOpenAIChatCompletionCall = async (content, responseFormat = 'text', op
                 reasoning_tokens: 0,
                 provider: 'openai',
                 model: completion.model,
-                cost: calculateCost('openai', completion.model, completion.usage.prompt_tokens, completion.usage.completion_tokens, 0),
+                cached_tokens: completion.usage.cached_tokens,
+                cost: calculateCost('openai', completion.model, completion.usage.prompt_tokens, completion.usage.completion_tokens, 0, completion.usage.cached_tokens),
             },
             tool_calls: completion.tool_calls,
         };
@@ -2383,7 +2450,8 @@ const makeOpenAIChatCompletionCall = async (content, responseFormat = 'text', op
             reasoning_tokens: 0,
             provider: 'openai',
             model: completion.model,
-            cost: calculateCost('openai', completion.model, completion.usage.prompt_tokens, completion.usage.completion_tokens, 0),
+            cached_tokens: completion.usage.cached_tokens,
+            cost: calculateCost('openai', completion.model, completion.usage.prompt_tokens, completion.usage.completion_tokens, 0, completion.usage.cached_tokens),
         },
         tool_calls: completion.tool_calls,
     };
@@ -2438,8 +2506,11 @@ const makeResponsesAPICall = async (input, options = {}) => {
         maxDelayMs: 30000,
         retryableErrors: isRetryableLLMError,
     }, `OpenAI-Responses:${normalizedModel}`);
+    // Responses API exposes cached input tokens under `input_tokens_details.cached_tokens`
+    // (the equivalent of Chat Completions' `prompt_tokens_details.cached_tokens`).
+    const responsesCachedTokens = response.usage?.input_tokens_details?.cached_tokens || 0;
     // Track cost in the global cost tracker
-    getLLMCostTracker().trackUsage('openai', normalizedModel, response.usage?.input_tokens || 0, response.usage?.output_tokens || 0, response.usage?.output_tokens_details?.reasoning_tokens || 0);
+    getLLMCostTracker().trackUsage('openai', normalizedModel, response.usage?.input_tokens || 0, response.usage?.output_tokens || 0, response.usage?.output_tokens_details?.reasoning_tokens || 0, responsesCachedTokens);
     // Extract tool calls from the output
     const toolCalls = response.output
         ?.filter((item) => item.type === 'function_call')
@@ -2480,7 +2551,8 @@ const makeResponsesAPICall = async (input, options = {}) => {
                 reasoning_tokens: response.usage?.output_tokens_details?.reasoning_tokens || 0,
                 provider: 'openai',
                 model: normalizedModel,
-                cost: calculateCost('openai', normalizedModel, response.usage?.input_tokens || 0, response.usage?.output_tokens || 0, response.usage?.output_tokens_details?.reasoning_tokens || 0),
+                cached_tokens: responsesCachedTokens,
+                cost: calculateCost('openai', normalizedModel, response.usage?.input_tokens || 0, response.usage?.output_tokens || 0, response.usage?.output_tokens_details?.reasoning_tokens || 0, responsesCachedTokens),
             },
             tool_calls: toolCalls,
             ...(codeInterpreterOutputs ? { code_interpreter_outputs: codeInterpreterOutputs } : {}),
@@ -2512,7 +2584,8 @@ const makeResponsesAPICall = async (input, options = {}) => {
             reasoning_tokens: response.usage?.output_tokens_details?.reasoning_tokens || 0,
             provider: 'openai',
             model: normalizedModel,
-            cost: calculateCost('openai', normalizedModel, response.usage?.input_tokens || 0, response.usage?.output_tokens || 0, response.usage?.output_tokens_details?.reasoning_tokens || 0),
+            cached_tokens: responsesCachedTokens,
+            cost: calculateCost('openai', normalizedModel, response.usage?.input_tokens || 0, response.usage?.output_tokens || 0, response.usage?.output_tokens_details?.reasoning_tokens || 0, responsesCachedTokens),
         },
         tool_calls: toolCalls,
         ...(codeInterpreterOutputs ? { code_interpreter_outputs: codeInterpreterOutputs } : {}),
@@ -7942,7 +8015,12 @@ function translateContextToAnthropic(context) {
 /** Convert string or content block array to a uniform content block array. */
 function toContentBlocks(content) {
     if (typeof content === 'string') {
-        return [{ type: 'text', text: content }];
+        const textBlock = {
+            type: 'text',
+            text: content,
+            citations: null,
+        };
+        return [textBlock];
     }
     return content;
 }
@@ -8699,14 +8777,25 @@ async function createDeepseekCompletion(content, responseFormat, options = {}) {
             maxDelayMs: 30000,
             retryableErrors: isRetryableDeepseekError,
         }, `Deepseek:${normalizedModel}`);
+        // DeepSeek surfaces cached input tokens in two places on the usage object:
+        //   - `prompt_cache_hit_tokens` (DeepSeek-native field, see
+        //     https://api-docs.deepseek.com/guides/kv_cache)
+        //   - `prompt_tokens_details.cached_tokens` (OpenAI-compatible alias)
+        // Prefer the OpenAI-compatible name so a single canonical field works for
+        // both providers; fall back to the DeepSeek-native name if absent.
+        const usageRaw = completion.usage;
+        const cachedTokens = usageRaw?.prompt_tokens_details?.cached_tokens ??
+            usageRaw?.prompt_cache_hit_tokens ??
+            0;
         return {
             id: completion.id,
             content: completion.choices[0]?.message?.content || '',
             tool_calls: completion.choices[0]?.message?.tool_calls,
-            usage: completion.usage || {
-                prompt_tokens: 0,
-                completion_tokens: 0,
-                total_tokens: 0,
+            usage: {
+                prompt_tokens: completion.usage?.prompt_tokens ?? 0,
+                completion_tokens: completion.usage?.completion_tokens ?? 0,
+                total_tokens: completion.usage?.total_tokens ?? 0,
+                cached_tokens: cachedTokens,
             },
             system_fingerprint: completion.system_fingerprint,
             provider: 'deepseek',
@@ -8748,7 +8837,7 @@ const makeDeepseekCall = async (content, responseFormat = 'json', options = {})
                 reasoning_tokens: 0,
                 provider: 'deepseek',
                 model: modelName,
-                cache_hit_tokens: 0,
+                cached_tokens: 0,
                 cost: 0,
             },
             tool_calls: undefined,
@@ -8767,7 +8856,7 @@ const makeDeepseekCall = async (content, responseFormat = 'json', options = {})
                 reasoning_tokens: 0,
                 provider: 'deepseek',
                 model: modelName,
-                cache_hit_tokens: 0,
+                cached_tokens: 0,
                 cost: 0,
             },
             tool_calls: undefined,
@@ -8775,8 +8864,9 @@ const makeDeepseekCall = async (content, responseFormat = 'json', options = {})
     }
     try {
         const completion = await createDeepseekCompletion(content, responseFormat, mergedOptions);
-        // Track cost in the global cost tracker
-        getLLMCostTracker().trackUsage('deepseek', completion.model, completion.usage.prompt_tokens, completion.usage.completion_tokens);
+        // Track cost in the global cost tracker. Pass cached tokens through so the
+        // discounted cached-input pricing tier is applied.
+        getLLMCostTracker().trackUsage('deepseek', completion.model, completion.usage.prompt_tokens, completion.usage.completion_tokens, 0, completion.usage.cached_tokens);
         // Handle tool calls similarly to OpenAI
         if (completion.tool_calls && completion.tool_calls.length > 0) {
             const toolCallResponse = {
@@ -8794,9 +8884,8 @@ const makeDeepseekCall = async (content, responseFormat = 'json', options = {})
                     reasoning_tokens: 0, // Deepseek doesn't provide reasoning tokens separately
                     provider: 'deepseek',
                     model: completion.model,
-                    cache_hit_tokens: 0, // Not provided directly in API response
-                    cost: calculateCost('deepseek', completion.model, completion.usage.prompt_tokens, completion.usage.completion_tokens, 0, 0 // Cache hit tokens (not provided in the response)
-                    ),
+                    cached_tokens: completion.usage.cached_tokens,
+                    cost: calculateCost('deepseek', completion.model, completion.usage.prompt_tokens, completion.usage.completion_tokens, 0, completion.usage.cached_tokens),
                 },
                 tool_calls: completion.tool_calls,
             };
@@ -8814,9 +8903,8 @@ const makeDeepseekCall = async (content, responseFormat = 'json', options = {})
                 reasoning_tokens: 0, // Deepseek doesn't provide reasoning tokens separately
                 provider: 'deepseek',
                 model: completion.model,
-                cache_hit_tokens: 0, // Not provided directly in API response
-                cost: calculateCost('deepseek', completion.model, completion.usage.prompt_tokens, completion.usage.completion_tokens, 0, 0 // Cache hit tokens (not provided in the response)
-                ),
+                cached_tokens: completion.usage.cached_tokens,
+                cost: calculateCost('deepseek', completion.model, completion.usage.prompt_tokens, completion.usage.completion_tokens, 0, completion.usage.cached_tokens),
             },
             tool_calls: completion.tool_calls,
         };
@@ -8834,7 +8922,7 @@ const makeDeepseekCall = async (content, responseFormat = 'json', options = {})
                 reasoning_tokens: 0,
                 provider: 'deepseek',
                 model: modelName,
-                cache_hit_tokens: 0,
+                cached_tokens: 0,
                 cost: 0,
             },
             tool_calls: undefined,
@@ -22733,11 +22821,11 @@ let poolConfig = DEFAULT_POOL_CONFIG;
 async function loadApolloModules() {
     if (typeof window === "undefined" || process.env.AWS_EXECUTION_ENV) {
         // Server-side (or Lambda): load the CommonJS‑based implementation.
-        return (await Promise.resolve().then(function () { return require('./apollo-client.server-BAuFJqgR.js'); }));
+        return (await Promise.resolve().then(function () { return require('./apollo-client.server-Djh4v__C.js'); }));
     }
     else {
         // Client-side: load the ESM‑based implementation.
-        return (await Promise.resolve().then(function () { return require('./apollo-client.client-Cz-ZMwuK.js'); }));
+        return (await Promise.resolve().then(function () { return require('./apollo-client.client-DVsbR05r.js'); }));
     }
 }
 /**
@@ -81448,4 +81536,4 @@ exports.withCorrelationId = withCorrelationId;
 exports.withMetrics = withMetrics;
 exports.withRateLimit = withRateLimit;
 exports.withRetry = withRetry;
-//# sourceMappingURL=index-C3ihLNel.js.map
+//# sourceMappingURL=index-BVl0tRmx.js.map