npm - @adaptic/lumic-utils - Versions diffs - 1.0.19 → 1.0.21 - Mend

@adaptic/lumic-utils 1.0.19 → 1.0.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/dist/{index-Dr85zRZC.js → index-Ca3x8X5U.js} RENAMED Viewed

@@ -768,58 +768,79 @@ const DEFAULT_DEVELOPER_PROMPT = `
     Present complete, high-confidence, final answers only. Do not rephrase to be more brief or omit parts of answers.
     Respond only with final content (e.g. code, a json or yaml object, a formatted string, or a markdown document) and nothing else. Do not reply with a preamble, introduction, or conclusion.
 `;
-/** Token costs in USD per token. Last updated Mar 2026. */
+/**
+ * Token costs in USD per token. Last updated Apr 2026.
+ *
+ * `cacheHitCost` reflects OpenAI's cached-input billing rate (~50% of the
+ * standard input rate per OpenAI's prompt caching documentation). When set,
+ * `calculateCost` splits prompt tokens into cached vs non-cached buckets and
+ * applies the discount; when omitted, cached tokens are billed at full input
+ * rate (a silent ~50% cost overstatement for cache-friendly workloads).
+ */
 const openAiModelCosts = {
     'gpt-5.4': {
         inputCost: 2.5 / 1_000_000,
+        cacheHitCost: 1.25 / 1_000_000,
         outputCost: 15 / 1_000_000,
     },
     'gpt-5.4-mini': {
         inputCost: 0.75 / 1_000_000,
+        cacheHitCost: 0.375 / 1_000_000,
         outputCost: 4.5 / 1_000_000,
     },
     'gpt-5.4-nano': {
         inputCost: 0.2 / 1_000_000,
+        cacheHitCost: 0.1 / 1_000_000,
         outputCost: 1.25 / 1_000_000,
     },
     'gpt-5': {
         inputCost: 2.5 / 1_000_000,
+        cacheHitCost: 1.25 / 1_000_000,
         outputCost: 10 / 1_000_000,
     },
     'gpt-5-mini': {
         inputCost: 0.15 / 1_000_000,
+        cacheHitCost: 0.075 / 1_000_000,
         outputCost: 0.6 / 1_000_000,
     },
     'o1-mini': {
         inputCost: 1.1 / 1_000_000,
+        cacheHitCost: 0.55 / 1_000_000,
         outputCost: 4.4 / 1_000_000,
     },
     'o1': {
         inputCost: 15 / 1_000_000,
+        cacheHitCost: 7.5 / 1_000_000,
         outputCost: 60 / 1_000_000,
     },
     'o3-mini': {
         inputCost: 1.1 / 1_000_000,
+        cacheHitCost: 0.55 / 1_000_000,
         outputCost: 4.4 / 1_000_000,
     },
     'o3': {
         inputCost: 2 / 1_000_000,
+        cacheHitCost: 1 / 1_000_000,
         outputCost: 8 / 1_000_000,
     },
     'gpt-4.1': {
         inputCost: 2 / 1_000_000,
+        cacheHitCost: 1 / 1_000_000,
         outputCost: 8 / 1_000_000,
     },
     'gpt-4.1-mini': {
         inputCost: 0.4 / 1_000_000,
+        cacheHitCost: 0.2 / 1_000_000,
         outputCost: 1.6 / 1_000_000,
     },
     'gpt-4.1-nano': {
         inputCost: 0.1 / 1_000_000,
+        cacheHitCost: 0.05 / 1_000_000,
         outputCost: 0.4 / 1_000_000,
     },
     'o4-mini': {
         inputCost: 1.1 / 1_000_000,
+        cacheHitCost: 0.55 / 1_000_000,
         outputCost: 4.4 / 1_000_000,
     },
 };
@@ -1894,7 +1915,10 @@ class LLMCostTracker {
             timestamp: Date.now(),
         };
         this.usageRecords.push(record);
-        getLumicLogger().info(`LLM cost tracked: ${provider}/${model} - $${cost.toFixed(6)}`, { provider, model, inputTokens, outputTokens, cost });
+        // Emit cachedTokens and reasoningTokens explicitly so operators can
+        // verify cache effectiveness from logs alone (the prior log shape only
+        // surfaced inputTokens and outputTokens, hiding the cache discount).
+        getLumicLogger().info(`LLM cost tracked: ${provider}/${model} - $${cost.toFixed(6)}`, { provider, model, inputTokens, cachedTokens: cacheHitTokens, outputTokens, reasoningTokens, cost });
     }
     /**
      * Records usage from an image generation call.
@@ -1975,11 +1999,13 @@ class LLMCostTracker {
         const images = this.getImageCosts();
         let totalCost = 0;
         let totalInputTokens = 0;
+        let totalCacheHitTokens = 0;
         let totalOutputTokens = 0;
         let totalReasoningTokens = 0;
         for (const summary of Object.values(byModel)) {
             totalCost += summary.totalCost;
             totalInputTokens += summary.totalInputTokens;
+            totalCacheHitTokens += summary.totalCacheHitTokens;
             totalOutputTokens += summary.totalOutputTokens;
             totalReasoningTokens += summary.totalReasoningTokens;
         }
@@ -1996,6 +2022,7 @@ class LLMCostTracker {
             totalCost,
             totalCalls: this.usageRecords.length + this.imageRecords.length,
             totalInputTokens,
+            totalCacheHitTokens,
             totalOutputTokens,
             totalReasoningTokens,
             byModel,
@@ -2018,7 +2045,9 @@ class LLMCostTracker {
             cost: `$${m.totalCost.toFixed(6)}`,
             calls: m.callCount,
             inputTokens: m.totalInputTokens,
+            cachedTokens: m.totalCacheHitTokens,
             outputTokens: m.totalOutputTokens,
+            reasoningTokens: m.totalReasoningTokens,
         }));
         const images = Object.values(summary.images).map((img) => ({
             model: img.model,
@@ -2374,14 +2403,20 @@ async function createCompletion(content, responseFormat, options = DEFAULT_OPTIO
         });
         throw error;
     }
+    // OpenAI returns cached input tokens under `prompt_tokens_details.cached_tokens`
+    // when prompts >1024 tokens hit the automatic prompt cache. We surface this
+    // as a first-class field so cost tracking and dashboards reflect the real
+    // (discounted) input cost rather than billing every input token at full rate.
+    const cachedTokens = completion.usage?.prompt_tokens_details?.cached_tokens ?? 0;
     const response = {
         id: completion.id,
         content: completion.choices[0]?.message?.content || '',
         tool_calls: completion.choices[0]?.message?.tool_calls,
-        usage: completion.usage || {
-            prompt_tokens: 0,
-            completion_tokens: 0,
-            total_tokens: 0,
+        usage: {
+            prompt_tokens: completion.usage?.prompt_tokens ?? 0,
+            completion_tokens: completion.usage?.completion_tokens ?? 0,
+            total_tokens: completion.usage?.total_tokens ?? 0,
+            cached_tokens: cachedTokens,
         },
         system_fingerprint: completion.system_fingerprint,
         service_tier: options.service_tier,
@@ -2404,8 +2439,10 @@ const makeOpenAIChatCompletionCall = async (content, responseFormat = 'text', op
         ...options,
     };
     const completion = await createCompletion(content, responseFormat, mergedOptions);
-    // Track cost in the global cost tracker
-    getLLMCostTracker().trackUsage('openai', completion.model, completion.usage.prompt_tokens, completion.usage.completion_tokens);
+    // Track cost in the global cost tracker. Pass cached tokens through so the
+    // tracker applies the discounted cached-input rate (typically ~50% of the
+    // standard input rate) instead of billing every input token at full price.
+    getLLMCostTracker().trackUsage('openai', completion.model, completion.usage.prompt_tokens, completion.usage.completion_tokens, 0, completion.usage.cached_tokens);
     // Handle tool calls differently
     if (completion.tool_calls && completion.tool_calls.length > 0) {
         const toolCallResponse = {
@@ -2423,7 +2460,8 @@ const makeOpenAIChatCompletionCall = async (content, responseFormat = 'text', op
                 reasoning_tokens: 0,
                 provider: 'openai',
                 model: completion.model,
-                cost: calculateCost('openai', completion.model, completion.usage.prompt_tokens, completion.usage.completion_tokens, 0),
+                cached_tokens: completion.usage.cached_tokens,
+                cost: calculateCost('openai', completion.model, completion.usage.prompt_tokens, completion.usage.completion_tokens, 0, completion.usage.cached_tokens),
             },
             tool_calls: completion.tool_calls,
         };
@@ -2441,7 +2479,8 @@ const makeOpenAIChatCompletionCall = async (content, responseFormat = 'text', op
             reasoning_tokens: 0,
             provider: 'openai',
             model: completion.model,
-            cost: calculateCost('openai', completion.model, completion.usage.prompt_tokens, completion.usage.completion_tokens, 0),
+            cached_tokens: completion.usage.cached_tokens,
+            cost: calculateCost('openai', completion.model, completion.usage.prompt_tokens, completion.usage.completion_tokens, 0, completion.usage.cached_tokens),
         },
         tool_calls: completion.tool_calls,
     };
@@ -2496,8 +2535,11 @@ const makeResponsesAPICall = async (input, options = {}) => {
         maxDelayMs: 30000,
         retryableErrors: isRetryableLLMError,
     }, `OpenAI-Responses:${normalizedModel}`);
+    // Responses API exposes cached input tokens under `input_tokens_details.cached_tokens`
+    // (the equivalent of Chat Completions' `prompt_tokens_details.cached_tokens`).
+    const responsesCachedTokens = response.usage?.input_tokens_details?.cached_tokens || 0;
     // Track cost in the global cost tracker
-    getLLMCostTracker().trackUsage('openai', normalizedModel, response.usage?.input_tokens || 0, response.usage?.output_tokens || 0, response.usage?.output_tokens_details?.reasoning_tokens || 0);
+    getLLMCostTracker().trackUsage('openai', normalizedModel, response.usage?.input_tokens || 0, response.usage?.output_tokens || 0, response.usage?.output_tokens_details?.reasoning_tokens || 0, responsesCachedTokens);
     // Extract tool calls from the output
     const toolCalls = response.output
         ?.filter((item) => item.type === 'function_call')
@@ -2538,7 +2580,8 @@ const makeResponsesAPICall = async (input, options = {}) => {
                 reasoning_tokens: response.usage?.output_tokens_details?.reasoning_tokens || 0,
                 provider: 'openai',
                 model: normalizedModel,
-                cost: calculateCost('openai', normalizedModel, response.usage?.input_tokens || 0, response.usage?.output_tokens || 0, response.usage?.output_tokens_details?.reasoning_tokens || 0),
+                cached_tokens: responsesCachedTokens,
+                cost: calculateCost('openai', normalizedModel, response.usage?.input_tokens || 0, response.usage?.output_tokens || 0, response.usage?.output_tokens_details?.reasoning_tokens || 0, responsesCachedTokens),
             },
             tool_calls: toolCalls,
             ...(codeInterpreterOutputs ? { code_interpreter_outputs: codeInterpreterOutputs } : {}),
@@ -2570,7 +2613,8 @@ const makeResponsesAPICall = async (input, options = {}) => {
             reasoning_tokens: response.usage?.output_tokens_details?.reasoning_tokens || 0,
             provider: 'openai',
             model: normalizedModel,
-            cost: calculateCost('openai', normalizedModel, response.usage?.input_tokens || 0, response.usage?.output_tokens || 0, response.usage?.output_tokens_details?.reasoning_tokens || 0),
+            cached_tokens: responsesCachedTokens,
+            cost: calculateCost('openai', normalizedModel, response.usage?.input_tokens || 0, response.usage?.output_tokens || 0, response.usage?.output_tokens_details?.reasoning_tokens || 0, responsesCachedTokens),
         },
         tool_calls: toolCalls,
         ...(codeInterpreterOutputs ? { code_interpreter_outputs: codeInterpreterOutputs } : {}),
@@ -8762,14 +8806,25 @@ async function createDeepseekCompletion(content, responseFormat, options = {}) {
             maxDelayMs: 30000,
             retryableErrors: isRetryableDeepseekError,
         }, `Deepseek:${normalizedModel}`);
+        // DeepSeek surfaces cached input tokens in two places on the usage object:
+        //   - `prompt_cache_hit_tokens` (DeepSeek-native field, see
+        //     https://api-docs.deepseek.com/guides/kv_cache)
+        //   - `prompt_tokens_details.cached_tokens` (OpenAI-compatible alias)
+        // Prefer the OpenAI-compatible name so a single canonical field works for
+        // both providers; fall back to the DeepSeek-native name if absent.
+        const usageRaw = completion.usage;
+        const cachedTokens = usageRaw?.prompt_tokens_details?.cached_tokens ??
+            usageRaw?.prompt_cache_hit_tokens ??
+            0;
         return {
             id: completion.id,
             content: completion.choices[0]?.message?.content || '',
             tool_calls: completion.choices[0]?.message?.tool_calls,
-            usage: completion.usage || {
-                prompt_tokens: 0,
-                completion_tokens: 0,
-                total_tokens: 0,
+            usage: {
+                prompt_tokens: completion.usage?.prompt_tokens ?? 0,
+                completion_tokens: completion.usage?.completion_tokens ?? 0,
+                total_tokens: completion.usage?.total_tokens ?? 0,
+                cached_tokens: cachedTokens,
             },
             system_fingerprint: completion.system_fingerprint,
             provider: 'deepseek',
@@ -8811,7 +8866,7 @@ const makeDeepseekCall = async (content, responseFormat = 'json', options = {})
                 reasoning_tokens: 0,
                 provider: 'deepseek',
                 model: modelName,
-                cache_hit_tokens: 0,
+                cached_tokens: 0,
                 cost: 0,
             },
             tool_calls: undefined,
@@ -8830,7 +8885,7 @@ const makeDeepseekCall = async (content, responseFormat = 'json', options = {})
                 reasoning_tokens: 0,
                 provider: 'deepseek',
                 model: modelName,
-                cache_hit_tokens: 0,
+                cached_tokens: 0,
                 cost: 0,
             },
             tool_calls: undefined,
@@ -8838,8 +8893,9 @@ const makeDeepseekCall = async (content, responseFormat = 'json', options = {})
     }
     try {
         const completion = await createDeepseekCompletion(content, responseFormat, mergedOptions);
-        // Track cost in the global cost tracker
-        getLLMCostTracker().trackUsage('deepseek', completion.model, completion.usage.prompt_tokens, completion.usage.completion_tokens);
+        // Track cost in the global cost tracker. Pass cached tokens through so the
+        // discounted cached-input pricing tier is applied.
+        getLLMCostTracker().trackUsage('deepseek', completion.model, completion.usage.prompt_tokens, completion.usage.completion_tokens, 0, completion.usage.cached_tokens);
         // Handle tool calls similarly to OpenAI
         if (completion.tool_calls && completion.tool_calls.length > 0) {
             const toolCallResponse = {
@@ -8857,9 +8913,8 @@ const makeDeepseekCall = async (content, responseFormat = 'json', options = {})
                     reasoning_tokens: 0, // Deepseek doesn't provide reasoning tokens separately
                     provider: 'deepseek',
                     model: completion.model,
-                    cache_hit_tokens: 0, // Not provided directly in API response
-                    cost: calculateCost('deepseek', completion.model, completion.usage.prompt_tokens, completion.usage.completion_tokens, 0, 0 // Cache hit tokens (not provided in the response)
-                    ),
+                    cached_tokens: completion.usage.cached_tokens,
+                    cost: calculateCost('deepseek', completion.model, completion.usage.prompt_tokens, completion.usage.completion_tokens, 0, completion.usage.cached_tokens),
                 },
                 tool_calls: completion.tool_calls,
             };
@@ -8877,9 +8932,8 @@ const makeDeepseekCall = async (content, responseFormat = 'json', options = {})
                 reasoning_tokens: 0, // Deepseek doesn't provide reasoning tokens separately
                 provider: 'deepseek',
                 model: completion.model,
-                cache_hit_tokens: 0, // Not provided directly in API response
-                cost: calculateCost('deepseek', completion.model, completion.usage.prompt_tokens, completion.usage.completion_tokens, 0, 0 // Cache hit tokens (not provided in the response)
-                ),
+                cached_tokens: completion.usage.cached_tokens,
+                cost: calculateCost('deepseek', completion.model, completion.usage.prompt_tokens, completion.usage.completion_tokens, 0, completion.usage.cached_tokens),
             },
             tool_calls: completion.tool_calls,
         };
@@ -8897,7 +8951,7 @@ const makeDeepseekCall = async (content, responseFormat = 'json', options = {})
                 reasoning_tokens: 0,
                 provider: 'deepseek',
                 model: modelName,
-                cache_hit_tokens: 0,
+                cached_tokens: 0,
                 cost: 0,
             },
             tool_calls: undefined,
@@ -22796,11 +22850,11 @@ let poolConfig = DEFAULT_POOL_CONFIG;
 async function loadApolloModules() {
     if (typeof window === "undefined" || process.env.AWS_EXECUTION_ENV) {
         // Server-side (or Lambda): load the CommonJS‑based implementation.
-        return (await Promise.resolve().then(function () { return require('./apollo-client.server-HwHIFnVk.js'); }));
+        return (await Promise.resolve().then(function () { return require('./apollo-client.server-BnZhh39o.js'); }));
     }
     else {
         // Client-side: load the ESM‑based implementation.
-        return (await Promise.resolve().then(function () { return require('./apollo-client.client-guxMwplM.js'); }));
+        return (await Promise.resolve().then(function () { return require('./apollo-client.client-ByADDB46.js'); }));
     }
 }
 /**
@@ -81511,4 +81565,4 @@ exports.withCorrelationId = withCorrelationId;
 exports.withMetrics = withMetrics;
 exports.withRateLimit = withRateLimit;
 exports.withRetry = withRetry;
-//# sourceMappingURL=index-Dr85zRZC.js.map
+//# sourceMappingURL=index-Ca3x8X5U.js.map