npm - mohdel - Versions diffs - 0.103.0 → 0.104.1 - Mend

mohdel 0.103.0 → 0.104.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/js/core/events.js +11 -0
package/js/session/adapters/_chat_completions.js +13 -2
package/js/session/adapters/_pricing.js +21 -8
package/js/session/adapters/anthropic.js +60 -5
package/js/session/adapters/openai.js +17 -2
package/package.json +3 -3

package/js/core/events.js CHANGED Viewed

@@ -55,8 +55,19 @@
  * @property {(string|null)} output
  *   Final text (null when `status === 'tool_use'` with no text).
  * @property {number} inputTokens
+ *   Regular (non-cached) input tokens. For OpenAI/cerebras/fireworks where
+ *   `cached_tokens` is reported as a SUBSET of `prompt_tokens`, adapters
+ *   subtract the cached portion before exposing it here so all providers
+ *   produce additive token shapes.
  * @property {number} outputTokens
  * @property {number} thinkingTokens
+ * @property {number} [cacheWriteInputTokens]
+ *   Input tokens written to a fresh prompt cache breakpoint, billed at
+ *   `cacheWritePrice`. Absent when the provider has no separate
+ *   cache-write counter.
+ * @property {number} [cacheReadInputTokens]
+ *   Input tokens served from prompt cache, billed at `cacheReadPrice`.
+ *   Absent when the provider has no prompt caching.
  * @property {number} cost
  *   USD, computed from curated pricing. Single number (not a breakdown).
  * @property {Timestamps} timestamps

package/js/session/adapters/_chat_completions.js CHANGED Viewed

@@ -258,10 +258,15 @@ async function * runStreaming (envelope, client, args, config, start, deps) {
  */
 function finalize ({ envelope, content, toolCalls, usage, finishReason, start, first, reasoning = null }) {
   const end = String(process.hrtime.bigint())
-  const inputTokens = usage.prompt_tokens || 0
+  const totalInputTokens = usage.prompt_tokens || 0
   const totalOutputTokens = usage.completion_tokens || 0
   const thinkingTokens = usage.completion_tokens_details?.reasoning_tokens || 0
+  const cachedInputTokens = usage.prompt_tokens_details?.cached_tokens || 0
   const visibleOutputTokens = Math.max(0, totalOutputTokens - thinkingTokens)
+  // OpenAI-shape APIs (cerebras/fireworks/...) report cached_tokens as a
+  // SUBSET of prompt_tokens. Convert to mohdel's additive convention so
+  // computeCost prices the cached portion at cacheReadPrice.
+  const inputTokens = Math.max(0, totalInputTokens - cachedInputTokens)
   const truncated = finishReason === 'length'
   let status = truncated ? STATUS_INCOMPLETE : STATUS_COMPLETED
@@ -276,9 +281,15 @@ function finalize ({ envelope, content, toolCalls, usage, finishReason, start, f
       inputTokens,
       outputTokens: visibleOutputTokens,
       thinkingTokens,
+      ...(cachedInputTokens > 0 && { cacheReadInputTokens: cachedInputTokens }),
       cost: costFor(
         catalogKey(envelope.model),
-        { inputTokens, outputTokens: visibleOutputTokens, thinkingTokens }
+        {
+          inputTokens,
+          outputTokens: visibleOutputTokens,
+          thinkingTokens,
+          cacheReadInputTokens: cachedInputTokens
+        }
       ),
       timestamps: { start, first: first ?? end, end }
     }

package/js/session/adapters/_pricing.js CHANGED Viewed

@@ -14,8 +14,8 @@ import { getSpec, setCatalog } from './_catalog.js'
 /**
  * Pure cost computation from spec + usage.
  *
- * Each price field (`inputPrice` / `outputPrice` / `thinkingPrice`) is
- * one of:
+ * Each price field (`inputPrice` / `outputPrice` / `thinkingPrice` /
+ * `cacheWritePrice` / `cacheReadPrice`) is one of:
  *
  *   - a `number` — flat per-million rate; or
  *   - an object `{">N": number, ..., "default": number}` — tiered.
@@ -24,12 +24,19 @@ import { getSpec, setCatalog } from './_catalog.js'
  *     nothing matches. Keys that aren't `">N"` or `"default"` are
  *     ignored. `>` is strict — at exactly N, the default is used.
  *
- * `thinkingPrice` is optional and falls back to the resolved
- * `outputPrice` when absent.
+ * Optional fields fall back to other prices when absent:
+ *   - `thinkingPrice` → `outputPrice`
+ *   - `cacheWritePrice` → `inputPrice` (graceful for non-caching providers)
+ *   - `cacheReadPrice` → `inputPrice`
  *
- * @param {any} spec  Catalog entry (with `inputPrice`/`outputPrice`/`thinkingPrice`),
- *                    or `undefined`.
- * @param {{inputTokens?: number, outputTokens?: number, thinkingTokens?: number}} usage
+ * Token-counting convention: this function is purely additive across
+ * `inputTokens`, `cacheWriteInputTokens`, `cacheReadInputTokens`,
+ * `outputTokens`, and `thinkingTokens`. Adapters normalize provider-specific
+ * shapes (e.g. subset-of-input vs. additional-to-input) before calling here.
+ *
+ * @param {any} spec  Catalog entry, or `undefined`.
+ * @param {{inputTokens?: number, outputTokens?: number, thinkingTokens?: number,
+ *          cacheWriteInputTokens?: number, cacheReadInputTokens?: number}} usage
  * @returns {number}
  */
 export function computeCost (spec, usage) {
@@ -37,12 +44,18 @@ export function computeCost (spec, usage) {
   const i = usage.inputTokens ?? 0
   const o = usage.outputTokens ?? 0
   const t = usage.thinkingTokens ?? 0
+  const cw = usage.cacheWriteInputTokens ?? 0
+  const cr = usage.cacheReadInputTokens ?? 0
   const ip = resolveTier(spec.inputPrice, i)
   const op = resolveTier(spec.outputPrice, i)
   if (typeof ip !== 'number' || typeof op !== 'number') return 0
   const tp = resolveTier(spec.thinkingPrice, i)
   const tpFinal = typeof tp === 'number' ? tp : op
-  const total = (i * ip + o * op + t * tpFinal) / 1_000_000
+  const cwp = resolveTier(spec.cacheWritePrice, i)
+  const cwpFinal = typeof cwp === 'number' ? cwp : ip
+  const crp = resolveTier(spec.cacheReadPrice, i)
+  const crpFinal = typeof crp === 'number' ? crp : ip
+  const total = (i * ip + cw * cwpFinal + cr * crpFinal + o * op + t * tpFinal) / 1_000_000
   return round(total)
 }

package/js/session/adapters/anthropic.js CHANGED Viewed

@@ -113,6 +113,8 @@ export async function * anthropic (envelope, deps = {}) {
   const currentOutput = () => outputParts.join('')
   let inputTokens = 0
   let outputTokens = 0
+  let cacheWriteTokens = 0
+  let cacheReadTokens = 0
   let thinkingChars = 0
   let status = STATUS_COMPLETED
   /** @type {string | undefined} */
@@ -135,6 +137,12 @@ export async function * anthropic (envelope, deps = {}) {
           if (event.message?.usage?.input_tokens) {
             inputTokens = event.message.usage.input_tokens
           }
+          if (event.message?.usage?.cache_creation_input_tokens) {
+            cacheWriteTokens = event.message.usage.cache_creation_input_tokens
+          }
+          if (event.message?.usage?.cache_read_input_tokens) {
+            cacheReadTokens = event.message.usage.cache_read_input_tokens
+          }
           break
         case 'content_block_start':
@@ -226,9 +234,17 @@ export async function * anthropic (envelope, deps = {}) {
       inputTokens,
       outputTokens: messageOutputTokens,
       thinkingTokens: estimatedThinkingTokens,
+      ...(cacheWriteTokens > 0 && { cacheWriteInputTokens: cacheWriteTokens }),
+      ...(cacheReadTokens > 0 && { cacheReadInputTokens: cacheReadTokens }),
       cost: costFor(
         catalogKey(envelope.model),
-        { inputTokens, outputTokens: messageOutputTokens, thinkingTokens: estimatedThinkingTokens }
+        {
+          inputTokens,
+          outputTokens: messageOutputTokens,
+          thinkingTokens: estimatedThinkingTokens,
+          cacheWriteInputTokens: cacheWriteTokens,
+          cacheReadInputTokens: cacheReadTokens
+        }
       ),
       timestamps: { start, first: first ?? end, end }
     }
@@ -261,7 +277,9 @@ function safeParseJson (s) {
 /**
  * @param {import('#core/envelope.js').CallEnvelope} envelope
  * @param {Array<{role: string, content: any}>} conversation
- * @param {string} system
+ * @param {string | Array<{type: string, text: string, cache_control?: object}>} system
+ *   Either a flat string (legacy) or an array of typed blocks with optional
+ *   `cache_control` for prompt caching. The Anthropic SDK accepts both shapes.
  */
 function buildRequest (envelope, conversation, system) {
   const spec = getSpec(catalogKey(envelope.model))
@@ -273,7 +291,9 @@ function buildRequest (envelope, conversation, system) {
     max_tokens: envelope.outputBudget ?? outputTokenLimit ?? ANTHROPIC_DEFAULT_MAX_TOKENS,
     messages: conversation
   }
-  if (system) request.system = system
+  if (typeof system === 'string' ? system : Array.isArray(system) && system.length) {
+    request.system = system
+  }
   if (envelope.tools?.length) {
     request.tools = toAnthropicTools(envelope.tools)
   }
@@ -314,11 +334,39 @@ function splitPrompt (prompt) {
   }
   /** @type {string[]} */
   const systemParts = []
+  /** @type {Array<{type: string, text: string, cache_control?: object}>} */
+  const systemBlocks = []
+  let hasCacheMarkers = false
   /** @type {Array<{role: string, content: any}>} */
   const conversation = []
   for (const m of prompt) {
     if (m.role === 'system') {
-      systemParts.push(flattenText(m.content))
+      // Translate spore-style cache markers ({text, cache: '5m'|'1h'}) into
+      // Anthropic's cache_control. Preserves the block boundary that spore
+      // chose; collapsing into a single string would silently disable
+      // caching even when the upstream tier composed the prompt with
+      // explicit breakpoints.
+      if (Array.isArray(m.content)) {
+        for (const p of m.content) {
+          if (!p?.text) continue
+          const block = { type: 'text', text: p.text }
+          if (p.cache === '5m') {
+            block.cache_control = { type: 'ephemeral' }
+            hasCacheMarkers = true
+          } else if (p.cache === '1h') {
+            block.cache_control = { type: 'ephemeral', ttl: '1h' }
+            hasCacheMarkers = true
+          }
+          systemBlocks.push(block)
+          systemParts.push(p.text)
+        }
+      } else {
+        const text = flattenText(m.content)
+        if (text) {
+          systemBlocks.push({ type: 'text', text })
+          systemParts.push(text)
+        }
+      }
     } else if (m.role === 'tool') {
       // Tool results go in a user-role message with tool_result blocks.
       conversation.push({
@@ -351,7 +399,14 @@ function splitPrompt (prompt) {
       })
     }
   }
-  return { system: systemParts.filter(Boolean).join('\n\n'), conversation }
+  // If cache markers are present, return the structured block array so
+  // buildRequest can pass it through to Anthropic's typed system field.
+  // Otherwise fall back to the legacy string concatenation path so
+  // non-cached calls don't change request shape.
+  const system = hasCacheMarkers
+    ? systemBlocks
+    : systemParts.filter(Boolean).join('\n\n')
+  return { system, conversation }
 }
 /** @param {string | import('#core/envelope.js').MessagePart[]} content */

package/js/session/adapters/openai.js CHANGED Viewed

@@ -76,6 +76,7 @@ export async function * openai (envelope, deps = {}) {
   let inputTokens = 0
   let outputTokens = 0
   let thinkingTokens = 0
+  let cachedInputTokens = 0
   let status = STATUS_COMPLETED
   /** @type {string | undefined} */
   let warning
@@ -130,6 +131,7 @@ export async function * openai (envelope, deps = {}) {
             inputTokens = event.response.usage.input_tokens ?? 0
             outputTokens = event.response.usage.output_tokens ?? 0
             thinkingTokens = event.response.usage.output_tokens_details?.reasoning_tokens ?? 0
+            cachedInputTokens = event.response.usage.input_tokens_details?.cached_tokens ?? 0
           }
           if (toolItems.size > 0) {
             status = STATUS_TOOL_USE
@@ -145,6 +147,7 @@ export async function * openai (envelope, deps = {}) {
             inputTokens = event.response.usage.input_tokens ?? 0
             outputTokens = event.response.usage.output_tokens ?? 0
             thinkingTokens = event.response.usage.output_tokens_details?.reasoning_tokens ?? 0
+            cachedInputTokens = event.response.usage.input_tokens_details?.cached_tokens ?? 0
           }
           break
@@ -174,18 +177,30 @@ export async function * openai (envelope, deps = {}) {
   // one from the other for the message-only count.
   const messageOutputTokens = Math.max(0, outputTokens - thinkingTokens)
+  // OpenAI counts cached_tokens as a SUBSET of input_tokens. Convert to
+  // mohdel's additive convention (cacheReadInputTokens is separate from
+  // inputTokens) by subtracting the cached portion before pricing. Both
+  // adapters and computeCost stay simpler with the additive shape.
+  const regularInputTokens = Math.max(0, inputTokens - cachedInputTokens)
   /** @type {import('#core/events.js').DoneEvent} */
   const done = {
     type: 'done',
     result: {
       status,
       output: currentOutput() || null,
-      inputTokens,
+      inputTokens: regularInputTokens,
       outputTokens: messageOutputTokens,
       thinkingTokens,
+      ...(cachedInputTokens > 0 && { cacheReadInputTokens: cachedInputTokens }),
       cost: costFor(
         catalogKey(envelope.model),
-        { inputTokens, outputTokens: messageOutputTokens, thinkingTokens }
+        {
+          inputTokens: regularInputTokens,
+          outputTokens: messageOutputTokens,
+          thinkingTokens,
+          cacheReadInputTokens: cachedInputTokens
+        }
       ),
       timestamps: { start, first: first ?? end, end }
     }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "mohdel",
-  "version": "0.103.0",
+  "version": "0.104.1",
   "license": "MIT",
   "author": {
     "name": "Christophe Le Bars",
@@ -87,12 +87,12 @@
     "@opentelemetry/exporter-trace-otlp-grpc": "^0.217.0",
     "@opentelemetry/sdk-node": "^0.217.0",
     "chalk": "^5.4.0",
-    "mohdel-thin-gate-linux-x64-gnu": "0.103.0"
+    "mohdel-thin-gate-linux-x64-gnu": "0.104.1"
   },
   "dependencies": {
     "@anthropic-ai/sdk": "^0.95.1",
     "@cerebras/cerebras_cloud_sdk": "^1.61.1",
-    "@google/genai": "^2.0.0",
+    "@google/genai": "^2.0.1",
     "@opentelemetry/api": "^1.9.1",
     "env-paths": "^4.0.0",
     "groq-sdk": "^1.1.2",