npm - mohdel - Versions diffs - 0.102.0 → 0.104.0 - Mend

mohdel 0.102.0 → 0.104.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/js/core/events.js +13 -0
package/js/session/adapters/_chat_completions.js +13 -2
package/js/session/adapters/_pricing.js +25 -8
package/js/session/adapters/anthropic.js +71 -10
package/js/session/adapters/openai.js +17 -2
package/js/session/run.js +5 -1
package/package.json +3 -3

package/js/core/events.js CHANGED Viewed

@@ -55,8 +55,21 @@
  * @property {(string|null)} output
  *   Final text (null when `status === 'tool_use'` with no text).
  * @property {number} inputTokens
+ *   Regular (non-cached) input tokens. For OpenAI/cerebras/fireworks where
+ *   `cached_tokens` is reported as a SUBSET of `prompt_tokens`, adapters
+ *   subtract the cached portion before exposing it here so all providers
+ *   produce additive token shapes.
  * @property {number} outputTokens
  * @property {number} thinkingTokens
+ * @property {number} [cacheWriteInputTokens]
+ *   Tokens written to a fresh prompt cache breakpoint, billed at
+ *   `cacheWritePrice` (typically 1.25× input on Anthropic). Absent on
+ *   providers that don't surface this counter (OpenAI doesn't separately
+ *   bill cache writes).
+ * @property {number} [cacheReadInputTokens]
+ *   Tokens served from prompt cache, billed at `cacheReadPrice` (typically
+ *   0.1× input). Set by Anthropic directly and by OpenAI-shape adapters
+ *   after subset→additive normalization of `prompt_tokens_details.cached_tokens`.
  * @property {number} cost
  *   USD, computed from curated pricing. Single number (not a breakdown).
  * @property {Timestamps} timestamps

package/js/session/adapters/_chat_completions.js CHANGED Viewed

@@ -258,10 +258,15 @@ async function * runStreaming (envelope, client, args, config, start, deps) {
  */
 function finalize ({ envelope, content, toolCalls, usage, finishReason, start, first, reasoning = null }) {
   const end = String(process.hrtime.bigint())
-  const inputTokens = usage.prompt_tokens || 0
+  const totalInputTokens = usage.prompt_tokens || 0
   const totalOutputTokens = usage.completion_tokens || 0
   const thinkingTokens = usage.completion_tokens_details?.reasoning_tokens || 0
+  const cachedInputTokens = usage.prompt_tokens_details?.cached_tokens || 0
   const visibleOutputTokens = Math.max(0, totalOutputTokens - thinkingTokens)
+  // OpenAI-shape APIs (cerebras/fireworks/...) report cached_tokens as a
+  // SUBSET of prompt_tokens. Convert to mohdel's additive convention so
+  // computeCost prices the cached portion at cacheReadPrice.
+  const inputTokens = Math.max(0, totalInputTokens - cachedInputTokens)
   const truncated = finishReason === 'length'
   let status = truncated ? STATUS_INCOMPLETE : STATUS_COMPLETED
@@ -276,9 +281,15 @@ function finalize ({ envelope, content, toolCalls, usage, finishReason, start, f
       inputTokens,
       outputTokens: visibleOutputTokens,
       thinkingTokens,
+      ...(cachedInputTokens > 0 && { cacheReadInputTokens: cachedInputTokens }),
       cost: costFor(
         catalogKey(envelope.model),
-        { inputTokens, outputTokens: visibleOutputTokens, thinkingTokens }
+        {
+          inputTokens,
+          outputTokens: visibleOutputTokens,
+          thinkingTokens,
+          cacheReadInputTokens: cachedInputTokens
+        }
       ),
       timestamps: { start, first: first ?? end, end }
     }

package/js/session/adapters/_pricing.js CHANGED Viewed

@@ -14,8 +14,8 @@ import { getSpec, setCatalog } from './_catalog.js'
 /**
  * Pure cost computation from spec + usage.
  *
- * Each price field (`inputPrice` / `outputPrice` / `thinkingPrice`) is
- * one of:
+ * Each price field (`inputPrice` / `outputPrice` / `thinkingPrice` /
+ * `cacheWritePrice` / `cacheReadPrice`) is one of:
  *
  *   - a `number` — flat per-million rate; or
  *   - an object `{">N": number, ..., "default": number}` — tiered.
@@ -24,12 +24,23 @@ import { getSpec, setCatalog } from './_catalog.js'
  *     nothing matches. Keys that aren't `">N"` or `"default"` are
  *     ignored. `>` is strict — at exactly N, the default is used.
  *
- * `thinkingPrice` is optional and falls back to the resolved
- * `outputPrice` when absent.
+ * Optional fields fall back to other prices when absent:
+ *   - `thinkingPrice` → `outputPrice`
+ *   - `cacheWritePrice` → `inputPrice` (graceful for non-caching providers)
+ *   - `cacheReadPrice` → `inputPrice`
  *
- * @param {any} spec  Catalog entry (with `inputPrice`/`outputPrice`/`thinkingPrice`),
- *                    or `undefined`.
- * @param {{inputTokens?: number, outputTokens?: number, thinkingTokens?: number}} usage
+ * Token-counting conventions:
+ *   - Anthropic reports `cache_creation_input_tokens` and `cache_read_input_tokens`
+ *     as ADDITIONAL to `input_tokens` (separately billable). The adapter
+ *     surfaces them as `cacheWriteInputTokens` / `cacheReadInputTokens`
+ *     (write/read pair, matching catalog `cacheWritePrice`/`cacheReadPrice`).
+ *   - OpenAI reports `prompt_tokens_details.cached_tokens` as a SUBSET of
+ *     `prompt_tokens` (already counted). Adapters subtract before passing
+ *     `inputTokens` to keep this function additive across providers.
+ *
+ * @param {any} spec  Catalog entry, or `undefined`.
+ * @param {{inputTokens?: number, outputTokens?: number, thinkingTokens?: number,
+ *          cacheWriteInputTokens?: number, cacheReadInputTokens?: number}} usage
  * @returns {number}
  */
 export function computeCost (spec, usage) {
@@ -37,12 +48,18 @@ export function computeCost (spec, usage) {
   const i = usage.inputTokens ?? 0
   const o = usage.outputTokens ?? 0
   const t = usage.thinkingTokens ?? 0
+  const cw = usage.cacheWriteInputTokens ?? 0
+  const cr = usage.cacheReadInputTokens ?? 0
   const ip = resolveTier(spec.inputPrice, i)
   const op = resolveTier(spec.outputPrice, i)
   if (typeof ip !== 'number' || typeof op !== 'number') return 0
   const tp = resolveTier(spec.thinkingPrice, i)
   const tpFinal = typeof tp === 'number' ? tp : op
-  const total = (i * ip + o * op + t * tpFinal) / 1_000_000
+  const cwp = resolveTier(spec.cacheWritePrice, i)
+  const cwpFinal = typeof cwp === 'number' ? cwp : ip
+  const crp = resolveTier(spec.cacheReadPrice, i)
+  const crpFinal = typeof crp === 'number' ? crp : ip
+  const total = (i * ip + cw * cwpFinal + cr * crpFinal + o * op + t * tpFinal) / 1_000_000
   return round(total)
 }

package/js/session/adapters/anthropic.js CHANGED Viewed

@@ -113,6 +113,8 @@ export async function * anthropic (envelope, deps = {}) {
   const currentOutput = () => outputParts.join('')
   let inputTokens = 0
   let outputTokens = 0
+  let cacheWriteTokens = 0
+  let cacheReadTokens = 0
   let thinkingChars = 0
   let status = STATUS_COMPLETED
   /** @type {string | undefined} */
@@ -135,6 +137,12 @@ export async function * anthropic (envelope, deps = {}) {
           if (event.message?.usage?.input_tokens) {
             inputTokens = event.message.usage.input_tokens
           }
+          if (event.message?.usage?.cache_creation_input_tokens) {
+            cacheWriteTokens = event.message.usage.cache_creation_input_tokens
+          }
+          if (event.message?.usage?.cache_read_input_tokens) {
+            cacheReadTokens = event.message.usage.cache_read_input_tokens
+          }
           break
         case 'content_block_start':
@@ -203,12 +211,18 @@ export async function * anthropic (envelope, deps = {}) {
   }
   const end = String(process.hrtime.bigint())
-  // Estimate thinking tokens from streamed thinking_delta char count
-  // (Anthropic API doesn't report them separately). Cap at total
-  // output tokens reported by usage.
+  // Estimate thinking tokens. Primary path: count streamed thinking_delta
+  // chars (sonnet emits these). Fallback: gap between Anthropic's reported
+  // output_tokens and what actually streamed as visible output (text +
+  // tool input JSON) — catches redacted_thinking blocks (opus 4.7 default)
+  // that consume output tokens but emit no streaming deltas.
+  const streamedOutput = currentOutput()
+  const streamedOutputChars = streamedOutput.length +
+    [...toolBlocks.values()].reduce((s, b) => s + b.inputJson.length, 0)
+  const streamedOutputTokens = Math.ceil(streamedOutputChars / ANTHROPIC_THINKING_CHARS_PER_TOKEN)
   const estimatedThinkingTokens = thinkingChars > 0
     ? Math.min(Math.ceil(thinkingChars / ANTHROPIC_THINKING_CHARS_PER_TOKEN), outputTokens)
-    : 0
+    : Math.max(0, outputTokens - streamedOutputTokens)
   const messageOutputTokens = Math.max(0, outputTokens - estimatedThinkingTokens)
   /** @type {import('#core/events.js').DoneEvent} */
@@ -216,13 +230,21 @@ export async function * anthropic (envelope, deps = {}) {
     type: 'done',
     result: {
       status,
-      output: currentOutput() || null,
+      output: streamedOutput || null,
       inputTokens,
       outputTokens: messageOutputTokens,
       thinkingTokens: estimatedThinkingTokens,
+      ...(cacheWriteTokens > 0 && { cacheWriteInputTokens: cacheWriteTokens }),
+      ...(cacheReadTokens > 0 && { cacheReadInputTokens: cacheReadTokens }),
       cost: costFor(
         catalogKey(envelope.model),
-        { inputTokens, outputTokens: messageOutputTokens, thinkingTokens: estimatedThinkingTokens }
+        {
+          inputTokens,
+          outputTokens: messageOutputTokens,
+          thinkingTokens: estimatedThinkingTokens,
+          cacheWriteInputTokens: cacheWriteTokens,
+          cacheReadInputTokens: cacheReadTokens
+        }
       ),
       timestamps: { start, first: first ?? end, end }
     }
@@ -255,7 +277,9 @@ function safeParseJson (s) {
 /**
  * @param {import('#core/envelope.js').CallEnvelope} envelope
  * @param {Array<{role: string, content: any}>} conversation
- * @param {string} system
+ * @param {string | Array<{type: string, text: string, cache_control?: object}>} system
+ *   Either a flat string (legacy) or an array of typed blocks with optional
+ *   `cache_control` for prompt caching. The Anthropic SDK accepts both shapes.
  */
 function buildRequest (envelope, conversation, system) {
   const spec = getSpec(catalogKey(envelope.model))
@@ -267,7 +291,9 @@ function buildRequest (envelope, conversation, system) {
     max_tokens: envelope.outputBudget ?? outputTokenLimit ?? ANTHROPIC_DEFAULT_MAX_TOKENS,
     messages: conversation
   }
-  if (system) request.system = system
+  if (typeof system === 'string' ? system : Array.isArray(system) && system.length) {
+    request.system = system
+  }
   if (envelope.tools?.length) {
     request.tools = toAnthropicTools(envelope.tools)
   }
@@ -308,11 +334,39 @@ function splitPrompt (prompt) {
   }
   /** @type {string[]} */
   const systemParts = []
+  /** @type {Array<{type: string, text: string, cache_control?: object}>} */
+  const systemBlocks = []
+  let hasCacheMarkers = false
   /** @type {Array<{role: string, content: any}>} */
   const conversation = []
   for (const m of prompt) {
     if (m.role === 'system') {
-      systemParts.push(flattenText(m.content))
+      // Translate spore-style cache markers ({text, cache: '5m'|'1h'}) into
+      // Anthropic's cache_control. Preserves the block boundary that spore
+      // chose; collapsing into a single string would silently disable
+      // caching even when the upstream tier composed the prompt with
+      // explicit breakpoints.
+      if (Array.isArray(m.content)) {
+        for (const p of m.content) {
+          if (!p?.text) continue
+          const block = { type: 'text', text: p.text }
+          if (p.cache === '5m') {
+            block.cache_control = { type: 'ephemeral' }
+            hasCacheMarkers = true
+          } else if (p.cache === '1h') {
+            block.cache_control = { type: 'ephemeral', ttl: '1h' }
+            hasCacheMarkers = true
+          }
+          systemBlocks.push(block)
+          systemParts.push(p.text)
+        }
+      } else {
+        const text = flattenText(m.content)
+        if (text) {
+          systemBlocks.push({ type: 'text', text })
+          systemParts.push(text)
+        }
+      }
     } else if (m.role === 'tool') {
       // Tool results go in a user-role message with tool_result blocks.
       conversation.push({
@@ -345,7 +399,14 @@ function splitPrompt (prompt) {
       })
     }
   }
-  return { system: systemParts.filter(Boolean).join('\n\n'), conversation }
+  // If cache markers are present, return the structured block array so
+  // buildRequest can pass it through to Anthropic's typed system field.
+  // Otherwise fall back to the legacy string concatenation path so
+  // non-cached calls don't change request shape.
+  const system = hasCacheMarkers
+    ? systemBlocks
+    : systemParts.filter(Boolean).join('\n\n')
+  return { system, conversation }
 }
 /** @param {string | import('#core/envelope.js').MessagePart[]} content */

package/js/session/adapters/openai.js CHANGED Viewed

@@ -76,6 +76,7 @@ export async function * openai (envelope, deps = {}) {
   let inputTokens = 0
   let outputTokens = 0
   let thinkingTokens = 0
+  let cachedInputTokens = 0
   let status = STATUS_COMPLETED
   /** @type {string | undefined} */
   let warning
@@ -130,6 +131,7 @@ export async function * openai (envelope, deps = {}) {
             inputTokens = event.response.usage.input_tokens ?? 0
             outputTokens = event.response.usage.output_tokens ?? 0
             thinkingTokens = event.response.usage.output_tokens_details?.reasoning_tokens ?? 0
+            cachedInputTokens = event.response.usage.input_tokens_details?.cached_tokens ?? 0
           }
           if (toolItems.size > 0) {
             status = STATUS_TOOL_USE
@@ -145,6 +147,7 @@ export async function * openai (envelope, deps = {}) {
             inputTokens = event.response.usage.input_tokens ?? 0
             outputTokens = event.response.usage.output_tokens ?? 0
             thinkingTokens = event.response.usage.output_tokens_details?.reasoning_tokens ?? 0
+            cachedInputTokens = event.response.usage.input_tokens_details?.cached_tokens ?? 0
           }
           break
@@ -174,18 +177,30 @@ export async function * openai (envelope, deps = {}) {
   // one from the other for the message-only count.
   const messageOutputTokens = Math.max(0, outputTokens - thinkingTokens)
+  // OpenAI counts cached_tokens as a SUBSET of input_tokens. Convert to
+  // mohdel's additive convention (cacheReadInputTokens is separate from
+  // inputTokens) by subtracting the cached portion before pricing. Both
+  // adapters and computeCost stay simpler with the additive shape.
+  const regularInputTokens = Math.max(0, inputTokens - cachedInputTokens)
   /** @type {import('#core/events.js').DoneEvent} */
   const done = {
     type: 'done',
     result: {
       status,
       output: currentOutput() || null,
-      inputTokens,
+      inputTokens: regularInputTokens,
       outputTokens: messageOutputTokens,
       thinkingTokens,
+      ...(cachedInputTokens > 0 && { cacheReadInputTokens: cachedInputTokens }),
       cost: costFor(
         catalogKey(envelope.model),
-        { inputTokens, outputTokens: messageOutputTokens, thinkingTokens }
+        {
+          inputTokens: regularInputTokens,
+          outputTokens: messageOutputTokens,
+          thinkingTokens,
+          cacheReadInputTokens: cachedInputTokens
+        }
       ),
       timestamps: { start, first: first ?? end, end }
     }

package/js/session/run.js CHANGED Viewed

@@ -264,12 +264,16 @@ export async function * run (envelope, {
 function normalizeModelEffort (envelope, resolveSpec) {
   const candidate = effortOf(envelope.model)
   if (!candidate) return { envelope }
-  if (envelope.outputEffort) return { envelope } // explicit wins
   const base = catalogKey(envelope.model)
   const baseSpec = resolveSpec(base)
   if (!baseSpec) return { envelope } // base not known — let full string fall through to not-found
+  // Explicit outputEffort wins; still strip the suffix so spans/logs see the canonical id.
+  if (envelope.outputEffort) {
+    return { envelope: { ...envelope, model: base } }
+  }
   if (!baseSpec.thinkingEffortLevels) {
     return {
       envelope,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "mohdel",
-  "version": "0.102.0",
+  "version": "0.104.0",
   "license": "MIT",
   "author": {
     "name": "Christophe Le Bars",
@@ -87,12 +87,12 @@
     "@opentelemetry/exporter-trace-otlp-grpc": "^0.217.0",
     "@opentelemetry/sdk-node": "^0.217.0",
     "chalk": "^5.4.0",
-    "mohdel-thin-gate-linux-x64-gnu": "0.102.0"
+    "mohdel-thin-gate-linux-x64-gnu": "0.104.0"
   },
   "dependencies": {
     "@anthropic-ai/sdk": "^0.95.1",
     "@cerebras/cerebras_cloud_sdk": "^1.61.1",
-    "@google/genai": "^2.0.0",
+    "@google/genai": "^2.0.1",
     "@opentelemetry/api": "^1.9.1",
     "env-paths": "^4.0.0",
     "groq-sdk": "^1.1.2",