mohdel 0.102.0 → 0.104.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/js/core/events.js CHANGED
@@ -55,8 +55,21 @@
55
55
  * @property {(string|null)} output
56
56
  * Final text (null when `status === 'tool_use'` with no text).
57
57
  * @property {number} inputTokens
58
+ * Regular (non-cached) input tokens. For OpenAI/cerebras/fireworks where
59
+ * `cached_tokens` is reported as a SUBSET of `prompt_tokens`, adapters
60
+ * subtract the cached portion before exposing it here so all providers
61
+ * produce additive token shapes.
58
62
  * @property {number} outputTokens
59
63
  * @property {number} thinkingTokens
64
+ * @property {number} [cacheWriteInputTokens]
65
+ * Tokens written to a fresh prompt cache breakpoint, billed at
66
+ * `cacheWritePrice` (typically 1.25× input on Anthropic). Absent on
67
+ * providers that don't surface this counter (OpenAI doesn't separately
68
+ * bill cache writes).
69
+ * @property {number} [cacheReadInputTokens]
70
+ * Tokens served from prompt cache, billed at `cacheReadPrice` (typically
71
+ * 0.1× input). Set by Anthropic directly and by OpenAI-shape adapters
72
+ * after subset→additive normalization of `prompt_tokens_details.cached_tokens`.
60
73
  * @property {number} cost
61
74
  * USD, computed from curated pricing. Single number (not a breakdown).
62
75
  * @property {Timestamps} timestamps
@@ -258,10 +258,15 @@ async function * runStreaming (envelope, client, args, config, start, deps) {
258
258
  */
259
259
  function finalize ({ envelope, content, toolCalls, usage, finishReason, start, first, reasoning = null }) {
260
260
  const end = String(process.hrtime.bigint())
261
- const inputTokens = usage.prompt_tokens || 0
261
+ const totalInputTokens = usage.prompt_tokens || 0
262
262
  const totalOutputTokens = usage.completion_tokens || 0
263
263
  const thinkingTokens = usage.completion_tokens_details?.reasoning_tokens || 0
264
+ const cachedInputTokens = usage.prompt_tokens_details?.cached_tokens || 0
264
265
  const visibleOutputTokens = Math.max(0, totalOutputTokens - thinkingTokens)
266
+ // OpenAI-shape APIs (cerebras/fireworks/...) report cached_tokens as a
267
+ // SUBSET of prompt_tokens. Convert to mohdel's additive convention so
268
+ // computeCost prices the cached portion at cacheReadPrice.
269
+ const inputTokens = Math.max(0, totalInputTokens - cachedInputTokens)
265
270
 
266
271
  const truncated = finishReason === 'length'
267
272
  let status = truncated ? STATUS_INCOMPLETE : STATUS_COMPLETED
@@ -276,9 +281,15 @@ function finalize ({ envelope, content, toolCalls, usage, finishReason, start, f
276
281
  inputTokens,
277
282
  outputTokens: visibleOutputTokens,
278
283
  thinkingTokens,
284
+ ...(cachedInputTokens > 0 && { cacheReadInputTokens: cachedInputTokens }),
279
285
  cost: costFor(
280
286
  catalogKey(envelope.model),
281
- { inputTokens, outputTokens: visibleOutputTokens, thinkingTokens }
287
+ {
288
+ inputTokens,
289
+ outputTokens: visibleOutputTokens,
290
+ thinkingTokens,
291
+ cacheReadInputTokens: cachedInputTokens
292
+ }
282
293
  ),
283
294
  timestamps: { start, first: first ?? end, end }
284
295
  }
@@ -14,8 +14,8 @@ import { getSpec, setCatalog } from './_catalog.js'
14
14
  /**
15
15
  * Pure cost computation from spec + usage.
16
16
  *
17
- * Each price field (`inputPrice` / `outputPrice` / `thinkingPrice`) is
18
- * one of:
17
+ * Each price field (`inputPrice` / `outputPrice` / `thinkingPrice` /
18
+ * `cacheWritePrice` / `cacheReadPrice`) is one of:
19
19
  *
20
20
  * - a `number` — flat per-million rate; or
21
21
  * - an object `{">N": number, ..., "default": number}` — tiered.
@@ -24,12 +24,23 @@ import { getSpec, setCatalog } from './_catalog.js'
24
24
  * nothing matches. Keys that aren't `">N"` or `"default"` are
25
25
  * ignored. `>` is strict — at exactly N, the default is used.
26
26
  *
27
- * `thinkingPrice` is optional and falls back to the resolved
28
- * `outputPrice` when absent.
27
+ * Optional fields fall back to other prices when absent:
28
+ * - `thinkingPrice` `outputPrice`
29
+ * - `cacheWritePrice` → `inputPrice` (graceful for non-caching providers)
30
+ * - `cacheReadPrice` → `inputPrice`
29
31
  *
30
- * @param {any} spec Catalog entry (with `inputPrice`/`outputPrice`/`thinkingPrice`),
31
- * or `undefined`.
32
- * @param {{inputTokens?: number, outputTokens?: number, thinkingTokens?: number}} usage
32
+ * Token-counting conventions:
33
+ * - Anthropic reports `cache_creation_input_tokens` and `cache_read_input_tokens`
34
+ * as ADDITIONAL to `input_tokens` (separately billable). The adapter
35
+ * surfaces them as `cacheWriteInputTokens` / `cacheReadInputTokens`
36
+ * (write/read pair, matching catalog `cacheWritePrice`/`cacheReadPrice`).
37
+ * - OpenAI reports `prompt_tokens_details.cached_tokens` as a SUBSET of
38
+ * `prompt_tokens` (already counted). Adapters subtract before passing
39
+ * `inputTokens` to keep this function additive across providers.
40
+ *
41
+ * @param {any} spec Catalog entry, or `undefined`.
42
+ * @param {{inputTokens?: number, outputTokens?: number, thinkingTokens?: number,
43
+ * cacheWriteInputTokens?: number, cacheReadInputTokens?: number}} usage
33
44
  * @returns {number}
34
45
  */
35
46
  export function computeCost (spec, usage) {
@@ -37,12 +48,18 @@ export function computeCost (spec, usage) {
37
48
  const i = usage.inputTokens ?? 0
38
49
  const o = usage.outputTokens ?? 0
39
50
  const t = usage.thinkingTokens ?? 0
51
+ const cw = usage.cacheWriteInputTokens ?? 0
52
+ const cr = usage.cacheReadInputTokens ?? 0
40
53
  const ip = resolveTier(spec.inputPrice, i)
41
54
  const op = resolveTier(spec.outputPrice, i)
42
55
  if (typeof ip !== 'number' || typeof op !== 'number') return 0
43
56
  const tp = resolveTier(spec.thinkingPrice, i)
44
57
  const tpFinal = typeof tp === 'number' ? tp : op
45
- const total = (i * ip + o * op + t * tpFinal) / 1_000_000
58
+ const cwp = resolveTier(spec.cacheWritePrice, i)
59
+ const cwpFinal = typeof cwp === 'number' ? cwp : ip
60
+ const crp = resolveTier(spec.cacheReadPrice, i)
61
+ const crpFinal = typeof crp === 'number' ? crp : ip
62
+ const total = (i * ip + cw * cwpFinal + cr * crpFinal + o * op + t * tpFinal) / 1_000_000
46
63
  return round(total)
47
64
  }
48
65
 
@@ -113,6 +113,8 @@ export async function * anthropic (envelope, deps = {}) {
113
113
  const currentOutput = () => outputParts.join('')
114
114
  let inputTokens = 0
115
115
  let outputTokens = 0
116
+ let cacheWriteTokens = 0
117
+ let cacheReadTokens = 0
116
118
  let thinkingChars = 0
117
119
  let status = STATUS_COMPLETED
118
120
  /** @type {string | undefined} */
@@ -135,6 +137,12 @@ export async function * anthropic (envelope, deps = {}) {
135
137
  if (event.message?.usage?.input_tokens) {
136
138
  inputTokens = event.message.usage.input_tokens
137
139
  }
140
+ if (event.message?.usage?.cache_creation_input_tokens) {
141
+ cacheWriteTokens = event.message.usage.cache_creation_input_tokens
142
+ }
143
+ if (event.message?.usage?.cache_read_input_tokens) {
144
+ cacheReadTokens = event.message.usage.cache_read_input_tokens
145
+ }
138
146
  break
139
147
 
140
148
  case 'content_block_start':
@@ -203,12 +211,18 @@ export async function * anthropic (envelope, deps = {}) {
203
211
  }
204
212
 
205
213
  const end = String(process.hrtime.bigint())
206
- // Estimate thinking tokens from streamed thinking_delta char count
207
- // (Anthropic API doesn't report them separately). Cap at total
208
- // output tokens reported by usage.
214
+ // Estimate thinking tokens. Primary path: count streamed thinking_delta
215
+ // chars (sonnet emits these). Fallback: gap between Anthropic's reported
216
+ // output_tokens and what actually streamed as visible output (text +
217
+ // tool input JSON) — catches redacted_thinking blocks (opus 4.7 default)
218
+ // that consume output tokens but emit no streaming deltas.
219
+ const streamedOutput = currentOutput()
220
+ const streamedOutputChars = streamedOutput.length +
221
+ [...toolBlocks.values()].reduce((s, b) => s + b.inputJson.length, 0)
222
+ const streamedOutputTokens = Math.ceil(streamedOutputChars / ANTHROPIC_THINKING_CHARS_PER_TOKEN)
209
223
  const estimatedThinkingTokens = thinkingChars > 0
210
224
  ? Math.min(Math.ceil(thinkingChars / ANTHROPIC_THINKING_CHARS_PER_TOKEN), outputTokens)
211
- : 0
225
+ : Math.max(0, outputTokens - streamedOutputTokens)
212
226
  const messageOutputTokens = Math.max(0, outputTokens - estimatedThinkingTokens)
213
227
 
214
228
  /** @type {import('#core/events.js').DoneEvent} */
@@ -216,13 +230,21 @@ export async function * anthropic (envelope, deps = {}) {
216
230
  type: 'done',
217
231
  result: {
218
232
  status,
219
- output: currentOutput() || null,
233
+ output: streamedOutput || null,
220
234
  inputTokens,
221
235
  outputTokens: messageOutputTokens,
222
236
  thinkingTokens: estimatedThinkingTokens,
237
+ ...(cacheWriteTokens > 0 && { cacheWriteInputTokens: cacheWriteTokens }),
238
+ ...(cacheReadTokens > 0 && { cacheReadInputTokens: cacheReadTokens }),
223
239
  cost: costFor(
224
240
  catalogKey(envelope.model),
225
- { inputTokens, outputTokens: messageOutputTokens, thinkingTokens: estimatedThinkingTokens }
241
+ {
242
+ inputTokens,
243
+ outputTokens: messageOutputTokens,
244
+ thinkingTokens: estimatedThinkingTokens,
245
+ cacheWriteInputTokens: cacheWriteTokens,
246
+ cacheReadInputTokens: cacheReadTokens
247
+ }
226
248
  ),
227
249
  timestamps: { start, first: first ?? end, end }
228
250
  }
@@ -255,7 +277,9 @@ function safeParseJson (s) {
255
277
  /**
256
278
  * @param {import('#core/envelope.js').CallEnvelope} envelope
257
279
  * @param {Array<{role: string, content: any}>} conversation
258
- * @param {string} system
280
+ * @param {string | Array<{type: string, text: string, cache_control?: object}>} system
281
+ * Either a flat string (legacy) or an array of typed blocks with optional
282
+ * `cache_control` for prompt caching. The Anthropic SDK accepts both shapes.
259
283
  */
260
284
  function buildRequest (envelope, conversation, system) {
261
285
  const spec = getSpec(catalogKey(envelope.model))
@@ -267,7 +291,9 @@ function buildRequest (envelope, conversation, system) {
267
291
  max_tokens: envelope.outputBudget ?? outputTokenLimit ?? ANTHROPIC_DEFAULT_MAX_TOKENS,
268
292
  messages: conversation
269
293
  }
270
- if (system) request.system = system
294
+ if (typeof system === 'string' ? system : Array.isArray(system) && system.length) {
295
+ request.system = system
296
+ }
271
297
  if (envelope.tools?.length) {
272
298
  request.tools = toAnthropicTools(envelope.tools)
273
299
  }
@@ -308,11 +334,39 @@ function splitPrompt (prompt) {
308
334
  }
309
335
  /** @type {string[]} */
310
336
  const systemParts = []
337
+ /** @type {Array<{type: string, text: string, cache_control?: object}>} */
338
+ const systemBlocks = []
339
+ let hasCacheMarkers = false
311
340
  /** @type {Array<{role: string, content: any}>} */
312
341
  const conversation = []
313
342
  for (const m of prompt) {
314
343
  if (m.role === 'system') {
315
- systemParts.push(flattenText(m.content))
344
+ // Translate spore-style cache markers ({text, cache: '5m'|'1h'}) into
345
+ // Anthropic's cache_control. Preserves the block boundary that spore
346
+ // chose; collapsing into a single string would silently disable
347
+ // caching even when the upstream tier composed the prompt with
348
+ // explicit breakpoints.
349
+ if (Array.isArray(m.content)) {
350
+ for (const p of m.content) {
351
+ if (!p?.text) continue
352
+ const block = { type: 'text', text: p.text }
353
+ if (p.cache === '5m') {
354
+ block.cache_control = { type: 'ephemeral' }
355
+ hasCacheMarkers = true
356
+ } else if (p.cache === '1h') {
357
+ block.cache_control = { type: 'ephemeral', ttl: '1h' }
358
+ hasCacheMarkers = true
359
+ }
360
+ systemBlocks.push(block)
361
+ systemParts.push(p.text)
362
+ }
363
+ } else {
364
+ const text = flattenText(m.content)
365
+ if (text) {
366
+ systemBlocks.push({ type: 'text', text })
367
+ systemParts.push(text)
368
+ }
369
+ }
316
370
  } else if (m.role === 'tool') {
317
371
  // Tool results go in a user-role message with tool_result blocks.
318
372
  conversation.push({
@@ -345,7 +399,14 @@ function splitPrompt (prompt) {
345
399
  })
346
400
  }
347
401
  }
348
- return { system: systemParts.filter(Boolean).join('\n\n'), conversation }
402
+ // If cache markers are present, return the structured block array so
403
+ // buildRequest can pass it through to Anthropic's typed system field.
404
+ // Otherwise fall back to the legacy string concatenation path so
405
+ // non-cached calls don't change request shape.
406
+ const system = hasCacheMarkers
407
+ ? systemBlocks
408
+ : systemParts.filter(Boolean).join('\n\n')
409
+ return { system, conversation }
349
410
  }
350
411
 
351
412
  /** @param {string | import('#core/envelope.js').MessagePart[]} content */
@@ -76,6 +76,7 @@ export async function * openai (envelope, deps = {}) {
76
76
  let inputTokens = 0
77
77
  let outputTokens = 0
78
78
  let thinkingTokens = 0
79
+ let cachedInputTokens = 0
79
80
  let status = STATUS_COMPLETED
80
81
  /** @type {string | undefined} */
81
82
  let warning
@@ -130,6 +131,7 @@ export async function * openai (envelope, deps = {}) {
130
131
  inputTokens = event.response.usage.input_tokens ?? 0
131
132
  outputTokens = event.response.usage.output_tokens ?? 0
132
133
  thinkingTokens = event.response.usage.output_tokens_details?.reasoning_tokens ?? 0
134
+ cachedInputTokens = event.response.usage.input_tokens_details?.cached_tokens ?? 0
133
135
  }
134
136
  if (toolItems.size > 0) {
135
137
  status = STATUS_TOOL_USE
@@ -145,6 +147,7 @@ export async function * openai (envelope, deps = {}) {
145
147
  inputTokens = event.response.usage.input_tokens ?? 0
146
148
  outputTokens = event.response.usage.output_tokens ?? 0
147
149
  thinkingTokens = event.response.usage.output_tokens_details?.reasoning_tokens ?? 0
150
+ cachedInputTokens = event.response.usage.input_tokens_details?.cached_tokens ?? 0
148
151
  }
149
152
  break
150
153
 
@@ -174,18 +177,30 @@ export async function * openai (envelope, deps = {}) {
174
177
  // one from the other for the message-only count.
175
178
  const messageOutputTokens = Math.max(0, outputTokens - thinkingTokens)
176
179
 
180
+ // OpenAI counts cached_tokens as a SUBSET of input_tokens. Convert to
181
+ // mohdel's additive convention (cacheReadInputTokens is separate from
182
+ // inputTokens) by subtracting the cached portion before pricing. Both
183
+ // adapters and computeCost stay simpler with the additive shape.
184
+ const regularInputTokens = Math.max(0, inputTokens - cachedInputTokens)
185
+
177
186
  /** @type {import('#core/events.js').DoneEvent} */
178
187
  const done = {
179
188
  type: 'done',
180
189
  result: {
181
190
  status,
182
191
  output: currentOutput() || null,
183
- inputTokens,
192
+ inputTokens: regularInputTokens,
184
193
  outputTokens: messageOutputTokens,
185
194
  thinkingTokens,
195
+ ...(cachedInputTokens > 0 && { cacheReadInputTokens: cachedInputTokens }),
186
196
  cost: costFor(
187
197
  catalogKey(envelope.model),
188
- { inputTokens, outputTokens: messageOutputTokens, thinkingTokens }
198
+ {
199
+ inputTokens: regularInputTokens,
200
+ outputTokens: messageOutputTokens,
201
+ thinkingTokens,
202
+ cacheReadInputTokens: cachedInputTokens
203
+ }
189
204
  ),
190
205
  timestamps: { start, first: first ?? end, end }
191
206
  }
package/js/session/run.js CHANGED
@@ -264,12 +264,16 @@ export async function * run (envelope, {
264
264
  function normalizeModelEffort (envelope, resolveSpec) {
265
265
  const candidate = effortOf(envelope.model)
266
266
  if (!candidate) return { envelope }
267
- if (envelope.outputEffort) return { envelope } // explicit wins
268
267
 
269
268
  const base = catalogKey(envelope.model)
270
269
  const baseSpec = resolveSpec(base)
271
270
  if (!baseSpec) return { envelope } // base not known — let full string fall through to not-found
272
271
 
272
+ // Explicit outputEffort wins; still strip the suffix so spans/logs see the canonical id.
273
+ if (envelope.outputEffort) {
274
+ return { envelope: { ...envelope, model: base } }
275
+ }
276
+
273
277
  if (!baseSpec.thinkingEffortLevels) {
274
278
  return {
275
279
  envelope,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "mohdel",
3
- "version": "0.102.0",
3
+ "version": "0.104.0",
4
4
  "license": "MIT",
5
5
  "author": {
6
6
  "name": "Christophe Le Bars",
@@ -87,12 +87,12 @@
87
87
  "@opentelemetry/exporter-trace-otlp-grpc": "^0.217.0",
88
88
  "@opentelemetry/sdk-node": "^0.217.0",
89
89
  "chalk": "^5.4.0",
90
- "mohdel-thin-gate-linux-x64-gnu": "0.102.0"
90
+ "mohdel-thin-gate-linux-x64-gnu": "0.104.0"
91
91
  },
92
92
  "dependencies": {
93
93
  "@anthropic-ai/sdk": "^0.95.1",
94
94
  "@cerebras/cerebras_cloud_sdk": "^1.61.1",
95
- "@google/genai": "^2.0.0",
95
+ "@google/genai": "^2.0.1",
96
96
  "@opentelemetry/api": "^1.9.1",
97
97
  "env-paths": "^4.0.0",
98
98
  "groq-sdk": "^1.1.2",