mohdel 0.103.0 → 0.104.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/js/core/events.js CHANGED
@@ -55,8 +55,19 @@
55
55
  * @property {(string|null)} output
56
56
  * Final text (null when `status === 'tool_use'` with no text).
57
57
  * @property {number} inputTokens
58
+ * Regular (non-cached) input tokens. For OpenAI/cerebras/fireworks where
59
+ * `cached_tokens` is reported as a SUBSET of `prompt_tokens`, adapters
60
+ * subtract the cached portion before exposing it here so all providers
61
+ * produce additive token shapes.
58
62
  * @property {number} outputTokens
59
63
  * @property {number} thinkingTokens
64
+ * @property {number} [cacheWriteInputTokens]
65
+ * Input tokens written to a fresh prompt cache breakpoint, billed at
66
+ * `cacheWritePrice`. Absent when the provider has no separate
67
+ * cache-write counter.
68
+ * @property {number} [cacheReadInputTokens]
69
+ * Input tokens served from prompt cache, billed at `cacheReadPrice`.
70
+ * Absent when the provider has no prompt caching.
60
71
  * @property {number} cost
61
72
  * USD, computed from curated pricing. Single number (not a breakdown).
62
73
  * @property {Timestamps} timestamps
@@ -258,10 +258,15 @@ async function * runStreaming (envelope, client, args, config, start, deps) {
258
258
  */
259
259
  function finalize ({ envelope, content, toolCalls, usage, finishReason, start, first, reasoning = null }) {
260
260
  const end = String(process.hrtime.bigint())
261
- const inputTokens = usage.prompt_tokens || 0
261
+ const totalInputTokens = usage.prompt_tokens || 0
262
262
  const totalOutputTokens = usage.completion_tokens || 0
263
263
  const thinkingTokens = usage.completion_tokens_details?.reasoning_tokens || 0
264
+ const cachedInputTokens = usage.prompt_tokens_details?.cached_tokens || 0
264
265
  const visibleOutputTokens = Math.max(0, totalOutputTokens - thinkingTokens)
266
+ // OpenAI-shape APIs (cerebras/fireworks/...) report cached_tokens as a
267
+ // SUBSET of prompt_tokens. Convert to mohdel's additive convention so
268
+ // computeCost prices the cached portion at cacheReadPrice.
269
+ const inputTokens = Math.max(0, totalInputTokens - cachedInputTokens)
265
270
 
266
271
  const truncated = finishReason === 'length'
267
272
  let status = truncated ? STATUS_INCOMPLETE : STATUS_COMPLETED
@@ -276,9 +281,15 @@ function finalize ({ envelope, content, toolCalls, usage, finishReason, start, f
276
281
  inputTokens,
277
282
  outputTokens: visibleOutputTokens,
278
283
  thinkingTokens,
284
+ ...(cachedInputTokens > 0 && { cacheReadInputTokens: cachedInputTokens }),
279
285
  cost: costFor(
280
286
  catalogKey(envelope.model),
281
- { inputTokens, outputTokens: visibleOutputTokens, thinkingTokens }
287
+ {
288
+ inputTokens,
289
+ outputTokens: visibleOutputTokens,
290
+ thinkingTokens,
291
+ cacheReadInputTokens: cachedInputTokens
292
+ }
282
293
  ),
283
294
  timestamps: { start, first: first ?? end, end }
284
295
  }
@@ -14,8 +14,8 @@ import { getSpec, setCatalog } from './_catalog.js'
14
14
  /**
15
15
  * Pure cost computation from spec + usage.
16
16
  *
17
- * Each price field (`inputPrice` / `outputPrice` / `thinkingPrice`) is
18
- * one of:
17
+ * Each price field (`inputPrice` / `outputPrice` / `thinkingPrice` /
18
+ * `cacheWritePrice` / `cacheReadPrice`) is one of:
19
19
  *
20
20
  * - a `number` — flat per-million rate; or
21
21
  * - an object `{">N": number, ..., "default": number}` — tiered.
@@ -24,12 +24,19 @@ import { getSpec, setCatalog } from './_catalog.js'
24
24
  * nothing matches. Keys that aren't `">N"` or `"default"` are
25
25
  * ignored. `>` is strict — at exactly N, the default is used.
26
26
  *
27
- * `thinkingPrice` is optional and falls back to the resolved
28
- * `outputPrice` when absent.
27
+ * Optional fields fall back to other prices when absent:
28
+ * - `thinkingPrice` `outputPrice`
29
+ * - `cacheWritePrice` → `inputPrice` (graceful for non-caching providers)
30
+ * - `cacheReadPrice` → `inputPrice`
29
31
  *
30
- * @param {any} spec Catalog entry (with `inputPrice`/`outputPrice`/`thinkingPrice`),
31
- * or `undefined`.
32
- * @param {{inputTokens?: number, outputTokens?: number, thinkingTokens?: number}} usage
32
+ * Token-counting convention: this function is purely additive across
33
+ * `inputTokens`, `cacheWriteInputTokens`, `cacheReadInputTokens`,
34
+ * `outputTokens`, and `thinkingTokens`. Adapters normalize provider-specific
35
+ * shapes (e.g. subset-of-input vs. additional-to-input) before calling here.
36
+ *
37
+ * @param {any} spec Catalog entry, or `undefined`.
38
+ * @param {{inputTokens?: number, outputTokens?: number, thinkingTokens?: number,
39
+ * cacheWriteInputTokens?: number, cacheReadInputTokens?: number}} usage
33
40
  * @returns {number}
34
41
  */
35
42
  export function computeCost (spec, usage) {
@@ -37,12 +44,18 @@ export function computeCost (spec, usage) {
37
44
  const i = usage.inputTokens ?? 0
38
45
  const o = usage.outputTokens ?? 0
39
46
  const t = usage.thinkingTokens ?? 0
47
+ const cw = usage.cacheWriteInputTokens ?? 0
48
+ const cr = usage.cacheReadInputTokens ?? 0
40
49
  const ip = resolveTier(spec.inputPrice, i)
41
50
  const op = resolveTier(spec.outputPrice, i)
42
51
  if (typeof ip !== 'number' || typeof op !== 'number') return 0
43
52
  const tp = resolveTier(spec.thinkingPrice, i)
44
53
  const tpFinal = typeof tp === 'number' ? tp : op
45
- const total = (i * ip + o * op + t * tpFinal) / 1_000_000
54
+ const cwp = resolveTier(spec.cacheWritePrice, i)
55
+ const cwpFinal = typeof cwp === 'number' ? cwp : ip
56
+ const crp = resolveTier(spec.cacheReadPrice, i)
57
+ const crpFinal = typeof crp === 'number' ? crp : ip
58
+ const total = (i * ip + cw * cwpFinal + cr * crpFinal + o * op + t * tpFinal) / 1_000_000
46
59
  return round(total)
47
60
  }
48
61
 
@@ -113,6 +113,8 @@ export async function * anthropic (envelope, deps = {}) {
113
113
  const currentOutput = () => outputParts.join('')
114
114
  let inputTokens = 0
115
115
  let outputTokens = 0
116
+ let cacheWriteTokens = 0
117
+ let cacheReadTokens = 0
116
118
  let thinkingChars = 0
117
119
  let status = STATUS_COMPLETED
118
120
  /** @type {string | undefined} */
@@ -135,6 +137,12 @@ export async function * anthropic (envelope, deps = {}) {
135
137
  if (event.message?.usage?.input_tokens) {
136
138
  inputTokens = event.message.usage.input_tokens
137
139
  }
140
+ if (event.message?.usage?.cache_creation_input_tokens) {
141
+ cacheWriteTokens = event.message.usage.cache_creation_input_tokens
142
+ }
143
+ if (event.message?.usage?.cache_read_input_tokens) {
144
+ cacheReadTokens = event.message.usage.cache_read_input_tokens
145
+ }
138
146
  break
139
147
 
140
148
  case 'content_block_start':
@@ -226,9 +234,17 @@ export async function * anthropic (envelope, deps = {}) {
226
234
  inputTokens,
227
235
  outputTokens: messageOutputTokens,
228
236
  thinkingTokens: estimatedThinkingTokens,
237
+ ...(cacheWriteTokens > 0 && { cacheWriteInputTokens: cacheWriteTokens }),
238
+ ...(cacheReadTokens > 0 && { cacheReadInputTokens: cacheReadTokens }),
229
239
  cost: costFor(
230
240
  catalogKey(envelope.model),
231
- { inputTokens, outputTokens: messageOutputTokens, thinkingTokens: estimatedThinkingTokens }
241
+ {
242
+ inputTokens,
243
+ outputTokens: messageOutputTokens,
244
+ thinkingTokens: estimatedThinkingTokens,
245
+ cacheWriteInputTokens: cacheWriteTokens,
246
+ cacheReadInputTokens: cacheReadTokens
247
+ }
232
248
  ),
233
249
  timestamps: { start, first: first ?? end, end }
234
250
  }
@@ -261,7 +277,9 @@ function safeParseJson (s) {
261
277
  /**
262
278
  * @param {import('#core/envelope.js').CallEnvelope} envelope
263
279
  * @param {Array<{role: string, content: any}>} conversation
264
- * @param {string} system
280
+ * @param {string | Array<{type: string, text: string, cache_control?: object}>} system
281
+ * Either a flat string (legacy) or an array of typed blocks with optional
282
+ * `cache_control` for prompt caching. The Anthropic SDK accepts both shapes.
265
283
  */
266
284
  function buildRequest (envelope, conversation, system) {
267
285
  const spec = getSpec(catalogKey(envelope.model))
@@ -273,7 +291,9 @@ function buildRequest (envelope, conversation, system) {
273
291
  max_tokens: envelope.outputBudget ?? outputTokenLimit ?? ANTHROPIC_DEFAULT_MAX_TOKENS,
274
292
  messages: conversation
275
293
  }
276
- if (system) request.system = system
294
+ if (typeof system === 'string' ? system : Array.isArray(system) && system.length) {
295
+ request.system = system
296
+ }
277
297
  if (envelope.tools?.length) {
278
298
  request.tools = toAnthropicTools(envelope.tools)
279
299
  }
@@ -314,11 +334,39 @@ function splitPrompt (prompt) {
314
334
  }
315
335
  /** @type {string[]} */
316
336
  const systemParts = []
337
+ /** @type {Array<{type: string, text: string, cache_control?: object}>} */
338
+ const systemBlocks = []
339
+ let hasCacheMarkers = false
317
340
  /** @type {Array<{role: string, content: any}>} */
318
341
  const conversation = []
319
342
  for (const m of prompt) {
320
343
  if (m.role === 'system') {
321
- systemParts.push(flattenText(m.content))
344
+ // Translate spore-style cache markers ({text, cache: '5m'|'1h'}) into
345
+ // Anthropic's cache_control. Preserves the block boundary that spore
346
+ // chose; collapsing into a single string would silently disable
347
+ // caching even when the upstream tier composed the prompt with
348
+ // explicit breakpoints.
349
+ if (Array.isArray(m.content)) {
350
+ for (const p of m.content) {
351
+ if (!p?.text) continue
352
+ const block = { type: 'text', text: p.text }
353
+ if (p.cache === '5m') {
354
+ block.cache_control = { type: 'ephemeral' }
355
+ hasCacheMarkers = true
356
+ } else if (p.cache === '1h') {
357
+ block.cache_control = { type: 'ephemeral', ttl: '1h' }
358
+ hasCacheMarkers = true
359
+ }
360
+ systemBlocks.push(block)
361
+ systemParts.push(p.text)
362
+ }
363
+ } else {
364
+ const text = flattenText(m.content)
365
+ if (text) {
366
+ systemBlocks.push({ type: 'text', text })
367
+ systemParts.push(text)
368
+ }
369
+ }
322
370
  } else if (m.role === 'tool') {
323
371
  // Tool results go in a user-role message with tool_result blocks.
324
372
  conversation.push({
@@ -351,7 +399,14 @@ function splitPrompt (prompt) {
351
399
  })
352
400
  }
353
401
  }
354
- return { system: systemParts.filter(Boolean).join('\n\n'), conversation }
402
+ // If cache markers are present, return the structured block array so
403
+ // buildRequest can pass it through to Anthropic's typed system field.
404
+ // Otherwise fall back to the legacy string concatenation path so
405
+ // non-cached calls don't change request shape.
406
+ const system = hasCacheMarkers
407
+ ? systemBlocks
408
+ : systemParts.filter(Boolean).join('\n\n')
409
+ return { system, conversation }
355
410
  }
356
411
 
357
412
  /** @param {string | import('#core/envelope.js').MessagePart[]} content */
@@ -76,6 +76,7 @@ export async function * openai (envelope, deps = {}) {
76
76
  let inputTokens = 0
77
77
  let outputTokens = 0
78
78
  let thinkingTokens = 0
79
+ let cachedInputTokens = 0
79
80
  let status = STATUS_COMPLETED
80
81
  /** @type {string | undefined} */
81
82
  let warning
@@ -130,6 +131,7 @@ export async function * openai (envelope, deps = {}) {
130
131
  inputTokens = event.response.usage.input_tokens ?? 0
131
132
  outputTokens = event.response.usage.output_tokens ?? 0
132
133
  thinkingTokens = event.response.usage.output_tokens_details?.reasoning_tokens ?? 0
134
+ cachedInputTokens = event.response.usage.input_tokens_details?.cached_tokens ?? 0
133
135
  }
134
136
  if (toolItems.size > 0) {
135
137
  status = STATUS_TOOL_USE
@@ -145,6 +147,7 @@ export async function * openai (envelope, deps = {}) {
145
147
  inputTokens = event.response.usage.input_tokens ?? 0
146
148
  outputTokens = event.response.usage.output_tokens ?? 0
147
149
  thinkingTokens = event.response.usage.output_tokens_details?.reasoning_tokens ?? 0
150
+ cachedInputTokens = event.response.usage.input_tokens_details?.cached_tokens ?? 0
148
151
  }
149
152
  break
150
153
 
@@ -174,18 +177,30 @@ export async function * openai (envelope, deps = {}) {
174
177
  // one from the other for the message-only count.
175
178
  const messageOutputTokens = Math.max(0, outputTokens - thinkingTokens)
176
179
 
180
+ // OpenAI counts cached_tokens as a SUBSET of input_tokens. Convert to
181
+ // mohdel's additive convention (cacheReadInputTokens is separate from
182
+ // inputTokens) by subtracting the cached portion before pricing. Both
183
+ // adapters and computeCost stay simpler with the additive shape.
184
+ const regularInputTokens = Math.max(0, inputTokens - cachedInputTokens)
185
+
177
186
  /** @type {import('#core/events.js').DoneEvent} */
178
187
  const done = {
179
188
  type: 'done',
180
189
  result: {
181
190
  status,
182
191
  output: currentOutput() || null,
183
- inputTokens,
192
+ inputTokens: regularInputTokens,
184
193
  outputTokens: messageOutputTokens,
185
194
  thinkingTokens,
195
+ ...(cachedInputTokens > 0 && { cacheReadInputTokens: cachedInputTokens }),
186
196
  cost: costFor(
187
197
  catalogKey(envelope.model),
188
- { inputTokens, outputTokens: messageOutputTokens, thinkingTokens }
198
+ {
199
+ inputTokens: regularInputTokens,
200
+ outputTokens: messageOutputTokens,
201
+ thinkingTokens,
202
+ cacheReadInputTokens: cachedInputTokens
203
+ }
189
204
  ),
190
205
  timestamps: { start, first: first ?? end, end }
191
206
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "mohdel",
3
- "version": "0.103.0",
3
+ "version": "0.104.1",
4
4
  "license": "MIT",
5
5
  "author": {
6
6
  "name": "Christophe Le Bars",
@@ -87,12 +87,12 @@
87
87
  "@opentelemetry/exporter-trace-otlp-grpc": "^0.217.0",
88
88
  "@opentelemetry/sdk-node": "^0.217.0",
89
89
  "chalk": "^5.4.0",
90
- "mohdel-thin-gate-linux-x64-gnu": "0.103.0"
90
+ "mohdel-thin-gate-linux-x64-gnu": "0.104.1"
91
91
  },
92
92
  "dependencies": {
93
93
  "@anthropic-ai/sdk": "^0.95.1",
94
94
  "@cerebras/cerebras_cloud_sdk": "^1.61.1",
95
- "@google/genai": "^2.0.0",
95
+ "@google/genai": "^2.0.1",
96
96
  "@opentelemetry/api": "^1.9.1",
97
97
  "env-paths": "^4.0.0",
98
98
  "groq-sdk": "^1.1.2",