mohdel 0.102.0 → 0.104.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/js/core/events.js
CHANGED
|
@@ -55,8 +55,21 @@
|
|
|
55
55
|
* @property {(string|null)} output
|
|
56
56
|
* Final text (null when `status === 'tool_use'` with no text).
|
|
57
57
|
* @property {number} inputTokens
|
|
58
|
+
* Regular (non-cached) input tokens. For OpenAI/cerebras/fireworks where
|
|
59
|
+
* `cached_tokens` is reported as a SUBSET of `prompt_tokens`, adapters
|
|
60
|
+
* subtract the cached portion before exposing it here so all providers
|
|
61
|
+
* produce additive token shapes.
|
|
58
62
|
* @property {number} outputTokens
|
|
59
63
|
* @property {number} thinkingTokens
|
|
64
|
+
* @property {number} [cacheWriteInputTokens]
|
|
65
|
+
* Tokens written to a fresh prompt cache breakpoint, billed at
|
|
66
|
+
* `cacheWritePrice` (typically 1.25× input on Anthropic). Absent on
|
|
67
|
+
* providers that don't surface this counter (OpenAI doesn't separately
|
|
68
|
+
* bill cache writes).
|
|
69
|
+
* @property {number} [cacheReadInputTokens]
|
|
70
|
+
* Tokens served from prompt cache, billed at `cacheReadPrice` (typically
|
|
71
|
+
* 0.1× input). Set by Anthropic directly and by OpenAI-shape adapters
|
|
72
|
+
* after subset→additive normalization of `prompt_tokens_details.cached_tokens`.
|
|
60
73
|
* @property {number} cost
|
|
61
74
|
* USD, computed from curated pricing. Single number (not a breakdown).
|
|
62
75
|
* @property {Timestamps} timestamps
|
|
@@ -258,10 +258,15 @@ async function * runStreaming (envelope, client, args, config, start, deps) {
|
|
|
258
258
|
*/
|
|
259
259
|
function finalize ({ envelope, content, toolCalls, usage, finishReason, start, first, reasoning = null }) {
|
|
260
260
|
const end = String(process.hrtime.bigint())
|
|
261
|
-
const
|
|
261
|
+
const totalInputTokens = usage.prompt_tokens || 0
|
|
262
262
|
const totalOutputTokens = usage.completion_tokens || 0
|
|
263
263
|
const thinkingTokens = usage.completion_tokens_details?.reasoning_tokens || 0
|
|
264
|
+
const cachedInputTokens = usage.prompt_tokens_details?.cached_tokens || 0
|
|
264
265
|
const visibleOutputTokens = Math.max(0, totalOutputTokens - thinkingTokens)
|
|
266
|
+
// OpenAI-shape APIs (cerebras/fireworks/...) report cached_tokens as a
|
|
267
|
+
// SUBSET of prompt_tokens. Convert to mohdel's additive convention so
|
|
268
|
+
// computeCost prices the cached portion at cacheReadPrice.
|
|
269
|
+
const inputTokens = Math.max(0, totalInputTokens - cachedInputTokens)
|
|
265
270
|
|
|
266
271
|
const truncated = finishReason === 'length'
|
|
267
272
|
let status = truncated ? STATUS_INCOMPLETE : STATUS_COMPLETED
|
|
@@ -276,9 +281,15 @@ function finalize ({ envelope, content, toolCalls, usage, finishReason, start, f
|
|
|
276
281
|
inputTokens,
|
|
277
282
|
outputTokens: visibleOutputTokens,
|
|
278
283
|
thinkingTokens,
|
|
284
|
+
...(cachedInputTokens > 0 && { cacheReadInputTokens: cachedInputTokens }),
|
|
279
285
|
cost: costFor(
|
|
280
286
|
catalogKey(envelope.model),
|
|
281
|
-
{
|
|
287
|
+
{
|
|
288
|
+
inputTokens,
|
|
289
|
+
outputTokens: visibleOutputTokens,
|
|
290
|
+
thinkingTokens,
|
|
291
|
+
cacheReadInputTokens: cachedInputTokens
|
|
292
|
+
}
|
|
282
293
|
),
|
|
283
294
|
timestamps: { start, first: first ?? end, end }
|
|
284
295
|
}
|
|
@@ -14,8 +14,8 @@ import { getSpec, setCatalog } from './_catalog.js'
|
|
|
14
14
|
/**
|
|
15
15
|
* Pure cost computation from spec + usage.
|
|
16
16
|
*
|
|
17
|
-
* Each price field (`inputPrice` / `outputPrice` / `thinkingPrice`
|
|
18
|
-
* one of:
|
|
17
|
+
* Each price field (`inputPrice` / `outputPrice` / `thinkingPrice` /
|
|
18
|
+
* `cacheWritePrice` / `cacheReadPrice`) is one of:
|
|
19
19
|
*
|
|
20
20
|
* - a `number` — flat per-million rate; or
|
|
21
21
|
* - an object `{">N": number, ..., "default": number}` — tiered.
|
|
@@ -24,12 +24,23 @@ import { getSpec, setCatalog } from './_catalog.js'
|
|
|
24
24
|
* nothing matches. Keys that aren't `">N"` or `"default"` are
|
|
25
25
|
* ignored. `>` is strict — at exactly N, the default is used.
|
|
26
26
|
*
|
|
27
|
-
*
|
|
28
|
-
* `
|
|
27
|
+
* Optional fields fall back to other prices when absent:
|
|
28
|
+
* - `thinkingPrice` → `outputPrice`
|
|
29
|
+
* - `cacheWritePrice` → `inputPrice` (graceful for non-caching providers)
|
|
30
|
+
* - `cacheReadPrice` → `inputPrice`
|
|
29
31
|
*
|
|
30
|
-
*
|
|
31
|
-
*
|
|
32
|
-
*
|
|
32
|
+
* Token-counting conventions:
|
|
33
|
+
* - Anthropic reports `cache_creation_input_tokens` and `cache_read_input_tokens`
|
|
34
|
+
* as ADDITIONAL to `input_tokens` (separately billable). The adapter
|
|
35
|
+
* surfaces them as `cacheWriteInputTokens` / `cacheReadInputTokens`
|
|
36
|
+
* (write/read pair, matching catalog `cacheWritePrice`/`cacheReadPrice`).
|
|
37
|
+
* - OpenAI reports `prompt_tokens_details.cached_tokens` as a SUBSET of
|
|
38
|
+
* `prompt_tokens` (already counted). Adapters subtract before passing
|
|
39
|
+
* `inputTokens` to keep this function additive across providers.
|
|
40
|
+
*
|
|
41
|
+
* @param {any} spec Catalog entry, or `undefined`.
|
|
42
|
+
* @param {{inputTokens?: number, outputTokens?: number, thinkingTokens?: number,
|
|
43
|
+
* cacheWriteInputTokens?: number, cacheReadInputTokens?: number}} usage
|
|
33
44
|
* @returns {number}
|
|
34
45
|
*/
|
|
35
46
|
export function computeCost (spec, usage) {
|
|
@@ -37,12 +48,18 @@ export function computeCost (spec, usage) {
|
|
|
37
48
|
const i = usage.inputTokens ?? 0
|
|
38
49
|
const o = usage.outputTokens ?? 0
|
|
39
50
|
const t = usage.thinkingTokens ?? 0
|
|
51
|
+
const cw = usage.cacheWriteInputTokens ?? 0
|
|
52
|
+
const cr = usage.cacheReadInputTokens ?? 0
|
|
40
53
|
const ip = resolveTier(spec.inputPrice, i)
|
|
41
54
|
const op = resolveTier(spec.outputPrice, i)
|
|
42
55
|
if (typeof ip !== 'number' || typeof op !== 'number') return 0
|
|
43
56
|
const tp = resolveTier(spec.thinkingPrice, i)
|
|
44
57
|
const tpFinal = typeof tp === 'number' ? tp : op
|
|
45
|
-
const
|
|
58
|
+
const cwp = resolveTier(spec.cacheWritePrice, i)
|
|
59
|
+
const cwpFinal = typeof cwp === 'number' ? cwp : ip
|
|
60
|
+
const crp = resolveTier(spec.cacheReadPrice, i)
|
|
61
|
+
const crpFinal = typeof crp === 'number' ? crp : ip
|
|
62
|
+
const total = (i * ip + cw * cwpFinal + cr * crpFinal + o * op + t * tpFinal) / 1_000_000
|
|
46
63
|
return round(total)
|
|
47
64
|
}
|
|
48
65
|
|
|
@@ -113,6 +113,8 @@ export async function * anthropic (envelope, deps = {}) {
|
|
|
113
113
|
const currentOutput = () => outputParts.join('')
|
|
114
114
|
let inputTokens = 0
|
|
115
115
|
let outputTokens = 0
|
|
116
|
+
let cacheWriteTokens = 0
|
|
117
|
+
let cacheReadTokens = 0
|
|
116
118
|
let thinkingChars = 0
|
|
117
119
|
let status = STATUS_COMPLETED
|
|
118
120
|
/** @type {string | undefined} */
|
|
@@ -135,6 +137,12 @@ export async function * anthropic (envelope, deps = {}) {
|
|
|
135
137
|
if (event.message?.usage?.input_tokens) {
|
|
136
138
|
inputTokens = event.message.usage.input_tokens
|
|
137
139
|
}
|
|
140
|
+
if (event.message?.usage?.cache_creation_input_tokens) {
|
|
141
|
+
cacheWriteTokens = event.message.usage.cache_creation_input_tokens
|
|
142
|
+
}
|
|
143
|
+
if (event.message?.usage?.cache_read_input_tokens) {
|
|
144
|
+
cacheReadTokens = event.message.usage.cache_read_input_tokens
|
|
145
|
+
}
|
|
138
146
|
break
|
|
139
147
|
|
|
140
148
|
case 'content_block_start':
|
|
@@ -203,12 +211,18 @@ export async function * anthropic (envelope, deps = {}) {
|
|
|
203
211
|
}
|
|
204
212
|
|
|
205
213
|
const end = String(process.hrtime.bigint())
|
|
206
|
-
// Estimate thinking tokens
|
|
207
|
-
// (
|
|
208
|
-
//
|
|
214
|
+
// Estimate thinking tokens. Primary path: count streamed thinking_delta
|
|
215
|
+
// chars (sonnet emits these). Fallback: gap between Anthropic's reported
|
|
216
|
+
// output_tokens and what actually streamed as visible output (text +
|
|
217
|
+
// tool input JSON) — catches redacted_thinking blocks (opus 4.7 default)
|
|
218
|
+
// that consume output tokens but emit no streaming deltas.
|
|
219
|
+
const streamedOutput = currentOutput()
|
|
220
|
+
const streamedOutputChars = streamedOutput.length +
|
|
221
|
+
[...toolBlocks.values()].reduce((s, b) => s + b.inputJson.length, 0)
|
|
222
|
+
const streamedOutputTokens = Math.ceil(streamedOutputChars / ANTHROPIC_THINKING_CHARS_PER_TOKEN)
|
|
209
223
|
const estimatedThinkingTokens = thinkingChars > 0
|
|
210
224
|
? Math.min(Math.ceil(thinkingChars / ANTHROPIC_THINKING_CHARS_PER_TOKEN), outputTokens)
|
|
211
|
-
: 0
|
|
225
|
+
: Math.max(0, outputTokens - streamedOutputTokens)
|
|
212
226
|
const messageOutputTokens = Math.max(0, outputTokens - estimatedThinkingTokens)
|
|
213
227
|
|
|
214
228
|
/** @type {import('#core/events.js').DoneEvent} */
|
|
@@ -216,13 +230,21 @@ export async function * anthropic (envelope, deps = {}) {
|
|
|
216
230
|
type: 'done',
|
|
217
231
|
result: {
|
|
218
232
|
status,
|
|
219
|
-
output:
|
|
233
|
+
output: streamedOutput || null,
|
|
220
234
|
inputTokens,
|
|
221
235
|
outputTokens: messageOutputTokens,
|
|
222
236
|
thinkingTokens: estimatedThinkingTokens,
|
|
237
|
+
...(cacheWriteTokens > 0 && { cacheWriteInputTokens: cacheWriteTokens }),
|
|
238
|
+
...(cacheReadTokens > 0 && { cacheReadInputTokens: cacheReadTokens }),
|
|
223
239
|
cost: costFor(
|
|
224
240
|
catalogKey(envelope.model),
|
|
225
|
-
{
|
|
241
|
+
{
|
|
242
|
+
inputTokens,
|
|
243
|
+
outputTokens: messageOutputTokens,
|
|
244
|
+
thinkingTokens: estimatedThinkingTokens,
|
|
245
|
+
cacheWriteInputTokens: cacheWriteTokens,
|
|
246
|
+
cacheReadInputTokens: cacheReadTokens
|
|
247
|
+
}
|
|
226
248
|
),
|
|
227
249
|
timestamps: { start, first: first ?? end, end }
|
|
228
250
|
}
|
|
@@ -255,7 +277,9 @@ function safeParseJson (s) {
|
|
|
255
277
|
/**
|
|
256
278
|
* @param {import('#core/envelope.js').CallEnvelope} envelope
|
|
257
279
|
* @param {Array<{role: string, content: any}>} conversation
|
|
258
|
-
* @param {string} system
|
|
280
|
+
* @param {string | Array<{type: string, text: string, cache_control?: object}>} system
|
|
281
|
+
* Either a flat string (legacy) or an array of typed blocks with optional
|
|
282
|
+
* `cache_control` for prompt caching. The Anthropic SDK accepts both shapes.
|
|
259
283
|
*/
|
|
260
284
|
function buildRequest (envelope, conversation, system) {
|
|
261
285
|
const spec = getSpec(catalogKey(envelope.model))
|
|
@@ -267,7 +291,9 @@ function buildRequest (envelope, conversation, system) {
|
|
|
267
291
|
max_tokens: envelope.outputBudget ?? outputTokenLimit ?? ANTHROPIC_DEFAULT_MAX_TOKENS,
|
|
268
292
|
messages: conversation
|
|
269
293
|
}
|
|
270
|
-
if (system
|
|
294
|
+
if (typeof system === 'string' ? system : Array.isArray(system) && system.length) {
|
|
295
|
+
request.system = system
|
|
296
|
+
}
|
|
271
297
|
if (envelope.tools?.length) {
|
|
272
298
|
request.tools = toAnthropicTools(envelope.tools)
|
|
273
299
|
}
|
|
@@ -308,11 +334,39 @@ function splitPrompt (prompt) {
|
|
|
308
334
|
}
|
|
309
335
|
/** @type {string[]} */
|
|
310
336
|
const systemParts = []
|
|
337
|
+
/** @type {Array<{type: string, text: string, cache_control?: object}>} */
|
|
338
|
+
const systemBlocks = []
|
|
339
|
+
let hasCacheMarkers = false
|
|
311
340
|
/** @type {Array<{role: string, content: any}>} */
|
|
312
341
|
const conversation = []
|
|
313
342
|
for (const m of prompt) {
|
|
314
343
|
if (m.role === 'system') {
|
|
315
|
-
|
|
344
|
+
// Translate spore-style cache markers ({text, cache: '5m'|'1h'}) into
|
|
345
|
+
// Anthropic's cache_control. Preserves the block boundary that spore
|
|
346
|
+
// chose; collapsing into a single string would silently disable
|
|
347
|
+
// caching even when the upstream tier composed the prompt with
|
|
348
|
+
// explicit breakpoints.
|
|
349
|
+
if (Array.isArray(m.content)) {
|
|
350
|
+
for (const p of m.content) {
|
|
351
|
+
if (!p?.text) continue
|
|
352
|
+
const block = { type: 'text', text: p.text }
|
|
353
|
+
if (p.cache === '5m') {
|
|
354
|
+
block.cache_control = { type: 'ephemeral' }
|
|
355
|
+
hasCacheMarkers = true
|
|
356
|
+
} else if (p.cache === '1h') {
|
|
357
|
+
block.cache_control = { type: 'ephemeral', ttl: '1h' }
|
|
358
|
+
hasCacheMarkers = true
|
|
359
|
+
}
|
|
360
|
+
systemBlocks.push(block)
|
|
361
|
+
systemParts.push(p.text)
|
|
362
|
+
}
|
|
363
|
+
} else {
|
|
364
|
+
const text = flattenText(m.content)
|
|
365
|
+
if (text) {
|
|
366
|
+
systemBlocks.push({ type: 'text', text })
|
|
367
|
+
systemParts.push(text)
|
|
368
|
+
}
|
|
369
|
+
}
|
|
316
370
|
} else if (m.role === 'tool') {
|
|
317
371
|
// Tool results go in a user-role message with tool_result blocks.
|
|
318
372
|
conversation.push({
|
|
@@ -345,7 +399,14 @@ function splitPrompt (prompt) {
|
|
|
345
399
|
})
|
|
346
400
|
}
|
|
347
401
|
}
|
|
348
|
-
|
|
402
|
+
// If cache markers are present, return the structured block array so
|
|
403
|
+
// buildRequest can pass it through to Anthropic's typed system field.
|
|
404
|
+
// Otherwise fall back to the legacy string concatenation path so
|
|
405
|
+
// non-cached calls don't change request shape.
|
|
406
|
+
const system = hasCacheMarkers
|
|
407
|
+
? systemBlocks
|
|
408
|
+
: systemParts.filter(Boolean).join('\n\n')
|
|
409
|
+
return { system, conversation }
|
|
349
410
|
}
|
|
350
411
|
|
|
351
412
|
/** @param {string | import('#core/envelope.js').MessagePart[]} content */
|
|
@@ -76,6 +76,7 @@ export async function * openai (envelope, deps = {}) {
|
|
|
76
76
|
let inputTokens = 0
|
|
77
77
|
let outputTokens = 0
|
|
78
78
|
let thinkingTokens = 0
|
|
79
|
+
let cachedInputTokens = 0
|
|
79
80
|
let status = STATUS_COMPLETED
|
|
80
81
|
/** @type {string | undefined} */
|
|
81
82
|
let warning
|
|
@@ -130,6 +131,7 @@ export async function * openai (envelope, deps = {}) {
|
|
|
130
131
|
inputTokens = event.response.usage.input_tokens ?? 0
|
|
131
132
|
outputTokens = event.response.usage.output_tokens ?? 0
|
|
132
133
|
thinkingTokens = event.response.usage.output_tokens_details?.reasoning_tokens ?? 0
|
|
134
|
+
cachedInputTokens = event.response.usage.input_tokens_details?.cached_tokens ?? 0
|
|
133
135
|
}
|
|
134
136
|
if (toolItems.size > 0) {
|
|
135
137
|
status = STATUS_TOOL_USE
|
|
@@ -145,6 +147,7 @@ export async function * openai (envelope, deps = {}) {
|
|
|
145
147
|
inputTokens = event.response.usage.input_tokens ?? 0
|
|
146
148
|
outputTokens = event.response.usage.output_tokens ?? 0
|
|
147
149
|
thinkingTokens = event.response.usage.output_tokens_details?.reasoning_tokens ?? 0
|
|
150
|
+
cachedInputTokens = event.response.usage.input_tokens_details?.cached_tokens ?? 0
|
|
148
151
|
}
|
|
149
152
|
break
|
|
150
153
|
|
|
@@ -174,18 +177,30 @@ export async function * openai (envelope, deps = {}) {
|
|
|
174
177
|
// one from the other for the message-only count.
|
|
175
178
|
const messageOutputTokens = Math.max(0, outputTokens - thinkingTokens)
|
|
176
179
|
|
|
180
|
+
// OpenAI counts cached_tokens as a SUBSET of input_tokens. Convert to
|
|
181
|
+
// mohdel's additive convention (cacheReadInputTokens is separate from
|
|
182
|
+
// inputTokens) by subtracting the cached portion before pricing. Both
|
|
183
|
+
// adapters and computeCost stay simpler with the additive shape.
|
|
184
|
+
const regularInputTokens = Math.max(0, inputTokens - cachedInputTokens)
|
|
185
|
+
|
|
177
186
|
/** @type {import('#core/events.js').DoneEvent} */
|
|
178
187
|
const done = {
|
|
179
188
|
type: 'done',
|
|
180
189
|
result: {
|
|
181
190
|
status,
|
|
182
191
|
output: currentOutput() || null,
|
|
183
|
-
inputTokens,
|
|
192
|
+
inputTokens: regularInputTokens,
|
|
184
193
|
outputTokens: messageOutputTokens,
|
|
185
194
|
thinkingTokens,
|
|
195
|
+
...(cachedInputTokens > 0 && { cacheReadInputTokens: cachedInputTokens }),
|
|
186
196
|
cost: costFor(
|
|
187
197
|
catalogKey(envelope.model),
|
|
188
|
-
{
|
|
198
|
+
{
|
|
199
|
+
inputTokens: regularInputTokens,
|
|
200
|
+
outputTokens: messageOutputTokens,
|
|
201
|
+
thinkingTokens,
|
|
202
|
+
cacheReadInputTokens: cachedInputTokens
|
|
203
|
+
}
|
|
189
204
|
),
|
|
190
205
|
timestamps: { start, first: first ?? end, end }
|
|
191
206
|
}
|
package/js/session/run.js
CHANGED
|
@@ -264,12 +264,16 @@ export async function * run (envelope, {
|
|
|
264
264
|
function normalizeModelEffort (envelope, resolveSpec) {
|
|
265
265
|
const candidate = effortOf(envelope.model)
|
|
266
266
|
if (!candidate) return { envelope }
|
|
267
|
-
if (envelope.outputEffort) return { envelope } // explicit wins
|
|
268
267
|
|
|
269
268
|
const base = catalogKey(envelope.model)
|
|
270
269
|
const baseSpec = resolveSpec(base)
|
|
271
270
|
if (!baseSpec) return { envelope } // base not known — let full string fall through to not-found
|
|
272
271
|
|
|
272
|
+
// Explicit outputEffort wins; still strip the suffix so spans/logs see the canonical id.
|
|
273
|
+
if (envelope.outputEffort) {
|
|
274
|
+
return { envelope: { ...envelope, model: base } }
|
|
275
|
+
}
|
|
276
|
+
|
|
273
277
|
if (!baseSpec.thinkingEffortLevels) {
|
|
274
278
|
return {
|
|
275
279
|
envelope,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "mohdel",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.104.0",
|
|
4
4
|
"license": "MIT",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "Christophe Le Bars",
|
|
@@ -87,12 +87,12 @@
|
|
|
87
87
|
"@opentelemetry/exporter-trace-otlp-grpc": "^0.217.0",
|
|
88
88
|
"@opentelemetry/sdk-node": "^0.217.0",
|
|
89
89
|
"chalk": "^5.4.0",
|
|
90
|
-
"mohdel-thin-gate-linux-x64-gnu": "0.
|
|
90
|
+
"mohdel-thin-gate-linux-x64-gnu": "0.104.0"
|
|
91
91
|
},
|
|
92
92
|
"dependencies": {
|
|
93
93
|
"@anthropic-ai/sdk": "^0.95.1",
|
|
94
94
|
"@cerebras/cerebras_cloud_sdk": "^1.61.1",
|
|
95
|
-
"@google/genai": "^2.0.
|
|
95
|
+
"@google/genai": "^2.0.1",
|
|
96
96
|
"@opentelemetry/api": "^1.9.1",
|
|
97
97
|
"env-paths": "^4.0.0",
|
|
98
98
|
"groq-sdk": "^1.1.2",
|