@oh-my-pi/pi-ai 14.5.8 → 14.5.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +12 -0
- package/package.json +3 -3
- package/src/providers/anthropic.ts +38 -0
- package/src/providers/google-gemini-cli.ts +3 -3
- package/src/providers/google-vertex.ts +3 -2
- package/src/providers/google.ts +3 -2
- package/src/providers/openai-codex-responses.ts +3 -0
- package/src/providers/openai-completions.ts +17 -5
- package/src/providers/openai-responses-shared.ts +2 -0
- package/src/types.ts +31 -0
package/CHANGELOG.md
CHANGED
|
@@ -2,6 +2,18 @@
|
|
|
2
2
|
|
|
3
3
|
## [Unreleased]
|
|
4
4
|
|
|
5
|
+
## [14.5.9] - 2026-04-30
|
|
6
|
+
### Added
|
|
7
|
+
|
|
8
|
+
- Added `usage.reasoningTokens` to OpenAI and Google usage output when providers report reasoning/thinking tokens
|
|
9
|
+
- Added `usage.cttl.ephemeral5m` and `usage.cttl.ephemeral1h` to report Anthropic cache-write TTL token buckets
|
|
10
|
+
- Added `usage.server.webSearch` and `usage.server.webFetch` to report Anthropic server tool-call request counts
|
|
11
|
+
|
|
12
|
+
### Fixed
|
|
13
|
+
|
|
14
|
+
- Fixed OpenAI usage attribution to avoid double-counting `reasoning_tokens` in output totals
|
|
15
|
+
- Fixed Anthropic streaming usage handling so a previously populated cache TTL breakdown is preserved when later events omit `cache_creation`
|
|
16
|
+
|
|
5
17
|
## [14.5.4] - 2026-04-28
|
|
6
18
|
### Changed
|
|
7
19
|
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"type": "module",
|
|
3
3
|
"name": "@oh-my-pi/pi-ai",
|
|
4
|
-
"version": "14.5.
|
|
4
|
+
"version": "14.5.9",
|
|
5
5
|
"description": "Unified LLM API with automatic model discovery and provider configuration",
|
|
6
6
|
"homepage": "https://github.com/can1357/oh-my-pi",
|
|
7
7
|
"author": "Can Boluk",
|
|
@@ -46,8 +46,8 @@
|
|
|
46
46
|
"@aws-sdk/credential-provider-node": "^3.972.36",
|
|
47
47
|
"@bufbuild/protobuf": "^2.12.0",
|
|
48
48
|
"@google/genai": "^1.50.1",
|
|
49
|
-
"@oh-my-pi/pi-natives": "14.5.
|
|
50
|
-
"@oh-my-pi/pi-utils": "14.5.
|
|
49
|
+
"@oh-my-pi/pi-natives": "14.5.9",
|
|
50
|
+
"@oh-my-pi/pi-utils": "14.5.9",
|
|
51
51
|
"@sinclair/typebox": "^0.34.49",
|
|
52
52
|
"@smithy/node-http-handler": "^4.6.1",
|
|
53
53
|
"ajv": "^8.20.0",
|
|
@@ -698,6 +698,42 @@ function createEmptyUsage(premiumRequests?: number): Usage {
|
|
|
698
698
|
};
|
|
699
699
|
}
|
|
700
700
|
|
|
701
|
+
export type AnthropicUsageLike = {
|
|
702
|
+
cache_creation?: { ephemeral_5m_input_tokens?: number | null; ephemeral_1h_input_tokens?: number | null } | null;
|
|
703
|
+
server_tool_use?: { web_search_requests?: number | null; web_fetch_requests?: number | null } | null;
|
|
704
|
+
};
|
|
705
|
+
|
|
706
|
+
/**
|
|
707
|
+
* Capture Anthropic's optional cache-creation TTL breakdown and server-tool-use
|
|
708
|
+
* counters into the harness Usage shape. Only sets fields that were reported, so
|
|
709
|
+
* a `message_delta` that omits `cache_creation` does not clobber the breakdown
|
|
710
|
+
* established at `message_start`.
|
|
711
|
+
*/
|
|
712
|
+
export function applyAnthropicUsageExtras(usage: Usage, source: AnthropicUsageLike): void {
|
|
713
|
+
const cacheCreation = source.cache_creation;
|
|
714
|
+
if (cacheCreation) {
|
|
715
|
+
const fiveMinute = cacheCreation.ephemeral_5m_input_tokens ?? 0;
|
|
716
|
+
const oneHour = cacheCreation.ephemeral_1h_input_tokens ?? 0;
|
|
717
|
+
if (fiveMinute > 0 || oneHour > 0) {
|
|
718
|
+
usage.cttl = {
|
|
719
|
+
...(fiveMinute > 0 ? { ephemeral5m: fiveMinute } : {}),
|
|
720
|
+
...(oneHour > 0 ? { ephemeral1h: oneHour } : {}),
|
|
721
|
+
};
|
|
722
|
+
}
|
|
723
|
+
}
|
|
724
|
+
const serverToolUse = source.server_tool_use;
|
|
725
|
+
if (serverToolUse) {
|
|
726
|
+
const webSearch = serverToolUse.web_search_requests ?? 0;
|
|
727
|
+
const webFetch = serverToolUse.web_fetch_requests ?? 0;
|
|
728
|
+
if (webSearch > 0 || webFetch > 0) {
|
|
729
|
+
usage.server = {
|
|
730
|
+
...(webSearch > 0 ? { webSearch } : {}),
|
|
731
|
+
...(webFetch > 0 ? { webFetch } : {}),
|
|
732
|
+
};
|
|
733
|
+
}
|
|
734
|
+
}
|
|
735
|
+
}
|
|
736
|
+
|
|
701
737
|
export const streamAnthropic: StreamFunction<"anthropic-messages"> = (
|
|
702
738
|
model: Model<"anthropic-messages">,
|
|
703
739
|
context: Context,
|
|
@@ -824,6 +860,7 @@ export const streamAnthropic: StreamFunction<"anthropic-messages"> = (
|
|
|
824
860
|
continue;
|
|
825
861
|
}
|
|
826
862
|
sawMessageStart = true;
|
|
863
|
+
applyAnthropicUsageExtras(output.usage, event.message.usage);
|
|
827
864
|
output.responseId = event.message.id;
|
|
828
865
|
output.usage.input = event.message.usage.input_tokens || 0;
|
|
829
866
|
output.usage.output = event.message.usage.output_tokens || 0;
|
|
@@ -989,6 +1026,7 @@ export const streamAnthropic: StreamFunction<"anthropic-messages"> = (
|
|
|
989
1026
|
if (event.usage.cache_creation_input_tokens != null) {
|
|
990
1027
|
output.usage.cacheWrite = event.usage.cache_creation_input_tokens;
|
|
991
1028
|
}
|
|
1029
|
+
applyAnthropicUsageExtras(output.usage, event.usage);
|
|
992
1030
|
output.usage.totalTokens =
|
|
993
1031
|
output.usage.input + output.usage.output + output.usage.cacheRead + output.usage.cacheWrite;
|
|
994
1032
|
calculateCost(model, output.usage);
|
|
@@ -804,14 +804,14 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli"> = (
|
|
|
804
804
|
// promptTokenCount includes cachedContentTokenCount, so subtract to get fresh input
|
|
805
805
|
const promptTokens = responseData.usageMetadata.promptTokenCount || 0;
|
|
806
806
|
const cacheReadTokens = responseData.usageMetadata.cachedContentTokenCount || 0;
|
|
807
|
+
const thinkingTokens = responseData.usageMetadata.thoughtsTokenCount || 0;
|
|
807
808
|
output.usage = {
|
|
808
809
|
input: promptTokens - cacheReadTokens,
|
|
809
|
-
output:
|
|
810
|
-
(responseData.usageMetadata.candidatesTokenCount || 0) +
|
|
811
|
-
(responseData.usageMetadata.thoughtsTokenCount || 0),
|
|
810
|
+
output: (responseData.usageMetadata.candidatesTokenCount || 0) + thinkingTokens,
|
|
812
811
|
cacheRead: cacheReadTokens,
|
|
813
812
|
cacheWrite: 0,
|
|
814
813
|
totalTokens: responseData.usageMetadata.totalTokenCount || 0,
|
|
814
|
+
...(thinkingTokens > 0 ? { reasoningTokens: thinkingTokens } : {}),
|
|
815
815
|
cost: {
|
|
816
816
|
input: 0,
|
|
817
817
|
output: 0,
|
|
@@ -243,13 +243,14 @@ export const streamGoogleVertex: StreamFunction<"google-vertex"> = (
|
|
|
243
243
|
// input + cacheRead = total prompt tokens (no double-counting).
|
|
244
244
|
// Ref: https://ai.google.dev/api/generate-content#v1beta.GenerateContentResponse.UsageMetadata
|
|
245
245
|
const cachedTokens = chunk.usageMetadata.cachedContentTokenCount || 0;
|
|
246
|
+
const thinkingTokens = chunk.usageMetadata.thoughtsTokenCount || 0;
|
|
246
247
|
output.usage = {
|
|
247
248
|
input: (chunk.usageMetadata.promptTokenCount || 0) - cachedTokens,
|
|
248
|
-
output:
|
|
249
|
-
(chunk.usageMetadata.candidatesTokenCount || 0) + (chunk.usageMetadata.thoughtsTokenCount || 0),
|
|
249
|
+
output: (chunk.usageMetadata.candidatesTokenCount || 0) + thinkingTokens,
|
|
250
250
|
cacheRead: cachedTokens,
|
|
251
251
|
cacheWrite: 0,
|
|
252
252
|
totalTokens: chunk.usageMetadata.totalTokenCount || 0,
|
|
253
|
+
...(thinkingTokens > 0 ? { reasoningTokens: thinkingTokens } : {}),
|
|
253
254
|
cost: {
|
|
254
255
|
input: 0,
|
|
255
256
|
output: 0,
|
package/src/providers/google.ts
CHANGED
|
@@ -223,13 +223,14 @@ export const streamGoogle: StreamFunction<"google-generative-ai"> = (
|
|
|
223
223
|
// input + cacheRead = total prompt tokens (no double-counting).
|
|
224
224
|
// Ref: https://ai.google.dev/api/generate-content#v1beta.GenerateContentResponse.UsageMetadata
|
|
225
225
|
const cachedTokens = chunk.usageMetadata.cachedContentTokenCount || 0;
|
|
226
|
+
const thinkingTokens = chunk.usageMetadata.thoughtsTokenCount || 0;
|
|
226
227
|
output.usage = {
|
|
227
228
|
input: (chunk.usageMetadata.promptTokenCount || 0) - cachedTokens,
|
|
228
|
-
output:
|
|
229
|
-
(chunk.usageMetadata.candidatesTokenCount || 0) + (chunk.usageMetadata.thoughtsTokenCount || 0),
|
|
229
|
+
output: (chunk.usageMetadata.candidatesTokenCount || 0) + thinkingTokens,
|
|
230
230
|
cacheRead: cachedTokens,
|
|
231
231
|
cacheWrite: 0,
|
|
232
232
|
totalTokens: chunk.usageMetadata.totalTokenCount || 0,
|
|
233
|
+
...(thinkingTokens > 0 ? { reasoningTokens: thinkingTokens } : {}),
|
|
233
234
|
cost: {
|
|
234
235
|
input: 0,
|
|
235
236
|
output: 0,
|
|
@@ -1119,6 +1119,7 @@ function handleResponseCompleted(
|
|
|
1119
1119
|
output_tokens?: number;
|
|
1120
1120
|
total_tokens?: number;
|
|
1121
1121
|
input_tokens_details?: { cached_tokens?: number };
|
|
1122
|
+
output_tokens_details?: { reasoning_tokens?: number };
|
|
1122
1123
|
};
|
|
1123
1124
|
status?: string;
|
|
1124
1125
|
};
|
|
@@ -1127,12 +1128,14 @@ function handleResponseCompleted(
|
|
|
1127
1128
|
|
|
1128
1129
|
if (response?.usage) {
|
|
1129
1130
|
const cachedTokens = response.usage.input_tokens_details?.cached_tokens || 0;
|
|
1131
|
+
const reasoningTokens = response.usage.output_tokens_details?.reasoning_tokens || 0;
|
|
1130
1132
|
output.usage = {
|
|
1131
1133
|
input: (response.usage.input_tokens || 0) - cachedTokens,
|
|
1132
1134
|
output: response.usage.output_tokens || 0,
|
|
1133
1135
|
cacheRead: cachedTokens,
|
|
1134
1136
|
cacheWrite: 0,
|
|
1135
1137
|
totalTokens: response.usage.total_tokens || 0,
|
|
1138
|
+
...(reasoningTokens > 0 ? { reasoningTokens } : {}),
|
|
1136
1139
|
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 },
|
|
1137
1140
|
};
|
|
1138
1141
|
}
|
|
@@ -959,7 +959,7 @@ function getChoiceUsage(choice: ChatCompletionChunk.Choice): object | undefined
|
|
|
959
959
|
return getOptionalObjectProperty(choice, "usage");
|
|
960
960
|
}
|
|
961
961
|
|
|
962
|
-
function parseChunkUsage(
|
|
962
|
+
export function parseChunkUsage(
|
|
963
963
|
rawUsage: object,
|
|
964
964
|
model: Model<"openai-completions">,
|
|
965
965
|
copilotPremiumRequests: number | undefined,
|
|
@@ -970,16 +970,28 @@ function parseChunkUsage(
|
|
|
970
970
|
getOptionalNumberProperty(rawUsage, "cached_tokens") ??
|
|
971
971
|
(promptTokenDetails ? getOptionalNumberProperty(promptTokenDetails, "cached_tokens") : undefined) ??
|
|
972
972
|
0;
|
|
973
|
+
// OpenRouter exposes cache writes via `prompt_tokens_details.cache_write_tokens`
|
|
974
|
+
// and INCLUDES them in `prompt_tokens`. Without subtracting, cache-write tokens
|
|
975
|
+
// leak into `input` (e.g. GLM/Anthropic via OpenRouter on a fresh cache).
|
|
976
|
+
// Ref: https://openrouter.ai/docs/guides/best-practices/prompt-caching
|
|
977
|
+
const cacheWriteTokens = promptTokenDetails
|
|
978
|
+
? (getOptionalNumberProperty(promptTokenDetails, "cache_write_tokens") ?? 0)
|
|
979
|
+
: 0;
|
|
973
980
|
const reasoningTokens =
|
|
974
981
|
(completionTokenDetails ? getOptionalNumberProperty(completionTokenDetails, "reasoning_tokens") : undefined) ?? 0;
|
|
975
|
-
const
|
|
976
|
-
const
|
|
982
|
+
const promptTokens = getOptionalNumberProperty(rawUsage, "prompt_tokens") ?? 0;
|
|
983
|
+
const input = Math.max(0, promptTokens - cachedTokens - cacheWriteTokens);
|
|
984
|
+
// Per OpenAI's CompletionUsage spec, `reasoning_tokens` is a subset of
|
|
985
|
+
// `completion_tokens` (which is the total billed output). Adding them would
|
|
986
|
+
// double-count.
|
|
987
|
+
const outputTokens = getOptionalNumberProperty(rawUsage, "completion_tokens") ?? 0;
|
|
977
988
|
const usage: AssistantMessage["usage"] = {
|
|
978
989
|
input,
|
|
979
990
|
output: outputTokens,
|
|
980
991
|
cacheRead: cachedTokens,
|
|
981
|
-
cacheWrite:
|
|
982
|
-
totalTokens: input + outputTokens + cachedTokens,
|
|
992
|
+
cacheWrite: cacheWriteTokens,
|
|
993
|
+
totalTokens: input + outputTokens + cachedTokens + cacheWriteTokens,
|
|
994
|
+
...(reasoningTokens > 0 ? { reasoningTokens } : {}),
|
|
983
995
|
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 },
|
|
984
996
|
...(copilotPremiumRequests !== undefined ? { premiumRequests: copilotPremiumRequests } : {}),
|
|
985
997
|
};
|
|
@@ -527,12 +527,14 @@ export async function processResponsesStream<TApi extends Api>(
|
|
|
527
527
|
}
|
|
528
528
|
if (response?.usage) {
|
|
529
529
|
const cachedTokens = response.usage.input_tokens_details?.cached_tokens || 0;
|
|
530
|
+
const reasoningTokens = response.usage.output_tokens_details?.reasoning_tokens || 0;
|
|
530
531
|
output.usage = {
|
|
531
532
|
input: (response.usage.input_tokens || 0) - cachedTokens,
|
|
532
533
|
output: response.usage.output_tokens || 0,
|
|
533
534
|
cacheRead: cachedTokens,
|
|
534
535
|
cacheWrite: 0,
|
|
535
536
|
totalTokens: response.usage.total_tokens || 0,
|
|
537
|
+
...(reasoningTokens > 0 ? { reasoningTokens } : {}),
|
|
536
538
|
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 },
|
|
537
539
|
};
|
|
538
540
|
}
|
package/src/types.ts
CHANGED
|
@@ -306,12 +306,43 @@ export interface ToolCall {
|
|
|
306
306
|
}
|
|
307
307
|
|
|
308
308
|
export interface Usage {
|
|
309
|
+
/** Non-cached input tokens (matches the bucket the provider bills as new input). */
|
|
309
310
|
input: number;
|
|
311
|
+
/** Total output tokens for the turn, including thinking, assistant text, and tool-call argument tokens. */
|
|
310
312
|
output: number;
|
|
313
|
+
/** Tokens read from the prompt cache. */
|
|
311
314
|
cacheRead: number;
|
|
315
|
+
/** Tokens written to the prompt cache (cache creation). */
|
|
312
316
|
cacheWrite: number;
|
|
317
|
+
/** Sum of input + output + cacheRead + cacheWrite. */
|
|
313
318
|
totalTokens: number;
|
|
319
|
+
/** Copilot premium-request counter, when applicable. */
|
|
314
320
|
premiumRequests?: number;
|
|
321
|
+
/**
|
|
322
|
+
* Reasoning/thinking tokens included in `output`, when the provider reports them
|
|
323
|
+
* (OpenAI `output_tokens_details.reasoning_tokens`, Google `thoughtsTokenCount`).
|
|
324
|
+
* Always a subset of `output` — non-reasoning output is `output - reasoningTokens`.
|
|
325
|
+
*
|
|
326
|
+
* Providers that don't expose this leave it undefined rather than guessing;
|
|
327
|
+
* `undefined` means unknown, NOT zero.
|
|
328
|
+
*/
|
|
329
|
+
reasoningTokens?: number;
|
|
330
|
+
/**
|
|
331
|
+
* Cache-write TTL breakdown (Anthropic only). When set, the components sum to
|
|
332
|
+
* `cacheWrite`. Absent providers do not populate this.
|
|
333
|
+
*/
|
|
334
|
+
cttl?: {
|
|
335
|
+
ephemeral5m?: number;
|
|
336
|
+
ephemeral1h?: number;
|
|
337
|
+
};
|
|
338
|
+
/**
|
|
339
|
+
* Server-side tool invocations made during this turn (Anthropic web_search /
|
|
340
|
+
* web_fetch, OpenAI built-in tools when reported). Counts requests, not tokens.
|
|
341
|
+
*/
|
|
342
|
+
server?: {
|
|
343
|
+
webSearch?: number;
|
|
344
|
+
webFetch?: number;
|
|
345
|
+
};
|
|
315
346
|
cost: {
|
|
316
347
|
input: number;
|
|
317
348
|
output: number;
|