@oh-my-pi/pi-ai 14.5.7 → 14.5.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -2,6 +2,18 @@
2
2
 
3
3
  ## [Unreleased]
4
4
 
5
+ ## [14.5.9] - 2026-04-30
6
+ ### Added
7
+
8
+ - Added `usage.reasoningTokens` to OpenAI and Google usage output when providers report reasoning/thinking tokens
9
+ - Added `usage.cttl.ephemeral5m` and `usage.cttl.ephemeral1h` to report Anthropic cache-write TTL token buckets
10
+ - Added `usage.server.webSearch` and `usage.server.webFetch` to report Anthropic server tool-call request counts
11
+
12
+ ### Fixed
13
+
14
+ - Fixed OpenAI usage attribution to avoid double-counting `reasoning_tokens` in output totals
15
+ - Fixed Anthropic streaming usage handling so a previously populated cache TTL breakdown is preserved when later events omit `cache_creation`
16
+
5
17
  ## [14.5.4] - 2026-04-28
6
18
  ### Changed
7
19
 
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "type": "module",
3
3
  "name": "@oh-my-pi/pi-ai",
4
- "version": "14.5.7",
4
+ "version": "14.5.9",
5
5
  "description": "Unified LLM API with automatic model discovery and provider configuration",
6
6
  "homepage": "https://github.com/can1357/oh-my-pi",
7
7
  "author": "Can Boluk",
@@ -46,8 +46,8 @@
46
46
  "@aws-sdk/credential-provider-node": "^3.972.36",
47
47
  "@bufbuild/protobuf": "^2.12.0",
48
48
  "@google/genai": "^1.50.1",
49
- "@oh-my-pi/pi-natives": "14.5.7",
50
- "@oh-my-pi/pi-utils": "14.5.7",
49
+ "@oh-my-pi/pi-natives": "14.5.9",
50
+ "@oh-my-pi/pi-utils": "14.5.9",
51
51
  "@sinclair/typebox": "^0.34.49",
52
52
  "@smithy/node-http-handler": "^4.6.1",
53
53
  "ajv": "^8.20.0",
@@ -698,6 +698,42 @@ function createEmptyUsage(premiumRequests?: number): Usage {
698
698
  };
699
699
  }
700
700
 
701
+ export type AnthropicUsageLike = {
702
+ cache_creation?: { ephemeral_5m_input_tokens?: number | null; ephemeral_1h_input_tokens?: number | null } | null;
703
+ server_tool_use?: { web_search_requests?: number | null; web_fetch_requests?: number | null } | null;
704
+ };
705
+
706
+ /**
707
+ * Capture Anthropic's optional cache-creation TTL breakdown and server-tool-use
708
+ * counters into the harness Usage shape. Only sets fields that were reported, so
709
+ * a `message_delta` that omits `cache_creation` does not clobber the breakdown
710
+ * established at `message_start`.
711
+ */
712
+ export function applyAnthropicUsageExtras(usage: Usage, source: AnthropicUsageLike): void {
713
+ const cacheCreation = source.cache_creation;
714
+ if (cacheCreation) {
715
+ const fiveMinute = cacheCreation.ephemeral_5m_input_tokens ?? 0;
716
+ const oneHour = cacheCreation.ephemeral_1h_input_tokens ?? 0;
717
+ if (fiveMinute > 0 || oneHour > 0) {
718
+ usage.cttl = {
719
+ ...(fiveMinute > 0 ? { ephemeral5m: fiveMinute } : {}),
720
+ ...(oneHour > 0 ? { ephemeral1h: oneHour } : {}),
721
+ };
722
+ }
723
+ }
724
+ const serverToolUse = source.server_tool_use;
725
+ if (serverToolUse) {
726
+ const webSearch = serverToolUse.web_search_requests ?? 0;
727
+ const webFetch = serverToolUse.web_fetch_requests ?? 0;
728
+ if (webSearch > 0 || webFetch > 0) {
729
+ usage.server = {
730
+ ...(webSearch > 0 ? { webSearch } : {}),
731
+ ...(webFetch > 0 ? { webFetch } : {}),
732
+ };
733
+ }
734
+ }
735
+ }
736
+
701
737
  export const streamAnthropic: StreamFunction<"anthropic-messages"> = (
702
738
  model: Model<"anthropic-messages">,
703
739
  context: Context,
@@ -824,6 +860,7 @@ export const streamAnthropic: StreamFunction<"anthropic-messages"> = (
824
860
  continue;
825
861
  }
826
862
  sawMessageStart = true;
863
+ applyAnthropicUsageExtras(output.usage, event.message.usage);
827
864
  output.responseId = event.message.id;
828
865
  output.usage.input = event.message.usage.input_tokens || 0;
829
866
  output.usage.output = event.message.usage.output_tokens || 0;
@@ -989,6 +1026,7 @@ export const streamAnthropic: StreamFunction<"anthropic-messages"> = (
989
1026
  if (event.usage.cache_creation_input_tokens != null) {
990
1027
  output.usage.cacheWrite = event.usage.cache_creation_input_tokens;
991
1028
  }
1029
+ applyAnthropicUsageExtras(output.usage, event.usage);
992
1030
  output.usage.totalTokens =
993
1031
  output.usage.input + output.usage.output + output.usage.cacheRead + output.usage.cacheWrite;
994
1032
  calculateCost(model, output.usage);
@@ -804,14 +804,14 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli"> = (
804
804
  // promptTokenCount includes cachedContentTokenCount, so subtract to get fresh input
805
805
  const promptTokens = responseData.usageMetadata.promptTokenCount || 0;
806
806
  const cacheReadTokens = responseData.usageMetadata.cachedContentTokenCount || 0;
807
+ const thinkingTokens = responseData.usageMetadata.thoughtsTokenCount || 0;
807
808
  output.usage = {
808
809
  input: promptTokens - cacheReadTokens,
809
- output:
810
- (responseData.usageMetadata.candidatesTokenCount || 0) +
811
- (responseData.usageMetadata.thoughtsTokenCount || 0),
810
+ output: (responseData.usageMetadata.candidatesTokenCount || 0) + thinkingTokens,
812
811
  cacheRead: cacheReadTokens,
813
812
  cacheWrite: 0,
814
813
  totalTokens: responseData.usageMetadata.totalTokenCount || 0,
814
+ ...(thinkingTokens > 0 ? { reasoningTokens: thinkingTokens } : {}),
815
815
  cost: {
816
816
  input: 0,
817
817
  output: 0,
@@ -243,13 +243,14 @@ export const streamGoogleVertex: StreamFunction<"google-vertex"> = (
243
243
  // input + cacheRead = total prompt tokens (no double-counting).
244
244
  // Ref: https://ai.google.dev/api/generate-content#v1beta.GenerateContentResponse.UsageMetadata
245
245
  const cachedTokens = chunk.usageMetadata.cachedContentTokenCount || 0;
246
+ const thinkingTokens = chunk.usageMetadata.thoughtsTokenCount || 0;
246
247
  output.usage = {
247
248
  input: (chunk.usageMetadata.promptTokenCount || 0) - cachedTokens,
248
- output:
249
- (chunk.usageMetadata.candidatesTokenCount || 0) + (chunk.usageMetadata.thoughtsTokenCount || 0),
249
+ output: (chunk.usageMetadata.candidatesTokenCount || 0) + thinkingTokens,
250
250
  cacheRead: cachedTokens,
251
251
  cacheWrite: 0,
252
252
  totalTokens: chunk.usageMetadata.totalTokenCount || 0,
253
+ ...(thinkingTokens > 0 ? { reasoningTokens: thinkingTokens } : {}),
253
254
  cost: {
254
255
  input: 0,
255
256
  output: 0,
@@ -223,13 +223,14 @@ export const streamGoogle: StreamFunction<"google-generative-ai"> = (
223
223
  // input + cacheRead = total prompt tokens (no double-counting).
224
224
  // Ref: https://ai.google.dev/api/generate-content#v1beta.GenerateContentResponse.UsageMetadata
225
225
  const cachedTokens = chunk.usageMetadata.cachedContentTokenCount || 0;
226
+ const thinkingTokens = chunk.usageMetadata.thoughtsTokenCount || 0;
226
227
  output.usage = {
227
228
  input: (chunk.usageMetadata.promptTokenCount || 0) - cachedTokens,
228
- output:
229
- (chunk.usageMetadata.candidatesTokenCount || 0) + (chunk.usageMetadata.thoughtsTokenCount || 0),
229
+ output: (chunk.usageMetadata.candidatesTokenCount || 0) + thinkingTokens,
230
230
  cacheRead: cachedTokens,
231
231
  cacheWrite: 0,
232
232
  totalTokens: chunk.usageMetadata.totalTokenCount || 0,
233
+ ...(thinkingTokens > 0 ? { reasoningTokens: thinkingTokens } : {}),
233
234
  cost: {
234
235
  input: 0,
235
236
  output: 0,
@@ -1119,6 +1119,7 @@ function handleResponseCompleted(
1119
1119
  output_tokens?: number;
1120
1120
  total_tokens?: number;
1121
1121
  input_tokens_details?: { cached_tokens?: number };
1122
+ output_tokens_details?: { reasoning_tokens?: number };
1122
1123
  };
1123
1124
  status?: string;
1124
1125
  };
@@ -1127,12 +1128,14 @@ function handleResponseCompleted(
1127
1128
 
1128
1129
  if (response?.usage) {
1129
1130
  const cachedTokens = response.usage.input_tokens_details?.cached_tokens || 0;
1131
+ const reasoningTokens = response.usage.output_tokens_details?.reasoning_tokens || 0;
1130
1132
  output.usage = {
1131
1133
  input: (response.usage.input_tokens || 0) - cachedTokens,
1132
1134
  output: response.usage.output_tokens || 0,
1133
1135
  cacheRead: cachedTokens,
1134
1136
  cacheWrite: 0,
1135
1137
  totalTokens: response.usage.total_tokens || 0,
1138
+ ...(reasoningTokens > 0 ? { reasoningTokens } : {}),
1136
1139
  cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 },
1137
1140
  };
1138
1141
  }
@@ -959,7 +959,7 @@ function getChoiceUsage(choice: ChatCompletionChunk.Choice): object | undefined
959
959
  return getOptionalObjectProperty(choice, "usage");
960
960
  }
961
961
 
962
- function parseChunkUsage(
962
+ export function parseChunkUsage(
963
963
  rawUsage: object,
964
964
  model: Model<"openai-completions">,
965
965
  copilotPremiumRequests: number | undefined,
@@ -970,16 +970,28 @@ function parseChunkUsage(
970
970
  getOptionalNumberProperty(rawUsage, "cached_tokens") ??
971
971
  (promptTokenDetails ? getOptionalNumberProperty(promptTokenDetails, "cached_tokens") : undefined) ??
972
972
  0;
973
+ // OpenRouter exposes cache writes via `prompt_tokens_details.cache_write_tokens`
974
+ // and INCLUDES them in `prompt_tokens`. Without subtracting, cache-write tokens
975
+ // leak into `input` (e.g. GLM/Anthropic via OpenRouter on a fresh cache).
976
+ // Ref: https://openrouter.ai/docs/guides/best-practices/prompt-caching
977
+ const cacheWriteTokens = promptTokenDetails
978
+ ? (getOptionalNumberProperty(promptTokenDetails, "cache_write_tokens") ?? 0)
979
+ : 0;
973
980
  const reasoningTokens =
974
981
  (completionTokenDetails ? getOptionalNumberProperty(completionTokenDetails, "reasoning_tokens") : undefined) ?? 0;
975
- const input = (getOptionalNumberProperty(rawUsage, "prompt_tokens") ?? 0) - cachedTokens;
976
- const outputTokens = (getOptionalNumberProperty(rawUsage, "completion_tokens") ?? 0) + reasoningTokens;
982
+ const promptTokens = getOptionalNumberProperty(rawUsage, "prompt_tokens") ?? 0;
983
+ const input = Math.max(0, promptTokens - cachedTokens - cacheWriteTokens);
984
+ // Per OpenAI's CompletionUsage spec, `reasoning_tokens` is a subset of
985
+ // `completion_tokens` (which is the total billed output). Adding them would
986
+ // double-count.
987
+ const outputTokens = getOptionalNumberProperty(rawUsage, "completion_tokens") ?? 0;
977
988
  const usage: AssistantMessage["usage"] = {
978
989
  input,
979
990
  output: outputTokens,
980
991
  cacheRead: cachedTokens,
981
- cacheWrite: 0,
982
- totalTokens: input + outputTokens + cachedTokens,
992
+ cacheWrite: cacheWriteTokens,
993
+ totalTokens: input + outputTokens + cachedTokens + cacheWriteTokens,
994
+ ...(reasoningTokens > 0 ? { reasoningTokens } : {}),
983
995
  cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 },
984
996
  ...(copilotPremiumRequests !== undefined ? { premiumRequests: copilotPremiumRequests } : {}),
985
997
  };
@@ -527,12 +527,14 @@ export async function processResponsesStream<TApi extends Api>(
527
527
  }
528
528
  if (response?.usage) {
529
529
  const cachedTokens = response.usage.input_tokens_details?.cached_tokens || 0;
530
+ const reasoningTokens = response.usage.output_tokens_details?.reasoning_tokens || 0;
530
531
  output.usage = {
531
532
  input: (response.usage.input_tokens || 0) - cachedTokens,
532
533
  output: response.usage.output_tokens || 0,
533
534
  cacheRead: cachedTokens,
534
535
  cacheWrite: 0,
535
536
  totalTokens: response.usage.total_tokens || 0,
537
+ ...(reasoningTokens > 0 ? { reasoningTokens } : {}),
536
538
  cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 },
537
539
  };
538
540
  }
package/src/stream.ts CHANGED
@@ -196,8 +196,13 @@ export function stream<TApi extends Api>(
196
196
 
197
197
  const api: Api = model.api;
198
198
  switch (api) {
199
- case "anthropic-messages":
200
- return streamAnthropic(model as Model<"anthropic-messages">, context, providerOptions);
199
+ case "anthropic-messages": {
200
+ const anthropicOptions = providerOptions as AnthropicOptions;
201
+ return streamAnthropic(model as Model<"anthropic-messages">, context, {
202
+ ...anthropicOptions,
203
+ isOAuth: anthropicOptions.isOAuth ?? model.isOAuth,
204
+ });
205
+ }
201
206
 
202
207
  case "openai-completions":
203
208
  return streamOpenAICompletions(model as Model<"openai-completions">, context, providerOptions as any);
package/src/types.ts CHANGED
@@ -306,12 +306,43 @@ export interface ToolCall {
306
306
  }
307
307
 
308
308
  export interface Usage {
309
+ /** Non-cached input tokens (matches the bucket the provider bills as new input). */
309
310
  input: number;
311
+ /** Total output tokens for the turn, including thinking, assistant text, and tool-call argument tokens. */
310
312
  output: number;
313
+ /** Tokens read from the prompt cache. */
311
314
  cacheRead: number;
315
+ /** Tokens written to the prompt cache (cache creation). */
312
316
  cacheWrite: number;
317
+ /** Sum of input + output + cacheRead + cacheWrite. */
313
318
  totalTokens: number;
319
+ /** Copilot premium-request counter, when applicable. */
314
320
  premiumRequests?: number;
321
+ /**
322
+ * Reasoning/thinking tokens included in `output`, when the provider reports them
323
+ * (OpenAI `output_tokens_details.reasoning_tokens`, Google `thoughtsTokenCount`).
324
+ * Always a subset of `output` — non-reasoning output is `output - reasoningTokens`.
325
+ *
326
+ * Providers that don't expose this leave it undefined rather than guessing;
327
+ * `undefined` means unknown, NOT zero.
328
+ */
329
+ reasoningTokens?: number;
330
+ /**
331
+ * Cache-write TTL breakdown (Anthropic only). When set, the components sum to
332
+ * `cacheWrite`. Absent providers do not populate this.
333
+ */
334
+ cttl?: {
335
+ ephemeral5m?: number;
336
+ ephemeral1h?: number;
337
+ };
338
+ /**
339
+ * Server-side tool invocations made during this turn (Anthropic web_search /
340
+ * web_fetch, OpenAI built-in tools when reported). Counts requests, not tokens.
341
+ */
342
+ server?: {
343
+ webSearch?: number;
344
+ webFetch?: number;
345
+ };
315
346
  cost: {
316
347
  input: number;
317
348
  output: number;
@@ -587,4 +618,11 @@ export interface Model<TApi extends Api = any> {
587
618
  * - `"function"` or undefined: JSON function-tool with `{input: string}` (spec §1.2).
588
619
  */
589
620
  applyPatchToolType?: "freeform" | "function";
621
+ /**
622
+ * Force OAuth-style request shaping for providers whose API key prefix doesn't
623
+ * match an OAuth token (e.g. routing Anthropic traffic through a proxy that
624
+ * expects Claude Code framing). When true, the streaming layer sets
625
+ * `options.isOAuth = true` for the underlying provider call.
626
+ */
627
+ isOAuth?: boolean;
590
628
  }