@oh-my-pi/pi-ai 14.7.4 → 14.7.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -2,6 +2,25 @@
2
2
 
3
3
  ## [Unreleased]
4
4
 
5
+ ## [14.7.6] - 2026-05-07
6
+
7
+ ### Added
8
+
9
+ - Added `hideThinkingSummary` option to `SimpleStreamOptions`. When true, `streamSimple` requests that the underlying provider omit reasoning/thinking summaries: Anthropic receives `thinking.display = "omitted"` (where supported), and OpenAI Responses / Azure / Codex providers leave `reasoning.summary` unset so the server skips emitting the human-readable summary stream entirely.
10
+
11
+ ### Changed
12
+
13
+ - Changed OpenAI Responses, Azure OpenAI Responses, and OpenAI Codex providers to omit `reasoning.summary` from requests when `reasoningSummary` is explicitly `null` (previously fell back to `"auto"`).
14
+ ## [14.7.5] - 2026-05-07
15
+
16
+ ### Added
17
+
18
+ - Added `OpenAICompat.supportsMultipleSystemMessages` so chat-completions hosts can opt out of separate leading system blocks. Auto-detected as `true` for OpenAI, Azure, OpenRouter, Cerebras, Together, Fireworks, Groq, DeepSeek, Mistral, xAI, Z.ai, GitHub Copilot, and Zenmux; `false` for MiniMax, Alibaba Dashscope, and Qwen Portal whose chat templates reject follow-up system messages. Unknown OpenAI-compatible hosts (custom vLLM/local) default to `false`; users can opt back in via `compat.supportsMultipleSystemMessages: true`.
19
+
20
+ ### Fixed
21
+
22
+ - Fixed strict-template OpenAI-compatible hosts (e.g. Qwen 3.5+ via vLLM, MiniMax) rejecting follow-up `system`/`developer` messages by coalescing ordered system prompts into a single block joined by `\n\n` when `compat.supportsMultipleSystemMessages` is false. Canonical hosts continue to receive separate blocks so KV-cache reuse stays effective when only the trailing prompt changes ([#958](https://github.com/can1357/oh-my-pi/issues/958)).
23
+
5
24
  ## [14.7.2] - 2026-05-06
6
25
 
7
26
  ### Fixed
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "type": "module",
3
3
  "name": "@oh-my-pi/pi-ai",
4
- "version": "14.7.4",
4
+ "version": "14.7.6",
5
5
  "description": "Unified LLM API with automatic model discovery and provider configuration",
6
6
  "homepage": "https://github.com/can1357/oh-my-pi",
7
7
  "author": "Can Boluk",
@@ -46,8 +46,8 @@
46
46
  "@aws-sdk/credential-provider-node": "^3.972.39",
47
47
  "@bufbuild/protobuf": "^2.12.0",
48
48
  "@google/genai": "^1.52.0",
49
- "@oh-my-pi/pi-natives": "14.7.4",
50
- "@oh-my-pi/pi-utils": "14.7.4",
49
+ "@oh-my-pi/pi-natives": "14.7.6",
50
+ "@oh-my-pi/pi-utils": "14.7.6",
51
51
  "@sinclair/typebox": "^0.34.49",
52
52
  "@smithy/node-http-handler": "^4.6.1",
53
53
  "ajv": "^8.20.0",
@@ -317,11 +317,14 @@ function buildParams(
317
317
  // See: https://github.com/can1357/oh-my-pi/issues/41
318
318
  params.include = ["reasoning.encrypted_content"];
319
319
 
320
- if (options?.reasoning || options?.reasoningSummary) {
321
- params.reasoning = {
320
+ if (options?.reasoning || options?.reasoningSummary !== undefined) {
321
+ const reasoningParams: NonNullable<typeof params.reasoning> = {
322
322
  effort: options?.reasoning || "medium",
323
- summary: options?.reasoningSummary || "auto",
324
323
  };
324
+ if (options?.reasoningSummary !== null) {
325
+ reasoningParams.summary = options?.reasoningSummary || "auto";
326
+ }
327
+ params.reasoning = reasoningParams;
325
328
  } else {
326
329
  if (model.name.toLowerCase().startsWith("gpt-5")) {
327
330
  // Jesus Christ, see https://community.openai.com/t/need-reasoning-false-option-for-gpt-5/1351588/7
@@ -4,7 +4,7 @@ import type { Api, Model } from "../../types";
4
4
 
5
5
  export interface ReasoningConfig {
6
6
  effort: "none" | "minimal" | "low" | "medium" | "high" | "xhigh";
7
- summary: "auto" | "concise" | "detailed" | null;
7
+ summary?: "auto" | "concise" | "detailed";
8
8
  }
9
9
 
10
10
  export interface CodexRequestOptions {
@@ -52,11 +52,14 @@ export interface RequestBody {
52
52
  }
53
53
 
54
54
  function getReasoningConfig(model: Model<Api>, options: CodexRequestOptions): ReasoningConfig {
55
- return {
55
+ const config: ReasoningConfig = {
56
56
  effort:
57
57
  options.reasoningEffort === "none" ? "none" : requireSupportedEffort(model, options.reasoningEffort as Effort),
58
- summary: options.reasoningSummary ?? "detailed",
59
58
  };
59
+ if (options.reasoningSummary !== null) {
60
+ config.summary = options.reasoningSummary ?? "detailed";
61
+ }
62
+ return config;
60
63
  }
61
64
 
62
65
  function filterInput(input: InputItem[] | undefined): InputItem[] | undefined {
@@ -99,6 +99,52 @@ export function detectOpenAICompat(model: Model<"openai-completions">, resolvedB
99
99
  const isGrok = provider === "xai" || baseUrl.includes("api.x.ai");
100
100
  const isMistral = provider === "mistral" || baseUrl.includes("mistral.ai");
101
101
 
102
+ // Hosts whose chat-completions endpoints are known to accept multiple
103
+ // leading `system`/`developer` messages (preferred for KV-cache reuse).
104
+ // Anything outside this allowlist defaults to coalescing because
105
+ // strict chat templates (Qwen 3.5+ via vLLM, MiniMax, etc.) reject
106
+ // follow-up system messages with a 400.
107
+ const isOpenAIHost = provider === "openai" || baseUrl.includes("api.openai.com");
108
+ const isAzureHost =
109
+ provider === "azure" ||
110
+ baseUrl.includes(".openai.azure.com") ||
111
+ baseUrl.includes("models.inference.ai.azure.com") ||
112
+ baseUrl.includes("azure.com/openai");
113
+ const isOpenRouter = provider === "openrouter" || baseUrl.includes("openrouter.ai");
114
+ const isTogether = provider === "together" || baseUrl.includes("api.together.xyz");
115
+ const isFireworks = baseUrl.includes("fireworks.ai");
116
+ const isGroqHost = provider === "groq" || baseUrl.includes("api.groq.com");
117
+ const isCopilotHost = provider === "github-copilot";
118
+ const isZenmuxHost = provider === "zenmux";
119
+ // Endpoints that MUST receive a single system block. MiniMax's OpenAI
120
+ // endpoint returns error 2013 on multiple system messages; Alibaba's
121
+ // Dashscope and Qwen Portal serve Qwen models whose chat template
122
+ // raises "System message must be at the beginning" if any system
123
+ // message appears past index 0.
124
+ const isMiniMaxHost =
125
+ provider === "minimax-code" ||
126
+ provider === "minimax-code-cn" ||
127
+ baseUrl.includes("api.minimax.io") ||
128
+ baseUrl.includes("api.minimaxi.com");
129
+ const isQwenPortal = provider === "qwen-portal" || baseUrl.includes("portal.qwen.ai");
130
+ const supportsMultipleSystemMessagesDefault =
131
+ !isMiniMaxHost &&
132
+ !isAlibaba &&
133
+ !isQwenPortal &&
134
+ (isOpenAIHost ||
135
+ isAzureHost ||
136
+ isOpenRouter ||
137
+ isCerebras ||
138
+ isTogether ||
139
+ isFireworks ||
140
+ isGroqHost ||
141
+ isDeepseekFamily ||
142
+ isMistral ||
143
+ isGrok ||
144
+ isZai ||
145
+ isCopilotHost ||
146
+ isZenmuxHost);
147
+
102
148
  const reasoningEffortMap: NonNullable<OpenAICompat["reasoningEffortMap"]> =
103
149
  provider === "groq" && model.id === "qwen/qwen3-32b"
104
150
  ? ({
@@ -115,6 +161,7 @@ export function detectOpenAICompat(model: Model<"openai-completions">, resolvedB
115
161
  return {
116
162
  supportsStore: !isNonStandard,
117
163
  supportsDeveloperRole: !isNonStandard,
164
+ supportsMultipleSystemMessages: supportsMultipleSystemMessagesDefault,
118
165
  supportsReasoningEffort: !isGrok && !isZai,
119
166
  reasoningEffortMap,
120
167
  supportsUsageInStreaming: !isCerebras,
@@ -175,6 +222,8 @@ export function resolveOpenAICompat(
175
222
  return {
176
223
  supportsStore: model.compat.supportsStore ?? detected.supportsStore,
177
224
  supportsDeveloperRole: model.compat.supportsDeveloperRole ?? detected.supportsDeveloperRole,
225
+ supportsMultipleSystemMessages:
226
+ model.compat.supportsMultipleSystemMessages ?? detected.supportsMultipleSystemMessages,
178
227
  supportsReasoningEffort: model.compat.supportsReasoningEffort ?? detected.supportsReasoningEffort,
179
228
  reasoningEffortMap: model.compat.reasoningEffortMap ?? detected.reasoningEffortMap,
180
229
  supportsUsageInStreaming: model.compat.supportsUsageInStreaming ?? detected.supportsUsageInStreaming,
@@ -1191,8 +1191,18 @@ export function convertMessages(
1191
1191
  if (systemPrompts.length > 0) {
1192
1192
  const useDeveloperRole = model.reasoning && compat.supportsDeveloperRole;
1193
1193
  const role = useDeveloperRole ? "developer" : "system";
1194
- for (const systemPrompt of systemPrompts) {
1195
- params.push({ role, content: systemPrompt });
1194
+ // Default to one block per ordered system prompt so the leading prefix
1195
+ // stays byte-identical between turns and the provider's KV cache can
1196
+ // reuse it. Hosts whose chat templates reject follow-up system messages
1197
+ // (Qwen via vLLM, MiniMax, Alibaba Dashscope, Qwen Portal, …) opt out
1198
+ // via `compat.supportsMultipleSystemMessages = false`; in that mode we
1199
+ // coalesce into a single message joined by `\n\n`.
1200
+ if (compat.supportsMultipleSystemMessages) {
1201
+ for (const systemPrompt of systemPrompts) {
1202
+ params.push({ role, content: systemPrompt });
1203
+ }
1204
+ } else {
1205
+ params.push({ role, content: systemPrompts.join("\n\n") });
1196
1206
  }
1197
1207
  }
1198
1208
 
@@ -430,13 +430,16 @@ function buildParams(
430
430
  // See: https://github.com/can1357/oh-my-pi/issues/41
431
431
  params.include = ["reasoning.encrypted_content"];
432
432
 
433
- if (options?.reasoning || options?.reasoningSummary) {
434
- params.reasoning = {
433
+ if (options?.reasoning || options?.reasoningSummary !== undefined) {
434
+ const reasoningParams: NonNullable<typeof params.reasoning> = {
435
435
  effort: mapReasoningEffort(options?.reasoning || "medium", model.compat?.reasoningEffortMap) as NonNullable<
436
436
  OpenAIResponsesSamplingParams["reasoning"]
437
437
  >["effort"],
438
- summary: options?.reasoningSummary || "auto",
439
438
  };
439
+ if (options?.reasoningSummary !== null) {
440
+ reasoningParams.summary = options?.reasoningSummary || "auto";
441
+ }
442
+ params.reasoning = reasoningParams;
440
443
  } else if (model.name.startsWith("gpt-5")) {
441
444
  // Jesus Christ, see https://community.openai.com/t/need-reasoning-false-option-for-gpt-5/1351588/7
442
445
  messages.push({
package/src/stream.ts CHANGED
@@ -462,6 +462,7 @@ function mapOptionsForApi<TApi extends Api>(
462
462
  ...base,
463
463
  thinkingEnabled: false,
464
464
  toolChoice: mapAnthropicToolChoice(options?.toolChoice),
465
+ thinkingDisplay: options?.hideThinkingSummary ? "omitted" : undefined,
465
466
  });
466
467
  }
467
468
 
@@ -471,6 +472,7 @@ function mapOptionsForApi<TApi extends Api>(
471
472
  ...base,
472
473
  thinkingEnabled: false,
473
474
  toolChoice: mapAnthropicToolChoice(options?.toolChoice),
475
+ thinkingDisplay: options?.hideThinkingSummary ? "omitted" : undefined,
474
476
  });
475
477
  }
476
478
 
@@ -483,6 +485,7 @@ function mapOptionsForApi<TApi extends Api>(
483
485
  thinkingEnabled: true,
484
486
  effort,
485
487
  toolChoice: mapAnthropicToolChoice(options?.toolChoice),
488
+ thinkingDisplay: options?.hideThinkingSummary ? "omitted" : undefined,
486
489
  });
487
490
  }
488
491
 
@@ -492,6 +495,7 @@ function mapOptionsForApi<TApi extends Api>(
492
495
  thinkingEnabled: true,
493
496
  thinkingBudgetTokens: thinkingBudget,
494
497
  toolChoice: mapAnthropicToolChoice(options?.toolChoice),
498
+ thinkingDisplay: options?.hideThinkingSummary ? "omitted" : undefined,
495
499
  });
496
500
  }
497
501
 
@@ -509,6 +513,7 @@ function mapOptionsForApi<TApi extends Api>(
509
513
  ...base,
510
514
  thinkingEnabled: false,
511
515
  toolChoice: mapAnthropicToolChoice(options?.toolChoice),
516
+ thinkingDisplay: options?.hideThinkingSummary ? "omitted" : undefined,
512
517
  });
513
518
  } else {
514
519
  return castApi<"anthropic-messages">({
@@ -517,6 +522,7 @@ function mapOptionsForApi<TApi extends Api>(
517
522
  thinkingEnabled: true,
518
523
  thinkingBudgetTokens: thinkingBudget,
519
524
  toolChoice: mapAnthropicToolChoice(options?.toolChoice),
525
+ thinkingDisplay: options?.hideThinkingSummary ? "omitted" : undefined,
520
526
  });
521
527
  }
522
528
  }
@@ -564,6 +570,7 @@ function mapOptionsForApi<TApi extends Api>(
564
570
  reasoning: resolveOpenAiReasoningEffort(model, options),
565
571
  toolChoice: mapOpenAiToolChoice(options?.toolChoice),
566
572
  serviceTier: options?.serviceTier,
573
+ reasoningSummary: options?.hideThinkingSummary ? null : undefined,
567
574
  });
568
575
 
569
576
  case "azure-openai-responses":
@@ -572,6 +579,7 @@ function mapOptionsForApi<TApi extends Api>(
572
579
  reasoning: resolveOpenAiReasoningEffort(model, options),
573
580
  toolChoice: mapOpenAiToolChoice(options?.toolChoice),
574
581
  serviceTier: options?.serviceTier,
582
+ reasoningSummary: options?.hideThinkingSummary ? null : undefined,
575
583
  });
576
584
 
577
585
  case "openai-codex-responses":
@@ -581,6 +589,7 @@ function mapOptionsForApi<TApi extends Api>(
581
589
  toolChoice: mapOpenAiToolChoice(options?.toolChoice),
582
590
  serviceTier: options?.serviceTier,
583
591
  preferWebsockets: options?.preferWebsockets,
592
+ reasoningSummary: options?.hideThinkingSummary ? null : undefined,
584
593
  });
585
594
 
586
595
  case "google-generative-ai": {
package/src/types.ts CHANGED
@@ -255,6 +255,14 @@ export interface SimpleStreamOptions extends StreamOptions {
255
255
  * this way when `reasoning` is undefined.
256
256
  */
257
257
  disableReasoning?: boolean;
258
+ /**
259
+ * If true, request that the provider omit thinking/reasoning summaries
260
+ * from the response (e.g. Anthropic `thinking.display = "omitted"`,
261
+ * OpenAI Responses `reasoning.summary` left unset). The model still
262
+ * reasons internally; only the human-readable summary stream is dropped.
263
+ * Useful when the UI hides thinking blocks anyway and the summary is wasted bandwidth.
264
+ */
265
+ hideThinkingSummary?: boolean;
258
266
  /** Custom token budgets for thinking levels (token-based providers only) */
259
267
  thinkingBudgets?: ThinkingBudgets;
260
268
  /** Cursor exec handlers for local tool execution */
@@ -540,6 +548,19 @@ export interface OpenAICompat {
540
548
  supportsStore?: boolean;
541
549
  /** Whether the provider supports the `developer` role (vs `system`). Default: auto-detected from URL. */
542
550
  supportsDeveloperRole?: boolean;
551
+ /**
552
+ * Whether the provider's chat-completions endpoint accepts multiple
553
+ * leading `system`/`developer` messages. When false, ordered system
554
+ * prompts are coalesced into a single message joined by `\n\n` so
555
+ * strict chat templates (e.g. Qwen-served via vLLM, MiniMax) accept
556
+ * the request. Default: detected per provider/baseUrl. Canonical
557
+ * OpenAI/Azure/OpenRouter/Cerebras/Together/Fireworks/Groq/DeepSeek/
558
+ * Mistral/xAI/Z.ai/GitHub Copilot/Zenmux are treated as `true`;
559
+ * unknown or strict-template hosts default to `false`. Setting this
560
+ * to `true` preserves separate blocks, which is preferred for
561
+ * KV-cache reuse when the trailing prompt changes between calls.
562
+ */
563
+ supportsMultipleSystemMessages?: boolean;
543
564
  /** Whether the provider supports `reasoning_effort`. Default: auto-detected from URL. */
544
565
  supportsReasoningEffort?: boolean;
545
566
  /** Optional mapping from pi-ai reasoning levels to provider/model-specific `reasoning_effort` values. */