@oh-my-pi/pi-ai 14.7.3 → 14.7.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -2,6 +2,16 @@
2
2
 
3
3
  ## [Unreleased]
4
4
 
5
+ ## [14.7.5] - 2026-05-07
6
+
7
+ ### Added
8
+
9
+ - Added `OpenAICompat.supportsMultipleSystemMessages` so chat-completions hosts can opt out of separate leading system blocks. Auto-detected as `true` for OpenAI, Azure, OpenRouter, Cerebras, Together, Fireworks, Groq, DeepSeek, Mistral, xAI, Z.ai, GitHub Copilot, and Zenmux; `false` for MiniMax, Alibaba Dashscope, and Qwen Portal whose chat templates reject follow-up system messages. Unknown OpenAI-compatible hosts (custom vLLM/local) default to `false`; users can opt back in via `compat.supportsMultipleSystemMessages: true`.
10
+
11
+ ### Fixed
12
+
13
+ - Fixed strict-template OpenAI-compatible hosts (e.g. Qwen 3.5+ via vLLM, MiniMax) rejecting follow-up `system`/`developer` messages by coalescing ordered system prompts into a single block joined by `\n\n` when `compat.supportsMultipleSystemMessages` is false. Canonical hosts continue to receive separate blocks so KV-cache reuse stays effective when only the trailing prompt changes ([#958](https://github.com/can1357/oh-my-pi/issues/958)).
14
+
5
15
  ## [14.7.2] - 2026-05-06
6
16
 
7
17
  ### Fixed
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "type": "module",
3
3
  "name": "@oh-my-pi/pi-ai",
4
- "version": "14.7.3",
4
+ "version": "14.7.5",
5
5
  "description": "Unified LLM API with automatic model discovery and provider configuration",
6
6
  "homepage": "https://github.com/can1357/oh-my-pi",
7
7
  "author": "Can Boluk",
@@ -46,8 +46,8 @@
46
46
  "@aws-sdk/credential-provider-node": "^3.972.39",
47
47
  "@bufbuild/protobuf": "^2.12.0",
48
48
  "@google/genai": "^1.52.0",
49
- "@oh-my-pi/pi-natives": "14.7.3",
50
- "@oh-my-pi/pi-utils": "14.7.3",
49
+ "@oh-my-pi/pi-natives": "14.7.5",
50
+ "@oh-my-pi/pi-utils": "14.7.5",
51
51
  "@sinclair/typebox": "^0.34.49",
52
52
  "@smithy/node-http-handler": "^4.6.1",
53
53
  "ajv": "^8.20.0",
@@ -99,6 +99,52 @@ export function detectOpenAICompat(model: Model<"openai-completions">, resolvedB
99
99
  const isGrok = provider === "xai" || baseUrl.includes("api.x.ai");
100
100
  const isMistral = provider === "mistral" || baseUrl.includes("mistral.ai");
101
101
 
102
+ // Hosts whose chat-completions endpoints are known to accept multiple
103
+ // leading `system`/`developer` messages (preferred for KV-cache reuse).
104
+ // Anything outside this allowlist defaults to coalescing because
105
+ // strict chat templates (Qwen 3.5+ via vLLM, MiniMax, etc.) reject
106
+ // follow-up system messages with a 400.
107
+ const isOpenAIHost = provider === "openai" || baseUrl.includes("api.openai.com");
108
+ const isAzureHost =
109
+ provider === "azure" ||
110
+ baseUrl.includes(".openai.azure.com") ||
111
+ baseUrl.includes("models.inference.ai.azure.com") ||
112
+ baseUrl.includes("azure.com/openai");
113
+ const isOpenRouter = provider === "openrouter" || baseUrl.includes("openrouter.ai");
114
+ const isTogether = provider === "together" || baseUrl.includes("api.together.xyz");
115
+ const isFireworks = baseUrl.includes("fireworks.ai");
116
+ const isGroqHost = provider === "groq" || baseUrl.includes("api.groq.com");
117
+ const isCopilotHost = provider === "github-copilot";
118
+ const isZenmuxHost = provider === "zenmux";
119
+ // Endpoints that MUST receive a single system block. MiniMax's OpenAI
120
+ // endpoint returns error 2013 on multiple system messages; Alibaba's
121
+ // Dashscope and Qwen Portal serve Qwen models whose chat template
122
+ // raises "System message must be at the beginning" if any system
123
+ // message appears past index 0.
124
+ const isMiniMaxHost =
125
+ provider === "minimax-code" ||
126
+ provider === "minimax-code-cn" ||
127
+ baseUrl.includes("api.minimax.io") ||
128
+ baseUrl.includes("api.minimaxi.com");
129
+ const isQwenPortal = provider === "qwen-portal" || baseUrl.includes("portal.qwen.ai");
130
+ const supportsMultipleSystemMessagesDefault =
131
+ !isMiniMaxHost &&
132
+ !isAlibaba &&
133
+ !isQwenPortal &&
134
+ (isOpenAIHost ||
135
+ isAzureHost ||
136
+ isOpenRouter ||
137
+ isCerebras ||
138
+ isTogether ||
139
+ isFireworks ||
140
+ isGroqHost ||
141
+ isDeepseekFamily ||
142
+ isMistral ||
143
+ isGrok ||
144
+ isZai ||
145
+ isCopilotHost ||
146
+ isZenmuxHost);
147
+
102
148
  const reasoningEffortMap: NonNullable<OpenAICompat["reasoningEffortMap"]> =
103
149
  provider === "groq" && model.id === "qwen/qwen3-32b"
104
150
  ? ({
@@ -115,6 +161,7 @@ export function detectOpenAICompat(model: Model<"openai-completions">, resolvedB
115
161
  return {
116
162
  supportsStore: !isNonStandard,
117
163
  supportsDeveloperRole: !isNonStandard,
164
+ supportsMultipleSystemMessages: supportsMultipleSystemMessagesDefault,
118
165
  supportsReasoningEffort: !isGrok && !isZai,
119
166
  reasoningEffortMap,
120
167
  supportsUsageInStreaming: !isCerebras,
@@ -175,6 +222,8 @@ export function resolveOpenAICompat(
175
222
  return {
176
223
  supportsStore: model.compat.supportsStore ?? detected.supportsStore,
177
224
  supportsDeveloperRole: model.compat.supportsDeveloperRole ?? detected.supportsDeveloperRole,
225
+ supportsMultipleSystemMessages:
226
+ model.compat.supportsMultipleSystemMessages ?? detected.supportsMultipleSystemMessages,
178
227
  supportsReasoningEffort: model.compat.supportsReasoningEffort ?? detected.supportsReasoningEffort,
179
228
  reasoningEffortMap: model.compat.reasoningEffortMap ?? detected.reasoningEffortMap,
180
229
  supportsUsageInStreaming: model.compat.supportsUsageInStreaming ?? detected.supportsUsageInStreaming,
@@ -1191,8 +1191,18 @@ export function convertMessages(
1191
1191
  if (systemPrompts.length > 0) {
1192
1192
  const useDeveloperRole = model.reasoning && compat.supportsDeveloperRole;
1193
1193
  const role = useDeveloperRole ? "developer" : "system";
1194
- for (const systemPrompt of systemPrompts) {
1195
- params.push({ role, content: systemPrompt });
1194
+ // Default to one block per ordered system prompt so the leading prefix
1195
+ // stays byte-identical between turns and the provider's KV cache can
1196
+ // reuse it. Hosts whose chat templates reject follow-up system messages
1197
+ // (Qwen via vLLM, MiniMax, Alibaba Dashscope, Qwen Portal, …) opt out
1198
+ // via `compat.supportsMultipleSystemMessages = false`; in that mode we
1199
+ // coalesce into a single message joined by `\n\n`.
1200
+ if (compat.supportsMultipleSystemMessages) {
1201
+ for (const systemPrompt of systemPrompts) {
1202
+ params.push({ role, content: systemPrompt });
1203
+ }
1204
+ } else {
1205
+ params.push({ role, content: systemPrompts.join("\n\n") });
1196
1206
  }
1197
1207
  }
1198
1208
 
package/src/types.ts CHANGED
@@ -540,6 +540,19 @@ export interface OpenAICompat {
540
540
  supportsStore?: boolean;
541
541
  /** Whether the provider supports the `developer` role (vs `system`). Default: auto-detected from URL. */
542
542
  supportsDeveloperRole?: boolean;
543
+ /**
544
+ * Whether the provider's chat-completions endpoint accepts multiple
545
+ * leading `system`/`developer` messages. When false, ordered system
546
+ * prompts are coalesced into a single message joined by `\n\n` so
547
+ * strict chat templates (e.g. Qwen-served via vLLM, MiniMax) accept
548
+ * the request. Default: detected per provider/baseUrl. Canonical
549
+ * OpenAI/Azure/OpenRouter/Cerebras/Together/Fireworks/Groq/DeepSeek/
550
+ * Mistral/xAI/Z.ai/GitHub Copilot/Zenmux are treated as `true`;
551
+ * unknown or strict-template hosts default to `false`. Setting this
552
+ * to `true` preserves separate blocks, which is preferred for
553
+ * KV-cache reuse when the trailing prompt changes between calls.
554
+ */
555
+ supportsMultipleSystemMessages?: boolean;
543
556
  /** Whether the provider supports `reasoning_effort`. Default: auto-detected from URL. */
544
557
  supportsReasoningEffort?: boolean;
545
558
  /** Optional mapping from pi-ai reasoning levels to provider/model-specific `reasoning_effort` values. */