npm - @oh-my-pi/pi-ai - Versions diffs - 14.7.4 → 14.7.6 - Mend

@oh-my-pi/pi-ai 14.7.4 → 14.7.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/CHANGELOG.md +19 -0
package/package.json +3 -3
package/src/providers/azure-openai-responses.ts +6 -3
package/src/providers/openai-codex/request-transformer.ts +6 -3
package/src/providers/openai-completions-compat.ts +49 -0
package/src/providers/openai-completions.ts +12 -2
package/src/providers/openai-responses.ts +6 -3
package/src/stream.ts +9 -0
package/src/types.ts +21 -0

package/CHANGELOG.md CHANGED Viewed

@@ -2,6 +2,25 @@
 ## [Unreleased]
+## [14.7.6] - 2026-05-07
+### Added
+- Added `hideThinkingSummary` option to `SimpleStreamOptions`. When true, `streamSimple` requests that the underlying provider omit reasoning/thinking summaries: Anthropic receives `thinking.display = "omitted"` (where supported), and OpenAI Responses / Azure / Codex providers leave `reasoning.summary` unset so the server skips emitting the human-readable summary stream entirely.
+### Changed
+- Changed OpenAI Responses, Azure OpenAI Responses, and OpenAI Codex providers to omit `reasoning.summary` from requests when `reasoningSummary` is explicitly `null` (previously fell back to `"auto"`).
+## [14.7.5] - 2026-05-07
+### Added
+- Added `OpenAICompat.supportsMultipleSystemMessages` so chat-completions hosts can opt out of separate leading system blocks. Auto-detected as `true` for OpenAI, Azure, OpenRouter, Cerebras, Together, Fireworks, Groq, DeepSeek, Mistral, xAI, Z.ai, GitHub Copilot, and Zenmux; `false` for MiniMax, Alibaba Dashscope, and Qwen Portal whose chat templates reject follow-up system messages. Unknown OpenAI-compatible hosts (custom vLLM/local) default to `false`; users can opt back in via `compat.supportsMultipleSystemMessages: true`.
+### Fixed
+- Fixed strict-template OpenAI-compatible hosts (e.g. Qwen 3.5+ via vLLM, MiniMax) rejecting follow-up `system`/`developer` messages by coalescing ordered system prompts into a single block joined by `\n\n` when `compat.supportsMultipleSystemMessages` is false. Canonical hosts continue to receive separate blocks so KV-cache reuse stays effective when only the trailing prompt changes ([#958](https://github.com/can1357/oh-my-pi/issues/958)).
 ## [14.7.2] - 2026-05-06
 ### Fixed

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
 	"type": "module",
 	"name": "@oh-my-pi/pi-ai",
-	"version": "14.7.4",
+	"version": "14.7.6",
 	"description": "Unified LLM API with automatic model discovery and provider configuration",
 	"homepage": "https://github.com/can1357/oh-my-pi",
 	"author": "Can Boluk",
@@ -46,8 +46,8 @@
 		"@aws-sdk/credential-provider-node": "^3.972.39",
 		"@bufbuild/protobuf": "^2.12.0",
 		"@google/genai": "^1.52.0",
-		"@oh-my-pi/pi-natives": "14.7.4",
-		"@oh-my-pi/pi-utils": "14.7.4",
+		"@oh-my-pi/pi-natives": "14.7.6",
+		"@oh-my-pi/pi-utils": "14.7.6",
 		"@sinclair/typebox": "^0.34.49",
 		"@smithy/node-http-handler": "^4.6.1",
 		"ajv": "^8.20.0",

package/src/providers/azure-openai-responses.ts CHANGED Viewed

@@ -317,11 +317,14 @@ function buildParams(
 		// See: https://github.com/can1357/oh-my-pi/issues/41
 		params.include = ["reasoning.encrypted_content"];
-		if (options?.reasoning || options?.reasoningSummary) {
-			params.reasoning = {
+		if (options?.reasoning || options?.reasoningSummary !== undefined) {
+			const reasoningParams: NonNullable<typeof params.reasoning> = {
 				effort: options?.reasoning || "medium",
-				summary: options?.reasoningSummary || "auto",
 			};
+			if (options?.reasoningSummary !== null) {
+				reasoningParams.summary = options?.reasoningSummary || "auto";
+			}
+			params.reasoning = reasoningParams;
 		} else {
 			if (model.name.toLowerCase().startsWith("gpt-5")) {
 				// Jesus Christ, see https://community.openai.com/t/need-reasoning-false-option-for-gpt-5/1351588/7

package/src/providers/openai-codex/request-transformer.ts CHANGED Viewed

@@ -4,7 +4,7 @@ import type { Api, Model } from "../../types";
 export interface ReasoningConfig {
 	effort: "none" | "minimal" | "low" | "medium" | "high" | "xhigh";
-	summary: "auto" | "concise" | "detailed" | null;
+	summary?: "auto" | "concise" | "detailed";
 }
 export interface CodexRequestOptions {
@@ -52,11 +52,14 @@ export interface RequestBody {
 }
 function getReasoningConfig(model: Model<Api>, options: CodexRequestOptions): ReasoningConfig {
-	return {
+	const config: ReasoningConfig = {
 		effort:
 			options.reasoningEffort === "none" ? "none" : requireSupportedEffort(model, options.reasoningEffort as Effort),
-		summary: options.reasoningSummary ?? "detailed",
 	};
+	if (options.reasoningSummary !== null) {
+		config.summary = options.reasoningSummary ?? "detailed";
+	}
+	return config;
 }
 function filterInput(input: InputItem[] | undefined): InputItem[] | undefined {

package/src/providers/openai-completions-compat.ts CHANGED Viewed

@@ -99,6 +99,52 @@ export function detectOpenAICompat(model: Model<"openai-completions">, resolvedB
 	const isGrok = provider === "xai" || baseUrl.includes("api.x.ai");
 	const isMistral = provider === "mistral" || baseUrl.includes("mistral.ai");
+	// Hosts whose chat-completions endpoints are known to accept multiple
+	// leading `system`/`developer` messages (preferred for KV-cache reuse).
+	// Anything outside this allowlist defaults to coalescing because
+	// strict chat templates (Qwen 3.5+ via vLLM, MiniMax, etc.) reject
+	// follow-up system messages with a 400.
+	const isOpenAIHost = provider === "openai" || baseUrl.includes("api.openai.com");
+	const isAzureHost =
+		provider === "azure" ||
+		baseUrl.includes(".openai.azure.com") ||
+		baseUrl.includes("models.inference.ai.azure.com") ||
+		baseUrl.includes("azure.com/openai");
+	const isOpenRouter = provider === "openrouter" || baseUrl.includes("openrouter.ai");
+	const isTogether = provider === "together" || baseUrl.includes("api.together.xyz");
+	const isFireworks = baseUrl.includes("fireworks.ai");
+	const isGroqHost = provider === "groq" || baseUrl.includes("api.groq.com");
+	const isCopilotHost = provider === "github-copilot";
+	const isZenmuxHost = provider === "zenmux";
+	// Endpoints that MUST receive a single system block. MiniMax's OpenAI
+	// endpoint returns error 2013 on multiple system messages; Alibaba's
+	// Dashscope and Qwen Portal serve Qwen models whose chat template
+	// raises "System message must be at the beginning" if any system
+	// message appears past index 0.
+	const isMiniMaxHost =
+		provider === "minimax-code" ||
+		provider === "minimax-code-cn" ||
+		baseUrl.includes("api.minimax.io") ||
+		baseUrl.includes("api.minimaxi.com");
+	const isQwenPortal = provider === "qwen-portal" || baseUrl.includes("portal.qwen.ai");
+	const supportsMultipleSystemMessagesDefault =
+		!isMiniMaxHost &&
+		!isAlibaba &&
+		!isQwenPortal &&
+		(isOpenAIHost ||
+			isAzureHost ||
+			isOpenRouter ||
+			isCerebras ||
+			isTogether ||
+			isFireworks ||
+			isGroqHost ||
+			isDeepseekFamily ||
+			isMistral ||
+			isGrok ||
+			isZai ||
+			isCopilotHost ||
+			isZenmuxHost);
 	const reasoningEffortMap: NonNullable<OpenAICompat["reasoningEffortMap"]> =
 		provider === "groq" && model.id === "qwen/qwen3-32b"
 			? ({
@@ -115,6 +161,7 @@ export function detectOpenAICompat(model: Model<"openai-completions">, resolvedB
 	return {
 		supportsStore: !isNonStandard,
 		supportsDeveloperRole: !isNonStandard,
+		supportsMultipleSystemMessages: supportsMultipleSystemMessagesDefault,
 		supportsReasoningEffort: !isGrok && !isZai,
 		reasoningEffortMap,
 		supportsUsageInStreaming: !isCerebras,
@@ -175,6 +222,8 @@ export function resolveOpenAICompat(
 	return {
 		supportsStore: model.compat.supportsStore ?? detected.supportsStore,
 		supportsDeveloperRole: model.compat.supportsDeveloperRole ?? detected.supportsDeveloperRole,
+		supportsMultipleSystemMessages:
+			model.compat.supportsMultipleSystemMessages ?? detected.supportsMultipleSystemMessages,
 		supportsReasoningEffort: model.compat.supportsReasoningEffort ?? detected.supportsReasoningEffort,
 		reasoningEffortMap: model.compat.reasoningEffortMap ?? detected.reasoningEffortMap,
 		supportsUsageInStreaming: model.compat.supportsUsageInStreaming ?? detected.supportsUsageInStreaming,

package/src/providers/openai-completions.ts CHANGED Viewed

@@ -1191,8 +1191,18 @@ export function convertMessages(
 	if (systemPrompts.length > 0) {
 		const useDeveloperRole = model.reasoning && compat.supportsDeveloperRole;
 		const role = useDeveloperRole ? "developer" : "system";
-		for (const systemPrompt of systemPrompts) {
-			params.push({ role, content: systemPrompt });
+		// Default to one block per ordered system prompt so the leading prefix
+		// stays byte-identical between turns and the provider's KV cache can
+		// reuse it. Hosts whose chat templates reject follow-up system messages
+		// (Qwen via vLLM, MiniMax, Alibaba Dashscope, Qwen Portal, …) opt out
+		// via `compat.supportsMultipleSystemMessages = false`; in that mode we
+		// coalesce into a single message joined by `\n\n`.
+		if (compat.supportsMultipleSystemMessages) {
+			for (const systemPrompt of systemPrompts) {
+				params.push({ role, content: systemPrompt });
+			}
+		} else {
+			params.push({ role, content: systemPrompts.join("\n\n") });
 		}
 	}

package/src/providers/openai-responses.ts CHANGED Viewed

@@ -430,13 +430,16 @@ function buildParams(
 		// See: https://github.com/can1357/oh-my-pi/issues/41
 		params.include = ["reasoning.encrypted_content"];
-		if (options?.reasoning || options?.reasoningSummary) {
-			params.reasoning = {
+		if (options?.reasoning || options?.reasoningSummary !== undefined) {
+			const reasoningParams: NonNullable<typeof params.reasoning> = {
 				effort: mapReasoningEffort(options?.reasoning || "medium", model.compat?.reasoningEffortMap) as NonNullable<
 					OpenAIResponsesSamplingParams["reasoning"]
 				>["effort"],
-				summary: options?.reasoningSummary || "auto",
 			};
+			if (options?.reasoningSummary !== null) {
+				reasoningParams.summary = options?.reasoningSummary || "auto";
+			}
+			params.reasoning = reasoningParams;
 		} else if (model.name.startsWith("gpt-5")) {
 			// Jesus Christ, see https://community.openai.com/t/need-reasoning-false-option-for-gpt-5/1351588/7
 			messages.push({

package/src/stream.ts CHANGED Viewed

@@ -462,6 +462,7 @@ function mapOptionsForApi<TApi extends Api>(
 					...base,
 					thinkingEnabled: false,
 					toolChoice: mapAnthropicToolChoice(options?.toolChoice),
+					thinkingDisplay: options?.hideThinkingSummary ? "omitted" : undefined,
 				});
 			}
@@ -471,6 +472,7 @@ function mapOptionsForApi<TApi extends Api>(
 					...base,
 					thinkingEnabled: false,
 					toolChoice: mapAnthropicToolChoice(options?.toolChoice),
+					thinkingDisplay: options?.hideThinkingSummary ? "omitted" : undefined,
 				});
 			}
@@ -483,6 +485,7 @@ function mapOptionsForApi<TApi extends Api>(
 					thinkingEnabled: true,
 					effort,
 					toolChoice: mapAnthropicToolChoice(options?.toolChoice),
+					thinkingDisplay: options?.hideThinkingSummary ? "omitted" : undefined,
 				});
 			}
@@ -492,6 +495,7 @@ function mapOptionsForApi<TApi extends Api>(
 					thinkingEnabled: true,
 					thinkingBudgetTokens: thinkingBudget,
 					toolChoice: mapAnthropicToolChoice(options?.toolChoice),
+					thinkingDisplay: options?.hideThinkingSummary ? "omitted" : undefined,
 				});
 			}
@@ -509,6 +513,7 @@ function mapOptionsForApi<TApi extends Api>(
 					...base,
 					thinkingEnabled: false,
 					toolChoice: mapAnthropicToolChoice(options?.toolChoice),
+					thinkingDisplay: options?.hideThinkingSummary ? "omitted" : undefined,
 				});
 			} else {
 				return castApi<"anthropic-messages">({
@@ -517,6 +522,7 @@ function mapOptionsForApi<TApi extends Api>(
 					thinkingEnabled: true,
 					thinkingBudgetTokens: thinkingBudget,
 					toolChoice: mapAnthropicToolChoice(options?.toolChoice),
+					thinkingDisplay: options?.hideThinkingSummary ? "omitted" : undefined,
 				});
 			}
 		}
@@ -564,6 +570,7 @@ function mapOptionsForApi<TApi extends Api>(
 				reasoning: resolveOpenAiReasoningEffort(model, options),
 				toolChoice: mapOpenAiToolChoice(options?.toolChoice),
 				serviceTier: options?.serviceTier,
+				reasoningSummary: options?.hideThinkingSummary ? null : undefined,
 			});
 		case "azure-openai-responses":
@@ -572,6 +579,7 @@ function mapOptionsForApi<TApi extends Api>(
 				reasoning: resolveOpenAiReasoningEffort(model, options),
 				toolChoice: mapOpenAiToolChoice(options?.toolChoice),
 				serviceTier: options?.serviceTier,
+				reasoningSummary: options?.hideThinkingSummary ? null : undefined,
 			});
 		case "openai-codex-responses":
@@ -581,6 +589,7 @@ function mapOptionsForApi<TApi extends Api>(
 				toolChoice: mapOpenAiToolChoice(options?.toolChoice),
 				serviceTier: options?.serviceTier,
 				preferWebsockets: options?.preferWebsockets,
+				reasoningSummary: options?.hideThinkingSummary ? null : undefined,
 			});
 		case "google-generative-ai": {

package/src/types.ts CHANGED Viewed

@@ -255,6 +255,14 @@ export interface SimpleStreamOptions extends StreamOptions {
 	 * this way when `reasoning` is undefined.
 	 */
 	disableReasoning?: boolean;
+	/**
+	 * If true, request that the provider omit thinking/reasoning summaries
+	 * from the response (e.g. Anthropic `thinking.display = "omitted"`,
+	 * OpenAI Responses `reasoning.summary` left unset). The model still
+	 * reasons internally; only the human-readable summary stream is dropped.
+	 * Useful when the UI hides thinking blocks anyway and the summary is wasted bandwidth.
+	 */
+	hideThinkingSummary?: boolean;
 	/** Custom token budgets for thinking levels (token-based providers only) */
 	thinkingBudgets?: ThinkingBudgets;
 	/** Cursor exec handlers for local tool execution */
@@ -540,6 +548,19 @@ export interface OpenAICompat {
 	supportsStore?: boolean;
 	/** Whether the provider supports the `developer` role (vs `system`). Default: auto-detected from URL. */
 	supportsDeveloperRole?: boolean;
+	/**
+	 * Whether the provider's chat-completions endpoint accepts multiple
+	 * leading `system`/`developer` messages. When false, ordered system
+	 * prompts are coalesced into a single message joined by `\n\n` so
+	 * strict chat templates (e.g. Qwen-served via vLLM, MiniMax) accept
+	 * the request. Default: detected per provider/baseUrl. Canonical
+	 * OpenAI/Azure/OpenRouter/Cerebras/Together/Fireworks/Groq/DeepSeek/
+	 * Mistral/xAI/Z.ai/GitHub Copilot/Zenmux are treated as `true`;
+	 * unknown or strict-template hosts default to `false`. Setting this
+	 * to `true` preserves separate blocks, which is preferred for
+	 * KV-cache reuse when the trailing prompt changes between calls.
+	 */
+	supportsMultipleSystemMessages?: boolean;
 	/** Whether the provider supports `reasoning_effort`. Default: auto-detected from URL. */
 	supportsReasoningEffort?: boolean;
 	/** Optional mapping from pi-ai reasoning levels to provider/model-specific `reasoning_effort` values. */