@oh-my-pi/pi-ai 12.4.0 → 12.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@oh-my-pi/pi-ai",
3
- "version": "12.4.0",
3
+ "version": "12.5.0",
4
4
  "description": "Unified LLM API with automatic model discovery and provider configuration",
5
5
  "type": "module",
6
6
  "main": "./src/index.ts",
@@ -57,19 +57,19 @@
57
57
  },
58
58
  "dependencies": {
59
59
  "@anthropic-ai/sdk": "^0.74.0",
60
- "@aws-sdk/client-bedrock-runtime": "^3.982.0",
60
+ "@aws-sdk/client-bedrock-runtime": "^3.990.0",
61
61
  "@bufbuild/protobuf": "^2.11.0",
62
62
  "@connectrpc/connect": "^2.1.1",
63
63
  "@connectrpc/connect-node": "^2.1.1",
64
- "@google/genai": "^1.39.0",
65
- "@mistralai/mistralai": "^1.13.0",
66
- "@oh-my-pi/pi-utils": "12.4.0",
64
+ "@google/genai": "^1.41.0",
65
+ "@mistralai/mistralai": "^1.14.0",
66
+ "@oh-my-pi/pi-utils": "12.5.0",
67
67
  "@sinclair/typebox": "^0.34.48",
68
- "@smithy/node-http-handler": "^4.4.9",
69
- "ajv": "^8.17.1",
68
+ "@smithy/node-http-handler": "^4.4.10",
69
+ "ajv": "^8.18.0",
70
70
  "ajv-formats": "^3.0.1",
71
71
  "chalk": "^5.6.2",
72
- "openai": "^6.17.0",
72
+ "openai": "^6.22.0",
73
73
  "partial-json": "^0.1.7",
74
74
  "zod-to-json-schema": "^3.25.1"
75
75
  },
@@ -83,6 +83,44 @@ export interface OpenAICompletionsOptions extends StreamOptions {
83
83
  reasoningEffort?: "minimal" | "low" | "medium" | "high" | "xhigh";
84
84
  }
85
85
 
86
+ // LIMITATION: The think tag parser uses naive string matching for <think>/<thinking> tags.
87
+ // If MiniMax models output these literal strings in code blocks, XML examples, or explanations,
88
+ // they will be incorrectly consumed as thinking delimiters, truncating visible output.
89
+ // A streaming parser with arbitrary chunk boundaries cannot reliably detect code block context.
90
+ // This is acceptable because: (1) only enabled for minimax-code providers, (2) MiniMax models
91
+ // use these tags as their actual thinking format, and (3) false positives are rare in practice.
92
+ const MINIMAX_THINK_OPEN_TAGS = ["<think>", "<thinking>"] as const;
93
+ const MINIMAX_THINK_CLOSE_TAGS = ["</think>", "</thinking>"] as const;
94
+
95
+ function findFirstTag(text: string, tags: readonly string[]): { index: number; tag: string } | undefined {
96
+ let earliestIndex = Number.POSITIVE_INFINITY;
97
+ let earliestTag: string | undefined;
98
+ for (const tag of tags) {
99
+ const index = text.indexOf(tag);
100
+ if (index !== -1 && index < earliestIndex) {
101
+ earliestIndex = index;
102
+ earliestTag = tag;
103
+ }
104
+ }
105
+ if (!earliestTag) return undefined;
106
+ return { index: earliestIndex, tag: earliestTag };
107
+ }
108
+
109
+ function getTrailingPartialTag(text: string, tags: readonly string[]): string {
110
+ let maxLength = 0;
111
+ for (const tag of tags) {
112
+ const maxCandidateLength = Math.min(tag.length - 1, text.length);
113
+ for (let length = maxCandidateLength; length > 0; length--) {
114
+ if (text.endsWith(tag.slice(0, length))) {
115
+ if (length > maxLength) maxLength = length;
116
+ break;
117
+ }
118
+ }
119
+ }
120
+ if (maxLength === 0) return "";
121
+ return text.slice(-maxLength);
122
+ }
123
+
86
124
  export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
87
125
  model: Model<"openai-completions">,
88
126
  context: Context,
@@ -152,6 +190,93 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
152
190
  }
153
191
  };
154
192
 
193
+ const parseMiniMaxThinkTags = model.provider === "minimax-code" || model.provider === "minimax-code-cn";
194
+ let taggedTextBuffer = "";
195
+ let insideTaggedThinking = false;
196
+
197
+ const appendTextDelta = (delta: string) => {
198
+ if (delta.length === 0) return;
199
+ if (!currentBlock || currentBlock.type !== "text") {
200
+ finishCurrentBlock(currentBlock);
201
+ currentBlock = { type: "text", text: "" };
202
+ output.content.push(currentBlock);
203
+ stream.push({ type: "text_start", contentIndex: blockIndex(), partial: output });
204
+ }
205
+ if (currentBlock.type === "text") {
206
+ currentBlock.text += delta;
207
+ stream.push({
208
+ type: "text_delta",
209
+ contentIndex: blockIndex(),
210
+ delta,
211
+ partial: output,
212
+ });
213
+ }
214
+ };
215
+
216
+ const appendThinkingDelta = (delta: string, signature?: string) => {
217
+ if (delta.length === 0) return;
218
+ if (
219
+ !currentBlock ||
220
+ currentBlock.type !== "thinking" ||
221
+ (signature !== undefined && currentBlock.thinkingSignature !== signature)
222
+ ) {
223
+ finishCurrentBlock(currentBlock);
224
+ currentBlock = {
225
+ type: "thinking",
226
+ thinking: "",
227
+ thinkingSignature: signature,
228
+ };
229
+ output.content.push(currentBlock);
230
+ stream.push({ type: "thinking_start", contentIndex: blockIndex(), partial: output });
231
+ }
232
+ if (currentBlock.type === "thinking") {
233
+ if (signature !== undefined && !currentBlock.thinkingSignature) {
234
+ currentBlock.thinkingSignature = signature;
235
+ }
236
+ currentBlock.thinking += delta;
237
+ stream.push({
238
+ type: "thinking_delta",
239
+ contentIndex: blockIndex(),
240
+ delta,
241
+ partial: output,
242
+ });
243
+ }
244
+ };
245
+
246
+ const flushTaggedTextBuffer = () => {
247
+ while (taggedTextBuffer.length > 0) {
248
+ if (insideTaggedThinking) {
249
+ const closingTag = findFirstTag(taggedTextBuffer, MINIMAX_THINK_CLOSE_TAGS);
250
+ if (closingTag) {
251
+ appendThinkingDelta(taggedTextBuffer.slice(0, closingTag.index));
252
+ taggedTextBuffer = taggedTextBuffer.slice(closingTag.index + closingTag.tag.length);
253
+ insideTaggedThinking = false;
254
+ continue;
255
+ }
256
+
257
+ const trailingPartialTag = getTrailingPartialTag(taggedTextBuffer, MINIMAX_THINK_CLOSE_TAGS);
258
+ const flushLength = taggedTextBuffer.length - trailingPartialTag.length;
259
+ appendThinkingDelta(taggedTextBuffer.slice(0, flushLength));
260
+ taggedTextBuffer = trailingPartialTag;
261
+ break;
262
+ }
263
+
264
+ const openingTag = findFirstTag(taggedTextBuffer, MINIMAX_THINK_OPEN_TAGS);
265
+ if (openingTag) {
266
+ appendTextDelta(taggedTextBuffer.slice(0, openingTag.index));
267
+ taggedTextBuffer = taggedTextBuffer.slice(openingTag.index + openingTag.tag.length);
268
+ insideTaggedThinking = true;
269
+ continue;
270
+ }
271
+
272
+ const trailingPartialTag = getTrailingPartialTag(taggedTextBuffer, MINIMAX_THINK_OPEN_TAGS);
273
+ const flushLength = taggedTextBuffer.length - trailingPartialTag.length;
274
+ appendTextDelta(taggedTextBuffer.slice(0, flushLength));
275
+ taggedTextBuffer = trailingPartialTag;
276
+ break;
277
+ }
278
+ };
279
+
155
280
  for await (const chunk of openaiStream) {
156
281
  if (chunk.usage) {
157
282
  // Check for cached_tokens at root level (Kimi) or in prompt_tokens_details (OpenAI)
@@ -196,21 +321,11 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
196
321
  choice.delta.content.length > 0
197
322
  ) {
198
323
  if (!firstTokenTime) firstTokenTime = Date.now();
199
- if (!currentBlock || currentBlock.type !== "text") {
200
- finishCurrentBlock(currentBlock);
201
- currentBlock = { type: "text", text: "" };
202
- output.content.push(currentBlock);
203
- stream.push({ type: "text_start", contentIndex: blockIndex(), partial: output });
204
- }
205
-
206
- if (currentBlock.type === "text") {
207
- currentBlock.text += choice.delta.content;
208
- stream.push({
209
- type: "text_delta",
210
- contentIndex: blockIndex(),
211
- delta: choice.delta.content,
212
- partial: output,
213
- });
324
+ if (parseMiniMaxThinkTags) {
325
+ taggedTextBuffer += choice.delta.content;
326
+ flushTaggedTextBuffer();
327
+ } else {
328
+ appendTextDelta(choice.delta.content);
214
329
  }
215
330
  }
216
331
 
@@ -234,27 +349,8 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
234
349
  }
235
350
 
236
351
  if (foundReasoningField) {
237
- if (!currentBlock || currentBlock.type !== "thinking") {
238
- finishCurrentBlock(currentBlock);
239
- currentBlock = {
240
- type: "thinking",
241
- thinking: "",
242
- thinkingSignature: foundReasoningField,
243
- };
244
- output.content.push(currentBlock);
245
- stream.push({ type: "thinking_start", contentIndex: blockIndex(), partial: output });
246
- }
247
-
248
- if (currentBlock.type === "thinking") {
249
- const delta = (choice.delta as any)[foundReasoningField];
250
- currentBlock.thinking += delta;
251
- stream.push({
252
- type: "thinking_delta",
253
- contentIndex: blockIndex(),
254
- delta,
255
- partial: output,
256
- });
257
- }
352
+ const delta = (choice.delta as any)[foundReasoningField];
353
+ appendThinkingDelta(delta, foundReasoningField);
258
354
  }
259
355
 
260
356
  if (choice?.delta?.tool_calls) {
@@ -311,6 +407,15 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
311
407
  }
312
408
  }
313
409
 
410
+ if (parseMiniMaxThinkTags && taggedTextBuffer.length > 0) {
411
+ if (insideTaggedThinking) {
412
+ appendThinkingDelta(taggedTextBuffer);
413
+ } else {
414
+ appendTextDelta(taggedTextBuffer);
415
+ }
416
+ taggedTextBuffer = "";
417
+ }
418
+
314
419
  finishCurrentBlock(currentBlock);
315
420
 
316
421
  if (options?.signal?.aborted) {
@@ -35,6 +35,10 @@ const OVERFLOW_PATTERNS = [
35
35
  /maximum context length is \d+ tokens/i, // OpenRouter (all backends)
36
36
  /exceeds the limit of \d+/i, // GitHub Copilot
37
37
  /exceeds the available context size/i, // llama.cpp server
38
+ /requested tokens?.*exceed.*context (window|length|size)/i, // llama.cpp / OpenAI-compatible local servers
39
+ /context (window|length|size).*(exceeded|overflow|too small)/i, // Generic local server variants
40
+ /(prompt|input).*(too long|too large).*(context|n_ctx)/i, // llama.cpp phrasing variants
41
+ /requested tokens?.*(exceeds?|greater than).*(n_ctx|context)/i, // llama.cpp n_ctx variants
38
42
  /greater than the context length/i, // LM Studio
39
43
  /context window exceeds limit/i, // MiniMax
40
44
  /exceeded model token limit/i, // Kimi For Coding
@@ -105,8 +109,8 @@ export function isContextOverflow(message: AssistantMessage, contextWindow?: num
105
109
  }
106
110
  }
107
111
 
108
- // Case 2: Silent overflow (z.ai style) - successful but usage exceeds context
109
- if (contextWindow && message.stopReason === "stop") {
112
+ // Case 2: Usage-based overflow (silent or provider-specific)
113
+ if (contextWindow) {
110
114
  const inputTokens = message.usage.input + message.usage.cacheRead;
111
115
  if (inputTokens > contextWindow) {
112
116
  return true;