@oh-my-pi/pi-ai 8.13.0 → 9.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@oh-my-pi/pi-ai",
3
- "version": "8.13.0",
3
+ "version": "9.1.0",
4
4
  "description": "Unified LLM API with automatic model discovery and provider configuration",
5
5
  "type": "module",
6
6
  "main": "./src/index.ts",
@@ -63,7 +63,7 @@
63
63
  "@connectrpc/connect-node": "^2.1.1",
64
64
  "@google/genai": "^1.38.0",
65
65
  "@mistralai/mistralai": "^1.13.0",
66
- "@oh-my-pi/pi-utils": "8.13.0",
66
+ "@oh-my-pi/pi-utils": "9.1.0",
67
67
  "@sinclair/typebox": "^0.34.48",
68
68
  "@smithy/node-http-handler": "^4.4.8",
69
69
  "ajv": "^8.17.1",
@@ -914,7 +914,7 @@ function convertTools(tools: Tool[], isOAuthToken: boolean): Anthropic.Messages.
914
914
  });
915
915
  }
916
916
 
917
- function mapStopReason(reason: Anthropic.Messages.StopReason): StopReason {
917
+ function mapStopReason(reason: Anthropic.Messages.StopReason | string): StopReason {
918
918
  switch (reason) {
919
919
  case "end_turn":
920
920
  return "stop";
@@ -928,9 +928,10 @@ function mapStopReason(reason: Anthropic.Messages.StopReason): StopReason {
928
928
  return "stop";
929
929
  case "stop_sequence":
930
930
  return "stop"; // We don't supply stop sequences, so this should never happen
931
- default: {
932
- const _exhaustive: never = reason;
933
- throw new Error(`Unhandled stop reason: ${_exhaustive}`);
934
- }
931
+ case "sensitive": // Content flagged by safety filters (not yet in SDK types)
932
+ return "error";
933
+ default:
934
+ // Handle unknown stop reasons gracefully (API may add new values)
935
+ throw new Error(`Unhandled stop reason: ${reason}`);
935
936
  }
936
937
  }
@@ -70,7 +70,7 @@ const GEMINI_CLI_HEADERS = {
70
70
 
71
71
  // Headers for Antigravity (sandbox endpoint) - requires specific User-Agent
72
72
  const ANTIGRAVITY_HEADERS = {
73
- "User-Agent": "antigravity/1.11.5 darwin/arm64",
73
+ "User-Agent": "antigravity/1.15.8 darwin/arm64",
74
74
  "X-Goog-Api-Client": "google-cloud-sdk vscode_cloudshelleditor/0.1",
75
75
  "Client-Metadata": JSON.stringify({
76
76
  ideType: "IDE_UNSPECIFIED",
@@ -789,6 +789,7 @@ function detectCompat(model: Model<"openai-completions">): ResolvedOpenAICompat
789
789
  provider === "mistral" ||
790
790
  baseUrl.includes("mistral.ai") ||
791
791
  baseUrl.includes("chutes.ai") ||
792
+ baseUrl.includes("deepseek.com") ||
792
793
  isZai ||
793
794
  provider === "opencode" ||
794
795
  baseUrl.includes("opencode.ai");
@@ -33,6 +33,22 @@ import { sanitizeSurrogates } from "../utils/sanitize-unicode";
33
33
  import { mapToOpenAIResponsesToolChoice } from "../utils/tool-choice";
34
34
  import { transformMessages } from "./transform-messages";
35
35
 
36
+ /**
37
+ * Get prompt cache retention based on PI_CACHE_RETENTION env var.
38
+ * Only applies to direct OpenAI API calls (api.openai.com).
39
+ * Returns '24h' for long retention, undefined for default (in-memory).
40
+ */
41
+ function getPromptCacheRetention(baseUrl: string): "24h" | undefined {
42
+ if (
43
+ typeof process !== "undefined" &&
44
+ process.env.PI_CACHE_RETENTION === "long" &&
45
+ baseUrl.includes("api.openai.com")
46
+ ) {
47
+ return "24h";
48
+ }
49
+ return undefined;
50
+ }
51
+
36
52
  // OpenAI Responses-specific options
37
53
  export interface OpenAIResponsesOptions extends StreamOptions {
38
54
  reasoningEffort?: "minimal" | "low" | "medium" | "high" | "xhigh";
@@ -395,6 +411,7 @@ function buildParams(model: Model<"openai-responses">, context: Context, options
395
411
  input: messages,
396
412
  stream: true,
397
413
  prompt_cache_key: options?.sessionId,
414
+ prompt_cache_retention: getPromptCacheRetention(model.baseUrl),
398
415
  };
399
416
 
400
417
  if (options?.maxTokens) {
@@ -6,13 +6,33 @@ import type { Api, AssistantMessage, Message, Model, ToolCall, ToolResultMessage
6
6
  * Anthropic APIs require IDs matching ^[a-zA-Z0-9_-]+$ (max 64 chars).
7
7
  */
8
8
  function normalizeToolCallId(id: string): string {
9
+ // Handle pipe-separated IDs from OpenAI Responses API
10
+ // Format: {call_id}|{item_id} where {item_id} can be 400+ chars with special chars (+, /, =)
11
+ // Extract just the call_id part and normalize it
12
+ if (id.includes("|")) {
13
+ const [callId] = id.split("|");
14
+ // Sanitize to allowed chars and truncate to 40 chars (OpenAI limit)
15
+ return callId.replace(/[^a-zA-Z0-9_-]/g, "_").slice(0, 40);
16
+ }
9
17
  return id.replace(/[^a-zA-Z0-9_-]/g, "").slice(0, 40);
10
18
  }
11
19
 
12
20
  function normalizeResponsesToolCallId(id: string): string {
13
21
  const [callId, itemId] = id.split("|");
14
22
  if (callId && itemId) {
15
- return id;
23
+ // Sanitize invalid characters and ensure proper format
24
+ const sanitizedCallId = callId.replace(/[^a-zA-Z0-9_-]/g, "_");
25
+ let sanitizedItemId = itemId.replace(/[^a-zA-Z0-9_-]/g, "_");
26
+ // OpenAI Responses API requires item id to start with "fc"
27
+ if (!sanitizedItemId.startsWith("fc")) {
28
+ sanitizedItemId = `fc_${sanitizedItemId}`;
29
+ }
30
+ // Truncate to 64 chars and strip trailing underscores (OpenAI Codex rejects them)
31
+ let normalizedCallId = sanitizedCallId.length > 64 ? sanitizedCallId.slice(0, 64) : sanitizedCallId;
32
+ let normalizedItemId = sanitizedItemId.length > 64 ? sanitizedItemId.slice(0, 64) : sanitizedItemId;
33
+ normalizedCallId = normalizedCallId.replace(/_+$/, "");
34
+ normalizedItemId = normalizedItemId.replace(/_+$/, "");
35
+ return `${normalizedCallId}|${normalizedItemId}`;
16
36
  }
17
37
  const hash = Bun.hash.xxHash64(id).toString(36);
18
38
  return `call_${hash}|item_${hash}`;
@@ -17,13 +17,16 @@ import type { AssistantMessage } from "../types";
17
17
  * - llama.cpp: "the request exceeds the available context size, try increasing it"
18
18
  * - LM Studio: "tokens to keep from the initial prompt is greater than the context length"
19
19
  * - GitHub Copilot: "prompt token count of X exceeds the limit of Y"
20
- * - Cerebras: Returns "400 status code (no body)" - handled separately below
21
- * - Mistral: Returns "400 status code (no body)" - handled separately below
20
+ * - MiniMax: "invalid params, context window exceeds limit"
21
+ * - Kimi For Coding: "Your request exceeded model token limit: X (requested: Y)"
22
+ * - Cerebras: Returns "400/413 status code (no body)" - handled separately below
23
+ * - Mistral: Returns "400/413 status code (no body)" - handled separately below
22
24
  * - z.ai: Does NOT error, accepts overflow silently - handled via usage.input > contextWindow
23
25
  * - Ollama: Silently truncates input - not detectable via error message
24
26
  */
25
27
  const OVERFLOW_PATTERNS = [
26
28
  /prompt is too long/i, // Anthropic
29
+ /input is too long for requested model/i, // Amazon Bedrock
27
30
  /exceeds the context window/i, // OpenAI (Completions & Responses API)
28
31
  /input token count.*exceeds the maximum/i, // Google (Gemini)
29
32
  /maximum prompt length is \d+/i, // xAI (Grok)
@@ -32,6 +35,8 @@ const OVERFLOW_PATTERNS = [
32
35
  /exceeds the limit of \d+/i, // GitHub Copilot
33
36
  /exceeds the available context size/i, // llama.cpp server
34
37
  /greater than the context length/i, // LM Studio
38
+ /context window exceeds limit/i, // MiniMax
39
+ /exceeded model token limit/i, // Kimi For Coding
35
40
  /context[_ ]length[_ ]exceeded/i, // Generic fallback
36
41
  /too many tokens/i, // Generic fallback
37
42
  /token limit exceeded/i, // Generic fallback
@@ -54,11 +59,12 @@ const OVERFLOW_PATTERNS = [
54
59
  * - Google Gemini: "input token count exceeds the maximum"
55
60
  * - xAI (Grok): "maximum prompt length is X but request contains Y"
56
61
  * - Groq: "reduce the length of the messages"
57
- * - Cerebras: 400/413/429 status code (no body)
58
- * - Mistral: 400/413/429 status code (no body)
62
+ * - Cerebras: 400/413 status code (no body)
63
+ * - Mistral: 400/413 status code (no body)
59
64
  * - OpenRouter (all backends): "maximum context length is X tokens"
60
65
  * - llama.cpp: "exceeds the available context size"
61
66
  * - LM Studio: "greater than the context length"
67
+ * - Kimi For Coding: "exceeded model token limit: X (requested: Y)"
62
68
  *
63
69
  * **Unreliable detection:**
64
70
  * - z.ai: Sometimes accepts overflow silently (detectable via usage.input > contextWindow),
@@ -89,9 +95,9 @@ export function isContextOverflow(message: AssistantMessage, contextWindow?: num
89
95
  return true;
90
96
  }
91
97
 
92
- // Cerebras and Mistral return 400/413/429 with no body - check for status code pattern
93
- // 429 can indicate token-based rate limiting which correlates with context overflow
94
- if (/^4(00|13|29)\s*(status code)?\s*\(no body\)/i.test(message.errorMessage)) {
98
+ // Cerebras and Mistral return 400/413 with no body for context overflow
99
+ // Note: 429 is rate limiting (requests/tokens per time), NOT context overflow
100
+ if (/^4(00|13)\s*(status code)?\s*\(no body\)/i.test(message.errorMessage)) {
95
101
  return true;
96
102
  }
97
103
  }