@oh-my-pi/pi-ai 14.7.1 → 14.7.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -2,6 +2,14 @@
2
2
 
3
3
  ## [Unreleased]
4
4
 
5
+ ## [14.7.2] - 2026-05-06
6
+
7
+ ### Fixed
8
+
9
+ - Fixed VLLM model discovery to use `max_model_len` as the context window when the endpoint reports it.
10
+ - Fixed custom Ollama Cloud/local-proxy model aliases (for example `deepseek-v4-pro:cloud`) to inherit bundled cache-pricing metadata when the upstream model is known ([#937](https://github.com/can1357/oh-my-pi/issues/937)).
11
+ - Fixed local Ollama model discovery to apply `/api/show` thinking and vision capabilities in addition to native context windows ([#928](https://github.com/can1357/oh-my-pi/issues/928)).
12
+
5
13
  ## [14.7.0] - 2026-05-04
6
14
  ### Breaking Changes
7
15
 
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "type": "module",
3
3
  "name": "@oh-my-pi/pi-ai",
4
- "version": "14.7.1",
4
+ "version": "14.7.3",
5
5
  "description": "Unified LLM API with automatic model discovery and provider configuration",
6
6
  "homepage": "https://github.com/can1357/oh-my-pi",
7
7
  "author": "Can Boluk",
@@ -46,8 +46,8 @@
46
46
  "@aws-sdk/credential-provider-node": "^3.972.39",
47
47
  "@bufbuild/protobuf": "^2.12.0",
48
48
  "@google/genai": "^1.52.0",
49
- "@oh-my-pi/pi-natives": "14.7.1",
50
- "@oh-my-pi/pi-utils": "14.7.1",
49
+ "@oh-my-pi/pi-natives": "14.7.3",
50
+ "@oh-my-pi/pi-utils": "14.7.3",
51
51
  "@sinclair/typebox": "^0.34.49",
52
52
  "@smithy/node-http-handler": "^4.6.1",
53
53
  "ajv": "^8.20.0",
@@ -1,6 +1,7 @@
1
1
  import type { ModelManagerOptions } from "../model-manager";
2
+ import { Effort } from "../model-thinking";
2
3
  import { getBundledModels } from "../models";
3
- import type { Api, Model } from "../types";
4
+ import type { Api, Model, ThinkingConfig } from "../types";
4
5
  import { isAnthropicOAuthToken, isRecord, toBoolean, toNumber, toPositiveNumber } from "../utils";
5
6
  import {
6
7
  fetchOpenAICompatibleModels,
@@ -192,7 +193,7 @@ function toOllamaNativeBaseUrl(baseUrl: string): string {
192
193
 
193
194
  async function fetchOllamaNativeModels(
194
195
  baseUrl: string,
195
- resolveLimits: (modelId: string) => Promise<OllamaModelLimits>,
196
+ resolveMetadata: (modelId: string) => Promise<OllamaResolvedMetadata>,
196
197
  ): Promise<Model<"openai-responses">[] | null> {
197
198
  const nativeBaseUrl = toOllamaNativeBaseUrl(baseUrl);
198
199
  let response: Response;
@@ -213,18 +214,19 @@ async function fetchOllamaNativeModels(
213
214
  entries.map(async (entry): Promise<Model<"openai-responses"> | null> => {
214
215
  const id = entry.model ?? entry.name;
215
216
  if (!id) return null;
216
- const { contextWindow, maxTokens } = await resolveLimits(id);
217
+ const metadata = await resolveMetadata(id);
217
218
  return {
218
219
  id,
219
220
  name: entry.name ?? id,
220
221
  api: "openai-responses",
221
222
  provider: "ollama",
222
223
  baseUrl,
223
- reasoning: false,
224
- input: ["text"],
224
+ reasoning: metadata.reasoning ?? false,
225
+ thinking: metadata.thinking,
226
+ input: metadata.input ?? ["text"],
225
227
  cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
226
- contextWindow,
227
- maxTokens,
228
+ contextWindow: metadata.contextWindow,
229
+ maxTokens: metadata.maxTokens,
228
230
  };
229
231
  }),
230
232
  );
@@ -241,18 +243,65 @@ const OLLAMA_FALLBACK_CONTEXT_WINDOW = 128_000;
241
243
  /** Cap max output tokens at a value that matches OMP's other openai-responses defaults. */
242
244
  const OLLAMA_DEFAULT_MAX_TOKENS = 8192;
243
245
 
244
- interface OllamaModelLimits {
246
+ interface OllamaResolvedMetadata {
245
247
  contextWindow: number;
246
248
  maxTokens: number;
249
+ capabilities?: string[];
250
+ reasoning?: boolean;
251
+ thinking?: ThinkingConfig;
252
+ input?: ("text" | "image")[];
253
+ }
254
+
255
+ interface OllamaShowMetadata {
256
+ contextWindow?: number;
257
+ maxTokens?: number;
258
+ capabilities?: string[];
259
+ reasoning?: boolean;
260
+ thinking?: ThinkingConfig;
261
+ input?: ("text" | "image")[];
262
+ }
263
+
264
+ function getOllamaContextWindow(modelInfo: Record<string, unknown> | undefined): number | undefined {
265
+ if (!modelInfo) {
266
+ return undefined;
267
+ }
268
+ for (const [key, value] of Object.entries(modelInfo)) {
269
+ if (typeof value !== "number" || value <= 0) {
270
+ continue;
271
+ }
272
+ if (key.endsWith(".context_length") || key.endsWith(".num_ctx") || key.endsWith(".context_window")) {
273
+ return value;
274
+ }
275
+ }
276
+ }
277
+
278
+ function getOllamaCapabilities(value: unknown): string[] | undefined {
279
+ if (!Array.isArray(value)) {
280
+ return undefined;
281
+ }
282
+ return value.filter((item): item is string => typeof item === "string");
283
+ }
284
+
285
+ function getOllamaThinkingConfig(capabilities: string[] | undefined): ThinkingConfig | undefined {
286
+ if (!capabilities?.includes("thinking")) {
287
+ return undefined;
288
+ }
289
+ return {
290
+ mode: "effort",
291
+ minLevel: Effort.Minimal,
292
+ maxLevel: Effort.High,
293
+ };
247
294
  }
248
295
 
249
296
  /**
250
- * Query Ollama's `/api/show` endpoint for a single model and pull its native
251
- * context length out of `model_info.<arch>.context_length`. Returns the
252
- * discovered limits, or `undefined` when the endpoint or field is
253
- * unavailable so callers can layer their own fallback.
297
+ * Query Ollama's `/api/show` endpoint for a single model and pull native
298
+ * context and capability metadata from the response. Returns `undefined` when
299
+ * the endpoint is unavailable so callers can layer their own fallback.
254
300
  */
255
- async function fetchOllamaShowLimits(nativeBaseUrl: string, modelId: string): Promise<OllamaModelLimits | undefined> {
301
+ async function fetchOllamaShowMetadata(
302
+ nativeBaseUrl: string,
303
+ modelId: string,
304
+ ): Promise<OllamaShowMetadata | undefined> {
256
305
  try {
257
306
  const response = await fetch(`${nativeBaseUrl}/api/show`, {
258
307
  method: "POST",
@@ -262,13 +311,21 @@ async function fetchOllamaShowLimits(nativeBaseUrl: string, modelId: string): Pr
262
311
  if (!response.ok) {
263
312
  return undefined;
264
313
  }
265
- const payload = (await response.json()) as { model_info?: Record<string, unknown> };
266
- const info = payload.model_info ?? {};
267
- for (const [key, value] of Object.entries(info)) {
268
- if (key.endsWith(".context_length") && typeof value === "number" && value > 0) {
269
- return { contextWindow: value, maxTokens: OLLAMA_DEFAULT_MAX_TOKENS };
270
- }
271
- }
314
+ const payload = (await response.json()) as { capabilities?: unknown; model_info?: Record<string, unknown> };
315
+ const capabilities = getOllamaCapabilities(payload.capabilities);
316
+ const contextWindow = getOllamaContextWindow(payload.model_info);
317
+ return {
318
+ contextWindow,
319
+ maxTokens: contextWindow ? OLLAMA_DEFAULT_MAX_TOKENS : undefined,
320
+ capabilities,
321
+ reasoning: capabilities ? capabilities.includes("thinking") : undefined,
322
+ thinking: getOllamaThinkingConfig(capabilities),
323
+ input: capabilities
324
+ ? capabilities.includes("vision")
325
+ ? (["text", "image"] as Array<"text" | "image">)
326
+ : (["text"] as Array<"text">)
327
+ : undefined,
328
+ };
272
329
  } catch {
273
330
  // fall through; caller decides on the fallback
274
331
  }
@@ -276,23 +333,27 @@ async function fetchOllamaShowLimits(nativeBaseUrl: string, modelId: string): Pr
276
333
  }
277
334
 
278
335
  /**
279
- * Build a resolver that fetches `/api/show` limits per model id and caches the
280
- * result in-memory for the lifetime of the manager. Successful lookups are
336
+ * Build a resolver that fetches `/api/show` metadata per model id and caches
337
+ * the result in-memory for the lifetime of the manager. Successful lookups are
281
338
  * cached so repeated `fetchDynamicModels` calls do not refetch; failed
282
339
  * lookups stay uncached so a later refresh can recover.
283
340
  */
284
- function createOllamaLimitsResolver(nativeBaseUrl: string): (modelId: string) => Promise<OllamaModelLimits> {
285
- const cache = new Map<string, Promise<OllamaModelLimits>>();
341
+ function createOllamaMetadataResolver(nativeBaseUrl: string): (modelId: string) => Promise<OllamaResolvedMetadata> {
342
+ const cache = new Map<string, Promise<OllamaResolvedMetadata>>();
286
343
  return modelId => {
287
344
  const cached = cache.get(modelId);
288
345
  if (cached) return cached;
289
346
  const pending = (async () => {
290
- const limits = await fetchOllamaShowLimits(nativeBaseUrl, modelId);
291
- if (!limits) {
347
+ const metadata = await fetchOllamaShowMetadata(nativeBaseUrl, modelId);
348
+ if (!metadata) {
292
349
  cache.delete(modelId);
293
350
  return { contextWindow: OLLAMA_FALLBACK_CONTEXT_WINDOW, maxTokens: OLLAMA_DEFAULT_MAX_TOKENS };
294
351
  }
295
- return limits;
352
+ return {
353
+ ...metadata,
354
+ contextWindow: metadata.contextWindow ?? OLLAMA_FALLBACK_CONTEXT_WINDOW,
355
+ maxTokens: metadata.maxTokens ?? OLLAMA_DEFAULT_MAX_TOKENS,
356
+ };
296
357
  })();
297
358
  cache.set(modelId, pending);
298
359
  void pending.catch(() => cache.delete(modelId));
@@ -702,7 +763,7 @@ export function ollamaModelManagerOptions(config?: OllamaModelManagerConfig): Mo
702
763
  const baseUrl = normalizeOllamaBaseUrl(config?.baseUrl);
703
764
  const nativeBaseUrl = toOllamaNativeBaseUrl(baseUrl);
704
765
  const references = createBundledReferenceMap<"openai-responses">("ollama" as Parameters<typeof getBundledModels>[0]);
705
- const resolveLimits = createOllamaLimitsResolver(nativeBaseUrl);
766
+ const resolveMetadata = createOllamaMetadataResolver(nativeBaseUrl);
706
767
  return {
707
768
  providerId: "ollama",
708
769
  fetchDynamicModels: async () => {
@@ -727,13 +788,20 @@ export function ollamaModelManagerOptions(config?: OllamaModelManagerConfig): Mo
727
788
  if (openAiCompatible && openAiCompatible.length > 0) {
728
789
  await Promise.all(
729
790
  openAiCompatible.map(async model => {
730
- const limits = await resolveLimits(model.id);
731
- model.contextWindow = limits.contextWindow;
791
+ const metadata = await resolveMetadata(model.id);
792
+ model.contextWindow = metadata.contextWindow;
793
+ if (metadata.reasoning !== undefined) {
794
+ model.reasoning = metadata.reasoning;
795
+ model.thinking = metadata.thinking;
796
+ }
797
+ if (metadata.input) {
798
+ model.input = metadata.input;
799
+ }
732
800
  }),
733
801
  );
734
802
  return openAiCompatible;
735
803
  }
736
- const nativeFallback = await fetchOllamaNativeModels(baseUrl, resolveLimits);
804
+ const nativeFallback = await fetchOllamaNativeModels(baseUrl, resolveMetadata);
737
805
  if (nativeFallback && nativeFallback.length > 0) {
738
806
  return nativeFallback;
739
807
  }
@@ -1407,8 +1475,11 @@ export function vllmModelManagerOptions(config?: VllmModelManagerConfig): ModelM
1407
1475
  baseUrl,
1408
1476
  apiKey,
1409
1477
  mapModel: (entry, defaults) => {
1410
- const reference = references.get(defaults.id);
1411
- return mapWithBundledReference(entry, defaults, reference);
1478
+ const model = mapWithBundledReference(entry, defaults, references.get(defaults.id));
1479
+ return {
1480
+ ...model,
1481
+ contextWindow: toPositiveNumber(entry.max_model_len, model.contextWindow),
1482
+ };
1412
1483
  },
1413
1484
  }),
1414
1485
  };
@@ -8,7 +8,7 @@ import type {
8
8
  MessageParam,
9
9
  RawMessageStreamEvent,
10
10
  } from "@anthropic-ai/sdk/resources/messages";
11
- import { $env, abortableSleep, isEnoent } from "@oh-my-pi/pi-utils";
11
+ import { $env, abortableSleep, isEnoent, readSseEvents } from "@oh-my-pi/pi-utils";
12
12
  import { hasOpus47ApiRestrictions, mapEffortToAnthropicAdaptiveEffort } from "../model-thinking";
13
13
  import { calculateCost } from "../models";
14
14
  import { getEnvApiKey, OUTPUT_FALLBACK_BUFFER } from "../stream";
@@ -658,18 +658,6 @@ function mergeHeaders(...headerSources: (Record<string, string> | undefined)[]):
658
658
  // We surface the resulting provider error ourselves, so keep the SDK quiet.
659
659
  const ANTHROPIC_SDK_LOG_LEVEL = "off" as const;
660
660
 
661
- interface ServerSentEvent {
662
- event: string | null;
663
- data: string;
664
- raw: string[];
665
- }
666
-
667
- interface SseDecoderState {
668
- event: string | null;
669
- data: string[];
670
- raw: string[];
671
- }
672
-
673
661
  const ANTHROPIC_MESSAGE_EVENTS: ReadonlySet<string> = new Set([
674
662
  "message_start",
675
663
  "message_delta",
@@ -679,136 +667,6 @@ const ANTHROPIC_MESSAGE_EVENTS: ReadonlySet<string> = new Set([
679
667
  "content_block_stop",
680
668
  ]);
681
669
 
682
- function flushSseEvent(state: SseDecoderState): ServerSentEvent | null {
683
- if (!state.event && state.data.length === 0) {
684
- return null;
685
- }
686
-
687
- const event: ServerSentEvent = {
688
- event: state.event,
689
- data: state.data.join("\n"),
690
- raw: [...state.raw],
691
- };
692
- state.event = null;
693
- state.data = [];
694
- state.raw = [];
695
- return event;
696
- }
697
-
698
- function decodeSseLine(line: string, state: SseDecoderState): ServerSentEvent | null {
699
- if (line === "") {
700
- return flushSseEvent(state);
701
- }
702
-
703
- state.raw.push(line);
704
- if (line.startsWith(":")) {
705
- return null;
706
- }
707
-
708
- const delimiterIndex = line.indexOf(":");
709
- const fieldName = delimiterIndex === -1 ? line : line.slice(0, delimiterIndex);
710
- let value = delimiterIndex === -1 ? "" : line.slice(delimiterIndex + 1);
711
- if (value.startsWith(" ")) {
712
- value = value.slice(1);
713
- }
714
-
715
- if (fieldName === "event") {
716
- state.event = value;
717
- } else if (fieldName === "data") {
718
- state.data.push(value);
719
- }
720
-
721
- return null;
722
- }
723
-
724
- function nextLineBreakIndex(text: string): number {
725
- const carriageReturnIndex = text.indexOf("\r");
726
- const newlineIndex = text.indexOf("\n");
727
- if (carriageReturnIndex === -1) {
728
- return newlineIndex;
729
- }
730
- if (newlineIndex === -1) {
731
- return carriageReturnIndex;
732
- }
733
- return Math.min(carriageReturnIndex, newlineIndex);
734
- }
735
-
736
- function consumeLine(text: string): { line: string; rest: string } | null {
737
- const lineBreakIndex = nextLineBreakIndex(text);
738
- if (lineBreakIndex === -1) {
739
- return null;
740
- }
741
-
742
- let nextIndex = lineBreakIndex + 1;
743
- if (text[lineBreakIndex] === "\r" && text[nextIndex] === "\n") {
744
- nextIndex += 1;
745
- }
746
-
747
- return {
748
- line: text.slice(0, lineBreakIndex),
749
- rest: text.slice(nextIndex),
750
- };
751
- }
752
-
753
- async function* iterateSseMessages(
754
- body: ReadableStream<Uint8Array>,
755
- signal?: AbortSignal,
756
- ): AsyncGenerator<ServerSentEvent> {
757
- const reader = body.getReader();
758
- const decoder = new TextDecoder();
759
- const state: SseDecoderState = { event: null, data: [], raw: [] };
760
- let buffer = "";
761
-
762
- try {
763
- while (true) {
764
- if (signal?.aborted) {
765
- throw new Error("Request was aborted");
766
- }
767
-
768
- const { value, done } = await reader.read();
769
- if (done) {
770
- break;
771
- }
772
-
773
- buffer += decoder.decode(value, { stream: true });
774
- let consumed = consumeLine(buffer);
775
- while (consumed) {
776
- buffer = consumed.rest;
777
- const event = decodeSseLine(consumed.line, state);
778
- if (event) {
779
- yield event;
780
- }
781
- consumed = consumeLine(buffer);
782
- }
783
- }
784
-
785
- buffer += decoder.decode();
786
- let consumed = consumeLine(buffer);
787
- while (consumed) {
788
- buffer = consumed.rest;
789
- const event = decodeSseLine(consumed.line, state);
790
- if (event) {
791
- yield event;
792
- }
793
- consumed = consumeLine(buffer);
794
- }
795
-
796
- if (buffer.length > 0) {
797
- const event = decodeSseLine(buffer, state);
798
- if (event) {
799
- yield event;
800
- }
801
- }
802
-
803
- const trailingEvent = flushSseEvent(state);
804
- if (trailingEvent) {
805
- yield trailingEvent;
806
- }
807
- } finally {
808
- reader.releaseLock();
809
- }
810
- }
811
-
812
670
  async function* iterateAnthropicEvents(
813
671
  response: Response,
814
672
  signal?: AbortSignal,
@@ -820,7 +678,7 @@ async function* iterateAnthropicEvents(
820
678
  let sawMessageStart = false;
821
679
  let sawMessageEnd = false;
822
680
 
823
- for await (const sse of iterateSseMessages(response.body, signal)) {
681
+ for await (const sse of readSseEvents(response.body, signal)) {
824
682
  if (sse.event === "error") {
825
683
  throw new Error(sse.data);
826
684
  }
@@ -105,23 +105,7 @@ export class EventStream<T, R = T> implements AsyncIterable<T> {
105
105
  }
106
106
  }
107
107
 
108
- // Delta events that can be batched for throttling
109
- type DeltaEvent =
110
- | { type: "text_delta"; contentIndex: number; delta: string; partial: AssistantMessage }
111
- | { type: "thinking_delta"; contentIndex: number; delta: string; partial: AssistantMessage }
112
- | { type: "toolcall_delta"; contentIndex: number; delta: string; partial: AssistantMessage };
113
-
114
- function isDeltaEvent(event: AssistantMessageEvent): event is DeltaEvent {
115
- return event.type === "text_delta" || event.type === "thinking_delta" || event.type === "toolcall_delta";
116
- }
117
-
118
108
  export class AssistantMessageEventStream extends EventStream<AssistantMessageEvent, AssistantMessage> {
119
- // Throttling state
120
- #deltaBuffer: DeltaEvent[] = [];
121
- #flushTimer?: NodeJS.Timeout;
122
- #lastFlushTime = 0;
123
- readonly #throttleMs = 50; // 20 updates/sec
124
-
125
109
  constructor() {
126
110
  super(
127
111
  event => event.type === "done" || event.type === "error",
@@ -139,103 +123,20 @@ export class AssistantMessageEventStream extends EventStream<AssistantMessageEve
139
123
  override push(event: AssistantMessageEvent): void {
140
124
  if (this.done) return;
141
125
 
142
- // Check for completion first
126
+ // Completion resolves the final result and still emits the terminal event.
143
127
  if (this.isComplete(event)) {
144
- this.#flushDeltas(); // Flush any pending deltas before completing
145
128
  this.done = true;
146
129
  this.resolveFinalResult(this.extractResult(event));
147
130
  }
148
131
 
149
- // Delta events get batched and throttled
150
- if (isDeltaEvent(event)) {
151
- this.#deltaBuffer.push(event);
152
- this.#scheduleFlush();
153
- return;
154
- }
155
-
156
- // Non-delta events flush pending deltas immediately, then emit
157
- this.#flushDeltas();
158
132
  this.deliver(event);
159
133
  }
160
134
 
161
135
  override end(result?: AssistantMessage): void {
162
- this.#flushDeltas();
163
136
  this.done = true;
164
137
  if (result !== undefined) {
165
138
  this.resolveFinalResult(result);
166
139
  }
167
140
  this.endWaiting();
168
141
  }
169
-
170
- override fail(err: unknown): void {
171
- if (this.#flushTimer) {
172
- clearTimeout(this.#flushTimer);
173
- this.#flushTimer = undefined;
174
- }
175
- this.#deltaBuffer = [];
176
- super.fail(err);
177
- }
178
-
179
- #scheduleFlush(): void {
180
- if (this.#flushTimer) return; // Already scheduled
181
-
182
- const now = Bun.nanoseconds();
183
- const timeSinceLastFlush = (now - this.#lastFlushTime) / 1e6;
184
-
185
- if (timeSinceLastFlush >= this.#throttleMs) {
186
- // Flush immediately if throttle window has passed
187
- this.#flushDeltas();
188
- } else {
189
- // Schedule flush for when throttle window expires
190
- const delay = this.#throttleMs - timeSinceLastFlush;
191
- this.#flushTimer = setTimeout(() => {
192
- this.#flushTimer = undefined;
193
- this.#flushDeltas();
194
- }, delay);
195
- }
196
- }
197
-
198
- #flushDeltas(): void {
199
- if (this.#flushTimer) {
200
- clearTimeout(this.#flushTimer);
201
- this.#flushTimer = undefined;
202
- }
203
-
204
- if (this.#deltaBuffer.length === 0) return;
205
-
206
- // Merge consecutive deltas for the same content block and type
207
- const merged = this.#mergeDeltas(this.#deltaBuffer);
208
- this.#deltaBuffer = [];
209
- this.#lastFlushTime = Bun.nanoseconds();
210
-
211
- for (const event of merged) {
212
- this.deliver(event);
213
- }
214
- }
215
-
216
- #mergeDeltas(deltas: DeltaEvent[]): AssistantMessageEvent[] {
217
- if (deltas.length === 0) return [];
218
- if (deltas.length === 1) return [deltas[0]];
219
-
220
- const result: AssistantMessageEvent[] = [];
221
- let current = deltas[0];
222
-
223
- for (let i = 1; i < deltas.length; i++) {
224
- const next = deltas[i];
225
- // Can merge if same type, same content index
226
- if (next.type === current.type && next.contentIndex === current.contentIndex) {
227
- current = {
228
- ...current,
229
- delta: current.delta + next.delta,
230
- partial: next.partial, // Use latest partial
231
- } as DeltaEvent;
232
- } else {
233
- result.push(current);
234
- current = next;
235
- }
236
- }
237
- result.push(current);
238
-
239
- return result;
240
- }
241
142
  }