@oh-my-pi/pi-ai 14.7.1 → 14.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -2,6 +2,14 @@
2
2
 
3
3
  ## [Unreleased]
4
4
 
5
+ ## [14.7.2] - 2026-05-06
6
+
7
+ ### Fixed
8
+
9
+ - Fixed VLLM model discovery to use `max_model_len` as the context window when the endpoint reports it.
10
+ - Fixed custom Ollama Cloud/local-proxy model aliases (for example `deepseek-v4-pro:cloud`) to inherit bundled cache-pricing metadata when the upstream model is known ([#937](https://github.com/can1357/oh-my-pi/issues/937)).
11
+ - Fixed local Ollama model discovery to apply `/api/show` thinking and vision capabilities in addition to native context windows ([#928](https://github.com/can1357/oh-my-pi/issues/928)).
12
+
5
13
  ## [14.7.0] - 2026-05-04
6
14
  ### Breaking Changes
7
15
 
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "type": "module",
3
3
  "name": "@oh-my-pi/pi-ai",
4
- "version": "14.7.1",
4
+ "version": "14.7.2",
5
5
  "description": "Unified LLM API with automatic model discovery and provider configuration",
6
6
  "homepage": "https://github.com/can1357/oh-my-pi",
7
7
  "author": "Can Boluk",
@@ -46,8 +46,8 @@
46
46
  "@aws-sdk/credential-provider-node": "^3.972.39",
47
47
  "@bufbuild/protobuf": "^2.12.0",
48
48
  "@google/genai": "^1.52.0",
49
- "@oh-my-pi/pi-natives": "14.7.1",
50
- "@oh-my-pi/pi-utils": "14.7.1",
49
+ "@oh-my-pi/pi-natives": "14.7.2",
50
+ "@oh-my-pi/pi-utils": "14.7.2",
51
51
  "@sinclair/typebox": "^0.34.49",
52
52
  "@smithy/node-http-handler": "^4.6.1",
53
53
  "ajv": "^8.20.0",
@@ -1,6 +1,7 @@
1
1
  import type { ModelManagerOptions } from "../model-manager";
2
+ import { Effort } from "../model-thinking";
2
3
  import { getBundledModels } from "../models";
3
- import type { Api, Model } from "../types";
4
+ import type { Api, Model, ThinkingConfig } from "../types";
4
5
  import { isAnthropicOAuthToken, isRecord, toBoolean, toNumber, toPositiveNumber } from "../utils";
5
6
  import {
6
7
  fetchOpenAICompatibleModels,
@@ -192,7 +193,7 @@ function toOllamaNativeBaseUrl(baseUrl: string): string {
192
193
 
193
194
  async function fetchOllamaNativeModels(
194
195
  baseUrl: string,
195
- resolveLimits: (modelId: string) => Promise<OllamaModelLimits>,
196
+ resolveMetadata: (modelId: string) => Promise<OllamaResolvedMetadata>,
196
197
  ): Promise<Model<"openai-responses">[] | null> {
197
198
  const nativeBaseUrl = toOllamaNativeBaseUrl(baseUrl);
198
199
  let response: Response;
@@ -213,18 +214,19 @@ async function fetchOllamaNativeModels(
213
214
  entries.map(async (entry): Promise<Model<"openai-responses"> | null> => {
214
215
  const id = entry.model ?? entry.name;
215
216
  if (!id) return null;
216
- const { contextWindow, maxTokens } = await resolveLimits(id);
217
+ const metadata = await resolveMetadata(id);
217
218
  return {
218
219
  id,
219
220
  name: entry.name ?? id,
220
221
  api: "openai-responses",
221
222
  provider: "ollama",
222
223
  baseUrl,
223
- reasoning: false,
224
- input: ["text"],
224
+ reasoning: metadata.reasoning ?? false,
225
+ thinking: metadata.thinking,
226
+ input: metadata.input ?? ["text"],
225
227
  cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
226
- contextWindow,
227
- maxTokens,
228
+ contextWindow: metadata.contextWindow,
229
+ maxTokens: metadata.maxTokens,
228
230
  };
229
231
  }),
230
232
  );
@@ -241,18 +243,65 @@ const OLLAMA_FALLBACK_CONTEXT_WINDOW = 128_000;
241
243
  /** Cap max output tokens at a value that matches OMP's other openai-responses defaults. */
242
244
  const OLLAMA_DEFAULT_MAX_TOKENS = 8192;
243
245
 
244
- interface OllamaModelLimits {
246
+ interface OllamaResolvedMetadata {
245
247
  contextWindow: number;
246
248
  maxTokens: number;
249
+ capabilities?: string[];
250
+ reasoning?: boolean;
251
+ thinking?: ThinkingConfig;
252
+ input?: ("text" | "image")[];
253
+ }
254
+
255
+ interface OllamaShowMetadata {
256
+ contextWindow?: number;
257
+ maxTokens?: number;
258
+ capabilities?: string[];
259
+ reasoning?: boolean;
260
+ thinking?: ThinkingConfig;
261
+ input?: ("text" | "image")[];
262
+ }
263
+
264
+ function getOllamaContextWindow(modelInfo: Record<string, unknown> | undefined): number | undefined {
265
+ if (!modelInfo) {
266
+ return undefined;
267
+ }
268
+ for (const [key, value] of Object.entries(modelInfo)) {
269
+ if (typeof value !== "number" || value <= 0) {
270
+ continue;
271
+ }
272
+ if (key.endsWith(".context_length") || key.endsWith(".num_ctx") || key.endsWith(".context_window")) {
273
+ return value;
274
+ }
275
+ }
276
+ }
277
+
278
+ function getOllamaCapabilities(value: unknown): string[] | undefined {
279
+ if (!Array.isArray(value)) {
280
+ return undefined;
281
+ }
282
+ return value.filter((item): item is string => typeof item === "string");
283
+ }
284
+
285
+ function getOllamaThinkingConfig(capabilities: string[] | undefined): ThinkingConfig | undefined {
286
+ if (!capabilities?.includes("thinking")) {
287
+ return undefined;
288
+ }
289
+ return {
290
+ mode: "effort",
291
+ minLevel: Effort.Minimal,
292
+ maxLevel: Effort.High,
293
+ };
247
294
  }
248
295
 
249
296
  /**
250
- * Query Ollama's `/api/show` endpoint for a single model and pull its native
251
- * context length out of `model_info.<arch>.context_length`. Returns the
252
- * discovered limits, or `undefined` when the endpoint or field is
253
- * unavailable so callers can layer their own fallback.
297
+ * Query Ollama's `/api/show` endpoint for a single model and pull native
298
+ * context and capability metadata from the response. Returns `undefined` when
299
+ * the endpoint is unavailable so callers can layer their own fallback.
254
300
  */
255
- async function fetchOllamaShowLimits(nativeBaseUrl: string, modelId: string): Promise<OllamaModelLimits | undefined> {
301
+ async function fetchOllamaShowMetadata(
302
+ nativeBaseUrl: string,
303
+ modelId: string,
304
+ ): Promise<OllamaShowMetadata | undefined> {
256
305
  try {
257
306
  const response = await fetch(`${nativeBaseUrl}/api/show`, {
258
307
  method: "POST",
@@ -262,13 +311,21 @@ async function fetchOllamaShowLimits(nativeBaseUrl: string, modelId: string): Pr
262
311
  if (!response.ok) {
263
312
  return undefined;
264
313
  }
265
- const payload = (await response.json()) as { model_info?: Record<string, unknown> };
266
- const info = payload.model_info ?? {};
267
- for (const [key, value] of Object.entries(info)) {
268
- if (key.endsWith(".context_length") && typeof value === "number" && value > 0) {
269
- return { contextWindow: value, maxTokens: OLLAMA_DEFAULT_MAX_TOKENS };
270
- }
271
- }
314
+ const payload = (await response.json()) as { capabilities?: unknown; model_info?: Record<string, unknown> };
315
+ const capabilities = getOllamaCapabilities(payload.capabilities);
316
+ const contextWindow = getOllamaContextWindow(payload.model_info);
317
+ return {
318
+ contextWindow,
319
+ maxTokens: contextWindow ? OLLAMA_DEFAULT_MAX_TOKENS : undefined,
320
+ capabilities,
321
+ reasoning: capabilities ? capabilities.includes("thinking") : undefined,
322
+ thinking: getOllamaThinkingConfig(capabilities),
323
+ input: capabilities
324
+ ? capabilities.includes("vision")
325
+ ? (["text", "image"] as Array<"text" | "image">)
326
+ : (["text"] as Array<"text">)
327
+ : undefined,
328
+ };
272
329
  } catch {
273
330
  // fall through; caller decides on the fallback
274
331
  }
@@ -276,23 +333,27 @@ async function fetchOllamaShowLimits(nativeBaseUrl: string, modelId: string): Pr
276
333
  }
277
334
 
278
335
  /**
279
- * Build a resolver that fetches `/api/show` limits per model id and caches the
280
- * result in-memory for the lifetime of the manager. Successful lookups are
336
+ * Build a resolver that fetches `/api/show` metadata per model id and caches
337
+ * the result in-memory for the lifetime of the manager. Successful lookups are
281
338
  * cached so repeated `fetchDynamicModels` calls do not refetch; failed
282
339
  * lookups stay uncached so a later refresh can recover.
283
340
  */
284
- function createOllamaLimitsResolver(nativeBaseUrl: string): (modelId: string) => Promise<OllamaModelLimits> {
285
- const cache = new Map<string, Promise<OllamaModelLimits>>();
341
+ function createOllamaMetadataResolver(nativeBaseUrl: string): (modelId: string) => Promise<OllamaResolvedMetadata> {
342
+ const cache = new Map<string, Promise<OllamaResolvedMetadata>>();
286
343
  return modelId => {
287
344
  const cached = cache.get(modelId);
288
345
  if (cached) return cached;
289
346
  const pending = (async () => {
290
- const limits = await fetchOllamaShowLimits(nativeBaseUrl, modelId);
291
- if (!limits) {
347
+ const metadata = await fetchOllamaShowMetadata(nativeBaseUrl, modelId);
348
+ if (!metadata) {
292
349
  cache.delete(modelId);
293
350
  return { contextWindow: OLLAMA_FALLBACK_CONTEXT_WINDOW, maxTokens: OLLAMA_DEFAULT_MAX_TOKENS };
294
351
  }
295
- return limits;
352
+ return {
353
+ ...metadata,
354
+ contextWindow: metadata.contextWindow ?? OLLAMA_FALLBACK_CONTEXT_WINDOW,
355
+ maxTokens: metadata.maxTokens ?? OLLAMA_DEFAULT_MAX_TOKENS,
356
+ };
296
357
  })();
297
358
  cache.set(modelId, pending);
298
359
  void pending.catch(() => cache.delete(modelId));
@@ -702,7 +763,7 @@ export function ollamaModelManagerOptions(config?: OllamaModelManagerConfig): Mo
702
763
  const baseUrl = normalizeOllamaBaseUrl(config?.baseUrl);
703
764
  const nativeBaseUrl = toOllamaNativeBaseUrl(baseUrl);
704
765
  const references = createBundledReferenceMap<"openai-responses">("ollama" as Parameters<typeof getBundledModels>[0]);
705
- const resolveLimits = createOllamaLimitsResolver(nativeBaseUrl);
766
+ const resolveMetadata = createOllamaMetadataResolver(nativeBaseUrl);
706
767
  return {
707
768
  providerId: "ollama",
708
769
  fetchDynamicModels: async () => {
@@ -727,13 +788,20 @@ export function ollamaModelManagerOptions(config?: OllamaModelManagerConfig): Mo
727
788
  if (openAiCompatible && openAiCompatible.length > 0) {
728
789
  await Promise.all(
729
790
  openAiCompatible.map(async model => {
730
- const limits = await resolveLimits(model.id);
731
- model.contextWindow = limits.contextWindow;
791
+ const metadata = await resolveMetadata(model.id);
792
+ model.contextWindow = metadata.contextWindow;
793
+ if (metadata.reasoning !== undefined) {
794
+ model.reasoning = metadata.reasoning;
795
+ model.thinking = metadata.thinking;
796
+ }
797
+ if (metadata.input) {
798
+ model.input = metadata.input;
799
+ }
732
800
  }),
733
801
  );
734
802
  return openAiCompatible;
735
803
  }
736
- const nativeFallback = await fetchOllamaNativeModels(baseUrl, resolveLimits);
804
+ const nativeFallback = await fetchOllamaNativeModels(baseUrl, resolveMetadata);
737
805
  if (nativeFallback && nativeFallback.length > 0) {
738
806
  return nativeFallback;
739
807
  }
@@ -1407,8 +1475,11 @@ export function vllmModelManagerOptions(config?: VllmModelManagerConfig): ModelM
1407
1475
  baseUrl,
1408
1476
  apiKey,
1409
1477
  mapModel: (entry, defaults) => {
1410
- const reference = references.get(defaults.id);
1411
- return mapWithBundledReference(entry, defaults, reference);
1478
+ const model = mapWithBundledReference(entry, defaults, references.get(defaults.id));
1479
+ return {
1480
+ ...model,
1481
+ contextWindow: toPositiveNumber(entry.max_model_len, model.contextWindow),
1482
+ };
1412
1483
  },
1413
1484
  }),
1414
1485
  };