npm - pi-omlx-picker - Versions diffs - 0.2.9 → 0.3.1 - Mend

pi-omlx-picker 0.2.9 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/index.ts CHANGED Viewed

@@ -20,6 +20,7 @@ import {
 } from "./src/catalog.ts";
 import {
 	DEFAULT_OMLX_BASE_URL,
+	hasOmlxTarget,
 	loadConfig,
 	type OmlxConfig,
 	resolveConfiguredApiKey,
@@ -115,9 +116,18 @@ function registerModels(
 	models: OmlxModel[],
 	modelSettingsPath?: string,
 ): void {
+	const keyless = !resolveConfiguredApiKey();
 	pi.registerProvider(PROVIDER, {
 		name: "OMLX",
-		...toProviderConfig(config.apiRoot, config.apiKeyEnvVar, models),
+		...toProviderConfig(
+			config.apiRoot,
+			config.apiKeyEnvVar,
+			models,
+			undefined,
+			{
+				keyless,
+			},
+		),
 	});
 	state.config = config;
 	state.catalog = models;
@@ -152,8 +162,9 @@ function registerCachedOrSetupModels(pi: ExtensionAPI, state: State): void {
 		apiRoot: DEFAULT_OMLX_BASE_URL,
 		apiKeyEnvVar: "OMLX_API_KEY",
 	};
+	const configured = resolveConfiguredApiKey() || hasOmlxTarget();
 	const cached = registrableCachedModels(readCatalogCache(config.apiRoot));
-	const fallbackCached = resolveConfiguredApiKey()
+	const fallbackCached = configured
 		? undefined
 		: registrableCachedModels(readLastCatalogCache());
 	const models = cached ?? fallbackCached;
@@ -161,7 +172,7 @@ function registerCachedOrSetupModels(pi: ExtensionAPI, state: State): void {
 		state.config = config;
 		state.catalog = [];
 		state.registered = false;
-		state.lastError = resolveConfiguredApiKey()
+		state.lastError = configured
 			? "No cached OMLX models with real max_context_window/max_tokens; waiting for live catalog refresh."
 			: "OMLX credentials are not set. Run /login and choose OMLX.";
 		state.lastRefreshAt = new Date().toISOString();
@@ -169,7 +180,9 @@ function registerCachedOrSetupModels(pi: ExtensionAPI, state: State): void {
 		return;
 	}
-	if (resolveConfiguredApiKey()) {
+	// A key OR a configured base URL (keyless server) is enough to register the
+	// real provider. Pi omits the auth header when the resolved key is empty.
+	if (configured) {
 		registerModels(pi, state, config, models);
 		return;
 	}
@@ -226,7 +239,7 @@ async function refreshProvider(
 ): Promise<RefreshResult> {
 	const config = loadConfig();
 	const apiKey = resolveConfiguredApiKey();
-	if (!apiKey) {
+	if (!apiKey && !hasOmlxTarget()) {
 		state.lastError = "OMLX credentials are not set";
 		return "not_configured";
 	}
@@ -237,7 +250,7 @@ async function refreshProvider(
 	let models: OmlxModel[];
 	try {
-		models = await fetchModels(config.apiRoot, apiKey, {
+		models = await fetchModels(config.apiRoot, apiKey ?? "", {
 			modelSettingsPath,
 			timeoutMs: opts.timeoutMs,
 		});

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
 	"name": "pi-omlx-picker",
-	"version": "0.2.9",
+	"version": "0.3.1",
 	"type": "module",
 	"description": "Pi extension that discovers models from a local OMLX server and registers them as a native Pi provider.",
 	"license": "MIT",

package/src/catalog.ts CHANGED Viewed

@@ -10,6 +10,8 @@ export interface OmlxModel {
 	modelAlias?: string;
 	contextWindow?: number;
 	maxTokens?: number;
+	/** Model architectural ceiling (`max_model_len`). Prio-3 fallback and clamp limit. */
+	archContextWindow?: number;
 	thinkingDefault?: boolean | null;
 	taskBudgetTokens?: number;
 	maxToolResultTokens?: number;
@@ -36,7 +38,7 @@ export interface CatalogDebugEvent {
 interface OpenAIModelsResponse {
 	object: string;
-	data: Array<{ id: string; object?: string }>;
+	data: Array<{ id: string; object?: string; max_model_len?: number | null }>;
 }
 interface OmlxModelsStatusResponse {
@@ -92,7 +94,10 @@ export function parseModelsResponse(json: unknown): OmlxModel[] {
 		if (!entry || typeof entry.id !== "string" || !entry.id) continue;
 		if (seen.has(entry.id)) continue;
 		seen.add(entry.id);
-		out.push({ id: entry.id });
+		const m: OmlxModel = { id: entry.id };
+		if (typeof entry.max_model_len === "number" && entry.max_model_len > 0)
+			m.archContextWindow = entry.max_model_len;
+		out.push(m);
 	}
 	return out;
 }
@@ -208,13 +213,15 @@ export async function fetchModels(
 			opts.modelSettingsPath,
 			opts.onDebug,
 		);
-		return applyApiGlobalDefaultsIfNeeded(
-			models,
-			apiRoot,
-			apiKey,
-			opts.signal,
-			timeoutMs,
-			opts.onDebug,
+		return resolveArchContextLimits(
+			await applyApiGlobalDefaultsIfNeeded(
+				models,
+				apiRoot,
+				apiKey,
+				opts.signal,
+				timeoutMs,
+				opts.onDebug,
+			),
 		);
 	} catch (err) {
 		if (err instanceof Error && err.name === "AbortError") throw err;
@@ -245,16 +252,35 @@ export async function fetchModels(
 		opts.modelSettingsPath,
 		opts.onDebug,
 	);
-	return applyApiGlobalDefaultsIfNeeded(
-		models,
-		apiRoot,
-		apiKey,
-		opts.signal,
-		timeoutMs,
-		opts.onDebug,
+	return resolveArchContextLimits(
+		await applyApiGlobalDefaultsIfNeeded(
+			models,
+			apiRoot,
+			apiKey,
+			opts.signal,
+			timeoutMs,
+			opts.onDebug,
+		),
 	);
 }
+/**
+ * Final context-window resolution, applied after model-specific (prio 1) and
+ * global (prio 2) settings. The model's architectural ceiling
+ * (`archContextWindow`, from `max_model_len`) is the prio-3 fallback when no
+ * user setting exists, and the hard clamp when a user setting exceeds it.
+ */
+export function resolveArchContextLimits(models: OmlxModel[]): OmlxModel[] {
+	return models.map((model) => {
+		const arch = model.archContextWindow;
+		if (arch == null) return model;
+		const next: OmlxModel = { ...model };
+		if (next.contextWindow == null) next.contextWindow = arch;
+		else if (next.contextWindow > arch) next.contextWindow = arch;
+		return next;
+	});
+}
 async function applyApiGlobalDefaultsIfNeeded(
 	models: OmlxModel[],
 	apiRoot: string,
@@ -263,7 +289,8 @@ async function applyApiGlobalDefaultsIfNeeded(
 	timeoutMs: number,
 	onDebug?: (event: CatalogDebugEvent) => void,
 ): Promise<OmlxModel[]> {
-	if (!models.some((m) => !m.contextWindow || !m.maxTokens)) return models;
+	if (!models.some((m) => m.contextWindow == null || m.maxTokens == null))
+		return models;
 	let defaults: OmlxGlobalDefaults | undefined;
 	try {
 		defaults = await fetchGlobalDefaults(apiRoot, apiKey, signal, timeoutMs);
@@ -272,6 +299,7 @@ async function applyApiGlobalDefaultsIfNeeded(
 			details: { apiRoot, defaults },
 		});
 	} catch (err) {
+		if (signal?.aborted) throw err;
 		onDebug?.({
 			kind: "catalog_global_settings_failed",
 			details: {
@@ -281,12 +309,13 @@ async function applyApiGlobalDefaultsIfNeeded(
 		});
 		return models;
 	}
-	if (!defaults.contextWindow && !defaults.maxTokens) return models;
+	if (defaults.contextWindow == null && defaults.maxTokens == null)
+		return models;
 	return models.map((model) => {
 		const next: OmlxModel = { ...model };
-		if (!next.contextWindow && defaults.contextWindow)
+		if (next.contextWindow == null && defaults.contextWindow != null)
 			next.contextWindow = defaults.contextWindow;
-		if (!next.maxTokens && defaults.maxTokens)
+		if (next.maxTokens == null && defaults.maxTokens != null)
 			next.maxTokens = defaults.maxTokens;
 		return next;
 	});
@@ -325,8 +354,10 @@ async function getJson(
 	timeoutMs: number,
 ): Promise<unknown> {
 	const signal = withTimeout(parent, timeoutMs);
+	// Empty key => keyless server (skip_api_key_verification): omit the header.
+	const headers = apiKey ? { Authorization: `Bearer ${apiKey}` } : undefined;
 	const res = await fetch(url, {
-		headers: { Authorization: `Bearer ${apiKey}` },
+		headers,
 		signal,
 	}).catch((err) => {
 		if (err instanceof Error && err.name === "AbortError") {

package/src/config.ts CHANGED Viewed

@@ -34,6 +34,18 @@ export function resolveConfiguredApiKey(
 	return loadOmlxCredential()?.apiKey;
 }
+/**
+ * True when the user has pointed us at a server even without an API key.
+ * OMLX servers run with `skip_api_key_verification: true` need no key; an
+ * explicit base URL (env or stored) is the signal that a keyless server is
+ * intended. With neither key nor base URL there is nothing to talk to.
+ */
+export function hasOmlxTarget(env: NodeJS.ProcessEnv = process.env): boolean {
+	if (env.OMLX_API_KEY || env.OMLX_BASE_URL) return true;
+	const stored = loadOmlxCredential();
+	return Boolean(stored?.apiKey || stored?.baseUrl);
+}
 // Legacy helper for older stored api_key credentials. Never fills only one side
 // of the env pair; partial shell overrides remain explicit shell state.
 export function applyStoredCredentialToEnv(

package/src/overflow.ts CHANGED Viewed

@@ -4,6 +4,7 @@ const OMLX_OVERFLOW_RE =
 	/prompt too long[:.]?\s*(\d[\d,]*)\s*tokens?\s*exceeds\s*max(?:imum)?\s*context window of\s*(\d[\d,]*)\s*tokens?/i;
 export function normalizeOverflowMessage(errorMessage: string): string {
+	if (errorMessage.startsWith("prompt is too long:")) return errorMessage;
 	const match = OMLX_OVERFLOW_RE.exec(errorMessage);
 	if (!match) return errorMessage;
 	const used = match[1];

package/src/provider.ts CHANGED Viewed

@@ -23,22 +23,27 @@ export function toProviderConfig(
 	apiKeyEnvVar: string,
 	models: OmlxModel[],
 	onStreamTimeout?: (event: StreamTimeoutEvent) => void,
+	options: { keyless?: boolean } = {},
 ): ProviderConfig {
-	return {
+	const config: ProviderConfig = {
 		baseUrl: apiRoot,
-		apiKey: `$${apiKeyEnvVar}`,
 		api: "openai-completions",
-		authHeader: true,
-		streamSimple: (model, context, options) =>
+		// Keyless server (skip_api_key_verification): no auth header. Pi rejects
+		// authHeader:true with no key, and resolveConfigValueOrThrow would throw
+		// on an unset $OMLX_API_KEY — so both apiKey and authHeader stay off.
+		authHeader: !options.keyless,
+		streamSimple: (model, context, streamOptions) =>
 			streamOmlxOpenAICompletions(
 				model,
 				context,
-				options,
+				streamOptions,
 				resolveFirstDeltaTimeoutMs(),
 				onStreamTimeout,
 			),
 		models: models.map(toProviderModel),
 	};
+	if (!options.keyless) config.apiKey = `$${apiKeyEnvVar}`;
+	return config;
 }
 function requirePositive(
@@ -59,11 +64,11 @@ function toProviderModel(m: OmlxModel): ProviderModelConfig {
 		name: m.displayName ?? m.id,
 		reasoning,
 		input: m.modelType === "vlm" ? ["text", "image"] : ["text"],
-		cost: FREE_COST,
+		cost: { ...FREE_COST },
 		contextWindow: requirePositive(m.contextWindow, m.id, "max_context_window"),
 		maxTokens: requirePositive(m.maxTokens, m.id, "max_tokens"),
 		compat: reasoning
 			? { ...BASE_COMPAT, thinkingFormat: thinkingFormatFor(m.reasoningParser) }
-			: BASE_COMPAT,
+			: { ...BASE_COMPAT },
 	};
 }

package/src/repeat-stop.ts CHANGED Viewed

@@ -29,11 +29,8 @@ function extractAssistantParts(message: AssistantMessage): AssistantParts {
 function lastAssistantMessage(
 	messages: Message[],
 ): AssistantMessage | undefined {
-	for (let i = messages.length - 1; i >= 0; i--) {
-		const m = messages[i];
-		if (m.role === "assistant") return m;
-	}
-	return undefined;
+	const last = messages.at(-1);
+	return last?.role === "assistant" ? last : undefined;
 }
 function bigramCounts(s: string): Map<string, number> {

package/src/stream-events.ts CHANGED Viewed

@@ -27,19 +27,6 @@ export function isThinkingEvent(event: AssistantMessageEvent): boolean {
 	);
 }
-export function mergeAbortSignals(
-	parent: AbortSignal | undefined,
-	child: AbortSignal,
-): AbortSignal {
-	if (!parent) return child;
-	if (parent.aborted) return parent;
-	const controller = new AbortController();
-	const abort = () => controller.abort();
-	parent.addEventListener("abort", abort, { once: true });
-	child.addEventListener("abort", abort, { once: true });
-	return controller.signal;
-}
 export function eventPartial(
 	event: AssistantMessageEvent,
 	model: Model<Api>,
@@ -64,7 +51,7 @@ export function errorAssistantMessage(
 		api: model.api,
 		provider: model.provider,
 		model: model.id,
-		usage: ZERO_USAGE,
+		usage: { ...ZERO_USAGE, cost: { ...ZERO_USAGE.cost } },
 		stopReason,
 		errorMessage,
 		timestamp: Date.now(),

package/src/stream-writer.ts CHANGED Viewed

@@ -21,6 +21,13 @@ export class StreamWriter {
 	}
 	push(event: AssistantMessageEvent): void {
+		if (event.type === "start") {
+			if (!this.startPushed) {
+				this.stream.push(this.startEvent ?? event);
+				this.startPushed = true;
+			}
+			return;
+		}
 		if (!this.startPushed) {
 			this.stream.push(
 				this.startEvent ?? {

package/src/stream.ts CHANGED Viewed

@@ -7,18 +7,31 @@ import {
 	type Model,
 	type SimpleStreamOptions,
 } from "@earendil-works/pi-ai";
-import { streamSimple as streamSimpleOpenAICompletions } from "@earendil-works/pi-ai/api/openai-completions";
+// Resolve the concrete OpenAI Completions stream once, via the lazy API factory
+// re-exported from compat. Calling the compat `streamSimple` dispatcher from
+// inside this wrapper would re-resolve through the api-provider registry; and
+// because this extension registers itself as the openai-completions handler,
+// that routes dispatch -> wrapper -> dispatch -> ... and overflows the stack.
+// The lazy factory returns a closure over the concrete implementation that
+// loads the module on first call and calls it directly — no registry, no
+// re-dispatch. It is captured here at module load, BEFORE the wrapper is
+// registered, so it can never be the wrapper itself.
+//
+// Note: pi's extension loader (jiti) only aliases a fixed set of pi-ai
+// subpaths (root, /compat, /oauth). Importing `@earendil-works/pi-ai/api/...`
+// is not resolvable there, so the concrete module is reached through compat.
+import { openAICompletionsApi } from "@earendil-works/pi-ai/compat";
+import { PROVIDER_KEY } from "./auth-storage.ts";
 import { normalizeErrorEvent } from "./overflow.ts";
 import { isRepeatStop } from "./repeat-stop.ts";
-import {
-	isMeaningfulBodyEvent,
-	isThinkingEvent,
-	mergeAbortSignals,
-} from "./stream-events.ts";
+import { isMeaningfulBodyEvent, isThinkingEvent } from "./stream-events.ts";
 import { StreamWriter } from "./stream-writer.ts";
+const streamOpenAICompletionsImpl = openAICompletionsApi().streamSimple;
 const DEFAULT_FIRST_DELTA_TIMEOUT_MS = 120_000;
 const FIRST_DELTA_MAX_ATTEMPTS = 2;
+const MAX_REISSUES = 1;
 export type StreamTimeoutEvent = {
 	model: string;
@@ -45,6 +58,32 @@ export function resolveFirstDeltaTimeoutMs(): number {
 		: DEFAULT_FIRST_DELTA_TIMEOUT_MS;
 }
+// Merge the parent (caller) signal with our own timeout signal. We compose the
+// raw source signals directly via AbortSignal.any rather than chaining through
+// a freshly-created controller per call: chaining previously-merged signals
+// accumulates abort listeners across a long session and, when abort fires,
+// propagates through N recursive .abort() calls that overflow the stack.
+// AbortSignal.any keeps the merged signal detached from either source's
+// listener set, and returns an already-aborted signal if either input is
+// aborted (so a pre-aborted parent propagates immediately). Node >=22
+// guarantees AbortSignal.any is available (engines).
+function mergeTimeoutSignal(
+	parent: AbortSignal | undefined,
+	own: AbortSignal,
+): AbortSignal {
+	if (!parent) return own;
+	return AbortSignal.any([parent, own]) as AbortSignal;
+}
+// Always flush buffered thinking events before leaving runAttempt, including
+// the timed-out and thrown paths. Previously these were dropped silently.
+function flushThinking(
+	writer: StreamWriter,
+	events: AssistantMessageEvent[],
+): void {
+	for (const held of events) writer.push(held);
+}
 async function runAttempt(
 	writer: StreamWriter,
 	model: Model<Api>,
@@ -56,11 +95,14 @@ async function runAttempt(
 	allowReissue: boolean,
 ): Promise<AttemptResult> {
 	const controller = new AbortController();
-	const signal = mergeAbortSignals(options?.signal, controller.signal);
+	const signal = mergeTimeoutSignal(options?.signal, controller.signal);
 	let timedOut = false;
 	let firstMeaningfulEvent = false;
 	const timer = setTimeout(() => {
+		// clearTimeout below does not stop a callback already queued on the
+		// event loop; firstMeaningfulEvent is a load-bearing guard against the
+		// timer firing just after the first body event cleared it.
 		if (writer.closed || firstMeaningfulEvent) return;
 		timedOut = true;
 		onTimeout?.({
@@ -76,7 +118,9 @@ async function runAttempt(
 	let bufferedThinking: AssistantMessageEvent[] = [];
 	try {
-		const inner = streamSimpleOpenAICompletions(
+		// Direct call into the concrete implementation: no dispatch, so a model
+		// whose provider registered this wrapper cannot recurse back into it.
+		const inner = streamOpenAICompletionsImpl(
 			model as Model<"openai-completions">,
 			context,
 			{ ...options, signal },
@@ -96,18 +140,27 @@ async function runAttempt(
 				continue;
 			}
 			if (allowReissue && isRepeatStop(event, context)) {
+				// Tear down the inner stream's network connection immediately so it
+				// does not drain unpredictably while the caller reissues.
+				controller.abort();
 				bufferedThinking = [];
 				return "reissue";
 			}
-			for (const held of bufferedThinking) writer.push(held);
+			flushThinking(writer, bufferedThinking);
 			bufferedThinking = [];
 			writer.push(normalizeErrorEvent(event));
 			if (event.type === "done" || event.type === "error") break;
 		}
+	} catch (err) {
+		flushThinking(writer, bufferedThinking);
+		if (timedOut) return "timed-out";
+		throw err;
 	} finally {
 		clearTimeout(timer);
 	}
+	flushThinking(writer, bufferedThinking);
 	return timedOut ? "timed-out" : "completed";
 }
@@ -118,13 +171,32 @@ export function streamOmlxOpenAICompletions(
 	firstDeltaTimeoutMs: number,
 	onTimeout: OnStreamTimeout | undefined,
 ): AssistantMessageEventStream {
+	// Pi dispatches stream handlers by api id, not provider. Registering this
+	// wrapper as the openai-completions streamSimple handler (the mechanism pi
+	// exposes) replaces the shared entry, so non-oMLX OpenAI-compatible models
+	// (groq, zai, glm, ...) also arrive here. Pass them straight through to the
+	// concrete implementation with no oMLX-specific timeout/reissue logic, so the
+	// extension cannot perturb unrelated providers.
+	if (model.provider !== PROVIDER_KEY) {
+		return streamOpenAICompletionsImpl(
+			model as Model<"openai-completions">,
+			context,
+			options,
+		);
+	}
 	const stream = createAssistantMessageEventStream();
 	const writer = new StreamWriter(stream, model);
 	(async () => {
 		try {
-			let reissued = false;
-			for (let attempt = 1; attempt <= FIRST_DELTA_MAX_ATTEMPTS; attempt++) {
+			// Separate budgets so a reissue never consumes a timeout attempt and
+			// the loop index is never mutated: timeoutAttemptsLeft governs how
+			// many timed-out retries remain, reissueLeft governs reissues.
+			let timeoutAttemptsLeft = FIRST_DELTA_MAX_ATTEMPTS;
+			let reissueLeft = MAX_REISSUES;
+			while (true) {
+				const attempt = FIRST_DELTA_MAX_ATTEMPTS - timeoutAttemptsLeft + 1;
 				const result = await runAttempt(
 					writer,
 					model,
@@ -133,20 +205,22 @@ export function streamOmlxOpenAICompletions(
 					firstDeltaTimeoutMs,
 					attempt,
 					onTimeout,
-					!reissued,
+					reissueLeft > 0,
 				);
 				if (writer.closed) return;
 				if (result === "reissue") {
-					reissued = true;
-					attempt--; // a re-issue doesn't consume a timeout attempt
+					reissueLeft--;
 					continue;
 				}
 				if (result === "completed") {
 					writer.end();
 					return;
 				}
-				if (attempt >= FIRST_DELTA_MAX_ATTEMPTS) {
+				// timed-out
+				timeoutAttemptsLeft--;
+				if (timeoutAttemptsLeft <= 0) {
 					writer.pushError(
 						firstDeltaTimeoutMessage(
 							firstDeltaTimeoutMs,

package/src/thinking-format.ts CHANGED Viewed

@@ -19,6 +19,6 @@ export function thinkingFormatFor(
 	if (!reasoningParser) return NO_THINKING_FORMAT;
 	return (
 		REASONING_PARSER_FORMATS[reasoningParser.toLowerCase()] ??
-		OMLX_CHAT_TEMPLATE_FORMAT
+		NO_THINKING_FORMAT
 	);
 }