npm - pi-llama-cpp - Versions diffs - 0.5.1 → 0.7.0 - Mend

pi-llama-cpp 0.5.1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

package/README.md +96 -30
package/package.json +6 -5
package/src/constants.ts +27 -5
package/src/enums/action.ts +3 -2
package/src/enums/mode.ts +1 -0
package/src/enums/serverStatus.ts +6 -0
package/src/enums/status.ts +1 -0
package/src/index.ts +53 -31
package/src/interfaces/auth.ts +1 -5
package/src/interfaces/endpoints/props.ts +1 -0
package/src/interfaces/levels.ts +7 -0
package/src/managers/command.ts +290 -0
package/src/managers/events.ts +101 -0
package/src/managers/server.ts +136 -0
package/src/models/baseModel.ts +75 -20
package/src/models/legacyModel.ts +45 -0
package/src/models/routerModel.ts +7 -30
package/src/models/singleModel.ts +9 -6
package/src/resolver.ts +152 -0
package/src/server.ts +187 -0
package/tests/commandManager.test.ts +182 -133
package/tests/events.test.ts +256 -0
package/tests/legacyModel.test.ts +112 -0
package/tests/mocks.ts +100 -0
package/tests/resolver.test.ts +143 -106
package/tests/routerModel.test.ts +46 -68
package/tests/server.test.ts +176 -0
package/tests/serverManager.test.ts +130 -0
package/tests/singleModel.test.ts +21 -29
package/src/commands/models.ts +0 -228
package/src/events.ts +0 -26
package/src/manager.ts +0 -96
package/src/tools/resolver.ts +0 -136
package/src/tools/retriever.ts +0 -71
package/tests/handlers.test.ts +0 -164
package/tests/modelsCommand.test.ts +0 -270

package/README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 # pi-llama-cpp
-A [Pi Coding Agent](https://pi.dev/) extension that integrates with a running [llama.cpp server](https://github.com/ggml-org/llama.cpp) to provide live model browsing, loading, and switching directly from Pi.
+A [Pi Coding Agent](https://pi.dev/) extension that integrates with running [llama.cpp servers](https://github.com/ggml-org/llama.cpp) to provide live model browsing, loading, and switching directly from Pi.
 ## Features
@@ -10,20 +10,26 @@ A [Pi Coding Agent](https://pi.dev/) extension that integrates with a running [l
 - **Multi-model router support** — works with both single-model and multi-model llama.cpp server configurations
 - **Image capabilities detection** — detects multimodal models automatically
 - **Flexible URL resolution** — configures the server URL via project config, environment variable, or global settings
+- **Auth support** — allows to login into a llama.cpp server that was secured with an API key
+- **Multiple server support** — connect to multiple llama.cpp servers simultaneously by separating URLs with semicolons
+- **Thinking budget support** — configurable token budgets for model reasoning/thinking, mapped to Pi's thinking levels
 ### Status Indicators
-| Icon | Status | Description |
-|------|--------|-------------|
-| 🟢 | Loaded | Model is active and ready to use |
-| 🟡 | Loading | Model is currently being loaded |
-| 🔴 | Failed | Model failed to load |
-| 🔵 | Sleeping | Model is available, but inactive |
-| ⚪ | Unloaded | Model is not loaded on the server |
+| Icon | Status       | Description                            |
+| ---- | ------------ | -------------------------------------- |
+| 🟢   | Loaded       | Model is active and ready to use       |
+| 🟡   | Loading      | Model is currently being loaded        |
+| 🔴   | Failed       | Model failed to load                   |
+| 🔵   | Sleeping     | Model is available, but inactive       |
+| ⚪   | Unloaded     | Model is not loaded on the server      |
+| ⛔   | Unauthorized | Model can't be used (API key required) |
 > **Note**: The `Sleeping` status only shows when you start your server with `llama-server --sleep-idle-seconds <n> ...`.
-This is a **llama.cpp server flag** that tells the server to put idle models to sleep after `n` seconds.
-The model awakens automatically when you send a message.
+> This is a **llama.cpp server flag** that tells the server to put idle models to sleep after `n` seconds.
+> The model awakens automatically when you send a message.
+> **Note:** You can run your server with API authentication with `llama-server --api-key <your key> ...`.
 ## Installation
@@ -41,13 +47,13 @@ pi install https://github.com/gsanhueza/pi-llama-cpp
 ## Configuration
-The extension resolves the llama.cpp server URL using the following priority order:
+The extension resolves the llama.cpp server URL(s) using the following priority order:
-1. **Per-project config** — `.pi/llama-server.json` in your project root:
+1. **Per-project config** — `.pi/settings.json` in your project root:
    ```json
    {
-     "url": "http://127.0.0.1:8080"
+     "llamaServerUrl": "http://127.0.0.1:8080"
    }
    ```
@@ -63,19 +69,33 @@ The extension resolves the llama.cpp server URL using the following priority ord
 4. **Default** — `http://127.0.0.1:8080`
-### API Key
+### Multiple Servers
+To connect to multiple llama.cpp servers simultaneously, add your URLs as a single string **separated with semicolons** in any of the examples above:
-If your llama.cpp server requires authentication, use `/login` in Pi, select the "API key" option, and choose the `Llama.cpp` provider from the list.
+```bash
+# Example for env, but you can use any of the other methods
+LLAMA_SERVER_URL="http://127.0.0.1:8080;http://127.0.0.1:8081;http://10.0.0.5:8080"
+```
-Alternatively, configure the API key in `~/.pi/agent/auth.json` using the provider ID `llama-server`:
+Each server gets its own provider (e.g., **Llama.cpp (http://127.0.0.1:8080)**) and its own set of models. The `/models` command lists all models from all servers, labeled with their server URL.
+### API Key
-> **Note**: The provider is displayed as **Llama.cpp** in the Pi UI, but its internal identifier is `llama-server` — use this ID when configuring `auth.json` or other programmatic access.
+If your llama.cpp server requires authentication, use `/login` in Pi, select the "API key" option, and choose the provider from the list that correlates with the server needing the API key.
+Alternatively, configure the API key in `~/.pi/agent/auth.json`:
+Use the provider ID `llama-server=<url>`:
 ```json
 {
-  "llama-server": {
+  "llama-server=http://127.0.0.1:8080": {
+    "type": "api_key",
+    "key": "<key-for-server-1>"
+  },
+  "llama-server=https://some-url-for-llama-cpp": {
     "type": "api_key",
-    "key": "<your-api-key-here>"
+    "key": "<key-for-server-2>"
   }
 }
 ```
@@ -98,22 +118,34 @@ llama-server --models-preset path/to/presets.ini ...
 llama-server --model path/to/model.gguf ...
 ```
+- For legacy-model mode (e.g., [ik_llama.cpp](https://github.com/ikawrakow/ik_llama.cpp)), the extension auto-detects and handles it transparently.
+> **Note:** This extension is focused on llama.cpp, not on ik_llama.cpp. Nonetheless, since I found a way to make it work with this extension, I added the option.
+> **Note:** The ik_llama.cpp fork is not legacy at all, but it uses an old way of describing models compared to llama.cpp.
 The extension determines the context size as follows:
 - **Router mode**
   - When loaded, reads `meta.n_ctx` from the `/models` endpoint
-  - When not loaded, reads `--ctx-size` and/or `--fit-ctx` from the server arguments, or `ctx-size` and/or `fit-ctx` keys from the **presets.ini** file.
+  - When not loaded, reads `--ctx-size` and/or `--fit-ctx` from the server arguments (which can also originate from the **presets.ini** file the llama.cpp server uses to load its models).
 - **Single mode** — reads `meta.n_ctx` from the `/models` endpoint
+- **Legacy mode** — reads `max_model_len` from `/models`, falling back to `n_ctx` from `/props`
 - Falls back to `128000` if not available
 ### Commands
-| Command          | Description                                                                                |
-| ---------------- | ------------------------------------------------------------------------------------------ |
-| `/models`        | Browse your models with live status. Select a model to load, switch, or unload it.         |
-| `/models info`   | Show detailed information for all available models at once.                                |
-| `/models unload` | Unload all loaded models at once (Note: this only makes sense in router mode).             |
+| Command          | Description                                                                        |
+| ---------------- | ---------------------------------------------------------------------------------- |
+| `/models`        | Browse your models with live status. Select a model to load, switch, or unload it. |
+| `/models info`   | Show detailed information for all available models at once.                        |
+| `/models unload` | Unload all loaded models at once.                                                  |
+> **Note:** When a llama.cpp server is slow to respond, it will be skipped at startup with a warning. Run `/models` to retry without timeout and see all models.
-> **Note:** When the llama.cpp server is unreachable, `/models` displays an error notification with the configured server URL.
+> **Note:** When a llama.cpp server is unreachable, `/models` displays an error notification with the configured server URL, but healthy servers continue to show their models.
+> **Note:** The `/models unload` command only makes sense in router mode.
 ### Model Actions
@@ -126,7 +158,37 @@ When browsing models via the `/models` command, you can:
 - **Info** — View model details (ID, capabilities, context size)
 - **Cancel** — Cancel the current operation
-> **Note:** In single-model mode, only **Info** and **Cancel** are available, since there is only one model loaded on the server.
+> **Note:** In single-model and legacy-model mode, **Unload** is not available, since there is only one model on the server.
+### Thinking Budgets
+The extension supports configurable **thinking budgets** that control how many tokens the model allocates to its reasoning/thinking process.
+This is tied to Pi's thinking level selector (off, minimal, low, medium, high, xhigh).
+| Level     | Tokens | Description                  |
+| --------- | ------ | ---------------------------- |
+| `off`     | 0      | Thinking disabled            |
+| `minimal` | 1,024  | Short reasoning steps        |
+| `low`     | 2,048  | Light reasoning              |
+| `medium`  | 8,192  | Balanced reasoning (default) |
+| `high`    | 16,384 | Extended reasoning           |
+| `xhigh`   | -1     | Unlimited reasoning          |
+User-defined budgets can override the defaults by adding a `thinkingBudgets` object to `~/.pi/agent/settings.json` (global) or `.pi/settings.json` (per-project):
+```json
+{
+  "thinkingBudgets": {
+    "minimal": 256,
+    "low": 1024,
+    "medium": 2048,
+    "high": 4096
+  }
+}
+```
+Only `minimal`, `low`, `medium`, and `high` are configurable — `off` (0) and `xhigh` (-1, unlimited) are fixed.
+The extension automatically injects the appropriate `thinking_budget_tokens` into each request payload based on the selected level.
 ### Model Selection Event
@@ -134,9 +196,12 @@ When you switch models via Pi's model picker (instead of using the `/models` com
 This keeps the server in sync with the active model in Pi, regardless of how the switch was initiated — you don't need to manually load models before using them.
+> **Note:** If you switch sessions while a model load is in-flight, you'll see a warning, but the load continues in the background. Use `/models` in the new session to verify the model status.
 ### Loading Models
 When you trigger a load, switch, or retry action, the extension polls the server to track progress. If a model takes longer than **60 seconds** to load, the polling times out with an error.
 > **Note:** The timeout is only for the polling. The model might still be loading.
 ### Model Configuration
@@ -149,6 +214,7 @@ Each model exposed to Pi includes the following defaults:
 ## Dependencies
-| Dependency                        | Purpose                               |
-| --------------------------------- | ------------------------------------- |
-| `@earendil-works/pi-coding-agent` | Pi Coding Agent SDK (peer dependency) |
+| Peer dependency                   | Purpose             |
+| --------------------------------- | ------------------- |
+| `@earendil-works/pi-coding-agent` | Pi Coding Agent SDK |
+| `@earendil-works/pi-tui`          | Pi TUI SDK          |

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "pi-llama-cpp",
-  "version": "0.5.1",
-  "description": "Pi extension for llama.cpp integration. Supports both router and single modes.",
+  "version": "0.7.0",
+  "description": "Pi extension for llama.cpp integration. Supports router, single and legacy models. Supports multiple servers.",
   "keywords": [
     "pi",
     "pi-package",
@@ -32,11 +32,12 @@
     ]
   },
   "peerDependencies": {
-    "@earendil-works/pi-coding-agent": "*"
+    "@earendil-works/pi-coding-agent": "*",
+    "@earendil-works/pi-tui": "*"
   },
   "devDependencies": {
-    "@types/node": "^25.9.1",
+    "@types/node": "^25.9.3",
     "prettier-plugin-organize-imports": "^4.3.0",
-    "vitest": "^4.1.7"
+    "vitest": "^4.1.8"
   }
 }

package/src/constants.ts CHANGED Viewed

@@ -1,7 +1,7 @@
 /**
- * This provider's id
+ * This provider's base ID
  */
-export const PROVIDER_ID = "llama-server";
+export const PROVIDER_PREFIX = "llama-server";
 /**
  * This provider's name
@@ -9,15 +9,20 @@ export const PROVIDER_ID = "llama-server";
 export const PROVIDER_NAME = "Llama.cpp";
 /**
- * The default URL if the resolver couldn't find it
+ * The default API type used in Pi
  */
-export const DEFAULT_LLAMA_SERVER_URL = "http://127.0.0.1:8080";
+export const API_TYPE = "openai-completions";
 /**
  * The placeholder api-key if it couldn't be resolved
  */
 export const API_KEY_PLACEHOLDER = "sk-placeholder";
+/**
+ * The default URL if the resolver couldn't find it
+ */
+export const DEFAULT_LLAMA_SERVER_URL = "http://127.0.0.1:8080";
 /**
  * The default context if the server didn't expose it
  */
@@ -34,6 +39,23 @@ export const POLLING_INTERVAL = 500;
 export const POLLING_TIMEOUT = 60000;
 /**
- * Reasonable time to read notifications if context goes stale
+ * Reasonable time (ms) to read notifications if context goes stale
  */
 export const READABLE_TIMEOUT = 15000;
+/**
+ * Timeout (ms) for server verification before assuming failure
+ */
+export const SERVER_TIMEOUT = 1000;
+/**
+ * Thinking budgets to send to the server, depending on user-selected level in Pi.
+ */
+export const DEFAULT_THINKING_BUDGETS = {
+  off: 0,
+  minimal: 1024,
+  low: 2048,
+  medium: 8192,
+  high: 16384,
+  xhigh: -1,
+};

package/src/enums/action.ts CHANGED Viewed

@@ -1,9 +1,10 @@
 /** The possible actions for the /models command */
 export enum Action {
+  LOAD_AND_SWITCH = "Load & switch",
   SWITCH = "Switch model",
-  RETRY = "Retry",
-  LOAD = "Load & switch",
+  LOAD = "Load only",
   UNLOAD = "Unload",
+  RETRY = "Retry",
   INFO = "Info",
   CANCEL = "Cancel",
 }

package/src/enums/mode.ts CHANGED Viewed

@@ -2,4 +2,5 @@
 export enum Mode {
   ROUTER = "router",
   SINGLE = "single",
+  LEGACY = "legacy",
 }

package/src/enums/serverStatus.ts ADDED Viewed

@@ -0,0 +1,6 @@
+/** The possible states of a llama.cpp server */
+export enum ServerStatus {
+  READY = "ready",
+  TIMEOUT = "timeout",
+  UNREACHABLE = "unreachable",
+}

package/src/enums/status.ts CHANGED Viewed

@@ -5,4 +5,5 @@ export enum Status {
   FAILED = "failed",
   SLEEPING = "sleeping",
   UNLOADED = "unloaded",
+  UNAUTHORIZED = "unauthorized",
 }

package/src/index.ts CHANGED Viewed

@@ -1,42 +1,64 @@
-import type {
-  ExtensionAPI,
-  ExtensionCommandContext,
+import {
+  type BeforeProviderRequestEvent,
+  type ExtensionAPI,
+  type ExtensionCommandContext,
+  type ExtensionContext,
+  type SessionBeforeSwitchEvent,
+  type SessionStartEvent,
 } from "@earendil-works/pi-coding-agent";
-import type { AutocompleteItem } from "@earendil-works/pi-tui";
-import { onSessionBeforeSwitch } from "./commands/models";
 import { PROVIDER_NAME } from "./constants";
-import { onModelSelect } from "./events";
-import { CommandManager } from "./manager";
+import { ModelSelectEvent } from "./interfaces/events";
+import { CommandManager } from "./managers/command";
+import { EventManager } from "./managers/events";
+import { ServerManager } from "./managers/server";
+import { ConfigResolver } from "./resolver";
+import { Server } from "./server";
 export default async function (pi: ExtensionAPI) {
-  const manager = new CommandManager(pi);
-  await manager.initialize();
+  const resolver = new ConfigResolver();
+  const urls = await resolver.resolveUrls();
+  const servers = urls.map((url) => new Server(url));
-  // Command: /models
+  const eventManager = new EventManager(servers);
+  const serverManager = new ServerManager(servers);
+  const commandManager = new CommandManager(serverManager);
+  // Register providers once at startup
+  await serverManager.initialize(pi);
+  // Single global /models command
   pi.registerCommand("models", {
     description: `Browse ${PROVIDER_NAME} models`,
-    getArgumentCompletions: (prefix: string): AutocompleteItem[] | null => {
-      const available = [
-        {
-          value: "info",
-          label: "info",
-          description: "Show information of all models",
-        },
-        {
-          value: "unload",
-          label: "unload",
-          description: "Unload all models",
-        },
-      ];
-      const filtered = available.filter((a) => a.value.startsWith(prefix));
-      return filtered.length > 0 ? filtered : null;
+    getArgumentCompletions: commandManager.getArgumentCompletions,
+    handler: async (args: string, ctx: ExtensionCommandContext) => {
+      await commandManager.handleCommand(args, ctx, pi);
     },
-    handler: async (args: string, ctx: ExtensionCommandContext) =>
-      await manager.run(args, ctx),
   });
-  // Events registration
-  pi.on("model_select", onModelSelect);
-  pi.on("session_before_switch", onSessionBeforeSwitch);
+  // Events
+  pi.on("session_start", (event: SessionStartEvent, ctx: ExtensionContext) => {
+    if (event.reason !== "startup") return;
+    for (const warning of serverManager.getWarnings())
+      ctx.ui.notify(warning, "warning");
+    for (const warning of resolver.getWarnings())
+      ctx.ui.notify(warning, "warning");
+  });
+  pi.on(
+    "before_provider_request",
+    async (event: BeforeProviderRequestEvent) =>
+      await eventManager.onBeforeProviderRequest(event),
+  );
+  pi.on(
+    "model_select",
+    async (event: ModelSelectEvent, ctx: ExtensionContext) =>
+      await eventManager.onModelSelect(event, ctx),
+  );
+  pi.on(
+    "session_before_switch",
+    async (_: SessionBeforeSwitchEvent, ctx: ExtensionContext) =>
+      await eventManager.onSessionBeforeSwitch(ctx),
+  );
 }

package/src/interfaces/auth.ts CHANGED Viewed

@@ -1,10 +1,6 @@
-import { PROVIDER_ID } from "../constants";
 interface Auth {
   type: string;
   key: string;
 }
-export interface AuthFile {
-  [PROVIDER_ID]: Auth;
-}
+export type AuthFile = Record<string, Auth>;

package/src/interfaces/endpoints/props.ts CHANGED Viewed

@@ -2,6 +2,7 @@
  * The structure of llama-server's /props endpoint
  */
 export interface PropsEndpoint {
+  role?: "router";
   error?: PropsError;
   default_generation_settings: Record<string, any>;
   total_slots: number;

package/src/interfaces/levels.ts ADDED Viewed

@@ -0,0 +1,7 @@
+export type ThinkingLevel =
+  | "off"
+  | "minimal"
+  | "low"
+  | "medium"
+  | "high"
+  | "xhigh";