pi-llama-cpp 0.5.1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # pi-llama-cpp
2
2
 
3
- A [Pi Coding Agent](https://pi.dev/) extension that integrates with a running [llama.cpp server](https://github.com/ggml-org/llama.cpp) to provide live model browsing, loading, and switching directly from Pi.
3
+ A [Pi Coding Agent](https://pi.dev/) extension that integrates with running [llama.cpp servers](https://github.com/ggml-org/llama.cpp) to provide live model browsing, loading, and switching directly from Pi.
4
4
 
5
5
  ## Features
6
6
 
@@ -10,20 +10,26 @@ A [Pi Coding Agent](https://pi.dev/) extension that integrates with a running [l
10
10
  - **Multi-model router support** — works with both single-model and multi-model llama.cpp server configurations
11
11
  - **Image capabilities detection** — detects multimodal models automatically
12
12
  - **Flexible URL resolution** — configures the server URL via project config, environment variable, or global settings
13
+ - **Auth support** — allows to login into a llama.cpp server that was secured with an API key
14
+ - **Multiple server support** — connect to multiple llama.cpp servers simultaneously by separating URLs with semicolons
15
+ - **Thinking budget support** — configurable token budgets for model reasoning/thinking, mapped to Pi's thinking levels
13
16
 
14
17
  ### Status Indicators
15
18
 
16
- | Icon | Status | Description |
17
- |------|--------|-------------|
18
- | 🟢 | Loaded | Model is active and ready to use |
19
- | 🟡 | Loading | Model is currently being loaded |
20
- | 🔴 | Failed | Model failed to load |
21
- | 🔵 | Sleeping | Model is available, but inactive |
22
- | ⚪ | Unloaded | Model is not loaded on the server |
19
+ | Icon | Status | Description |
20
+ | ---- | ------------ | -------------------------------------- |
21
+ | 🟢 | Loaded | Model is active and ready to use |
22
+ | 🟡 | Loading | Model is currently being loaded |
23
+ | 🔴 | Failed | Model failed to load |
24
+ | 🔵 | Sleeping | Model is available, but inactive |
25
+ | ⚪ | Unloaded | Model is not loaded on the server |
26
+ | ⛔ | Unauthorized | Model can't be used (API key required) |
23
27
 
24
28
  > **Note**: The `Sleeping` status only shows when you start your server with `llama-server --sleep-idle-seconds <n> ...`.
25
- This is a **llama.cpp server flag** that tells the server to put idle models to sleep after `n` seconds.
26
- The model awakens automatically when you send a message.
29
+ > This is a **llama.cpp server flag** that tells the server to put idle models to sleep after `n` seconds.
30
+ > The model awakens automatically when you send a message.
31
+
32
+ > **Note:** You can run your server with API authentication with `llama-server --api-key <your key> ...`.
27
33
 
28
34
  ## Installation
29
35
 
@@ -41,13 +47,13 @@ pi install https://github.com/gsanhueza/pi-llama-cpp
41
47
 
42
48
  ## Configuration
43
49
 
44
- The extension resolves the llama.cpp server URL using the following priority order:
50
+ The extension resolves the llama.cpp server URL(s) using the following priority order:
45
51
 
46
- 1. **Per-project config** — `.pi/llama-server.json` in your project root:
52
+ 1. **Per-project config** — `.pi/settings.json` in your project root:
47
53
 
48
54
  ```json
49
55
  {
50
- "url": "http://127.0.0.1:8080"
56
+ "llamaServerUrl": "http://127.0.0.1:8080"
51
57
  }
52
58
  ```
53
59
 
@@ -63,19 +69,33 @@ The extension resolves the llama.cpp server URL using the following priority ord
63
69
 
64
70
  4. **Default** — `http://127.0.0.1:8080`
65
71
 
66
- ### API Key
72
+ ### Multiple Servers
73
+
74
+ To connect to multiple llama.cpp servers simultaneously, add your URLs as a single string **separated with semicolons** in any of the examples above:
67
75
 
68
- If your llama.cpp server requires authentication, use `/login` in Pi, select the "API key" option, and choose the `Llama.cpp` provider from the list.
76
+ ```bash
77
+ # Example for env, but you can use any of the other methods
78
+ LLAMA_SERVER_URL="http://127.0.0.1:8080;http://127.0.0.1:8081;http://10.0.0.5:8080"
79
+ ```
69
80
 
70
- Alternatively, configure the API key in `~/.pi/agent/auth.json` using the provider ID `llama-server`:
81
+ Each server gets its own provider (e.g., **Llama.cpp (http://127.0.0.1:8080)**) and its own set of models. The `/models` command lists all models from all servers, labeled with their server URL.
82
+
83
+ ### API Key
71
84
 
72
- > **Note**: The provider is displayed as **Llama.cpp** in the Pi UI, but its internal identifier is `llama-server` use this ID when configuring `auth.json` or other programmatic access.
85
+ If your llama.cpp server requires authentication, use `/login` in Pi, select the "API key" option, and choose the provider from the list that correlates with the server needing the API key.
86
+
87
+ Alternatively, configure the API key in `~/.pi/agent/auth.json`:
88
+ Use the provider ID `llama-server=<url>`:
73
89
 
74
90
  ```json
75
91
  {
76
- "llama-server": {
92
+ "llama-server=http://127.0.0.1:8080": {
93
+ "type": "api_key",
94
+ "key": "<key-for-server-1>"
95
+ },
96
+ "llama-server=https://some-url-for-llama-cpp": {
77
97
  "type": "api_key",
78
- "key": "<your-api-key-here>"
98
+ "key": "<key-for-server-2>"
79
99
  }
80
100
  }
81
101
  ```
@@ -98,22 +118,34 @@ llama-server --models-preset path/to/presets.ini ...
98
118
  llama-server --model path/to/model.gguf ...
99
119
  ```
100
120
 
121
+ - For legacy-model mode (e.g., [ik_llama.cpp](https://github.com/ikawrakow/ik_llama.cpp)), the extension auto-detects and handles it transparently.
122
+
123
+ > **Note:** This extension is focused on llama.cpp, not on ik_llama.cpp. Nonetheless, since I found a way to make it work with this extension, I added the option.
124
+
125
+ > **Note:** The ik_llama.cpp fork is not legacy at all, but it uses an old way of describing models compared to llama.cpp.
126
+
101
127
  The extension determines the context size as follows:
128
+
102
129
  - **Router mode**
103
130
  - When loaded, reads `meta.n_ctx` from the `/models` endpoint
104
- - When not loaded, reads `--ctx-size` and/or `--fit-ctx` from the server arguments, or `ctx-size` and/or `fit-ctx` keys from the **presets.ini** file.
131
+ - When not loaded, reads `--ctx-size` and/or `--fit-ctx` from the server arguments (which can also originate from the **presets.ini** file the llama.cpp server uses to load its models).
105
132
  - **Single mode** — reads `meta.n_ctx` from the `/models` endpoint
133
+ - **Legacy mode** — reads `max_model_len` from `/models`, falling back to `n_ctx` from `/props`
106
134
  - Falls back to `128000` if not available
107
135
 
108
136
  ### Commands
109
137
 
110
- | Command | Description |
111
- | ---------------- | ------------------------------------------------------------------------------------------ |
112
- | `/models` | Browse your models with live status. Select a model to load, switch, or unload it. |
113
- | `/models info` | Show detailed information for all available models at once. |
114
- | `/models unload` | Unload all loaded models at once (Note: this only makes sense in router mode). |
138
+ | Command | Description |
139
+ | ---------------- | ---------------------------------------------------------------------------------- |
140
+ | `/models` | Browse your models with live status. Select a model to load, switch, or unload it. |
141
+ | `/models info` | Show detailed information for all available models at once. |
142
+ | `/models unload` | Unload all loaded models at once. |
143
+
144
+ > **Note:** When a llama.cpp server is slow to respond, it will be skipped at startup with a warning. Run `/models` to retry without timeout and see all models.
115
145
 
116
- > **Note:** When the llama.cpp server is unreachable, `/models` displays an error notification with the configured server URL.
146
+ > **Note:** When a llama.cpp server is unreachable, `/models` displays an error notification with the configured server URL, but healthy servers continue to show their models.
147
+
148
+ > **Note:** The `/models unload` command only makes sense in router mode.
117
149
 
118
150
  ### Model Actions
119
151
 
@@ -126,7 +158,37 @@ When browsing models via the `/models` command, you can:
126
158
  - **Info** — View model details (ID, capabilities, context size)
127
159
  - **Cancel** — Cancel the current operation
128
160
 
129
- > **Note:** In single-model mode, only **Info** and **Cancel** are available, since there is only one model loaded on the server.
161
+ > **Note:** In single-model and legacy-model mode, **Unload** is not available, since there is only one model on the server.
162
+
163
+ ### Thinking Budgets
164
+
165
+ The extension supports configurable **thinking budgets** that control how many tokens the model allocates to its reasoning/thinking process.
166
+ This is tied to Pi's thinking level selector (off, minimal, low, medium, high, xhigh).
167
+
168
+ | Level | Tokens | Description |
169
+ | --------- | ------ | ---------------------------- |
170
+ | `off` | 0 | Thinking disabled |
171
+ | `minimal` | 1,024 | Short reasoning steps |
172
+ | `low` | 2,048 | Light reasoning |
173
+ | `medium` | 8,192 | Balanced reasoning (default) |
174
+ | `high` | 16,384 | Extended reasoning |
175
+ | `xhigh` | -1 | Unlimited reasoning |
176
+
177
+ User-defined budgets can override the defaults by adding a `thinkingBudgets` object to `~/.pi/agent/settings.json` (global) or `.pi/settings.json` (per-project):
178
+
179
+ ```json
180
+ {
181
+ "thinkingBudgets": {
182
+ "minimal": 256,
183
+ "low": 1024,
184
+ "medium": 2048,
185
+ "high": 4096
186
+ }
187
+ }
188
+ ```
189
+
190
+ Only `minimal`, `low`, `medium`, and `high` are configurable — `off` (0) and `xhigh` (-1, unlimited) are fixed.
191
+ The extension automatically injects the appropriate `thinking_budget_tokens` into each request payload based on the selected level.
130
192
 
131
193
  ### Model Selection Event
132
194
 
@@ -134,9 +196,12 @@ When you switch models via Pi's model picker (instead of using the `/models` com
134
196
 
135
197
  This keeps the server in sync with the active model in Pi, regardless of how the switch was initiated — you don't need to manually load models before using them.
136
198
 
199
+ > **Note:** If you switch sessions while a model load is in-flight, you'll see a warning, but the load continues in the background. Use `/models` in the new session to verify the model status.
200
+
137
201
  ### Loading Models
138
202
 
139
203
  When you trigger a load, switch, or retry action, the extension polls the server to track progress. If a model takes longer than **60 seconds** to load, the polling times out with an error.
204
+
140
205
  > **Note:** The timeout is only for the polling. The model might still be loading.
141
206
 
142
207
  ### Model Configuration
@@ -149,6 +214,7 @@ Each model exposed to Pi includes the following defaults:
149
214
 
150
215
  ## Dependencies
151
216
 
152
- | Dependency | Purpose |
153
- | --------------------------------- | ------------------------------------- |
154
- | `@earendil-works/pi-coding-agent` | Pi Coding Agent SDK (peer dependency) |
217
+ | Peer dependency | Purpose |
218
+ | --------------------------------- | ------------------- |
219
+ | `@earendil-works/pi-coding-agent` | Pi Coding Agent SDK |
220
+ | `@earendil-works/pi-tui` | Pi TUI SDK |
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "pi-llama-cpp",
3
- "version": "0.5.1",
4
- "description": "Pi extension for llama.cpp integration. Supports both router and single modes.",
3
+ "version": "0.7.0",
4
+ "description": "Pi extension for llama.cpp integration. Supports router, single and legacy models. Supports multiple servers.",
5
5
  "keywords": [
6
6
  "pi",
7
7
  "pi-package",
@@ -32,11 +32,12 @@
32
32
  ]
33
33
  },
34
34
  "peerDependencies": {
35
- "@earendil-works/pi-coding-agent": "*"
35
+ "@earendil-works/pi-coding-agent": "*",
36
+ "@earendil-works/pi-tui": "*"
36
37
  },
37
38
  "devDependencies": {
38
- "@types/node": "^25.9.1",
39
+ "@types/node": "^25.9.3",
39
40
  "prettier-plugin-organize-imports": "^4.3.0",
40
- "vitest": "^4.1.7"
41
+ "vitest": "^4.1.8"
41
42
  }
42
43
  }
package/src/constants.ts CHANGED
@@ -1,7 +1,7 @@
1
1
  /**
2
- * This provider's id
2
+ * This provider's base ID
3
3
  */
4
- export const PROVIDER_ID = "llama-server";
4
+ export const PROVIDER_PREFIX = "llama-server";
5
5
 
6
6
  /**
7
7
  * This provider's name
@@ -9,15 +9,20 @@ export const PROVIDER_ID = "llama-server";
9
9
  export const PROVIDER_NAME = "Llama.cpp";
10
10
 
11
11
  /**
12
- * The default URL if the resolver couldn't find it
12
+ * The default API type used in Pi
13
13
  */
14
- export const DEFAULT_LLAMA_SERVER_URL = "http://127.0.0.1:8080";
14
+ export const API_TYPE = "openai-completions";
15
15
 
16
16
  /**
17
17
  * The placeholder api-key if it couldn't be resolved
18
18
  */
19
19
  export const API_KEY_PLACEHOLDER = "sk-placeholder";
20
20
 
21
+ /**
22
+ * The default URL if the resolver couldn't find it
23
+ */
24
+ export const DEFAULT_LLAMA_SERVER_URL = "http://127.0.0.1:8080";
25
+
21
26
  /**
22
27
  * The default context if the server didn't expose it
23
28
  */
@@ -34,6 +39,23 @@ export const POLLING_INTERVAL = 500;
34
39
  export const POLLING_TIMEOUT = 60000;
35
40
 
36
41
  /**
37
- * Reasonable time to read notifications if context goes stale
42
+ * Reasonable time (ms) to read notifications if context goes stale
38
43
  */
39
44
  export const READABLE_TIMEOUT = 15000;
45
+
46
+ /**
47
+ * Timeout (ms) for server verification before assuming failure
48
+ */
49
+ export const SERVER_TIMEOUT = 1000;
50
+
51
+ /**
52
+ * Thinking budgets to send to the server, depending on user-selected level in Pi.
53
+ */
54
+ export const DEFAULT_THINKING_BUDGETS = {
55
+ off: 0,
56
+ minimal: 1024,
57
+ low: 2048,
58
+ medium: 8192,
59
+ high: 16384,
60
+ xhigh: -1,
61
+ };
@@ -1,9 +1,10 @@
1
1
  /** The possible actions for the /models command */
2
2
  export enum Action {
3
+ LOAD_AND_SWITCH = "Load & switch",
3
4
  SWITCH = "Switch model",
4
- RETRY = "Retry",
5
- LOAD = "Load & switch",
5
+ LOAD = "Load only",
6
6
  UNLOAD = "Unload",
7
+ RETRY = "Retry",
7
8
  INFO = "Info",
8
9
  CANCEL = "Cancel",
9
10
  }
package/src/enums/mode.ts CHANGED
@@ -2,4 +2,5 @@
2
2
  export enum Mode {
3
3
  ROUTER = "router",
4
4
  SINGLE = "single",
5
+ LEGACY = "legacy",
5
6
  }
@@ -0,0 +1,6 @@
1
+ /** The possible states of a llama.cpp server */
2
+ export enum ServerStatus {
3
+ READY = "ready",
4
+ TIMEOUT = "timeout",
5
+ UNREACHABLE = "unreachable",
6
+ }
@@ -5,4 +5,5 @@ export enum Status {
5
5
  FAILED = "failed",
6
6
  SLEEPING = "sleeping",
7
7
  UNLOADED = "unloaded",
8
+ UNAUTHORIZED = "unauthorized",
8
9
  }
package/src/index.ts CHANGED
@@ -1,42 +1,64 @@
1
- import type {
2
- ExtensionAPI,
3
- ExtensionCommandContext,
1
+ import {
2
+ type BeforeProviderRequestEvent,
3
+ type ExtensionAPI,
4
+ type ExtensionCommandContext,
5
+ type ExtensionContext,
6
+ type SessionBeforeSwitchEvent,
7
+ type SessionStartEvent,
4
8
  } from "@earendil-works/pi-coding-agent";
5
- import type { AutocompleteItem } from "@earendil-works/pi-tui";
6
- import { onSessionBeforeSwitch } from "./commands/models";
7
9
  import { PROVIDER_NAME } from "./constants";
8
- import { onModelSelect } from "./events";
9
- import { CommandManager } from "./manager";
10
+ import { ModelSelectEvent } from "./interfaces/events";
11
+ import { CommandManager } from "./managers/command";
12
+ import { EventManager } from "./managers/events";
13
+ import { ServerManager } from "./managers/server";
14
+ import { ConfigResolver } from "./resolver";
15
+ import { Server } from "./server";
10
16
 
11
17
  export default async function (pi: ExtensionAPI) {
12
- const manager = new CommandManager(pi);
13
- await manager.initialize();
18
+ const resolver = new ConfigResolver();
19
+ const urls = await resolver.resolveUrls();
20
+ const servers = urls.map((url) => new Server(url));
14
21
 
15
- // Command: /models
22
+ const eventManager = new EventManager(servers);
23
+ const serverManager = new ServerManager(servers);
24
+ const commandManager = new CommandManager(serverManager);
25
+
26
+ // Register providers once at startup
27
+ await serverManager.initialize(pi);
28
+
29
+ // Single global /models command
16
30
  pi.registerCommand("models", {
17
31
  description: `Browse ${PROVIDER_NAME} models`,
18
- getArgumentCompletions: (prefix: string): AutocompleteItem[] | null => {
19
- const available = [
20
- {
21
- value: "info",
22
- label: "info",
23
- description: "Show information of all models",
24
- },
25
- {
26
- value: "unload",
27
- label: "unload",
28
- description: "Unload all models",
29
- },
30
- ];
31
-
32
- const filtered = available.filter((a) => a.value.startsWith(prefix));
33
- return filtered.length > 0 ? filtered : null;
32
+ getArgumentCompletions: commandManager.getArgumentCompletions,
33
+ handler: async (args: string, ctx: ExtensionCommandContext) => {
34
+ await commandManager.handleCommand(args, ctx, pi);
34
35
  },
35
- handler: async (args: string, ctx: ExtensionCommandContext) =>
36
- await manager.run(args, ctx),
37
36
  });
38
37
 
39
- // Events registration
40
- pi.on("model_select", onModelSelect);
41
- pi.on("session_before_switch", onSessionBeforeSwitch);
38
+ // Events
39
+ pi.on("session_start", (event: SessionStartEvent, ctx: ExtensionContext) => {
40
+ if (event.reason !== "startup") return;
41
+ for (const warning of serverManager.getWarnings())
42
+ ctx.ui.notify(warning, "warning");
43
+
44
+ for (const warning of resolver.getWarnings())
45
+ ctx.ui.notify(warning, "warning");
46
+ });
47
+
48
+ pi.on(
49
+ "before_provider_request",
50
+ async (event: BeforeProviderRequestEvent) =>
51
+ await eventManager.onBeforeProviderRequest(event),
52
+ );
53
+
54
+ pi.on(
55
+ "model_select",
56
+ async (event: ModelSelectEvent, ctx: ExtensionContext) =>
57
+ await eventManager.onModelSelect(event, ctx),
58
+ );
59
+ pi.on(
60
+ "session_before_switch",
61
+ async (_: SessionBeforeSwitchEvent, ctx: ExtensionContext) =>
62
+ await eventManager.onSessionBeforeSwitch(ctx),
63
+ );
42
64
  }
@@ -1,10 +1,6 @@
1
- import { PROVIDER_ID } from "../constants";
2
-
3
1
  interface Auth {
4
2
  type: string;
5
3
  key: string;
6
4
  }
7
5
 
8
- export interface AuthFile {
9
- [PROVIDER_ID]: Auth;
10
- }
6
+ export type AuthFile = Record<string, Auth>;
@@ -2,6 +2,7 @@
2
2
  * The structure of llama-server's /props endpoint
3
3
  */
4
4
  export interface PropsEndpoint {
5
+ role?: "router";
5
6
  error?: PropsError;
6
7
  default_generation_settings: Record<string, any>;
7
8
  total_slots: number;
@@ -0,0 +1,7 @@
1
+ export type ThinkingLevel =
2
+ | "off"
3
+ | "minimal"
4
+ | "low"
5
+ | "medium"
6
+ | "high"
7
+ | "xhigh";