pi-llama-cpp 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/commands/models.ts +6 -1
- package/src/interfaces/endpoints/props.ts +7 -4
- package/src/models/baseModel.ts +32 -5
- package/src/models/routerModel.ts +37 -5
- package/src/models/singleModel.ts +11 -7
- package/tests/routerModel.test.ts +59 -9
- package/tests/singleModel.test.ts +8 -31
package/package.json
CHANGED
package/src/commands/models.ts
CHANGED
|
@@ -38,7 +38,12 @@ const getActionsForModel = async (model: BaseModel): Promise<Array<Action>> => {
|
|
|
38
38
|
[Status.LOADED]: [Action.SWITCH, Action.UNLOAD, Action.INFO, Action.CANCEL],
|
|
39
39
|
[Status.LOADING]: [Action.INFO, Action.CANCEL],
|
|
40
40
|
[Status.FAILED]: [Action.RETRY, Action.CANCEL],
|
|
41
|
-
[Status.SLEEPING]: [
|
|
41
|
+
[Status.SLEEPING]: [
|
|
42
|
+
Action.SWITCH,
|
|
43
|
+
Action.UNLOAD,
|
|
44
|
+
Action.INFO,
|
|
45
|
+
Action.CANCEL,
|
|
46
|
+
],
|
|
42
47
|
[Status.UNLOADED]: [Action.LOAD, Action.CANCEL],
|
|
43
48
|
};
|
|
44
49
|
|
|
@@ -1,11 +1,8 @@
|
|
|
1
|
-
|
|
2
1
|
/**
|
|
3
2
|
* The structure of llama-server's /props endpoint
|
|
4
|
-
*
|
|
5
|
-
* In single mode, applies to /props
|
|
6
|
-
* In router mode, applies to /props?model=<id>
|
|
7
3
|
*/
|
|
8
4
|
export interface PropsEndpoint {
|
|
5
|
+
error?: PropsError;
|
|
9
6
|
default_generation_settings: Record<string, any>;
|
|
10
7
|
total_slots: number;
|
|
11
8
|
model_alias: string;
|
|
@@ -27,3 +24,9 @@ export interface PropsEndpoint {
|
|
|
27
24
|
build_info: string;
|
|
28
25
|
is_sleeping: boolean;
|
|
29
26
|
}
|
|
27
|
+
|
|
28
|
+
export interface PropsError {
|
|
29
|
+
code: number;
|
|
30
|
+
message: string;
|
|
31
|
+
type: string;
|
|
32
|
+
}
|
package/src/models/baseModel.ts
CHANGED
|
@@ -69,11 +69,31 @@ export abstract class BaseModel {
|
|
|
69
69
|
|
|
70
70
|
/**
|
|
71
71
|
* Gets the load status of the model
|
|
72
|
+
*
|
|
73
|
+
* @returns The current status
|
|
72
74
|
*/
|
|
73
|
-
|
|
75
|
+
public async getStatus(): Promise<Status> {
|
|
76
|
+
try {
|
|
77
|
+
const { is_sleeping, error } = await rpc<PropsEndpoint>(
|
|
78
|
+
`/props?model=${this.id}`,
|
|
79
|
+
);
|
|
80
|
+
|
|
81
|
+
if (is_sleeping) return Status.SLEEPING;
|
|
82
|
+
if (!error) return Status.LOADED;
|
|
83
|
+
if (error.code === 503) return Status.LOADING;
|
|
84
|
+
if (error.code === 400 && error.message === "model is not loaded")
|
|
85
|
+
return Status.UNLOADED;
|
|
86
|
+
|
|
87
|
+
return Status.FAILED;
|
|
88
|
+
} catch (err) {
|
|
89
|
+
return Status.FAILED;
|
|
90
|
+
}
|
|
91
|
+
}
|
|
74
92
|
|
|
75
93
|
/**
|
|
76
94
|
* Gets the context size of a particular model
|
|
95
|
+
*
|
|
96
|
+
* @returns The detected context size
|
|
77
97
|
*/
|
|
78
98
|
async getContextSize(): Promise<number> {
|
|
79
99
|
try {
|
|
@@ -116,6 +136,7 @@ export abstract class BaseModel {
|
|
|
116
136
|
|
|
117
137
|
/**
|
|
118
138
|
* Converts the llama-server model into a configuration object used by Pi
|
|
139
|
+
*
|
|
119
140
|
* @returns A Pi configuration object
|
|
120
141
|
*/
|
|
121
142
|
async toProviderConfig(): Promise<ProviderModelConfig> {
|
|
@@ -153,15 +174,21 @@ export abstract class BaseModel {
|
|
|
153
174
|
* Polls llama-server to check when the model is loaded
|
|
154
175
|
*
|
|
155
176
|
* @param startTime The initial polling timestamp
|
|
177
|
+
* @param timeout The maximum amount of ms before timeout. Defaults to POLLING_TIMEOUT
|
|
178
|
+
* @param interval The polling interval. Defaults to POLLING_INTERVAL
|
|
156
179
|
*/
|
|
157
|
-
async pollStatus(
|
|
180
|
+
async pollStatus(
|
|
181
|
+
startTime: number = Date.now(),
|
|
182
|
+
timeout: number = POLLING_TIMEOUT,
|
|
183
|
+
interval: number = POLLING_INTERVAL,
|
|
184
|
+
): Promise<void> {
|
|
158
185
|
while ((await this.getStatus()) === Status.LOADING) {
|
|
159
186
|
// Force a timeout if we wasted too much time polling
|
|
160
|
-
if (Date.now() - startTime >
|
|
161
|
-
const message = `Model loading timed out after ${
|
|
187
|
+
if (Date.now() - startTime > timeout) {
|
|
188
|
+
const message = `Model loading timed out after ${timeout} ms: ${this.id}`;
|
|
162
189
|
throw new Error(message);
|
|
163
190
|
}
|
|
164
|
-
await new Promise((r) => setTimeout(r,
|
|
191
|
+
await new Promise((r) => setTimeout(r, interval));
|
|
165
192
|
}
|
|
166
193
|
}
|
|
167
194
|
}
|
|
@@ -1,7 +1,8 @@
|
|
|
1
|
-
import { DEFAULT_CTX } from "../constants";
|
|
1
|
+
import { DEFAULT_CTX, POLLING_INTERVAL, POLLING_TIMEOUT } from "../constants";
|
|
2
2
|
import { Mode } from "../enums/mode";
|
|
3
3
|
import { Status } from "../enums/status";
|
|
4
4
|
import { ModelsEndpoint } from "../interfaces/endpoints/models";
|
|
5
|
+
import { PropsEndpoint } from "../interfaces/endpoints/props";
|
|
5
6
|
import { rpc } from "../tools/retriever";
|
|
6
7
|
import { BaseModel } from "./baseModel";
|
|
7
8
|
|
|
@@ -21,15 +22,46 @@ export class RouterModel extends BaseModel {
|
|
|
21
22
|
if (!model) return Status.FAILED;
|
|
22
23
|
|
|
23
24
|
const status = this.statusMapper[model.status!.value];
|
|
24
|
-
if (status === Status.UNLOADED) {
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
return Status.UNLOADED;
|
|
25
|
+
if (status === Status.UNLOADED || status === Status.LOADING) {
|
|
26
|
+
return super.getStatus();
|
|
28
27
|
}
|
|
29
28
|
|
|
30
29
|
return status;
|
|
31
30
|
}
|
|
32
31
|
|
|
32
|
+
/**
|
|
33
|
+
* Workaround for the currently-bugged /models status detection
|
|
34
|
+
* (I suspect it was introduced in PR #22683 of llama.cpp)
|
|
35
|
+
*
|
|
36
|
+
* When a model is loaded for the very first time,
|
|
37
|
+
* this workaround will try to poll to /props instead of /models
|
|
38
|
+
* for up to 5 seconds to try to detect if the model is really loading,
|
|
39
|
+
* or if it definitely failed.
|
|
40
|
+
*
|
|
41
|
+
* The tradeoff is that we'll have to wait for 5 seconds
|
|
42
|
+
* while the model is "loading", while not really loading.
|
|
43
|
+
*
|
|
44
|
+
* In exchange, it will allow unloaded models to be correctly shown as "unloaded".
|
|
45
|
+
*/
|
|
46
|
+
async pollStatus(startTime = Date.now()): Promise<void> {
|
|
47
|
+
let elapsed = 0;
|
|
48
|
+
const limit = 5000;
|
|
49
|
+
|
|
50
|
+
// Grab the glitch
|
|
51
|
+
while (Date.now() - startTime <= limit) {
|
|
52
|
+
try {
|
|
53
|
+
await rpc<PropsEndpoint>(`/props?model=${this.id}`);
|
|
54
|
+
break;
|
|
55
|
+
} catch {
|
|
56
|
+
elapsed += POLLING_INTERVAL;
|
|
57
|
+
await new Promise((r) => setTimeout(r, POLLING_INTERVAL));
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
const timeout = POLLING_TIMEOUT - elapsed;
|
|
62
|
+
return await super.pollStatus(startTime, timeout);
|
|
63
|
+
}
|
|
64
|
+
|
|
33
65
|
async getCapabilities(): Promise<["text"] | ["image"]> {
|
|
34
66
|
// We can get the real capabilities if the model is already loaded
|
|
35
67
|
if ((await this.getStatus()) === Status.LOADED) {
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
+
import { DEFAULT_CTX } from "../constants";
|
|
1
2
|
import { Mode } from "../enums/mode";
|
|
2
|
-
import { Status } from "../enums/status";
|
|
3
3
|
import { PropsEndpoint } from "../interfaces/endpoints/props";
|
|
4
4
|
import { rpc } from "../tools/retriever";
|
|
5
5
|
import { BaseModel } from "./baseModel";
|
|
@@ -9,11 +9,15 @@ export class SingleModel extends BaseModel {
|
|
|
9
9
|
return Mode.SINGLE;
|
|
10
10
|
}
|
|
11
11
|
|
|
12
|
-
async
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
12
|
+
async getContextSize(): Promise<number> {
|
|
13
|
+
try {
|
|
14
|
+
const { default_generation_settings } = await rpc<PropsEndpoint>(
|
|
15
|
+
`/props?model=${this.id}`,
|
|
16
|
+
);
|
|
17
|
+
const { n_ctx } = default_generation_settings;
|
|
18
|
+
return n_ctx;
|
|
19
|
+
} catch {
|
|
20
|
+
return DEFAULT_CTX;
|
|
21
|
+
}
|
|
18
22
|
}
|
|
19
23
|
}
|
|
@@ -115,7 +115,18 @@ describe("RouterModel context size extraction", () => {
|
|
|
115
115
|
data: [
|
|
116
116
|
{
|
|
117
117
|
id: "test-model",
|
|
118
|
-
status: {
|
|
118
|
+
status: {
|
|
119
|
+
value: "loaded",
|
|
120
|
+
args: [
|
|
121
|
+
"--model",
|
|
122
|
+
"gguf",
|
|
123
|
+
"--ctx-size",
|
|
124
|
+
"4096",
|
|
125
|
+
"--fit-ctx",
|
|
126
|
+
"8192",
|
|
127
|
+
],
|
|
128
|
+
preset: "default",
|
|
129
|
+
},
|
|
119
130
|
},
|
|
120
131
|
],
|
|
121
132
|
});
|
|
@@ -149,7 +160,11 @@ describe("RouterModel context size extraction", () => {
|
|
|
149
160
|
data: [
|
|
150
161
|
{
|
|
151
162
|
id: "test-model",
|
|
152
|
-
status: {
|
|
163
|
+
status: {
|
|
164
|
+
value: "loaded",
|
|
165
|
+
args: ["--model", "gguf"],
|
|
166
|
+
preset: "default",
|
|
167
|
+
},
|
|
153
168
|
},
|
|
154
169
|
],
|
|
155
170
|
});
|
|
@@ -186,7 +201,12 @@ describe("RouterModel capabilities detection", () => {
|
|
|
186
201
|
data: [
|
|
187
202
|
{
|
|
188
203
|
id: "test-model",
|
|
189
|
-
status: {
|
|
204
|
+
status: {
|
|
205
|
+
value: "loaded",
|
|
206
|
+
args: [],
|
|
207
|
+
preset: "default",
|
|
208
|
+
failed: false,
|
|
209
|
+
},
|
|
190
210
|
},
|
|
191
211
|
],
|
|
192
212
|
});
|
|
@@ -206,7 +226,12 @@ describe("RouterModel capabilities detection", () => {
|
|
|
206
226
|
data: [
|
|
207
227
|
{
|
|
208
228
|
id: "test-model",
|
|
209
|
-
status: {
|
|
229
|
+
status: {
|
|
230
|
+
value: "loaded",
|
|
231
|
+
args: [],
|
|
232
|
+
preset: "default",
|
|
233
|
+
failed: false,
|
|
234
|
+
},
|
|
210
235
|
},
|
|
211
236
|
],
|
|
212
237
|
});
|
|
@@ -225,7 +250,12 @@ describe("RouterModel capabilities detection", () => {
|
|
|
225
250
|
data: [
|
|
226
251
|
{
|
|
227
252
|
id: "test-model",
|
|
228
|
-
status: {
|
|
253
|
+
status: {
|
|
254
|
+
value: "loaded",
|
|
255
|
+
args: [],
|
|
256
|
+
preset: "default",
|
|
257
|
+
failed: false,
|
|
258
|
+
},
|
|
229
259
|
},
|
|
230
260
|
],
|
|
231
261
|
});
|
|
@@ -244,14 +274,24 @@ describe("RouterModel capabilities detection", () => {
|
|
|
244
274
|
data: [
|
|
245
275
|
{
|
|
246
276
|
id: "test-model",
|
|
247
|
-
status: {
|
|
277
|
+
status: {
|
|
278
|
+
value: "unloaded",
|
|
279
|
+
args: ["--model", "gguf", "--mmproj", "mmproj.gguf"],
|
|
280
|
+
preset: "default",
|
|
281
|
+
failed: false,
|
|
282
|
+
},
|
|
248
283
|
},
|
|
249
284
|
],
|
|
250
285
|
});
|
|
251
286
|
|
|
252
287
|
const model = new RouterModel(
|
|
253
288
|
createModel({
|
|
254
|
-
status: {
|
|
289
|
+
status: {
|
|
290
|
+
value: "unloaded",
|
|
291
|
+
args: ["--model", "gguf", "--mmproj", "mmproj.gguf"],
|
|
292
|
+
preset: "default",
|
|
293
|
+
failed: false,
|
|
294
|
+
},
|
|
255
295
|
}),
|
|
256
296
|
);
|
|
257
297
|
const capabilities = await model.getCapabilities();
|
|
@@ -265,14 +305,24 @@ describe("RouterModel capabilities detection", () => {
|
|
|
265
305
|
data: [
|
|
266
306
|
{
|
|
267
307
|
id: "test-model",
|
|
268
|
-
status: {
|
|
308
|
+
status: {
|
|
309
|
+
value: "unloaded",
|
|
310
|
+
args: ["--model", "gguf"],
|
|
311
|
+
preset: "default",
|
|
312
|
+
failed: false,
|
|
313
|
+
},
|
|
269
314
|
},
|
|
270
315
|
],
|
|
271
316
|
});
|
|
272
317
|
|
|
273
318
|
const model = new RouterModel(
|
|
274
319
|
createModel({
|
|
275
|
-
status: {
|
|
320
|
+
status: {
|
|
321
|
+
value: "unloaded",
|
|
322
|
+
args: ["--model", "gguf"],
|
|
323
|
+
preset: "default",
|
|
324
|
+
failed: false,
|
|
325
|
+
},
|
|
276
326
|
}),
|
|
277
327
|
);
|
|
278
328
|
const capabilities = await model.getCapabilities();
|
|
@@ -18,36 +18,13 @@ beforeEach(() => {
|
|
|
18
18
|
});
|
|
19
19
|
|
|
20
20
|
const createModel = (extra: Partial<ModelProperty> = {}): SingleModel =>
|
|
21
|
-
new SingleModel(
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
},
|
|
29
|
-
{
|
|
30
|
-
name: "test",
|
|
31
|
-
model: "test.gguf",
|
|
32
|
-
modified_at: new Date().toISOString(),
|
|
33
|
-
size: "1B",
|
|
34
|
-
digest: "abc123",
|
|
35
|
-
type: "model",
|
|
36
|
-
description: "test",
|
|
37
|
-
tags: [],
|
|
38
|
-
capabilities: [],
|
|
39
|
-
parameters: "",
|
|
40
|
-
details: {
|
|
41
|
-
parent_model: "",
|
|
42
|
-
format: "",
|
|
43
|
-
family: "",
|
|
44
|
-
families: [],
|
|
45
|
-
parameter_size: "",
|
|
46
|
-
quantization_level: "",
|
|
47
|
-
},
|
|
48
|
-
...extra,
|
|
49
|
-
},
|
|
50
|
-
);
|
|
21
|
+
new SingleModel({
|
|
22
|
+
id: "test",
|
|
23
|
+
tags: [],
|
|
24
|
+
object: "model",
|
|
25
|
+
owned_by: "test",
|
|
26
|
+
created: Date.now(),
|
|
27
|
+
});
|
|
51
28
|
|
|
52
29
|
describe("SingleModel mode", () => {
|
|
53
30
|
it("should always return SINGLE mode", () => {
|
|
@@ -94,7 +71,7 @@ describe("SingleModel getStatus", () => {
|
|
|
94
71
|
const status = await model.getStatus();
|
|
95
72
|
|
|
96
73
|
expect(status).toBe(Status.LOADED);
|
|
97
|
-
expect(mockRpc).toHaveBeenCalledWith(
|
|
74
|
+
expect(mockRpc).toHaveBeenCalledWith(`/props?model=${model.id}`);
|
|
98
75
|
});
|
|
99
76
|
|
|
100
77
|
it("should return SLEEPING when is_sleeping is true", async () => {
|