@fugood/llama.node 1.2.6 → 1.3.0-rc.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -0
- package/lib/binding.ts +96 -1
- package/lib/index.js +4 -2
- package/lib/index.ts +4 -1
- package/lib/parallel.js +214 -0
- package/lib/parallel.ts +273 -0
- package/package.json +14 -14
- package/src/LlamaContext.cpp +34 -1
- package/src/LlamaContext.h +16 -0
- package/src/common.hpp +4 -3
- package/src/llama.cpp/common/arg.cpp +1 -1
- package/src/llama.cpp/ggml/include/ggml.h +44 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +10 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +32 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +4 -0
- package/src/llama.cpp/src/llama-arch.cpp +5 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-model.cpp +3 -1
- package/src/llama.cpp/src/llama-quant.cpp +7 -1
- package/src/llama.cpp/src/llama.cpp +3 -0
package/CMakeLists.txt
CHANGED
package/lib/binding.ts
CHANGED
|
@@ -25,6 +25,12 @@ export type LlamaModelOptions = {
|
|
|
25
25
|
n_ctx?: number
|
|
26
26
|
n_batch?: number
|
|
27
27
|
n_ubatch?: number
|
|
28
|
+
/**
|
|
29
|
+
* Number of parallel sequences to support (sets n_seq_max).
|
|
30
|
+
* This determines the maximum number of parallel slots that can be used.
|
|
31
|
+
* Default: 8
|
|
32
|
+
*/
|
|
33
|
+
n_parallel?: number
|
|
28
34
|
n_threads?: number
|
|
29
35
|
n_gpu_layers?: number
|
|
30
36
|
flash_attn_type?: 'auto' | 'on' | 'off'
|
|
@@ -157,6 +163,36 @@ export type LlamaCompletionOptions = {
|
|
|
157
163
|
n_probs?: number
|
|
158
164
|
}
|
|
159
165
|
|
|
166
|
+
/**
|
|
167
|
+
* Parameters for parallel completion requests (queueCompletion).
|
|
168
|
+
* Extends LlamaCompletionOptions with parallel-mode specific options.
|
|
169
|
+
*/
|
|
170
|
+
export type LlamaParallelCompletionOptions = LlamaCompletionOptions & {
|
|
171
|
+
/**
|
|
172
|
+
* File path to load session state from before processing.
|
|
173
|
+
* This allows you to resume from a previously saved completion state.
|
|
174
|
+
* Use with `save_state_path` to enable conversation continuity across requests.
|
|
175
|
+
* Example: `'/path/to/session.bin'` or `'file:///path/to/session.bin'`
|
|
176
|
+
*/
|
|
177
|
+
load_state_path?: string
|
|
178
|
+
|
|
179
|
+
/**
|
|
180
|
+
* File path to save session state to after completion.
|
|
181
|
+
* The session state will be saved to this file path when the completion finishes.
|
|
182
|
+
* You can then pass this path to `load_state_path` in a subsequent request to resume.
|
|
183
|
+
* Example: `'/path/to/session.bin'` or `'file:///path/to/session.bin'`
|
|
184
|
+
*/
|
|
185
|
+
save_state_path?: string
|
|
186
|
+
|
|
187
|
+
/**
|
|
188
|
+
* Number of tokens to save when saving session state.
|
|
189
|
+
* If not specified or <= 0, all tokens will be saved.
|
|
190
|
+
* Use this to limit the size of saved session files.
|
|
191
|
+
* Example: `512` to save only the last 512 tokens
|
|
192
|
+
*/
|
|
193
|
+
save_state_size?: number
|
|
194
|
+
}
|
|
195
|
+
|
|
160
196
|
export type TokenProbability = {
|
|
161
197
|
tok_str: string
|
|
162
198
|
prob: number
|
|
@@ -271,7 +307,7 @@ export type JinjaFormattedChatResult = {
|
|
|
271
307
|
prompt: string
|
|
272
308
|
chat_format: number
|
|
273
309
|
grammar: string
|
|
274
|
-
|
|
310
|
+
grammar_lazy: boolean
|
|
275
311
|
grammar_triggers: Array<{
|
|
276
312
|
type: number
|
|
277
313
|
value: string
|
|
@@ -404,6 +440,65 @@ export interface LlamaContext {
|
|
|
404
440
|
*/
|
|
405
441
|
decodeAudioTokens(tokens: number[]|Int32Array): Promise<Float32Array>
|
|
406
442
|
|
|
443
|
+
// Parallel decoding methods
|
|
444
|
+
|
|
445
|
+
/**
|
|
446
|
+
* Enable parallel decoding mode
|
|
447
|
+
* @param params Configuration for parallel mode
|
|
448
|
+
* @returns boolean indicating if successful
|
|
449
|
+
*/
|
|
450
|
+
enableParallelMode(params: { n_parallel?: number, n_batch?: number }): boolean
|
|
451
|
+
|
|
452
|
+
/**
|
|
453
|
+
* Disable parallel decoding mode
|
|
454
|
+
*/
|
|
455
|
+
disableParallelMode(): void
|
|
456
|
+
|
|
457
|
+
/**
|
|
458
|
+
* Queue a completion request for parallel processing
|
|
459
|
+
* @param options Completion options with parallel-specific state management
|
|
460
|
+
* @param callback Optional token callback
|
|
461
|
+
* @returns Object with requestId
|
|
462
|
+
*/
|
|
463
|
+
queueCompletion(
|
|
464
|
+
options: LlamaParallelCompletionOptions,
|
|
465
|
+
callback?: (error: any, result: any) => void,
|
|
466
|
+
): { requestId: number }
|
|
467
|
+
|
|
468
|
+
/**
|
|
469
|
+
* Queue an embedding request for parallel processing
|
|
470
|
+
* @param text Text to embed
|
|
471
|
+
* @param params Optional embedding parameters
|
|
472
|
+
* @param callback Optional result callback
|
|
473
|
+
* @returns Object with requestId
|
|
474
|
+
*/
|
|
475
|
+
queueEmbedding(
|
|
476
|
+
text: string,
|
|
477
|
+
params?: { embd_normalize?: number },
|
|
478
|
+
callback?: (error: any, result: any) => void,
|
|
479
|
+
): { requestId: number }
|
|
480
|
+
|
|
481
|
+
/**
|
|
482
|
+
* Queue a rerank request for parallel processing
|
|
483
|
+
* @param query Query text
|
|
484
|
+
* @param documents Documents to rank
|
|
485
|
+
* @param params Optional rerank parameters
|
|
486
|
+
* @param callback Optional result callback
|
|
487
|
+
* @returns Object with requestId
|
|
488
|
+
*/
|
|
489
|
+
queueRerank(
|
|
490
|
+
query: string,
|
|
491
|
+
documents: string[],
|
|
492
|
+
params?: RerankParams,
|
|
493
|
+
callback?: (error: any, result: any) => void,
|
|
494
|
+
): { requestId: number }
|
|
495
|
+
|
|
496
|
+
/**
|
|
497
|
+
* Cancel a queued request
|
|
498
|
+
* @param requestId Request ID to cancel
|
|
499
|
+
*/
|
|
500
|
+
cancelRequest(requestId: number): void
|
|
501
|
+
|
|
407
502
|
// static
|
|
408
503
|
loadModelInfo(path: string, skip: string[]): Promise<GGUFModelInfo>
|
|
409
504
|
toggleNativeLog(
|
package/lib/index.js
CHANGED
|
@@ -23,10 +23,12 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
23
23
|
});
|
|
24
24
|
};
|
|
25
25
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
26
|
-
exports.BuildInfo = exports.loadLlamaModelInfo = exports.initLlama = exports.loadModel = exports.toggleNativeLog = exports.MTMD_DEFAULT_MEDIA_MARKER = void 0;
|
|
26
|
+
exports.BuildInfo = exports.loadLlamaModelInfo = exports.initLlama = exports.loadModel = exports.toggleNativeLog = exports.MTMD_DEFAULT_MEDIA_MARKER = exports.LlamaParallelAPI = void 0;
|
|
27
27
|
exports.addNativeLogListener = addNativeLogListener;
|
|
28
28
|
const binding_1 = require("./binding");
|
|
29
29
|
const version_1 = require("./version");
|
|
30
|
+
const parallel_1 = require("./parallel");
|
|
31
|
+
Object.defineProperty(exports, "LlamaParallelAPI", { enumerable: true, get: function () { return parallel_1.LlamaParallelAPI; } });
|
|
30
32
|
__exportStar(require("./binding"), exports);
|
|
31
33
|
exports.MTMD_DEFAULT_MEDIA_MARKER = '<__media__>';
|
|
32
34
|
const mods = {};
|
|
@@ -66,6 +68,7 @@ const getJsonSchema = (responseFormat) => {
|
|
|
66
68
|
class LlamaContextWrapper {
|
|
67
69
|
constructor(nativeCtx) {
|
|
68
70
|
this.ctx = nativeCtx;
|
|
71
|
+
this.parallel = new parallel_1.LlamaParallelAPI(nativeCtx);
|
|
69
72
|
}
|
|
70
73
|
getSystemInfo() {
|
|
71
74
|
return this.ctx.getSystemInfo();
|
|
@@ -138,7 +141,6 @@ class LlamaContextWrapper {
|
|
|
138
141
|
let tmpl;
|
|
139
142
|
if (template)
|
|
140
143
|
tmpl = template; // Force replace if provided
|
|
141
|
-
const jsonSchema = getJsonSchema(params === null || params === void 0 ? void 0 : params.response_format);
|
|
142
144
|
const result = this.ctx.getFormattedChat(chat, tmpl, {
|
|
143
145
|
jinja: useJinja,
|
|
144
146
|
response_format: params === null || params === void 0 ? void 0 : params.response_format,
|
package/lib/index.ts
CHANGED
|
@@ -18,8 +18,10 @@ import type {
|
|
|
18
18
|
GGUFModelInfo,
|
|
19
19
|
} from './binding'
|
|
20
20
|
import { BUILD_NUMBER, BUILD_COMMIT } from './version'
|
|
21
|
+
import { LlamaParallelAPI } from './parallel'
|
|
21
22
|
|
|
22
23
|
export * from './binding'
|
|
24
|
+
export { LlamaParallelAPI }
|
|
23
25
|
|
|
24
26
|
export const MTMD_DEFAULT_MEDIA_MARKER = '<__media__>'
|
|
25
27
|
|
|
@@ -78,9 +80,11 @@ export type FormattedChatResult = {
|
|
|
78
80
|
|
|
79
81
|
class LlamaContextWrapper {
|
|
80
82
|
ctx: LlamaContext
|
|
83
|
+
parallel: LlamaParallelAPI
|
|
81
84
|
|
|
82
85
|
constructor(nativeCtx: LlamaContext) {
|
|
83
86
|
this.ctx = nativeCtx
|
|
87
|
+
this.parallel = new LlamaParallelAPI(nativeCtx)
|
|
84
88
|
}
|
|
85
89
|
|
|
86
90
|
getSystemInfo(): string {
|
|
@@ -181,7 +185,6 @@ class LlamaContextWrapper {
|
|
|
181
185
|
const useJinja = this.isJinjaSupported() && params?.jinja
|
|
182
186
|
let tmpl
|
|
183
187
|
if (template) tmpl = template // Force replace if provided
|
|
184
|
-
const jsonSchema = getJsonSchema(params?.response_format)
|
|
185
188
|
|
|
186
189
|
const result = this.ctx.getFormattedChat(chat!, tmpl, {
|
|
187
190
|
jinja: useJinja,
|
package/lib/parallel.js
ADDED
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
exports.LlamaParallelAPI = void 0;
|
|
13
|
+
class LlamaParallelAPI {
|
|
14
|
+
constructor(context) {
|
|
15
|
+
this.enabled = false;
|
|
16
|
+
this.pendingRequests = new Map();
|
|
17
|
+
this.context = context;
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* Enable parallel decoding mode
|
|
21
|
+
* @param config Configuration for parallel mode
|
|
22
|
+
* @returns boolean indicating if successful
|
|
23
|
+
*/
|
|
24
|
+
enable(config) {
|
|
25
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
26
|
+
const defaultConfig = { n_parallel: 2, n_batch: 512 };
|
|
27
|
+
const result = this.context.enableParallelMode(Object.assign(Object.assign({}, defaultConfig), config));
|
|
28
|
+
this.enabled = result;
|
|
29
|
+
return result;
|
|
30
|
+
});
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Disable parallel decoding mode
|
|
34
|
+
*/
|
|
35
|
+
disable() {
|
|
36
|
+
this.context.disableParallelMode();
|
|
37
|
+
this.enabled = false;
|
|
38
|
+
}
|
|
39
|
+
/**
|
|
40
|
+
* Configure parallel decoding mode (enables if not already enabled)
|
|
41
|
+
* @param config Configuration for parallel mode
|
|
42
|
+
* @returns boolean indicating if successful
|
|
43
|
+
*/
|
|
44
|
+
configure(config) {
|
|
45
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
46
|
+
return this.enable(config);
|
|
47
|
+
});
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Queue a completion request for parallel processing
|
|
51
|
+
* @param options Completion options
|
|
52
|
+
* @param onToken Optional callback for each token
|
|
53
|
+
* @returns Object with requestId, promise for result, and stop function
|
|
54
|
+
*/
|
|
55
|
+
completion(options, onToken) {
|
|
56
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
57
|
+
if (!this.enabled) {
|
|
58
|
+
throw new Error('Parallel mode is not enabled. Call enable() first.');
|
|
59
|
+
}
|
|
60
|
+
const tokenCallback = onToken
|
|
61
|
+
? (error, result) => {
|
|
62
|
+
if (error) {
|
|
63
|
+
console.error('Token callback error:', error);
|
|
64
|
+
// Handle completion error
|
|
65
|
+
const pendingReq = this.pendingRequests.get(result === null || result === void 0 ? void 0 : result.requestId);
|
|
66
|
+
if (pendingReq) {
|
|
67
|
+
pendingReq.reject(error);
|
|
68
|
+
this.pendingRequests.delete(result === null || result === void 0 ? void 0 : result.requestId);
|
|
69
|
+
}
|
|
70
|
+
return;
|
|
71
|
+
}
|
|
72
|
+
// Check if this is a token callback or final result
|
|
73
|
+
if (result) {
|
|
74
|
+
if (result.token !== undefined) {
|
|
75
|
+
// This is a token callback
|
|
76
|
+
onToken(result.requestId, result);
|
|
77
|
+
}
|
|
78
|
+
else if (result.text !== undefined ||
|
|
79
|
+
result.content !== undefined) {
|
|
80
|
+
// This is the final result
|
|
81
|
+
const pendingReq = this.pendingRequests.get(result.requestId);
|
|
82
|
+
if (pendingReq) {
|
|
83
|
+
pendingReq.resolve(result);
|
|
84
|
+
this.pendingRequests.delete(result.requestId);
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
: undefined;
|
|
90
|
+
// Queue the completion immediately (this is synchronous!)
|
|
91
|
+
const { requestId } = this.context.queueCompletion(options, tokenCallback ||
|
|
92
|
+
((error, result) => {
|
|
93
|
+
if (error) {
|
|
94
|
+
const pendingReq = this.pendingRequests.get(result === null || result === void 0 ? void 0 : result.requestId);
|
|
95
|
+
if (pendingReq) {
|
|
96
|
+
pendingReq.reject(error);
|
|
97
|
+
this.pendingRequests.delete(result === null || result === void 0 ? void 0 : result.requestId);
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
else if (result &&
|
|
101
|
+
(result.text !== undefined || result.content !== undefined)) {
|
|
102
|
+
// Final result for non-streaming
|
|
103
|
+
const pendingReq = this.pendingRequests.get(result.requestId);
|
|
104
|
+
if (pendingReq) {
|
|
105
|
+
pendingReq.resolve(result);
|
|
106
|
+
this.pendingRequests.delete(result.requestId);
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
}));
|
|
110
|
+
// Create promise for final result
|
|
111
|
+
const promise = new Promise((resolveResult, rejectResult) => {
|
|
112
|
+
this.pendingRequests.set(requestId, {
|
|
113
|
+
resolve: resolveResult,
|
|
114
|
+
reject: rejectResult,
|
|
115
|
+
});
|
|
116
|
+
});
|
|
117
|
+
// Create stop function
|
|
118
|
+
const stop = () => {
|
|
119
|
+
this.context.cancelRequest(requestId);
|
|
120
|
+
const pendingReq = this.pendingRequests.get(requestId);
|
|
121
|
+
if (pendingReq) {
|
|
122
|
+
pendingReq.reject(new Error('Request cancelled'));
|
|
123
|
+
this.pendingRequests.delete(requestId);
|
|
124
|
+
}
|
|
125
|
+
};
|
|
126
|
+
// Return immediately without wrapping in a Promise
|
|
127
|
+
return {
|
|
128
|
+
requestId,
|
|
129
|
+
promise,
|
|
130
|
+
stop,
|
|
131
|
+
};
|
|
132
|
+
});
|
|
133
|
+
}
|
|
134
|
+
/**
|
|
135
|
+
* Queue an embedding request for parallel processing
|
|
136
|
+
* @param text Text to embed
|
|
137
|
+
* @param params Optional embedding parameters
|
|
138
|
+
* @returns Object with requestId and promise for result
|
|
139
|
+
*/
|
|
140
|
+
embedding(text, params) {
|
|
141
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
142
|
+
if (!this.enabled) {
|
|
143
|
+
throw new Error('Parallel mode is not enabled. Call enable() first.');
|
|
144
|
+
}
|
|
145
|
+
// Create promise for result
|
|
146
|
+
let resolveResult;
|
|
147
|
+
let rejectResult;
|
|
148
|
+
const promise = new Promise((res, rej) => {
|
|
149
|
+
resolveResult = res;
|
|
150
|
+
rejectResult = rej;
|
|
151
|
+
});
|
|
152
|
+
// Queue the embedding immediately (this is synchronous!)
|
|
153
|
+
const { requestId } = this.context.queueEmbedding(text, params, (error, result) => {
|
|
154
|
+
if (error) {
|
|
155
|
+
rejectResult(error);
|
|
156
|
+
}
|
|
157
|
+
else {
|
|
158
|
+
resolveResult(result);
|
|
159
|
+
}
|
|
160
|
+
});
|
|
161
|
+
// Return immediately without wrapping in a Promise
|
|
162
|
+
return {
|
|
163
|
+
requestId,
|
|
164
|
+
promise,
|
|
165
|
+
};
|
|
166
|
+
});
|
|
167
|
+
}
|
|
168
|
+
/**
|
|
169
|
+
* Queue a rerank request for parallel processing
|
|
170
|
+
* @param query Query text
|
|
171
|
+
* @param documents Documents to rank
|
|
172
|
+
* @param params Optional rerank parameters
|
|
173
|
+
* @returns Object with requestId and promise for results
|
|
174
|
+
*/
|
|
175
|
+
rerank(query, documents, params) {
|
|
176
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
177
|
+
if (!this.enabled) {
|
|
178
|
+
throw new Error('Parallel mode is not enabled. Call enable() first.');
|
|
179
|
+
}
|
|
180
|
+
// Create promise for result
|
|
181
|
+
let resolveResult;
|
|
182
|
+
let rejectResult;
|
|
183
|
+
const promise = new Promise((res, rej) => {
|
|
184
|
+
resolveResult = res;
|
|
185
|
+
rejectResult = rej;
|
|
186
|
+
});
|
|
187
|
+
// Queue the rerank immediately (this is synchronous!)
|
|
188
|
+
const { requestId } = this.context.queueRerank(query, documents, params, (error, result) => {
|
|
189
|
+
if (error) {
|
|
190
|
+
rejectResult(error);
|
|
191
|
+
}
|
|
192
|
+
else {
|
|
193
|
+
// Add document text to results and sort by score
|
|
194
|
+
const enrichedResults = result.results
|
|
195
|
+
.map((r) => (Object.assign(Object.assign({}, r), { document: documents[r.index] })))
|
|
196
|
+
.sort((a, b) => b.score - a.score);
|
|
197
|
+
resolveResult(enrichedResults);
|
|
198
|
+
}
|
|
199
|
+
});
|
|
200
|
+
// Return immediately without wrapping in a Promise
|
|
201
|
+
return {
|
|
202
|
+
requestId,
|
|
203
|
+
promise,
|
|
204
|
+
};
|
|
205
|
+
});
|
|
206
|
+
}
|
|
207
|
+
/**
|
|
208
|
+
* Check if parallel mode is enabled
|
|
209
|
+
*/
|
|
210
|
+
isEnabled() {
|
|
211
|
+
return this.enabled;
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
exports.LlamaParallelAPI = LlamaParallelAPI;
|
package/lib/parallel.ts
ADDED
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
// Parallel decoding API implementation for llama.node
|
|
2
|
+
import type {
|
|
3
|
+
LlamaContext,
|
|
4
|
+
LlamaCompletionOptions,
|
|
5
|
+
LlamaCompletionToken,
|
|
6
|
+
RerankParams,
|
|
7
|
+
} from './binding'
|
|
8
|
+
|
|
9
|
+
export class LlamaParallelAPI {
|
|
10
|
+
private context: LlamaContext
|
|
11
|
+
private enabled: boolean = false
|
|
12
|
+
private pendingRequests = new Map<
|
|
13
|
+
number,
|
|
14
|
+
{
|
|
15
|
+
resolve: (value: any) => void
|
|
16
|
+
reject: (reason?: any) => void
|
|
17
|
+
}
|
|
18
|
+
>()
|
|
19
|
+
|
|
20
|
+
constructor(context: LlamaContext) {
|
|
21
|
+
this.context = context
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Enable parallel decoding mode
|
|
26
|
+
* @param config Configuration for parallel mode
|
|
27
|
+
* @returns boolean indicating if successful
|
|
28
|
+
*/
|
|
29
|
+
async enable(config?: {
|
|
30
|
+
n_parallel?: number
|
|
31
|
+
n_batch?: number
|
|
32
|
+
}): Promise<boolean> {
|
|
33
|
+
const defaultConfig = { n_parallel: 2, n_batch: 512 }
|
|
34
|
+
const result = this.context.enableParallelMode({
|
|
35
|
+
...defaultConfig,
|
|
36
|
+
...config,
|
|
37
|
+
})
|
|
38
|
+
this.enabled = result
|
|
39
|
+
return result
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* Disable parallel decoding mode
|
|
44
|
+
*/
|
|
45
|
+
disable(): void {
|
|
46
|
+
this.context.disableParallelMode()
|
|
47
|
+
this.enabled = false
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Configure parallel decoding mode (enables if not already enabled)
|
|
52
|
+
* @param config Configuration for parallel mode
|
|
53
|
+
* @returns boolean indicating if successful
|
|
54
|
+
*/
|
|
55
|
+
async configure(config: {
|
|
56
|
+
n_parallel?: number
|
|
57
|
+
n_batch?: number
|
|
58
|
+
}): Promise<boolean> {
|
|
59
|
+
return this.enable(config)
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* Queue a completion request for parallel processing
|
|
64
|
+
* @param options Completion options
|
|
65
|
+
* @param onToken Optional callback for each token
|
|
66
|
+
* @returns Object with requestId, promise for result, and stop function
|
|
67
|
+
*/
|
|
68
|
+
async completion(
|
|
69
|
+
options: LlamaCompletionOptions,
|
|
70
|
+
onToken?: (requestId: number, data: LlamaCompletionToken) => void,
|
|
71
|
+
): Promise<{
|
|
72
|
+
requestId: number
|
|
73
|
+
promise: Promise<any>
|
|
74
|
+
stop: () => void
|
|
75
|
+
}> {
|
|
76
|
+
if (!this.enabled) {
|
|
77
|
+
throw new Error('Parallel mode is not enabled. Call enable() first.')
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
const tokenCallback = onToken
|
|
81
|
+
? (error: any, result: any) => {
|
|
82
|
+
if (error) {
|
|
83
|
+
console.error('Token callback error:', error)
|
|
84
|
+
// Handle completion error
|
|
85
|
+
const pendingReq = this.pendingRequests.get(result?.requestId)
|
|
86
|
+
if (pendingReq) {
|
|
87
|
+
pendingReq.reject(error)
|
|
88
|
+
this.pendingRequests.delete(result?.requestId)
|
|
89
|
+
}
|
|
90
|
+
return
|
|
91
|
+
}
|
|
92
|
+
// Check if this is a token callback or final result
|
|
93
|
+
if (result) {
|
|
94
|
+
if (result.token !== undefined) {
|
|
95
|
+
// This is a token callback
|
|
96
|
+
onToken(result.requestId, result)
|
|
97
|
+
} else if (
|
|
98
|
+
result.text !== undefined ||
|
|
99
|
+
result.content !== undefined
|
|
100
|
+
) {
|
|
101
|
+
// This is the final result
|
|
102
|
+
const pendingReq = this.pendingRequests.get(result.requestId)
|
|
103
|
+
if (pendingReq) {
|
|
104
|
+
pendingReq.resolve(result)
|
|
105
|
+
this.pendingRequests.delete(result.requestId)
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
: undefined
|
|
111
|
+
|
|
112
|
+
// Queue the completion immediately (this is synchronous!)
|
|
113
|
+
const { requestId } = this.context.queueCompletion(
|
|
114
|
+
options,
|
|
115
|
+
tokenCallback ||
|
|
116
|
+
((error, result) => {
|
|
117
|
+
if (error) {
|
|
118
|
+
const pendingReq = this.pendingRequests.get(result?.requestId)
|
|
119
|
+
if (pendingReq) {
|
|
120
|
+
pendingReq.reject(error)
|
|
121
|
+
this.pendingRequests.delete(result?.requestId)
|
|
122
|
+
}
|
|
123
|
+
} else if (
|
|
124
|
+
result &&
|
|
125
|
+
(result.text !== undefined || result.content !== undefined)
|
|
126
|
+
) {
|
|
127
|
+
// Final result for non-streaming
|
|
128
|
+
const pendingReq = this.pendingRequests.get(result.requestId)
|
|
129
|
+
if (pendingReq) {
|
|
130
|
+
pendingReq.resolve(result)
|
|
131
|
+
this.pendingRequests.delete(result.requestId)
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
}),
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
// Create promise for final result
|
|
138
|
+
const promise = new Promise((resolveResult, rejectResult) => {
|
|
139
|
+
this.pendingRequests.set(requestId, {
|
|
140
|
+
resolve: resolveResult,
|
|
141
|
+
reject: rejectResult,
|
|
142
|
+
})
|
|
143
|
+
})
|
|
144
|
+
|
|
145
|
+
// Create stop function
|
|
146
|
+
const stop = () => {
|
|
147
|
+
this.context.cancelRequest(requestId)
|
|
148
|
+
const pendingReq = this.pendingRequests.get(requestId)
|
|
149
|
+
if (pendingReq) {
|
|
150
|
+
pendingReq.reject(new Error('Request cancelled'))
|
|
151
|
+
this.pendingRequests.delete(requestId)
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
// Return immediately without wrapping in a Promise
|
|
156
|
+
return {
|
|
157
|
+
requestId,
|
|
158
|
+
promise,
|
|
159
|
+
stop,
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
/**
|
|
164
|
+
* Queue an embedding request for parallel processing
|
|
165
|
+
* @param text Text to embed
|
|
166
|
+
* @param params Optional embedding parameters
|
|
167
|
+
* @returns Object with requestId and promise for result
|
|
168
|
+
*/
|
|
169
|
+
async embedding(
|
|
170
|
+
text: string,
|
|
171
|
+
params?: { embd_normalize?: number },
|
|
172
|
+
): Promise<{
|
|
173
|
+
requestId: number
|
|
174
|
+
promise: Promise<{ embedding: number[] }>
|
|
175
|
+
}> {
|
|
176
|
+
if (!this.enabled) {
|
|
177
|
+
throw new Error('Parallel mode is not enabled. Call enable() first.')
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
// Create promise for result
|
|
181
|
+
let resolveResult: (value: any) => void
|
|
182
|
+
let rejectResult: (reason?: any) => void
|
|
183
|
+
|
|
184
|
+
const promise = new Promise<{ embedding: number[] }>((res, rej) => {
|
|
185
|
+
resolveResult = res
|
|
186
|
+
rejectResult = rej
|
|
187
|
+
})
|
|
188
|
+
|
|
189
|
+
// Queue the embedding immediately (this is synchronous!)
|
|
190
|
+
const { requestId } = this.context.queueEmbedding(
|
|
191
|
+
text,
|
|
192
|
+
params,
|
|
193
|
+
(error, result) => {
|
|
194
|
+
if (error) {
|
|
195
|
+
rejectResult(error)
|
|
196
|
+
} else {
|
|
197
|
+
resolveResult(result)
|
|
198
|
+
}
|
|
199
|
+
},
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
// Return immediately without wrapping in a Promise
|
|
203
|
+
return {
|
|
204
|
+
requestId,
|
|
205
|
+
promise,
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
/**
|
|
210
|
+
* Queue a rerank request for parallel processing
|
|
211
|
+
* @param query Query text
|
|
212
|
+
* @param documents Documents to rank
|
|
213
|
+
* @param params Optional rerank parameters
|
|
214
|
+
* @returns Object with requestId and promise for results
|
|
215
|
+
*/
|
|
216
|
+
async rerank(
|
|
217
|
+
query: string,
|
|
218
|
+
documents: string[],
|
|
219
|
+
params?: RerankParams,
|
|
220
|
+
): Promise<{
|
|
221
|
+
requestId: number
|
|
222
|
+
promise: Promise<Array<{ score: number; index: number; document: string }>>
|
|
223
|
+
}> {
|
|
224
|
+
if (!this.enabled) {
|
|
225
|
+
throw new Error('Parallel mode is not enabled. Call enable() first.')
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
// Create promise for result
|
|
229
|
+
let resolveResult: (value: any) => void
|
|
230
|
+
let rejectResult: (reason?: any) => void
|
|
231
|
+
|
|
232
|
+
const promise = new Promise<
|
|
233
|
+
Array<{ score: number; index: number; document: string }>
|
|
234
|
+
>((res, rej) => {
|
|
235
|
+
resolveResult = res
|
|
236
|
+
rejectResult = rej
|
|
237
|
+
})
|
|
238
|
+
|
|
239
|
+
// Queue the rerank immediately (this is synchronous!)
|
|
240
|
+
const { requestId } = this.context.queueRerank(
|
|
241
|
+
query,
|
|
242
|
+
documents,
|
|
243
|
+
params,
|
|
244
|
+
(error, result) => {
|
|
245
|
+
if (error) {
|
|
246
|
+
rejectResult(error)
|
|
247
|
+
} else {
|
|
248
|
+
// Add document text to results and sort by score
|
|
249
|
+
const enrichedResults = result.results
|
|
250
|
+
.map((r: any) => ({
|
|
251
|
+
...r,
|
|
252
|
+
document: documents[r.index],
|
|
253
|
+
}))
|
|
254
|
+
.sort((a: any, b: any) => b.score - a.score)
|
|
255
|
+
resolveResult(enrichedResults)
|
|
256
|
+
}
|
|
257
|
+
},
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
// Return immediately without wrapping in a Promise
|
|
261
|
+
return {
|
|
262
|
+
requestId,
|
|
263
|
+
promise,
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
/**
|
|
268
|
+
* Check if parallel mode is enabled
|
|
269
|
+
*/
|
|
270
|
+
isEnabled(): boolean {
|
|
271
|
+
return this.enabled
|
|
272
|
+
}
|
|
273
|
+
}
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.
|
|
4
|
+
"version": "1.3.0-rc.0",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -72,19 +72,19 @@
|
|
|
72
72
|
"CMakeLists.txt"
|
|
73
73
|
],
|
|
74
74
|
"optionalDependencies": {
|
|
75
|
-
"@fugood/node-llama-linux-x64": "1.
|
|
76
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.
|
|
77
|
-
"@fugood/node-llama-linux-x64-cuda": "1.
|
|
78
|
-
"@fugood/node-llama-linux-arm64": "1.
|
|
79
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.
|
|
80
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.
|
|
81
|
-
"@fugood/node-llama-win32-x64": "1.
|
|
82
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.
|
|
83
|
-
"@fugood/node-llama-win32-x64-cuda": "1.
|
|
84
|
-
"@fugood/node-llama-win32-arm64": "1.
|
|
85
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.
|
|
86
|
-
"@fugood/node-llama-darwin-x64": "1.
|
|
87
|
-
"@fugood/node-llama-darwin-arm64": "1.
|
|
75
|
+
"@fugood/node-llama-linux-x64": "1.3.0-rc.0",
|
|
76
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.3.0-rc.0",
|
|
77
|
+
"@fugood/node-llama-linux-x64-cuda": "1.3.0-rc.0",
|
|
78
|
+
"@fugood/node-llama-linux-arm64": "1.3.0-rc.0",
|
|
79
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.3.0-rc.0",
|
|
80
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.3.0-rc.0",
|
|
81
|
+
"@fugood/node-llama-win32-x64": "1.3.0-rc.0",
|
|
82
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.3.0-rc.0",
|
|
83
|
+
"@fugood/node-llama-win32-x64-cuda": "1.3.0-rc.0",
|
|
84
|
+
"@fugood/node-llama-win32-arm64": "1.3.0-rc.0",
|
|
85
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.3.0-rc.0",
|
|
86
|
+
"@fugood/node-llama-darwin-x64": "1.3.0-rc.0",
|
|
87
|
+
"@fugood/node-llama-darwin-arm64": "1.3.0-rc.0"
|
|
88
88
|
},
|
|
89
89
|
"devDependencies": {
|
|
90
90
|
"@babel/preset-env": "^7.24.4",
|
package/src/LlamaContext.cpp
CHANGED
|
@@ -168,6 +168,25 @@ void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
|
|
|
168
168
|
static_cast<napi_property_attributes>(napi_enumerable)),
|
|
169
169
|
InstanceMethod<&LlamaContext::DecodeAudioTokens>(
|
|
170
170
|
"decodeAudioTokens",
|
|
171
|
+
static_cast<napi_property_attributes>(napi_enumerable)),
|
|
172
|
+
// Parallel decoding methods
|
|
173
|
+
InstanceMethod<&LlamaContext::EnableParallelMode>(
|
|
174
|
+
"enableParallelMode",
|
|
175
|
+
static_cast<napi_property_attributes>(napi_enumerable)),
|
|
176
|
+
InstanceMethod<&LlamaContext::DisableParallelMode>(
|
|
177
|
+
"disableParallelMode",
|
|
178
|
+
static_cast<napi_property_attributes>(napi_enumerable)),
|
|
179
|
+
InstanceMethod<&LlamaContext::QueueCompletion>(
|
|
180
|
+
"queueCompletion",
|
|
181
|
+
static_cast<napi_property_attributes>(napi_enumerable)),
|
|
182
|
+
InstanceMethod<&LlamaContext::QueueEmbedding>(
|
|
183
|
+
"queueEmbedding",
|
|
184
|
+
static_cast<napi_property_attributes>(napi_enumerable)),
|
|
185
|
+
InstanceMethod<&LlamaContext::QueueRerank>(
|
|
186
|
+
"queueRerank",
|
|
187
|
+
static_cast<napi_property_attributes>(napi_enumerable)),
|
|
188
|
+
InstanceMethod<&LlamaContext::CancelRequest>(
|
|
189
|
+
"cancelRequest",
|
|
171
190
|
static_cast<napi_property_attributes>(napi_enumerable))});
|
|
172
191
|
Napi::FunctionReference *constructor = new Napi::FunctionReference();
|
|
173
192
|
*constructor = Napi::Persistent(func);
|
|
@@ -217,6 +236,7 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
|
|
|
217
236
|
params.n_ctx = get_option<int32_t>(options, "n_ctx", 512);
|
|
218
237
|
params.n_batch = get_option<int32_t>(options, "n_batch", 2048);
|
|
219
238
|
params.n_ubatch = get_option<int32_t>(options, "n_ubatch", 512);
|
|
239
|
+
params.n_parallel = get_option<int32_t>(options, "n_parallel", 1); // Default to 1 for compatibility
|
|
220
240
|
params.embedding = get_option<bool>(options, "embedding", false);
|
|
221
241
|
if (params.embedding) {
|
|
222
242
|
// For non-causal models, batch size must be equal to ubatch size
|
|
@@ -288,6 +308,9 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
|
|
|
288
308
|
}
|
|
289
309
|
}
|
|
290
310
|
}
|
|
311
|
+
// Initialize validity flag for async callback safety
|
|
312
|
+
_context_valid = std::make_shared<std::atomic<bool>>(true);
|
|
313
|
+
|
|
291
314
|
// Use rn-llama context instead of direct session
|
|
292
315
|
_rn_ctx = new llama_rn_context();
|
|
293
316
|
if (!_rn_ctx->loadModel(params)) {
|
|
@@ -305,6 +328,11 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
|
|
|
305
328
|
}
|
|
306
329
|
|
|
307
330
|
LlamaContext::~LlamaContext() {
|
|
331
|
+
// Invalidate the context to prevent use-after-free in async callbacks
|
|
332
|
+
if (_context_valid) {
|
|
333
|
+
_context_valid->store(false);
|
|
334
|
+
}
|
|
335
|
+
|
|
308
336
|
// The DisposeWorker is responsible for cleanup of _rn_ctx
|
|
309
337
|
// If _rn_ctx is still not null here, it means disposal was not properly initiated
|
|
310
338
|
if (_rn_ctx) {
|
|
@@ -579,7 +607,7 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
|
|
|
579
607
|
// grammar: string
|
|
580
608
|
result.Set("grammar", chatParams.grammar);
|
|
581
609
|
// grammar_lazy: boolean
|
|
582
|
-
result.Set("
|
|
610
|
+
result.Set("grammar_lazy", chatParams.grammar_lazy);
|
|
583
611
|
// grammar_triggers: [{ value: string, token: number }]
|
|
584
612
|
Napi::Array grammar_triggers = Napi::Array::New(env);
|
|
585
613
|
for (size_t i = 0; i < chatParams.grammar_triggers.size(); i++) {
|
|
@@ -1135,6 +1163,11 @@ Napi::Value LlamaContext::Release(const Napi::CallbackInfo &info) {
|
|
|
1135
1163
|
_wip->SetStop();
|
|
1136
1164
|
}
|
|
1137
1165
|
|
|
1166
|
+
// stop_processing_loop
|
|
1167
|
+
if (_rn_ctx && _rn_ctx->slot_manager) {
|
|
1168
|
+
_rn_ctx->slot_manager->stop_processing_loop();
|
|
1169
|
+
}
|
|
1170
|
+
|
|
1138
1171
|
if (_rn_ctx == nullptr) {
|
|
1139
1172
|
auto promise = Napi::Promise::Deferred(env);
|
|
1140
1173
|
promise.Resolve(env.Undefined());
|
package/src/LlamaContext.h
CHANGED
|
@@ -4,6 +4,10 @@
|
|
|
4
4
|
#include "rn-llama/rn-llama.h"
|
|
5
5
|
#include "rn-llama/rn-completion.h"
|
|
6
6
|
#include "rn-llama/rn-tts.h"
|
|
7
|
+
#include "rn-llama/rn-slot.h"
|
|
8
|
+
#include "rn-llama/rn-slot-manager.h"
|
|
9
|
+
#include <atomic>
|
|
10
|
+
#include <memory>
|
|
7
11
|
|
|
8
12
|
using namespace rnllama;
|
|
9
13
|
|
|
@@ -55,10 +59,22 @@ private:
|
|
|
55
59
|
Napi::Value GetAudioCompletionGuideTokens(const Napi::CallbackInfo &info);
|
|
56
60
|
Napi::Value DecodeAudioTokens(const Napi::CallbackInfo &info);
|
|
57
61
|
|
|
62
|
+
// Parallel decoding methods
|
|
63
|
+
Napi::Value EnableParallelMode(const Napi::CallbackInfo &info);
|
|
64
|
+
void DisableParallelMode(const Napi::CallbackInfo &info);
|
|
65
|
+
Napi::Value QueueCompletion(const Napi::CallbackInfo &info);
|
|
66
|
+
Napi::Value QueueEmbedding(const Napi::CallbackInfo &info);
|
|
67
|
+
Napi::Value QueueRerank(const Napi::CallbackInfo &info);
|
|
68
|
+
void CancelRequest(const Napi::CallbackInfo &info);
|
|
69
|
+
|
|
58
70
|
std::string _info;
|
|
59
71
|
Napi::Object _meta;
|
|
60
72
|
LlamaCompletionWorker *_wip = nullptr;
|
|
61
73
|
|
|
62
74
|
// Use rn-llama context instead of direct llama.cpp types
|
|
63
75
|
llama_rn_context *_rn_ctx = nullptr;
|
|
76
|
+
|
|
77
|
+
// Validity flag for async callbacks to prevent use-after-free
|
|
78
|
+
// Shared pointer ensures callbacks can safely check if context is still alive
|
|
79
|
+
std::shared_ptr<std::atomic<bool>> _context_valid;
|
|
64
80
|
};
|
package/src/common.hpp
CHANGED
|
@@ -16,11 +16,12 @@ static bool is_nil(const Napi::Value &value) {
|
|
|
16
16
|
return value.IsNull() || value.IsUndefined();
|
|
17
17
|
}
|
|
18
18
|
|
|
19
|
-
|
|
20
|
-
|
|
19
|
+
// Overload for Napi::Value to handle both arrays and objects
|
|
20
|
+
static std::string json_stringify(const Napi::Value &value) {
|
|
21
|
+
Napi::Env env = value.Env();
|
|
21
22
|
Napi::Object json = env.Global().Get("JSON").As<Napi::Object>();
|
|
22
23
|
Napi::Function stringify = json.Get("stringify").As<Napi::Function>();
|
|
23
|
-
return stringify.Call(json, {
|
|
24
|
+
return stringify.Call(json, {value}).As<Napi::String>().ToString();
|
|
24
25
|
}
|
|
25
26
|
|
|
26
27
|
static void console_log(Napi::Env env, const std::string &message) {
|
|
@@ -1760,7 +1760,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1760
1760
|
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
|
|
1761
1761
|
add_opt(common_arg(
|
|
1762
1762
|
{"-t", "--threads"}, "N",
|
|
1763
|
-
string_format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
|
|
1763
|
+
string_format("number of CPU threads to use during generation (default: %d)", params.cpuparams.n_threads),
|
|
1764
1764
|
[](common_params & params, int value) {
|
|
1765
1765
|
params.cpuparams.n_threads = value;
|
|
1766
1766
|
if (params.cpuparams.n_threads <= 0) {
|
|
@@ -577,6 +577,10 @@ extern "C" {
|
|
|
577
577
|
GGML_UNARY_OP_EXP,
|
|
578
578
|
GGML_UNARY_OP_GELU_ERF,
|
|
579
579
|
GGML_UNARY_OP_XIELU,
|
|
580
|
+
GGML_UNARY_OP_FLOOR,
|
|
581
|
+
GGML_UNARY_OP_CEIL,
|
|
582
|
+
GGML_UNARY_OP_ROUND,
|
|
583
|
+
GGML_UNARY_OP_TRUNC,
|
|
580
584
|
|
|
581
585
|
GGML_UNARY_OP_COUNT,
|
|
582
586
|
};
|
|
@@ -1151,6 +1155,46 @@ extern "C" {
|
|
|
1151
1155
|
struct ggml_context * ctx,
|
|
1152
1156
|
struct ggml_tensor * a);
|
|
1153
1157
|
|
|
1158
|
+
GGML_API struct ggml_tensor * ggml_floor(
|
|
1159
|
+
struct ggml_context * ctx,
|
|
1160
|
+
struct ggml_tensor * a);
|
|
1161
|
+
|
|
1162
|
+
GGML_API struct ggml_tensor * ggml_floor_inplace(
|
|
1163
|
+
struct ggml_context * ctx,
|
|
1164
|
+
struct ggml_tensor * a);
|
|
1165
|
+
|
|
1166
|
+
GGML_API struct ggml_tensor * ggml_ceil(
|
|
1167
|
+
struct ggml_context * ctx,
|
|
1168
|
+
struct ggml_tensor * a);
|
|
1169
|
+
|
|
1170
|
+
GGML_API struct ggml_tensor * ggml_ceil_inplace(
|
|
1171
|
+
struct ggml_context * ctx,
|
|
1172
|
+
struct ggml_tensor * a);
|
|
1173
|
+
|
|
1174
|
+
GGML_API struct ggml_tensor * ggml_round(
|
|
1175
|
+
struct ggml_context * ctx,
|
|
1176
|
+
struct ggml_tensor * a);
|
|
1177
|
+
|
|
1178
|
+
GGML_API struct ggml_tensor * ggml_round_inplace(
|
|
1179
|
+
struct ggml_context * ctx,
|
|
1180
|
+
struct ggml_tensor * a);
|
|
1181
|
+
|
|
1182
|
+
/**
|
|
1183
|
+
* Truncates the fractional part of each element in the tensor (towards zero).
|
|
1184
|
+
* For example: trunc(3.7) = 3.0, trunc(-2.9) = -2.0
|
|
1185
|
+
* Similar to std::trunc in C/C++.
|
|
1186
|
+
*/
|
|
1187
|
+
|
|
1188
|
+
GGML_API struct ggml_tensor * ggml_trunc(
|
|
1189
|
+
struct ggml_context * ctx,
|
|
1190
|
+
struct ggml_tensor * a);
|
|
1191
|
+
|
|
1192
|
+
GGML_API struct ggml_tensor * ggml_trunc_inplace(
|
|
1193
|
+
struct ggml_context * ctx,
|
|
1194
|
+
struct ggml_tensor * a);
|
|
1195
|
+
|
|
1196
|
+
|
|
1197
|
+
|
|
1154
1198
|
// xIELU activation function
|
|
1155
1199
|
// x = x * (c_a(alpha_n) + c_b(alpha_p, beta) * sigmoid(beta * x)) + eps * (x > 0)
|
|
1156
1200
|
// where c_a = softplus and c_b(a, b) = softplus(a) + b are constraining functions
|
|
@@ -2184,6 +2184,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
|
2184
2184
|
case GGML_UNARY_OP_HARDSWISH:
|
|
2185
2185
|
case GGML_UNARY_OP_HARDSIGMOID:
|
|
2186
2186
|
case GGML_UNARY_OP_EXP:
|
|
2187
|
+
case GGML_UNARY_OP_FLOOR:
|
|
2188
|
+
case GGML_UNARY_OP_CEIL:
|
|
2189
|
+
case GGML_UNARY_OP_ROUND:
|
|
2190
|
+
case GGML_UNARY_OP_TRUNC:
|
|
2187
2191
|
{
|
|
2188
2192
|
n_tasks = 1;
|
|
2189
2193
|
} break;
|
|
@@ -3563,13 +3567,17 @@ void ggml_cpu_init(void) {
|
|
|
3563
3567
|
#ifdef GGML_USE_OPENMP
|
|
3564
3568
|
//if (!getenv("OMP_WAIT_POLICY")) {
|
|
3565
3569
|
// // set the wait policy to active, so that OpenMP threads don't sleep
|
|
3566
|
-
//
|
|
3570
|
+
// setenv("OMP_WAIT_POLICY", "active", 0)
|
|
3567
3571
|
//}
|
|
3568
3572
|
|
|
3569
3573
|
if (!getenv("KMP_BLOCKTIME")) {
|
|
3570
3574
|
// set the time to wait before sleeping a thread
|
|
3571
3575
|
// this is less aggressive than setting the wait policy to active, but should achieve similar results in most cases
|
|
3572
|
-
|
|
3576
|
+
#ifdef _WIN32
|
|
3577
|
+
_putenv_s("KMP_BLOCKTIME", "200"); // 200ms
|
|
3578
|
+
#else
|
|
3579
|
+
setenv("KMP_BLOCKTIME", "200", 0); // 200ms
|
|
3580
|
+
#endif
|
|
3573
3581
|
}
|
|
3574
3582
|
#endif
|
|
3575
3583
|
}
|
|
@@ -8993,6 +8993,22 @@ void ggml_compute_forward_unary(
|
|
|
8993
8993
|
{
|
|
8994
8994
|
ggml_compute_forward_exp(params, dst);
|
|
8995
8995
|
} break;
|
|
8996
|
+
case GGML_UNARY_OP_FLOOR:
|
|
8997
|
+
{
|
|
8998
|
+
ggml_compute_forward_floor(params, dst);
|
|
8999
|
+
} break;
|
|
9000
|
+
case GGML_UNARY_OP_CEIL:
|
|
9001
|
+
{
|
|
9002
|
+
ggml_compute_forward_ceil(params, dst);
|
|
9003
|
+
} break;
|
|
9004
|
+
case GGML_UNARY_OP_ROUND:
|
|
9005
|
+
{
|
|
9006
|
+
ggml_compute_forward_round(params, dst);
|
|
9007
|
+
} break;
|
|
9008
|
+
case GGML_UNARY_OP_TRUNC:
|
|
9009
|
+
{
|
|
9010
|
+
ggml_compute_forward_trunc(params, dst);
|
|
9011
|
+
} break;
|
|
8996
9012
|
case GGML_UNARY_OP_XIELU:
|
|
8997
9013
|
{
|
|
8998
9014
|
ggml_compute_forward_xielu(params, dst);
|
|
@@ -73,6 +73,22 @@ static inline float op_log(float x) {
|
|
|
73
73
|
return logf(x);
|
|
74
74
|
}
|
|
75
75
|
|
|
76
|
+
static inline float op_floor(float x) {
|
|
77
|
+
return floorf(x);
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
static inline float op_ceil(float x) {
|
|
81
|
+
return ceilf(x);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
static inline float op_round(float x) {
|
|
85
|
+
return roundf(x);
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
static inline float op_trunc(float x) {
|
|
89
|
+
return truncf(x);
|
|
90
|
+
}
|
|
91
|
+
|
|
76
92
|
template <float (*op)(float), typename src0_t, typename dst_t>
|
|
77
93
|
static inline void vec_unary_op(int64_t n, dst_t * y, const src0_t * x) {
|
|
78
94
|
constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
|
|
@@ -274,6 +290,22 @@ void ggml_compute_forward_log(const ggml_compute_params * params, ggml_tensor *
|
|
|
274
290
|
unary_op<op_log>(params, dst);
|
|
275
291
|
}
|
|
276
292
|
|
|
293
|
+
void ggml_compute_forward_floor(const ggml_compute_params * params, ggml_tensor * dst) {
|
|
294
|
+
unary_op<op_floor>(params, dst);
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
void ggml_compute_forward_ceil(const ggml_compute_params * params, ggml_tensor * dst) {
|
|
298
|
+
unary_op<op_ceil>(params, dst);
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
void ggml_compute_forward_round(const ggml_compute_params * params, ggml_tensor * dst) {
|
|
302
|
+
unary_op<op_round>(params, dst);
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
void ggml_compute_forward_trunc(const ggml_compute_params * params, ggml_tensor * dst) {
|
|
306
|
+
unary_op<op_trunc>(params, dst);
|
|
307
|
+
}
|
|
308
|
+
|
|
277
309
|
void ggml_compute_forward_xielu(const ggml_compute_params * params, ggml_tensor * dst) {
|
|
278
310
|
const float alpha_n = ggml_get_op_params_f32(dst, 1);
|
|
279
311
|
const float alpha_p = ggml_get_op_params_f32(dst, 2);
|
|
@@ -22,6 +22,10 @@ void ggml_compute_forward_sqrt(const struct ggml_compute_params * params, struct
|
|
|
22
22
|
void ggml_compute_forward_sin(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
23
23
|
void ggml_compute_forward_cos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
24
24
|
void ggml_compute_forward_log(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
25
|
+
void ggml_compute_forward_floor(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
26
|
+
void ggml_compute_forward_ceil(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
27
|
+
void ggml_compute_forward_round(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
28
|
+
void ggml_compute_forward_trunc(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
25
29
|
void ggml_compute_forward_xielu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
26
30
|
|
|
27
31
|
#ifdef __cplusplus
|
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
#include <map>
|
|
6
6
|
|
|
7
7
|
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
8
|
+
{ LLM_ARCH_CLIP, "clip" }, // dummy, only used by llama-quantize
|
|
8
9
|
{ LLM_ARCH_LLAMA, "llama" },
|
|
9
10
|
{ LLM_ARCH_LLAMA4, "llama4" },
|
|
10
11
|
{ LLM_ARCH_DECI, "deci" },
|
|
@@ -275,6 +276,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
275
276
|
};
|
|
276
277
|
|
|
277
278
|
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
|
|
279
|
+
{
|
|
280
|
+
LLM_ARCH_CLIP,
|
|
281
|
+
{},
|
|
282
|
+
},
|
|
278
283
|
{
|
|
279
284
|
LLM_ARCH_LLAMA,
|
|
280
285
|
{
|
|
@@ -478,7 +478,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
478
478
|
ml.get_key(LLM_KV_GENERAL_NAME, name, false);
|
|
479
479
|
|
|
480
480
|
// everything past this point is not vocab-related
|
|
481
|
-
|
|
481
|
+
// for CLIP models, we only need to load tensors, no hparams
|
|
482
|
+
if (hparams.vocab_only || ml.get_arch() == LLM_ARCH_CLIP) {
|
|
482
483
|
return;
|
|
483
484
|
}
|
|
484
485
|
|
|
@@ -20013,6 +20014,7 @@ int32_t llama_n_head(const llama_model * model) {
|
|
|
20013
20014
|
llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
20014
20015
|
switch (model->arch) {
|
|
20015
20016
|
// these models do not use RoPE
|
|
20017
|
+
case LLM_ARCH_CLIP:
|
|
20016
20018
|
case LLM_ARCH_GPT2:
|
|
20017
20019
|
case LLM_ARCH_GPTJ:
|
|
20018
20020
|
case LLM_ARCH_MPT:
|
|
@@ -701,6 +701,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
701
701
|
});
|
|
702
702
|
}
|
|
703
703
|
|
|
704
|
+
bool is_clip_model = false;
|
|
704
705
|
for (const auto * it : tensors) {
|
|
705
706
|
const struct ggml_tensor * tensor = it->tensor;
|
|
706
707
|
|
|
@@ -714,12 +715,14 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
714
715
|
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
|
|
715
716
|
qs.has_output = true;
|
|
716
717
|
}
|
|
718
|
+
|
|
719
|
+
is_clip_model |= name.rfind("mm.", 0) == 0; // check the "mm." prefix
|
|
717
720
|
}
|
|
718
721
|
|
|
719
722
|
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
|
|
720
723
|
|
|
721
724
|
// sanity checks for models that have attention layers
|
|
722
|
-
if (qs.n_attention_wv != 0)
|
|
725
|
+
if (qs.n_attention_wv != 0 && !is_clip_model)
|
|
723
726
|
{
|
|
724
727
|
const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
|
|
725
728
|
// attention layers have a non-zero number of kv heads
|
|
@@ -881,6 +884,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
881
884
|
// do not quantize relative position bias (T5)
|
|
882
885
|
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
|
|
883
886
|
|
|
887
|
+
// do not quantize specific multimodal tensors
|
|
888
|
+
quantize &= name.find(".position_embd.") == std::string::npos;
|
|
889
|
+
|
|
884
890
|
ggml_type new_type;
|
|
885
891
|
void * new_data;
|
|
886
892
|
size_t new_size;
|
|
@@ -124,6 +124,9 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
|
|
|
124
124
|
} catch(const std::exception & e) {
|
|
125
125
|
throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
|
|
126
126
|
}
|
|
127
|
+
if (model.arch == LLM_ARCH_CLIP) {
|
|
128
|
+
throw std::runtime_error("CLIP cannot be used as main model, use it with --mmproj instead");
|
|
129
|
+
}
|
|
127
130
|
try {
|
|
128
131
|
model.load_vocab(ml);
|
|
129
132
|
} catch(const std::exception & e) {
|