@fugood/llama.node 1.2.5 → 1.3.0-rc.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -0
- package/lib/binding.ts +96 -1
- package/lib/index.js +4 -2
- package/lib/index.ts +4 -1
- package/lib/parallel.js +214 -0
- package/lib/parallel.ts +273 -0
- package/package.json +14 -14
- package/src/LlamaContext.cpp +34 -1
- package/src/LlamaContext.h +16 -0
- package/src/common.hpp +4 -3
- package/src/llama.cpp/common/arg.cpp +1 -1
- package/src/llama.cpp/ggml/include/ggml.h +44 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +16 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +32 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +1 -1
- package/src/llama.cpp/src/llama-arch.cpp +5 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +74 -43
- package/src/llama.cpp/src/llama-graph.h +7 -3
- package/src/llama.cpp/src/llama-model.cpp +8 -7
- package/src/llama.cpp/src/llama-quant.cpp +7 -1
- package/src/llama.cpp/src/llama.cpp +4 -0
package/CMakeLists.txt
CHANGED
package/lib/binding.ts
CHANGED
|
@@ -25,6 +25,12 @@ export type LlamaModelOptions = {
|
|
|
25
25
|
n_ctx?: number
|
|
26
26
|
n_batch?: number
|
|
27
27
|
n_ubatch?: number
|
|
28
|
+
/**
|
|
29
|
+
* Number of parallel sequences to support (sets n_seq_max).
|
|
30
|
+
* This determines the maximum number of parallel slots that can be used.
|
|
31
|
+
* Default: 8
|
|
32
|
+
*/
|
|
33
|
+
n_parallel?: number
|
|
28
34
|
n_threads?: number
|
|
29
35
|
n_gpu_layers?: number
|
|
30
36
|
flash_attn_type?: 'auto' | 'on' | 'off'
|
|
@@ -157,6 +163,36 @@ export type LlamaCompletionOptions = {
|
|
|
157
163
|
n_probs?: number
|
|
158
164
|
}
|
|
159
165
|
|
|
166
|
+
/**
|
|
167
|
+
* Parameters for parallel completion requests (queueCompletion).
|
|
168
|
+
* Extends LlamaCompletionOptions with parallel-mode specific options.
|
|
169
|
+
*/
|
|
170
|
+
export type LlamaParallelCompletionOptions = LlamaCompletionOptions & {
|
|
171
|
+
/**
|
|
172
|
+
* File path to load session state from before processing.
|
|
173
|
+
* This allows you to resume from a previously saved completion state.
|
|
174
|
+
* Use with `save_state_path` to enable conversation continuity across requests.
|
|
175
|
+
* Example: `'/path/to/session.bin'` or `'file:///path/to/session.bin'`
|
|
176
|
+
*/
|
|
177
|
+
load_state_path?: string
|
|
178
|
+
|
|
179
|
+
/**
|
|
180
|
+
* File path to save session state to after completion.
|
|
181
|
+
* The session state will be saved to this file path when the completion finishes.
|
|
182
|
+
* You can then pass this path to `load_state_path` in a subsequent request to resume.
|
|
183
|
+
* Example: `'/path/to/session.bin'` or `'file:///path/to/session.bin'`
|
|
184
|
+
*/
|
|
185
|
+
save_state_path?: string
|
|
186
|
+
|
|
187
|
+
/**
|
|
188
|
+
* Number of tokens to save when saving session state.
|
|
189
|
+
* If not specified or <= 0, all tokens will be saved.
|
|
190
|
+
* Use this to limit the size of saved session files.
|
|
191
|
+
* Example: `512` to save only the last 512 tokens
|
|
192
|
+
*/
|
|
193
|
+
save_state_size?: number
|
|
194
|
+
}
|
|
195
|
+
|
|
160
196
|
export type TokenProbability = {
|
|
161
197
|
tok_str: string
|
|
162
198
|
prob: number
|
|
@@ -271,7 +307,7 @@ export type JinjaFormattedChatResult = {
|
|
|
271
307
|
prompt: string
|
|
272
308
|
chat_format: number
|
|
273
309
|
grammar: string
|
|
274
|
-
|
|
310
|
+
grammar_lazy: boolean
|
|
275
311
|
grammar_triggers: Array<{
|
|
276
312
|
type: number
|
|
277
313
|
value: string
|
|
@@ -404,6 +440,65 @@ export interface LlamaContext {
|
|
|
404
440
|
*/
|
|
405
441
|
decodeAudioTokens(tokens: number[]|Int32Array): Promise<Float32Array>
|
|
406
442
|
|
|
443
|
+
// Parallel decoding methods
|
|
444
|
+
|
|
445
|
+
/**
|
|
446
|
+
* Enable parallel decoding mode
|
|
447
|
+
* @param params Configuration for parallel mode
|
|
448
|
+
* @returns boolean indicating if successful
|
|
449
|
+
*/
|
|
450
|
+
enableParallelMode(params: { n_parallel?: number, n_batch?: number }): boolean
|
|
451
|
+
|
|
452
|
+
/**
|
|
453
|
+
* Disable parallel decoding mode
|
|
454
|
+
*/
|
|
455
|
+
disableParallelMode(): void
|
|
456
|
+
|
|
457
|
+
/**
|
|
458
|
+
* Queue a completion request for parallel processing
|
|
459
|
+
* @param options Completion options with parallel-specific state management
|
|
460
|
+
* @param callback Optional token callback
|
|
461
|
+
* @returns Object with requestId
|
|
462
|
+
*/
|
|
463
|
+
queueCompletion(
|
|
464
|
+
options: LlamaParallelCompletionOptions,
|
|
465
|
+
callback?: (error: any, result: any) => void,
|
|
466
|
+
): { requestId: number }
|
|
467
|
+
|
|
468
|
+
/**
|
|
469
|
+
* Queue an embedding request for parallel processing
|
|
470
|
+
* @param text Text to embed
|
|
471
|
+
* @param params Optional embedding parameters
|
|
472
|
+
* @param callback Optional result callback
|
|
473
|
+
* @returns Object with requestId
|
|
474
|
+
*/
|
|
475
|
+
queueEmbedding(
|
|
476
|
+
text: string,
|
|
477
|
+
params?: { embd_normalize?: number },
|
|
478
|
+
callback?: (error: any, result: any) => void,
|
|
479
|
+
): { requestId: number }
|
|
480
|
+
|
|
481
|
+
/**
|
|
482
|
+
* Queue a rerank request for parallel processing
|
|
483
|
+
* @param query Query text
|
|
484
|
+
* @param documents Documents to rank
|
|
485
|
+
* @param params Optional rerank parameters
|
|
486
|
+
* @param callback Optional result callback
|
|
487
|
+
* @returns Object with requestId
|
|
488
|
+
*/
|
|
489
|
+
queueRerank(
|
|
490
|
+
query: string,
|
|
491
|
+
documents: string[],
|
|
492
|
+
params?: RerankParams,
|
|
493
|
+
callback?: (error: any, result: any) => void,
|
|
494
|
+
): { requestId: number }
|
|
495
|
+
|
|
496
|
+
/**
|
|
497
|
+
* Cancel a queued request
|
|
498
|
+
* @param requestId Request ID to cancel
|
|
499
|
+
*/
|
|
500
|
+
cancelRequest(requestId: number): void
|
|
501
|
+
|
|
407
502
|
// static
|
|
408
503
|
loadModelInfo(path: string, skip: string[]): Promise<GGUFModelInfo>
|
|
409
504
|
toggleNativeLog(
|
package/lib/index.js
CHANGED
|
@@ -23,10 +23,12 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
23
23
|
});
|
|
24
24
|
};
|
|
25
25
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
26
|
-
exports.BuildInfo = exports.loadLlamaModelInfo = exports.initLlama = exports.loadModel = exports.toggleNativeLog = exports.MTMD_DEFAULT_MEDIA_MARKER = void 0;
|
|
26
|
+
exports.BuildInfo = exports.loadLlamaModelInfo = exports.initLlama = exports.loadModel = exports.toggleNativeLog = exports.MTMD_DEFAULT_MEDIA_MARKER = exports.LlamaParallelAPI = void 0;
|
|
27
27
|
exports.addNativeLogListener = addNativeLogListener;
|
|
28
28
|
const binding_1 = require("./binding");
|
|
29
29
|
const version_1 = require("./version");
|
|
30
|
+
const parallel_1 = require("./parallel");
|
|
31
|
+
Object.defineProperty(exports, "LlamaParallelAPI", { enumerable: true, get: function () { return parallel_1.LlamaParallelAPI; } });
|
|
30
32
|
__exportStar(require("./binding"), exports);
|
|
31
33
|
exports.MTMD_DEFAULT_MEDIA_MARKER = '<__media__>';
|
|
32
34
|
const mods = {};
|
|
@@ -66,6 +68,7 @@ const getJsonSchema = (responseFormat) => {
|
|
|
66
68
|
class LlamaContextWrapper {
|
|
67
69
|
constructor(nativeCtx) {
|
|
68
70
|
this.ctx = nativeCtx;
|
|
71
|
+
this.parallel = new parallel_1.LlamaParallelAPI(nativeCtx);
|
|
69
72
|
}
|
|
70
73
|
getSystemInfo() {
|
|
71
74
|
return this.ctx.getSystemInfo();
|
|
@@ -138,7 +141,6 @@ class LlamaContextWrapper {
|
|
|
138
141
|
let tmpl;
|
|
139
142
|
if (template)
|
|
140
143
|
tmpl = template; // Force replace if provided
|
|
141
|
-
const jsonSchema = getJsonSchema(params === null || params === void 0 ? void 0 : params.response_format);
|
|
142
144
|
const result = this.ctx.getFormattedChat(chat, tmpl, {
|
|
143
145
|
jinja: useJinja,
|
|
144
146
|
response_format: params === null || params === void 0 ? void 0 : params.response_format,
|
package/lib/index.ts
CHANGED
|
@@ -18,8 +18,10 @@ import type {
|
|
|
18
18
|
GGUFModelInfo,
|
|
19
19
|
} from './binding'
|
|
20
20
|
import { BUILD_NUMBER, BUILD_COMMIT } from './version'
|
|
21
|
+
import { LlamaParallelAPI } from './parallel'
|
|
21
22
|
|
|
22
23
|
export * from './binding'
|
|
24
|
+
export { LlamaParallelAPI }
|
|
23
25
|
|
|
24
26
|
export const MTMD_DEFAULT_MEDIA_MARKER = '<__media__>'
|
|
25
27
|
|
|
@@ -78,9 +80,11 @@ export type FormattedChatResult = {
|
|
|
78
80
|
|
|
79
81
|
class LlamaContextWrapper {
|
|
80
82
|
ctx: LlamaContext
|
|
83
|
+
parallel: LlamaParallelAPI
|
|
81
84
|
|
|
82
85
|
constructor(nativeCtx: LlamaContext) {
|
|
83
86
|
this.ctx = nativeCtx
|
|
87
|
+
this.parallel = new LlamaParallelAPI(nativeCtx)
|
|
84
88
|
}
|
|
85
89
|
|
|
86
90
|
getSystemInfo(): string {
|
|
@@ -181,7 +185,6 @@ class LlamaContextWrapper {
|
|
|
181
185
|
const useJinja = this.isJinjaSupported() && params?.jinja
|
|
182
186
|
let tmpl
|
|
183
187
|
if (template) tmpl = template // Force replace if provided
|
|
184
|
-
const jsonSchema = getJsonSchema(params?.response_format)
|
|
185
188
|
|
|
186
189
|
const result = this.ctx.getFormattedChat(chat!, tmpl, {
|
|
187
190
|
jinja: useJinja,
|
package/lib/parallel.js
ADDED
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
exports.LlamaParallelAPI = void 0;
|
|
13
|
+
class LlamaParallelAPI {
|
|
14
|
+
constructor(context) {
|
|
15
|
+
this.enabled = false;
|
|
16
|
+
this.pendingRequests = new Map();
|
|
17
|
+
this.context = context;
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* Enable parallel decoding mode
|
|
21
|
+
* @param config Configuration for parallel mode
|
|
22
|
+
* @returns boolean indicating if successful
|
|
23
|
+
*/
|
|
24
|
+
enable(config) {
|
|
25
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
26
|
+
const defaultConfig = { n_parallel: 2, n_batch: 512 };
|
|
27
|
+
const result = this.context.enableParallelMode(Object.assign(Object.assign({}, defaultConfig), config));
|
|
28
|
+
this.enabled = result;
|
|
29
|
+
return result;
|
|
30
|
+
});
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Disable parallel decoding mode
|
|
34
|
+
*/
|
|
35
|
+
disable() {
|
|
36
|
+
this.context.disableParallelMode();
|
|
37
|
+
this.enabled = false;
|
|
38
|
+
}
|
|
39
|
+
/**
|
|
40
|
+
* Configure parallel decoding mode (enables if not already enabled)
|
|
41
|
+
* @param config Configuration for parallel mode
|
|
42
|
+
* @returns boolean indicating if successful
|
|
43
|
+
*/
|
|
44
|
+
configure(config) {
|
|
45
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
46
|
+
return this.enable(config);
|
|
47
|
+
});
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Queue a completion request for parallel processing
|
|
51
|
+
* @param options Completion options
|
|
52
|
+
* @param onToken Optional callback for each token
|
|
53
|
+
* @returns Object with requestId, promise for result, and stop function
|
|
54
|
+
*/
|
|
55
|
+
completion(options, onToken) {
|
|
56
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
57
|
+
if (!this.enabled) {
|
|
58
|
+
throw new Error('Parallel mode is not enabled. Call enable() first.');
|
|
59
|
+
}
|
|
60
|
+
const tokenCallback = onToken
|
|
61
|
+
? (error, result) => {
|
|
62
|
+
if (error) {
|
|
63
|
+
console.error('Token callback error:', error);
|
|
64
|
+
// Handle completion error
|
|
65
|
+
const pendingReq = this.pendingRequests.get(result === null || result === void 0 ? void 0 : result.requestId);
|
|
66
|
+
if (pendingReq) {
|
|
67
|
+
pendingReq.reject(error);
|
|
68
|
+
this.pendingRequests.delete(result === null || result === void 0 ? void 0 : result.requestId);
|
|
69
|
+
}
|
|
70
|
+
return;
|
|
71
|
+
}
|
|
72
|
+
// Check if this is a token callback or final result
|
|
73
|
+
if (result) {
|
|
74
|
+
if (result.token !== undefined) {
|
|
75
|
+
// This is a token callback
|
|
76
|
+
onToken(result.requestId, result);
|
|
77
|
+
}
|
|
78
|
+
else if (result.text !== undefined ||
|
|
79
|
+
result.content !== undefined) {
|
|
80
|
+
// This is the final result
|
|
81
|
+
const pendingReq = this.pendingRequests.get(result.requestId);
|
|
82
|
+
if (pendingReq) {
|
|
83
|
+
pendingReq.resolve(result);
|
|
84
|
+
this.pendingRequests.delete(result.requestId);
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
: undefined;
|
|
90
|
+
// Queue the completion immediately (this is synchronous!)
|
|
91
|
+
const { requestId } = this.context.queueCompletion(options, tokenCallback ||
|
|
92
|
+
((error, result) => {
|
|
93
|
+
if (error) {
|
|
94
|
+
const pendingReq = this.pendingRequests.get(result === null || result === void 0 ? void 0 : result.requestId);
|
|
95
|
+
if (pendingReq) {
|
|
96
|
+
pendingReq.reject(error);
|
|
97
|
+
this.pendingRequests.delete(result === null || result === void 0 ? void 0 : result.requestId);
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
else if (result &&
|
|
101
|
+
(result.text !== undefined || result.content !== undefined)) {
|
|
102
|
+
// Final result for non-streaming
|
|
103
|
+
const pendingReq = this.pendingRequests.get(result.requestId);
|
|
104
|
+
if (pendingReq) {
|
|
105
|
+
pendingReq.resolve(result);
|
|
106
|
+
this.pendingRequests.delete(result.requestId);
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
}));
|
|
110
|
+
// Create promise for final result
|
|
111
|
+
const promise = new Promise((resolveResult, rejectResult) => {
|
|
112
|
+
this.pendingRequests.set(requestId, {
|
|
113
|
+
resolve: resolveResult,
|
|
114
|
+
reject: rejectResult,
|
|
115
|
+
});
|
|
116
|
+
});
|
|
117
|
+
// Create stop function
|
|
118
|
+
const stop = () => {
|
|
119
|
+
this.context.cancelRequest(requestId);
|
|
120
|
+
const pendingReq = this.pendingRequests.get(requestId);
|
|
121
|
+
if (pendingReq) {
|
|
122
|
+
pendingReq.reject(new Error('Request cancelled'));
|
|
123
|
+
this.pendingRequests.delete(requestId);
|
|
124
|
+
}
|
|
125
|
+
};
|
|
126
|
+
// Return immediately without wrapping in a Promise
|
|
127
|
+
return {
|
|
128
|
+
requestId,
|
|
129
|
+
promise,
|
|
130
|
+
stop,
|
|
131
|
+
};
|
|
132
|
+
});
|
|
133
|
+
}
|
|
134
|
+
/**
|
|
135
|
+
* Queue an embedding request for parallel processing
|
|
136
|
+
* @param text Text to embed
|
|
137
|
+
* @param params Optional embedding parameters
|
|
138
|
+
* @returns Object with requestId and promise for result
|
|
139
|
+
*/
|
|
140
|
+
embedding(text, params) {
|
|
141
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
142
|
+
if (!this.enabled) {
|
|
143
|
+
throw new Error('Parallel mode is not enabled. Call enable() first.');
|
|
144
|
+
}
|
|
145
|
+
// Create promise for result
|
|
146
|
+
let resolveResult;
|
|
147
|
+
let rejectResult;
|
|
148
|
+
const promise = new Promise((res, rej) => {
|
|
149
|
+
resolveResult = res;
|
|
150
|
+
rejectResult = rej;
|
|
151
|
+
});
|
|
152
|
+
// Queue the embedding immediately (this is synchronous!)
|
|
153
|
+
const { requestId } = this.context.queueEmbedding(text, params, (error, result) => {
|
|
154
|
+
if (error) {
|
|
155
|
+
rejectResult(error);
|
|
156
|
+
}
|
|
157
|
+
else {
|
|
158
|
+
resolveResult(result);
|
|
159
|
+
}
|
|
160
|
+
});
|
|
161
|
+
// Return immediately without wrapping in a Promise
|
|
162
|
+
return {
|
|
163
|
+
requestId,
|
|
164
|
+
promise,
|
|
165
|
+
};
|
|
166
|
+
});
|
|
167
|
+
}
|
|
168
|
+
/**
|
|
169
|
+
* Queue a rerank request for parallel processing
|
|
170
|
+
* @param query Query text
|
|
171
|
+
* @param documents Documents to rank
|
|
172
|
+
* @param params Optional rerank parameters
|
|
173
|
+
* @returns Object with requestId and promise for results
|
|
174
|
+
*/
|
|
175
|
+
rerank(query, documents, params) {
|
|
176
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
177
|
+
if (!this.enabled) {
|
|
178
|
+
throw new Error('Parallel mode is not enabled. Call enable() first.');
|
|
179
|
+
}
|
|
180
|
+
// Create promise for result
|
|
181
|
+
let resolveResult;
|
|
182
|
+
let rejectResult;
|
|
183
|
+
const promise = new Promise((res, rej) => {
|
|
184
|
+
resolveResult = res;
|
|
185
|
+
rejectResult = rej;
|
|
186
|
+
});
|
|
187
|
+
// Queue the rerank immediately (this is synchronous!)
|
|
188
|
+
const { requestId } = this.context.queueRerank(query, documents, params, (error, result) => {
|
|
189
|
+
if (error) {
|
|
190
|
+
rejectResult(error);
|
|
191
|
+
}
|
|
192
|
+
else {
|
|
193
|
+
// Add document text to results and sort by score
|
|
194
|
+
const enrichedResults = result.results
|
|
195
|
+
.map((r) => (Object.assign(Object.assign({}, r), { document: documents[r.index] })))
|
|
196
|
+
.sort((a, b) => b.score - a.score);
|
|
197
|
+
resolveResult(enrichedResults);
|
|
198
|
+
}
|
|
199
|
+
});
|
|
200
|
+
// Return immediately without wrapping in a Promise
|
|
201
|
+
return {
|
|
202
|
+
requestId,
|
|
203
|
+
promise,
|
|
204
|
+
};
|
|
205
|
+
});
|
|
206
|
+
}
|
|
207
|
+
/**
|
|
208
|
+
* Check if parallel mode is enabled
|
|
209
|
+
*/
|
|
210
|
+
isEnabled() {
|
|
211
|
+
return this.enabled;
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
exports.LlamaParallelAPI = LlamaParallelAPI;
|
package/lib/parallel.ts
ADDED
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
// Parallel decoding API implementation for llama.node
|
|
2
|
+
import type {
|
|
3
|
+
LlamaContext,
|
|
4
|
+
LlamaCompletionOptions,
|
|
5
|
+
LlamaCompletionToken,
|
|
6
|
+
RerankParams,
|
|
7
|
+
} from './binding'
|
|
8
|
+
|
|
9
|
+
export class LlamaParallelAPI {
|
|
10
|
+
private context: LlamaContext
|
|
11
|
+
private enabled: boolean = false
|
|
12
|
+
private pendingRequests = new Map<
|
|
13
|
+
number,
|
|
14
|
+
{
|
|
15
|
+
resolve: (value: any) => void
|
|
16
|
+
reject: (reason?: any) => void
|
|
17
|
+
}
|
|
18
|
+
>()
|
|
19
|
+
|
|
20
|
+
constructor(context: LlamaContext) {
|
|
21
|
+
this.context = context
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Enable parallel decoding mode
|
|
26
|
+
* @param config Configuration for parallel mode
|
|
27
|
+
* @returns boolean indicating if successful
|
|
28
|
+
*/
|
|
29
|
+
async enable(config?: {
|
|
30
|
+
n_parallel?: number
|
|
31
|
+
n_batch?: number
|
|
32
|
+
}): Promise<boolean> {
|
|
33
|
+
const defaultConfig = { n_parallel: 2, n_batch: 512 }
|
|
34
|
+
const result = this.context.enableParallelMode({
|
|
35
|
+
...defaultConfig,
|
|
36
|
+
...config,
|
|
37
|
+
})
|
|
38
|
+
this.enabled = result
|
|
39
|
+
return result
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* Disable parallel decoding mode
|
|
44
|
+
*/
|
|
45
|
+
disable(): void {
|
|
46
|
+
this.context.disableParallelMode()
|
|
47
|
+
this.enabled = false
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Configure parallel decoding mode (enables if not already enabled)
|
|
52
|
+
* @param config Configuration for parallel mode
|
|
53
|
+
* @returns boolean indicating if successful
|
|
54
|
+
*/
|
|
55
|
+
async configure(config: {
|
|
56
|
+
n_parallel?: number
|
|
57
|
+
n_batch?: number
|
|
58
|
+
}): Promise<boolean> {
|
|
59
|
+
return this.enable(config)
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* Queue a completion request for parallel processing
|
|
64
|
+
* @param options Completion options
|
|
65
|
+
* @param onToken Optional callback for each token
|
|
66
|
+
* @returns Object with requestId, promise for result, and stop function
|
|
67
|
+
*/
|
|
68
|
+
async completion(
|
|
69
|
+
options: LlamaCompletionOptions,
|
|
70
|
+
onToken?: (requestId: number, data: LlamaCompletionToken) => void,
|
|
71
|
+
): Promise<{
|
|
72
|
+
requestId: number
|
|
73
|
+
promise: Promise<any>
|
|
74
|
+
stop: () => void
|
|
75
|
+
}> {
|
|
76
|
+
if (!this.enabled) {
|
|
77
|
+
throw new Error('Parallel mode is not enabled. Call enable() first.')
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
const tokenCallback = onToken
|
|
81
|
+
? (error: any, result: any) => {
|
|
82
|
+
if (error) {
|
|
83
|
+
console.error('Token callback error:', error)
|
|
84
|
+
// Handle completion error
|
|
85
|
+
const pendingReq = this.pendingRequests.get(result?.requestId)
|
|
86
|
+
if (pendingReq) {
|
|
87
|
+
pendingReq.reject(error)
|
|
88
|
+
this.pendingRequests.delete(result?.requestId)
|
|
89
|
+
}
|
|
90
|
+
return
|
|
91
|
+
}
|
|
92
|
+
// Check if this is a token callback or final result
|
|
93
|
+
if (result) {
|
|
94
|
+
if (result.token !== undefined) {
|
|
95
|
+
// This is a token callback
|
|
96
|
+
onToken(result.requestId, result)
|
|
97
|
+
} else if (
|
|
98
|
+
result.text !== undefined ||
|
|
99
|
+
result.content !== undefined
|
|
100
|
+
) {
|
|
101
|
+
// This is the final result
|
|
102
|
+
const pendingReq = this.pendingRequests.get(result.requestId)
|
|
103
|
+
if (pendingReq) {
|
|
104
|
+
pendingReq.resolve(result)
|
|
105
|
+
this.pendingRequests.delete(result.requestId)
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
: undefined
|
|
111
|
+
|
|
112
|
+
// Queue the completion immediately (this is synchronous!)
|
|
113
|
+
const { requestId } = this.context.queueCompletion(
|
|
114
|
+
options,
|
|
115
|
+
tokenCallback ||
|
|
116
|
+
((error, result) => {
|
|
117
|
+
if (error) {
|
|
118
|
+
const pendingReq = this.pendingRequests.get(result?.requestId)
|
|
119
|
+
if (pendingReq) {
|
|
120
|
+
pendingReq.reject(error)
|
|
121
|
+
this.pendingRequests.delete(result?.requestId)
|
|
122
|
+
}
|
|
123
|
+
} else if (
|
|
124
|
+
result &&
|
|
125
|
+
(result.text !== undefined || result.content !== undefined)
|
|
126
|
+
) {
|
|
127
|
+
// Final result for non-streaming
|
|
128
|
+
const pendingReq = this.pendingRequests.get(result.requestId)
|
|
129
|
+
if (pendingReq) {
|
|
130
|
+
pendingReq.resolve(result)
|
|
131
|
+
this.pendingRequests.delete(result.requestId)
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
}),
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
// Create promise for final result
|
|
138
|
+
const promise = new Promise((resolveResult, rejectResult) => {
|
|
139
|
+
this.pendingRequests.set(requestId, {
|
|
140
|
+
resolve: resolveResult,
|
|
141
|
+
reject: rejectResult,
|
|
142
|
+
})
|
|
143
|
+
})
|
|
144
|
+
|
|
145
|
+
// Create stop function
|
|
146
|
+
const stop = () => {
|
|
147
|
+
this.context.cancelRequest(requestId)
|
|
148
|
+
const pendingReq = this.pendingRequests.get(requestId)
|
|
149
|
+
if (pendingReq) {
|
|
150
|
+
pendingReq.reject(new Error('Request cancelled'))
|
|
151
|
+
this.pendingRequests.delete(requestId)
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
// Return immediately without wrapping in a Promise
|
|
156
|
+
return {
|
|
157
|
+
requestId,
|
|
158
|
+
promise,
|
|
159
|
+
stop,
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
/**
|
|
164
|
+
* Queue an embedding request for parallel processing
|
|
165
|
+
* @param text Text to embed
|
|
166
|
+
* @param params Optional embedding parameters
|
|
167
|
+
* @returns Object with requestId and promise for result
|
|
168
|
+
*/
|
|
169
|
+
async embedding(
|
|
170
|
+
text: string,
|
|
171
|
+
params?: { embd_normalize?: number },
|
|
172
|
+
): Promise<{
|
|
173
|
+
requestId: number
|
|
174
|
+
promise: Promise<{ embedding: number[] }>
|
|
175
|
+
}> {
|
|
176
|
+
if (!this.enabled) {
|
|
177
|
+
throw new Error('Parallel mode is not enabled. Call enable() first.')
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
// Create promise for result
|
|
181
|
+
let resolveResult: (value: any) => void
|
|
182
|
+
let rejectResult: (reason?: any) => void
|
|
183
|
+
|
|
184
|
+
const promise = new Promise<{ embedding: number[] }>((res, rej) => {
|
|
185
|
+
resolveResult = res
|
|
186
|
+
rejectResult = rej
|
|
187
|
+
})
|
|
188
|
+
|
|
189
|
+
// Queue the embedding immediately (this is synchronous!)
|
|
190
|
+
const { requestId } = this.context.queueEmbedding(
|
|
191
|
+
text,
|
|
192
|
+
params,
|
|
193
|
+
(error, result) => {
|
|
194
|
+
if (error) {
|
|
195
|
+
rejectResult(error)
|
|
196
|
+
} else {
|
|
197
|
+
resolveResult(result)
|
|
198
|
+
}
|
|
199
|
+
},
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
// Return immediately without wrapping in a Promise
|
|
203
|
+
return {
|
|
204
|
+
requestId,
|
|
205
|
+
promise,
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
/**
|
|
210
|
+
* Queue a rerank request for parallel processing
|
|
211
|
+
* @param query Query text
|
|
212
|
+
* @param documents Documents to rank
|
|
213
|
+
* @param params Optional rerank parameters
|
|
214
|
+
* @returns Object with requestId and promise for results
|
|
215
|
+
*/
|
|
216
|
+
async rerank(
|
|
217
|
+
query: string,
|
|
218
|
+
documents: string[],
|
|
219
|
+
params?: RerankParams,
|
|
220
|
+
): Promise<{
|
|
221
|
+
requestId: number
|
|
222
|
+
promise: Promise<Array<{ score: number; index: number; document: string }>>
|
|
223
|
+
}> {
|
|
224
|
+
if (!this.enabled) {
|
|
225
|
+
throw new Error('Parallel mode is not enabled. Call enable() first.')
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
// Create promise for result
|
|
229
|
+
let resolveResult: (value: any) => void
|
|
230
|
+
let rejectResult: (reason?: any) => void
|
|
231
|
+
|
|
232
|
+
const promise = new Promise<
|
|
233
|
+
Array<{ score: number; index: number; document: string }>
|
|
234
|
+
>((res, rej) => {
|
|
235
|
+
resolveResult = res
|
|
236
|
+
rejectResult = rej
|
|
237
|
+
})
|
|
238
|
+
|
|
239
|
+
// Queue the rerank immediately (this is synchronous!)
|
|
240
|
+
const { requestId } = this.context.queueRerank(
|
|
241
|
+
query,
|
|
242
|
+
documents,
|
|
243
|
+
params,
|
|
244
|
+
(error, result) => {
|
|
245
|
+
if (error) {
|
|
246
|
+
rejectResult(error)
|
|
247
|
+
} else {
|
|
248
|
+
// Add document text to results and sort by score
|
|
249
|
+
const enrichedResults = result.results
|
|
250
|
+
.map((r: any) => ({
|
|
251
|
+
...r,
|
|
252
|
+
document: documents[r.index],
|
|
253
|
+
}))
|
|
254
|
+
.sort((a: any, b: any) => b.score - a.score)
|
|
255
|
+
resolveResult(enrichedResults)
|
|
256
|
+
}
|
|
257
|
+
},
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
// Return immediately without wrapping in a Promise
|
|
261
|
+
return {
|
|
262
|
+
requestId,
|
|
263
|
+
promise,
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
/**
|
|
268
|
+
* Check if parallel mode is enabled
|
|
269
|
+
*/
|
|
270
|
+
isEnabled(): boolean {
|
|
271
|
+
return this.enabled
|
|
272
|
+
}
|
|
273
|
+
}
|