@fugood/llama.node 0.4.7 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +4 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/lib/binding.ts +66 -6
- package/lib/index.js +59 -17
- package/lib/index.ts +74 -23
- package/package.json +1 -1
- package/src/DecodeAudioTokenWorker.cpp +40 -0
- package/src/DecodeAudioTokenWorker.h +22 -0
- package/src/EmbeddingWorker.cpp +7 -5
- package/src/LlamaCompletionWorker.cpp +68 -54
- package/src/LlamaCompletionWorker.h +7 -8
- package/src/LlamaContext.cpp +551 -235
- package/src/LlamaContext.h +26 -4
- package/src/LoadSessionWorker.cpp +4 -2
- package/src/SaveSessionWorker.cpp +10 -6
- package/src/TokenizeWorker.cpp +23 -14
- package/src/TokenizeWorker.h +2 -2
- package/src/addons.cc +8 -11
- package/src/common.hpp +129 -126
- package/src/llama.cpp/.github/workflows/build.yml +2 -2
- package/src/llama.cpp/.github/workflows/release.yml +152 -129
- package/src/llama.cpp/.github/workflows/winget.yml +42 -0
- package/src/llama.cpp/common/arg.cpp +14 -13
- package/src/llama.cpp/common/common.cpp +4 -75
- package/src/llama.cpp/common/common.h +7 -12
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +0 -13
- package/src/llama.cpp/examples/lookup/lookup.cpp +0 -11
- package/src/llama.cpp/examples/parallel/parallel.cpp +0 -9
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +6 -6
- package/src/llama.cpp/examples/simple/simple.cpp +1 -1
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
- package/src/llama.cpp/examples/sycl/run-llama2.sh +4 -4
- package/src/llama.cpp/examples/sycl/run-llama3.sh +28 -0
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/sycl/win-run-llama3.bat +9 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +2 -0
- package/src/llama.cpp/ggml/include/ggml.h +11 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +274 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +27 -0
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +18 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +107 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -155
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +5 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +43 -12
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +171 -112
- package/src/llama.cpp/ggml/src/ggml.c +64 -18
- package/src/llama.cpp/include/llama.h +24 -124
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
- package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
- package/src/llama.cpp/src/llama-batch.cpp +3 -1
- package/src/llama.cpp/src/llama-context.cpp +60 -110
- package/src/llama.cpp/src/llama-graph.cpp +137 -233
- package/src/llama.cpp/src/llama-graph.h +49 -7
- package/src/llama.cpp/src/llama-hparams.cpp +17 -1
- package/src/llama.cpp/src/llama-hparams.h +34 -5
- package/src/llama.cpp/src/llama-kv-cache.cpp +654 -321
- package/src/llama.cpp/src/llama-kv-cache.h +201 -85
- package/src/llama.cpp/src/llama-memory.h +3 -2
- package/src/llama.cpp/src/llama-model.cpp +273 -94
- package/src/llama.cpp/src/llama-model.h +4 -1
- package/src/llama.cpp/tests/test-arg-parser.cpp +1 -1
- package/src/llama.cpp/tools/llama-bench/llama-bench.cpp +1 -0
- package/src/llama.cpp/tools/mtmd/CMakeLists.txt +13 -2
- package/src/llama.cpp/tools/mtmd/clip-impl.h +108 -11
- package/src/llama.cpp/tools/mtmd/clip.cpp +466 -88
- package/src/llama.cpp/tools/mtmd/clip.h +6 -4
- package/src/llama.cpp/tools/mtmd/miniaudio.h +93468 -0
- package/src/llama.cpp/tools/mtmd/mtmd-audio.cpp +855 -0
- package/src/llama.cpp/tools/mtmd/mtmd-audio.h +62 -0
- package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +21 -14
- package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +36 -49
- package/src/llama.cpp/tools/mtmd/mtmd.cpp +362 -98
- package/src/llama.cpp/tools/mtmd/mtmd.h +52 -21
- package/src/llama.cpp/tools/run/run.cpp +2 -2
- package/src/llama.cpp/tools/server/server.cpp +158 -47
- package/src/llama.cpp/tools/server/utils.hpp +71 -43
- package/src/llama.cpp/tools/tts/tts.cpp +4 -2
- package/src/tts_utils.cpp +342 -0
- package/src/tts_utils.h +62 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
package/CMakeLists.txt
CHANGED
|
@@ -102,6 +102,10 @@ file(
|
|
|
102
102
|
"src/LoadSessionWorker.h"
|
|
103
103
|
"src/SaveSessionWorker.cpp"
|
|
104
104
|
"src/SaveSessionWorker.h"
|
|
105
|
+
"src/DecodeAudioTokenWorker.cpp"
|
|
106
|
+
"src/DecodeAudioTokenWorker.h"
|
|
107
|
+
"src/tts_utils.cpp"
|
|
108
|
+
"src/tts_utils.h"
|
|
105
109
|
)
|
|
106
110
|
|
|
107
111
|
add_library(${PROJECT_NAME} SHARED ${SOURCE_FILES} ${CMAKE_JS_SRC})
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/lib/binding.ts
CHANGED
|
@@ -6,6 +6,11 @@ export type MessagePart = {
|
|
|
6
6
|
text?: string,
|
|
7
7
|
image_url?: {
|
|
8
8
|
url?: string
|
|
9
|
+
},
|
|
10
|
+
input_audio?: {
|
|
11
|
+
format: string
|
|
12
|
+
data?: string
|
|
13
|
+
url?: string
|
|
9
14
|
}
|
|
10
15
|
}
|
|
11
16
|
|
|
@@ -103,12 +108,17 @@ export type LlamaCompletionOptions = {
|
|
|
103
108
|
grammar_triggers?: { type: number; word: string; at_start: boolean }[]
|
|
104
109
|
preserved_tokens?: string[]
|
|
105
110
|
/**
|
|
106
|
-
* Path(s) to
|
|
107
|
-
* When provided, the
|
|
111
|
+
* Path(s) to media file(s) to process before generating text.
|
|
112
|
+
* When provided, the media will be processed and added to the context.
|
|
108
113
|
* Requires multimodal support to be enabled via initMultimodal.
|
|
109
114
|
* Supports both file paths and base64 data URLs.
|
|
110
115
|
*/
|
|
111
|
-
|
|
116
|
+
media_paths?: string | string[]
|
|
117
|
+
/**
|
|
118
|
+
* Guide tokens to use for audio completion.
|
|
119
|
+
* Help prevent hallucinations by forcing the TTS to use the correct words.
|
|
120
|
+
*/
|
|
121
|
+
guide_tokens?: Int32Array
|
|
112
122
|
}
|
|
113
123
|
|
|
114
124
|
export type LlamaCompletionResult = {
|
|
@@ -137,10 +147,10 @@ export type LlamaCompletionToken = {
|
|
|
137
147
|
|
|
138
148
|
export type TokenizeResult = {
|
|
139
149
|
tokens: Int32Array
|
|
140
|
-
|
|
150
|
+
has_media: boolean
|
|
141
151
|
bitmap_hashes: string[]
|
|
142
152
|
chunk_pos: number[]
|
|
143
|
-
|
|
153
|
+
chunk_pos_media: number[]
|
|
144
154
|
}
|
|
145
155
|
|
|
146
156
|
export type EmbeddingResult = {
|
|
@@ -167,7 +177,7 @@ export interface LlamaContext {
|
|
|
167
177
|
callback?: (token: LlamaCompletionToken) => void,
|
|
168
178
|
): Promise<LlamaCompletionResult>
|
|
169
179
|
stopCompletion(): void
|
|
170
|
-
tokenize(text: string,
|
|
180
|
+
tokenize(text: string, media_paths?: string[]): Promise<TokenizeResult>
|
|
171
181
|
detokenize(tokens: number[]): Promise<string>
|
|
172
182
|
embedding(text: string): Promise<EmbeddingResult>
|
|
173
183
|
saveSession(path: string): Promise<void>
|
|
@@ -189,11 +199,61 @@ export interface LlamaContext {
|
|
|
189
199
|
*/
|
|
190
200
|
isMultimodalEnabled(): Promise<boolean>
|
|
191
201
|
|
|
202
|
+
/**
|
|
203
|
+
* Get multimodal support capabilities
|
|
204
|
+
* @returns Promise resolving to an object with vision and audio support
|
|
205
|
+
*/
|
|
206
|
+
getMultimodalSupport(): Promise<{
|
|
207
|
+
vision: boolean
|
|
208
|
+
audio: boolean
|
|
209
|
+
}>
|
|
210
|
+
|
|
192
211
|
/**
|
|
193
212
|
* Release multimodal support
|
|
194
213
|
*/
|
|
195
214
|
releaseMultimodal(): Promise<void>
|
|
196
215
|
|
|
216
|
+
/**
|
|
217
|
+
* Load a vocoder model
|
|
218
|
+
* @param path Path to the vocoder model
|
|
219
|
+
* @returns Promise resolving to true if loading was successful
|
|
220
|
+
*/
|
|
221
|
+
initVocoder(path: string): Promise<boolean>
|
|
222
|
+
|
|
223
|
+
/**
|
|
224
|
+
* Unload the vocoder model
|
|
225
|
+
* @returns Promise resolving to true if unloading was successful
|
|
226
|
+
*/
|
|
227
|
+
releaseVocoder(): Promise<void>
|
|
228
|
+
|
|
229
|
+
/**
|
|
230
|
+
* Check if the vocoder model is enabled
|
|
231
|
+
* @returns Promise resolving to true if the vocoder model is enabled
|
|
232
|
+
*/
|
|
233
|
+
isVocoderEnabled(): boolean
|
|
234
|
+
|
|
235
|
+
/**
|
|
236
|
+
* Get the formatted prompt for audio completion
|
|
237
|
+
* @param speaker Speaker name or null
|
|
238
|
+
* @param text Text to complete
|
|
239
|
+
* @returns Formatted audio completion
|
|
240
|
+
*/
|
|
241
|
+
getFormattedAudioCompletion(speaker: string|null, text: string): string
|
|
242
|
+
|
|
243
|
+
/**
|
|
244
|
+
* Get guide tokens for audio completion
|
|
245
|
+
* @param text Text to complete
|
|
246
|
+
* @returns Guide tokens
|
|
247
|
+
*/
|
|
248
|
+
getAudioCompletionGuideTokens(text: string): Int32Array
|
|
249
|
+
|
|
250
|
+
/**
|
|
251
|
+
* Decode audio tokens to audio data
|
|
252
|
+
* @param tokens Tokens to decode
|
|
253
|
+
* @returns Decoded audio tokens
|
|
254
|
+
*/
|
|
255
|
+
decodeAudioTokens(tokens: Int32Array): Promise<Float32Array>
|
|
256
|
+
|
|
197
257
|
// static
|
|
198
258
|
loadModelInfo(path: string, skip: string[]): Promise<Object>
|
|
199
259
|
toggleNativeLog(
|
package/lib/index.js
CHANGED
|
@@ -23,10 +23,11 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
23
23
|
});
|
|
24
24
|
};
|
|
25
25
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
26
|
-
exports.loadLlamaModelInfo = exports.initLlama = exports.loadModel = exports.toggleNativeLog = void 0;
|
|
26
|
+
exports.loadLlamaModelInfo = exports.initLlama = exports.loadModel = exports.toggleNativeLog = exports.MTMD_DEFAULT_MEDIA_MARKER = void 0;
|
|
27
27
|
exports.addNativeLogListener = addNativeLogListener;
|
|
28
28
|
const binding_1 = require("./binding");
|
|
29
29
|
__exportStar(require("./binding"), exports);
|
|
30
|
+
exports.MTMD_DEFAULT_MEDIA_MARKER = '<__media__>';
|
|
30
31
|
const mods = {};
|
|
31
32
|
const logListeners = [];
|
|
32
33
|
const logCallback = (level, text) => {
|
|
@@ -78,13 +79,13 @@ class LlamaContextWrapper {
|
|
|
78
79
|
isLlamaChatSupported() {
|
|
79
80
|
return !!this.ctx.getModelInfo().chatTemplates.llamaChat;
|
|
80
81
|
}
|
|
81
|
-
|
|
82
|
+
_formatMediaChat(messages) {
|
|
82
83
|
if (!messages)
|
|
83
84
|
return {
|
|
84
85
|
messages,
|
|
85
|
-
|
|
86
|
+
has_media: false,
|
|
86
87
|
};
|
|
87
|
-
const
|
|
88
|
+
const mediaPaths = [];
|
|
88
89
|
return {
|
|
89
90
|
messages: messages.map((msg) => {
|
|
90
91
|
if (Array.isArray(msg.content)) {
|
|
@@ -93,10 +94,30 @@ class LlamaContextWrapper {
|
|
|
93
94
|
// Handle multimodal content
|
|
94
95
|
if (part.type === 'image_url') {
|
|
95
96
|
let path = ((_a = part.image_url) === null || _a === void 0 ? void 0 : _a.url) || '';
|
|
96
|
-
|
|
97
|
+
mediaPaths.push(path);
|
|
97
98
|
return {
|
|
98
99
|
type: 'text',
|
|
99
|
-
text:
|
|
100
|
+
text: exports.MTMD_DEFAULT_MEDIA_MARKER,
|
|
101
|
+
};
|
|
102
|
+
}
|
|
103
|
+
else if (part.type === 'input_audio') {
|
|
104
|
+
const { input_audio: audio } = part;
|
|
105
|
+
if (!audio)
|
|
106
|
+
throw new Error('input_audio is required');
|
|
107
|
+
const { format } = audio;
|
|
108
|
+
if (format != 'wav' && format != 'mp3') {
|
|
109
|
+
throw new Error(`Unsupported audio format: ${format}`);
|
|
110
|
+
}
|
|
111
|
+
if (audio.url) {
|
|
112
|
+
const path = audio.url.replace(/file:\/\//, '');
|
|
113
|
+
mediaPaths.push(path);
|
|
114
|
+
}
|
|
115
|
+
else if (audio.data) {
|
|
116
|
+
mediaPaths.push(audio.data);
|
|
117
|
+
}
|
|
118
|
+
return {
|
|
119
|
+
type: 'text',
|
|
120
|
+
text: exports.MTMD_DEFAULT_MEDIA_MARKER,
|
|
100
121
|
};
|
|
101
122
|
}
|
|
102
123
|
return part;
|
|
@@ -105,12 +126,12 @@ class LlamaContextWrapper {
|
|
|
105
126
|
}
|
|
106
127
|
return msg;
|
|
107
128
|
}),
|
|
108
|
-
|
|
109
|
-
|
|
129
|
+
has_media: mediaPaths.length > 0,
|
|
130
|
+
media_paths: mediaPaths,
|
|
110
131
|
};
|
|
111
132
|
}
|
|
112
133
|
getFormattedChat(messages, template, params) {
|
|
113
|
-
const { messages: chat,
|
|
134
|
+
const { messages: chat, has_media, media_paths, } = this._formatMediaChat(messages);
|
|
114
135
|
const useJinja = this.isJinjaSupported() && (params === null || params === void 0 ? void 0 : params.jinja);
|
|
115
136
|
let tmpl;
|
|
116
137
|
if (template)
|
|
@@ -127,25 +148,25 @@ class LlamaContextWrapper {
|
|
|
127
148
|
return {
|
|
128
149
|
type: 'llama-chat',
|
|
129
150
|
prompt: result,
|
|
130
|
-
|
|
131
|
-
|
|
151
|
+
has_media,
|
|
152
|
+
media_paths,
|
|
132
153
|
};
|
|
133
154
|
}
|
|
134
155
|
const jinjaResult = result;
|
|
135
156
|
jinjaResult.type = 'jinja';
|
|
136
|
-
jinjaResult.
|
|
137
|
-
jinjaResult.
|
|
157
|
+
jinjaResult.has_media = has_media;
|
|
158
|
+
jinjaResult.media_paths = media_paths;
|
|
138
159
|
return jinjaResult;
|
|
139
160
|
}
|
|
140
161
|
completion(options, callback) {
|
|
141
|
-
const { messages,
|
|
142
|
-
return this.ctx.completion(Object.assign(Object.assign({}, options), { messages,
|
|
162
|
+
const { messages, media_paths = options.media_paths } = this._formatMediaChat(options.messages);
|
|
163
|
+
return this.ctx.completion(Object.assign(Object.assign({}, options), { messages, media_paths: options.media_paths || media_paths }), callback || (() => { }));
|
|
143
164
|
}
|
|
144
165
|
stopCompletion() {
|
|
145
166
|
return this.ctx.stopCompletion();
|
|
146
167
|
}
|
|
147
|
-
tokenize(text, {
|
|
148
|
-
return this.ctx.tokenize(text,
|
|
168
|
+
tokenize(text, { media_paths } = {}) {
|
|
169
|
+
return this.ctx.tokenize(text, media_paths);
|
|
149
170
|
}
|
|
150
171
|
detokenize(tokens) {
|
|
151
172
|
return this.ctx.detokenize(tokens);
|
|
@@ -180,6 +201,27 @@ class LlamaContextWrapper {
|
|
|
180
201
|
releaseMultimodal() {
|
|
181
202
|
return this.ctx.releaseMultimodal();
|
|
182
203
|
}
|
|
204
|
+
getMultimodalSupport() {
|
|
205
|
+
return this.ctx.getMultimodalSupport();
|
|
206
|
+
}
|
|
207
|
+
initVocoder(path) {
|
|
208
|
+
return this.ctx.initVocoder(path);
|
|
209
|
+
}
|
|
210
|
+
releaseVocoder() {
|
|
211
|
+
return this.ctx.releaseVocoder();
|
|
212
|
+
}
|
|
213
|
+
isVocoderEnabled() {
|
|
214
|
+
return this.ctx.isVocoderEnabled();
|
|
215
|
+
}
|
|
216
|
+
getFormattedAudioCompletion(speaker, text) {
|
|
217
|
+
return this.ctx.getFormattedAudioCompletion(speaker, text);
|
|
218
|
+
}
|
|
219
|
+
getAudioCompletionGuideTokens(text) {
|
|
220
|
+
return this.ctx.getAudioCompletionGuideTokens(text);
|
|
221
|
+
}
|
|
222
|
+
decodeAudioTokens(tokens) {
|
|
223
|
+
return this.ctx.decodeAudioTokens(tokens);
|
|
224
|
+
}
|
|
183
225
|
}
|
|
184
226
|
const loadModel = (options) => __awaiter(void 0, void 0, void 0, function* () {
|
|
185
227
|
var _a, _b;
|
package/lib/index.ts
CHANGED
|
@@ -14,6 +14,8 @@ import type {
|
|
|
14
14
|
|
|
15
15
|
export * from './binding'
|
|
16
16
|
|
|
17
|
+
export const MTMD_DEFAULT_MEDIA_MARKER = '<__media__>'
|
|
18
|
+
|
|
17
19
|
export interface LlamaModelOptionsExtended extends LlamaModelOptions {
|
|
18
20
|
lib_variant?: LibVariant
|
|
19
21
|
}
|
|
@@ -63,8 +65,8 @@ const getJsonSchema = (responseFormat?: CompletionResponseFormat) => {
|
|
|
63
65
|
export type FormattedChatResult = {
|
|
64
66
|
type: 'jinja' | 'llama-chat'
|
|
65
67
|
prompt: string
|
|
66
|
-
|
|
67
|
-
|
|
68
|
+
has_media: boolean
|
|
69
|
+
media_paths?: Array<string>
|
|
68
70
|
}
|
|
69
71
|
|
|
70
72
|
class LlamaContextWrapper {
|
|
@@ -91,17 +93,17 @@ class LlamaContextWrapper {
|
|
|
91
93
|
return !!this.ctx.getModelInfo().chatTemplates.llamaChat
|
|
92
94
|
}
|
|
93
95
|
|
|
94
|
-
|
|
96
|
+
_formatMediaChat(messages: ChatMessage[] | undefined): {
|
|
95
97
|
messages: ChatMessage[] | undefined
|
|
96
|
-
|
|
97
|
-
|
|
98
|
+
has_media: boolean
|
|
99
|
+
media_paths?: string[]
|
|
98
100
|
} {
|
|
99
101
|
if (!messages)
|
|
100
102
|
return {
|
|
101
103
|
messages,
|
|
102
|
-
|
|
104
|
+
has_media: false,
|
|
103
105
|
}
|
|
104
|
-
const
|
|
106
|
+
const mediaPaths: string[] = []
|
|
105
107
|
return {
|
|
106
108
|
messages: messages.map((msg) => {
|
|
107
109
|
if (Array.isArray(msg.content)) {
|
|
@@ -109,10 +111,28 @@ class LlamaContextWrapper {
|
|
|
109
111
|
// Handle multimodal content
|
|
110
112
|
if (part.type === 'image_url') {
|
|
111
113
|
let path = part.image_url?.url || ''
|
|
112
|
-
|
|
114
|
+
mediaPaths.push(path)
|
|
113
115
|
return {
|
|
114
116
|
type: 'text',
|
|
115
|
-
text:
|
|
117
|
+
text: MTMD_DEFAULT_MEDIA_MARKER,
|
|
118
|
+
}
|
|
119
|
+
} else if (part.type === 'input_audio') {
|
|
120
|
+
const { input_audio: audio } = part
|
|
121
|
+
if (!audio) throw new Error('input_audio is required')
|
|
122
|
+
|
|
123
|
+
const { format } = audio
|
|
124
|
+
if (format != 'wav' && format != 'mp3') {
|
|
125
|
+
throw new Error(`Unsupported audio format: ${format}`)
|
|
126
|
+
}
|
|
127
|
+
if (audio.url) {
|
|
128
|
+
const path = audio.url.replace(/file:\/\//, '')
|
|
129
|
+
mediaPaths.push(path)
|
|
130
|
+
} else if (audio.data) {
|
|
131
|
+
mediaPaths.push(audio.data)
|
|
132
|
+
}
|
|
133
|
+
return {
|
|
134
|
+
type: 'text',
|
|
135
|
+
text: MTMD_DEFAULT_MEDIA_MARKER,
|
|
116
136
|
}
|
|
117
137
|
}
|
|
118
138
|
return part
|
|
@@ -125,8 +145,8 @@ class LlamaContextWrapper {
|
|
|
125
145
|
}
|
|
126
146
|
return msg
|
|
127
147
|
}),
|
|
128
|
-
|
|
129
|
-
|
|
148
|
+
has_media: mediaPaths.length > 0,
|
|
149
|
+
media_paths: mediaPaths,
|
|
130
150
|
}
|
|
131
151
|
}
|
|
132
152
|
|
|
@@ -143,9 +163,9 @@ class LlamaContextWrapper {
|
|
|
143
163
|
): FormattedChatResult {
|
|
144
164
|
const {
|
|
145
165
|
messages: chat,
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
} = this.
|
|
166
|
+
has_media,
|
|
167
|
+
media_paths,
|
|
168
|
+
} = this._formatMediaChat(messages)
|
|
149
169
|
|
|
150
170
|
const useJinja = this.isJinjaSupported() && params?.jinja
|
|
151
171
|
let tmpl
|
|
@@ -164,14 +184,14 @@ class LlamaContextWrapper {
|
|
|
164
184
|
return {
|
|
165
185
|
type: 'llama-chat',
|
|
166
186
|
prompt: result as string,
|
|
167
|
-
|
|
168
|
-
|
|
187
|
+
has_media,
|
|
188
|
+
media_paths,
|
|
169
189
|
}
|
|
170
190
|
}
|
|
171
191
|
const jinjaResult = result
|
|
172
192
|
jinjaResult.type = 'jinja'
|
|
173
|
-
jinjaResult.
|
|
174
|
-
jinjaResult.
|
|
193
|
+
jinjaResult.has_media = has_media
|
|
194
|
+
jinjaResult.media_paths = media_paths
|
|
175
195
|
return jinjaResult
|
|
176
196
|
}
|
|
177
197
|
|
|
@@ -179,12 +199,12 @@ class LlamaContextWrapper {
|
|
|
179
199
|
options: LlamaCompletionOptions,
|
|
180
200
|
callback?: (token: LlamaCompletionToken) => void,
|
|
181
201
|
): Promise<LlamaCompletionResult> {
|
|
182
|
-
const { messages,
|
|
183
|
-
this.
|
|
202
|
+
const { messages, media_paths = options.media_paths } =
|
|
203
|
+
this._formatMediaChat(options.messages)
|
|
184
204
|
return this.ctx.completion({
|
|
185
205
|
...options,
|
|
186
206
|
messages,
|
|
187
|
-
|
|
207
|
+
media_paths: options.media_paths || media_paths,
|
|
188
208
|
}, callback || (() => {}))
|
|
189
209
|
}
|
|
190
210
|
|
|
@@ -192,8 +212,8 @@ class LlamaContextWrapper {
|
|
|
192
212
|
return this.ctx.stopCompletion()
|
|
193
213
|
}
|
|
194
214
|
|
|
195
|
-
tokenize(text: string, {
|
|
196
|
-
return this.ctx.tokenize(text,
|
|
215
|
+
tokenize(text: string, { media_paths }: { media_paths?: string[] } = {}): Promise<TokenizeResult> {
|
|
216
|
+
return this.ctx.tokenize(text, media_paths)
|
|
197
217
|
}
|
|
198
218
|
|
|
199
219
|
detokenize(tokens: number[]): Promise<string> {
|
|
@@ -242,6 +262,37 @@ class LlamaContextWrapper {
|
|
|
242
262
|
releaseMultimodal(): Promise<void> {
|
|
243
263
|
return this.ctx.releaseMultimodal()
|
|
244
264
|
}
|
|
265
|
+
|
|
266
|
+
getMultimodalSupport(): Promise<{
|
|
267
|
+
vision: boolean
|
|
268
|
+
audio: boolean
|
|
269
|
+
}> {
|
|
270
|
+
return this.ctx.getMultimodalSupport()
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
initVocoder(path: string): Promise<boolean> {
|
|
274
|
+
return this.ctx.initVocoder(path)
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
releaseVocoder(): Promise<void> {
|
|
278
|
+
return this.ctx.releaseVocoder()
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
isVocoderEnabled(): boolean {
|
|
282
|
+
return this.ctx.isVocoderEnabled()
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
getFormattedAudioCompletion(speaker: string|null, text: string): string {
|
|
286
|
+
return this.ctx.getFormattedAudioCompletion(speaker, text)
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
getAudioCompletionGuideTokens(text: string): Int32Array {
|
|
290
|
+
return this.ctx.getAudioCompletionGuideTokens(text)
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
decodeAudioTokens(tokens: number[]|Int32Array): Promise<Float32Array> {
|
|
294
|
+
return this.ctx.decodeAudioTokens(tokens)
|
|
295
|
+
}
|
|
245
296
|
}
|
|
246
297
|
|
|
247
298
|
export const loadModel = async (
|
package/package.json
CHANGED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
#include "DecodeAudioTokenWorker.h"
|
|
2
|
+
#include "tts_utils.h"
|
|
3
|
+
#include <vector>
|
|
4
|
+
|
|
5
|
+
DecodeAudioTokenWorker::DecodeAudioTokenWorker(
|
|
6
|
+
const Napi::CallbackInfo &info, llama_model *model, llama_context *ctx,
|
|
7
|
+
int n_threads, const std::vector<llama_token> &tokens)
|
|
8
|
+
: AsyncWorker(info.Env()), Deferred(info.Env()), _model(model), _ctx(ctx),
|
|
9
|
+
_n_threads(n_threads), _tokens(tokens) {}
|
|
10
|
+
|
|
11
|
+
void DecodeAudioTokenWorker::Execute() {
|
|
12
|
+
const int n_codes = _tokens.size();
|
|
13
|
+
llama_batch batch = llama_batch_init(n_codes, 0, 1);
|
|
14
|
+
for (size_t i = 0; i < _tokens.size(); ++i) {
|
|
15
|
+
common_batch_add(batch, _tokens[i], i, {0}, true);
|
|
16
|
+
}
|
|
17
|
+
if (batch.n_tokens != n_codes) {
|
|
18
|
+
SetError("batch.n_tokens != n_codes");
|
|
19
|
+
return;
|
|
20
|
+
}
|
|
21
|
+
if (llama_encode(_ctx, batch) != 0) {
|
|
22
|
+
SetError("llama_encode() failed");
|
|
23
|
+
return;
|
|
24
|
+
}
|
|
25
|
+
llama_synchronize(_ctx);
|
|
26
|
+
const int n_embd = llama_model_n_embd(_model);
|
|
27
|
+
const float *embd = llama_get_embeddings(_ctx);
|
|
28
|
+
_result = embd_to_audio(embd, n_codes, n_embd, _n_threads);
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
void DecodeAudioTokenWorker::OnOK() {
|
|
32
|
+
auto result =
|
|
33
|
+
Napi::Float32Array::New(Napi::AsyncWorker::Env(), _result.size());
|
|
34
|
+
memcpy(result.Data(), _result.data(), _result.size() * sizeof(float));
|
|
35
|
+
Napi::Promise::Deferred::Resolve(result);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
void DecodeAudioTokenWorker::OnError(const Napi::Error &err) {
|
|
39
|
+
Napi::Promise::Deferred::Reject(err.Value());
|
|
40
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
#include "common.hpp"
|
|
2
|
+
#include <vector>
|
|
3
|
+
|
|
4
|
+
class DecodeAudioTokenWorker : public Napi::AsyncWorker,
|
|
5
|
+
public Napi::Promise::Deferred {
|
|
6
|
+
public:
|
|
7
|
+
DecodeAudioTokenWorker(const Napi::CallbackInfo &info, llama_model *model,
|
|
8
|
+
llama_context *ctx, int n_threads,
|
|
9
|
+
const std::vector<llama_token> &tokens);
|
|
10
|
+
|
|
11
|
+
protected:
|
|
12
|
+
void Execute();
|
|
13
|
+
void OnOK();
|
|
14
|
+
void OnError(const Napi::Error &err);
|
|
15
|
+
|
|
16
|
+
private:
|
|
17
|
+
llama_model *_model;
|
|
18
|
+
llama_context *_ctx;
|
|
19
|
+
int _n_threads;
|
|
20
|
+
std::vector<llama_token> _tokens;
|
|
21
|
+
std::vector<float> _result;
|
|
22
|
+
};
|
package/src/EmbeddingWorker.cpp
CHANGED
|
@@ -2,8 +2,10 @@
|
|
|
2
2
|
#include "LlamaContext.h"
|
|
3
3
|
|
|
4
4
|
EmbeddingWorker::EmbeddingWorker(const Napi::CallbackInfo &info,
|
|
5
|
-
LlamaSessionPtr &sess, std::string text,
|
|
6
|
-
|
|
5
|
+
LlamaSessionPtr &sess, std::string text,
|
|
6
|
+
common_params ¶ms)
|
|
7
|
+
: AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess), _text(text),
|
|
8
|
+
_params(params) {}
|
|
7
9
|
|
|
8
10
|
void EmbeddingWorker::Execute() {
|
|
9
11
|
llama_kv_self_clear(_sess->context());
|
|
@@ -17,8 +19,7 @@ void EmbeddingWorker::Execute() {
|
|
|
17
19
|
do {
|
|
18
20
|
auto ctx = _sess->context();
|
|
19
21
|
int ret =
|
|
20
|
-
llama_decode(ctx,
|
|
21
|
-
llama_batch_get_one(tokens.data(), tokens.size()));
|
|
22
|
+
llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()));
|
|
22
23
|
if (ret < 0) {
|
|
23
24
|
SetError("Failed to inference, code: " + std::to_string(ret));
|
|
24
25
|
break;
|
|
@@ -37,7 +38,8 @@ void EmbeddingWorker::Execute() {
|
|
|
37
38
|
}
|
|
38
39
|
_result.embedding.resize(n_embd);
|
|
39
40
|
std::vector<float> embedding(embd, embd + n_embd), out(embd, embd + n_embd);
|
|
40
|
-
|
|
41
|
+
common_embd_normalize(embedding.data(), out.data(), n_embd,
|
|
42
|
+
_params.embd_normalize);
|
|
41
43
|
memcpy(_result.embedding.data(), out.data(), n_embd * sizeof(float));
|
|
42
44
|
} while (false);
|
|
43
45
|
}
|