@fugood/llama.node 0.4.7 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. package/CMakeLists.txt +4 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/lib/binding.ts +66 -6
  11. package/lib/index.js +59 -17
  12. package/lib/index.ts +74 -23
  13. package/package.json +1 -1
  14. package/src/DecodeAudioTokenWorker.cpp +40 -0
  15. package/src/DecodeAudioTokenWorker.h +22 -0
  16. package/src/EmbeddingWorker.cpp +7 -5
  17. package/src/LlamaCompletionWorker.cpp +68 -54
  18. package/src/LlamaCompletionWorker.h +7 -8
  19. package/src/LlamaContext.cpp +551 -235
  20. package/src/LlamaContext.h +26 -4
  21. package/src/LoadSessionWorker.cpp +4 -2
  22. package/src/SaveSessionWorker.cpp +10 -6
  23. package/src/TokenizeWorker.cpp +23 -14
  24. package/src/TokenizeWorker.h +2 -2
  25. package/src/addons.cc +8 -11
  26. package/src/common.hpp +129 -126
  27. package/src/llama.cpp/.github/workflows/build.yml +2 -2
  28. package/src/llama.cpp/.github/workflows/release.yml +152 -129
  29. package/src/llama.cpp/.github/workflows/winget.yml +42 -0
  30. package/src/llama.cpp/common/arg.cpp +14 -13
  31. package/src/llama.cpp/common/common.cpp +4 -75
  32. package/src/llama.cpp/common/common.h +7 -12
  33. package/src/llama.cpp/examples/lookahead/lookahead.cpp +0 -13
  34. package/src/llama.cpp/examples/lookup/lookup.cpp +0 -11
  35. package/src/llama.cpp/examples/parallel/parallel.cpp +0 -9
  36. package/src/llama.cpp/examples/retrieval/retrieval.cpp +6 -6
  37. package/src/llama.cpp/examples/simple/simple.cpp +1 -1
  38. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
  39. package/src/llama.cpp/examples/sycl/run-llama2.sh +4 -4
  40. package/src/llama.cpp/examples/sycl/run-llama3.sh +28 -0
  41. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  42. package/src/llama.cpp/examples/sycl/win-run-llama3.bat +9 -0
  43. package/src/llama.cpp/ggml/include/ggml-opt.h +2 -0
  44. package/src/llama.cpp/ggml/include/ggml.h +11 -0
  45. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +274 -0
  46. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +27 -0
  47. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +18 -2
  48. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
  49. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +107 -0
  50. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +16 -0
  51. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
  52. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -155
  53. package/src/llama.cpp/ggml/src/ggml-opt.cpp +5 -0
  54. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +43 -12
  55. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +171 -112
  56. package/src/llama.cpp/ggml/src/ggml.c +64 -18
  57. package/src/llama.cpp/include/llama.h +24 -124
  58. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
  59. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
  60. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  61. package/src/llama.cpp/src/llama-batch.cpp +3 -1
  62. package/src/llama.cpp/src/llama-context.cpp +60 -110
  63. package/src/llama.cpp/src/llama-graph.cpp +137 -233
  64. package/src/llama.cpp/src/llama-graph.h +49 -7
  65. package/src/llama.cpp/src/llama-hparams.cpp +17 -1
  66. package/src/llama.cpp/src/llama-hparams.h +34 -5
  67. package/src/llama.cpp/src/llama-kv-cache.cpp +654 -321
  68. package/src/llama.cpp/src/llama-kv-cache.h +201 -85
  69. package/src/llama.cpp/src/llama-memory.h +3 -2
  70. package/src/llama.cpp/src/llama-model.cpp +273 -94
  71. package/src/llama.cpp/src/llama-model.h +4 -1
  72. package/src/llama.cpp/tests/test-arg-parser.cpp +1 -1
  73. package/src/llama.cpp/tools/llama-bench/llama-bench.cpp +1 -0
  74. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +13 -2
  75. package/src/llama.cpp/tools/mtmd/clip-impl.h +108 -11
  76. package/src/llama.cpp/tools/mtmd/clip.cpp +466 -88
  77. package/src/llama.cpp/tools/mtmd/clip.h +6 -4
  78. package/src/llama.cpp/tools/mtmd/miniaudio.h +93468 -0
  79. package/src/llama.cpp/tools/mtmd/mtmd-audio.cpp +855 -0
  80. package/src/llama.cpp/tools/mtmd/mtmd-audio.h +62 -0
  81. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +21 -14
  82. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +36 -49
  83. package/src/llama.cpp/tools/mtmd/mtmd.cpp +362 -98
  84. package/src/llama.cpp/tools/mtmd/mtmd.h +52 -21
  85. package/src/llama.cpp/tools/run/run.cpp +2 -2
  86. package/src/llama.cpp/tools/server/server.cpp +158 -47
  87. package/src/llama.cpp/tools/server/utils.hpp +71 -43
  88. package/src/llama.cpp/tools/tts/tts.cpp +4 -2
  89. package/src/tts_utils.cpp +342 -0
  90. package/src/tts_utils.h +62 -0
  91. package/bin/win32/arm64/llama-node.node +0 -0
  92. package/bin/win32/arm64/node.lib +0 -0
  93. package/bin/win32/x64/llama-node.node +0 -0
  94. package/bin/win32/x64/node.lib +0 -0
  95. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  96. package/bin/win32-vulkan/arm64/node.lib +0 -0
  97. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  98. package/bin/win32-vulkan/x64/node.lib +0 -0
package/CMakeLists.txt CHANGED
@@ -102,6 +102,10 @@ file(
102
102
  "src/LoadSessionWorker.h"
103
103
  "src/SaveSessionWorker.cpp"
104
104
  "src/SaveSessionWorker.h"
105
+ "src/DecodeAudioTokenWorker.cpp"
106
+ "src/DecodeAudioTokenWorker.h"
107
+ "src/tts_utils.cpp"
108
+ "src/tts_utils.h"
105
109
  )
106
110
 
107
111
  add_library(${PROJECT_NAME} SHARED ${SOURCE_FILES} ${CMAKE_JS_SRC})
Binary file
Binary file
Binary file
Binary file
Binary file
package/lib/binding.ts CHANGED
@@ -6,6 +6,11 @@ export type MessagePart = {
6
6
  text?: string,
7
7
  image_url?: {
8
8
  url?: string
9
+ },
10
+ input_audio?: {
11
+ format: string
12
+ data?: string
13
+ url?: string
9
14
  }
10
15
  }
11
16
 
@@ -103,12 +108,17 @@ export type LlamaCompletionOptions = {
103
108
  grammar_triggers?: { type: number; word: string; at_start: boolean }[]
104
109
  preserved_tokens?: string[]
105
110
  /**
106
- * Path(s) to image file(s) to process before generating text.
107
- * When provided, the image(s) will be processed and added to the context.
111
+ * Path(s) to media file(s) to process before generating text.
112
+ * When provided, the media will be processed and added to the context.
108
113
  * Requires multimodal support to be enabled via initMultimodal.
109
114
  * Supports both file paths and base64 data URLs.
110
115
  */
111
- image_paths?: string | string[]
116
+ media_paths?: string | string[]
117
+ /**
118
+ * Guide tokens to use for audio completion.
119
+ * Help prevent hallucinations by forcing the TTS to use the correct words.
120
+ */
121
+ guide_tokens?: Int32Array
112
122
  }
113
123
 
114
124
  export type LlamaCompletionResult = {
@@ -137,10 +147,10 @@ export type LlamaCompletionToken = {
137
147
 
138
148
  export type TokenizeResult = {
139
149
  tokens: Int32Array
140
- has_image: boolean
150
+ has_media: boolean
141
151
  bitmap_hashes: string[]
142
152
  chunk_pos: number[]
143
- chunk_pos_images: number[]
153
+ chunk_pos_media: number[]
144
154
  }
145
155
 
146
156
  export type EmbeddingResult = {
@@ -167,7 +177,7 @@ export interface LlamaContext {
167
177
  callback?: (token: LlamaCompletionToken) => void,
168
178
  ): Promise<LlamaCompletionResult>
169
179
  stopCompletion(): void
170
- tokenize(text: string, image_paths?: string[]): Promise<TokenizeResult>
180
+ tokenize(text: string, media_paths?: string[]): Promise<TokenizeResult>
171
181
  detokenize(tokens: number[]): Promise<string>
172
182
  embedding(text: string): Promise<EmbeddingResult>
173
183
  saveSession(path: string): Promise<void>
@@ -189,11 +199,61 @@ export interface LlamaContext {
189
199
  */
190
200
  isMultimodalEnabled(): Promise<boolean>
191
201
 
202
+ /**
203
+ * Get multimodal support capabilities
204
+ * @returns Promise resolving to an object with vision and audio support
205
+ */
206
+ getMultimodalSupport(): Promise<{
207
+ vision: boolean
208
+ audio: boolean
209
+ }>
210
+
192
211
  /**
193
212
  * Release multimodal support
194
213
  */
195
214
  releaseMultimodal(): Promise<void>
196
215
 
216
+ /**
217
+ * Load a vocoder model
218
+ * @param path Path to the vocoder model
219
+ * @returns Promise resolving to true if loading was successful
220
+ */
221
+ initVocoder(path: string): Promise<boolean>
222
+
223
+ /**
224
+ * Unload the vocoder model
225
+ * @returns Promise resolving to true if unloading was successful
226
+ */
227
+ releaseVocoder(): Promise<void>
228
+
229
+ /**
230
+ * Check if the vocoder model is enabled
231
+ * @returns Promise resolving to true if the vocoder model is enabled
232
+ */
233
+ isVocoderEnabled(): boolean
234
+
235
+ /**
236
+ * Get the formatted prompt for audio completion
237
+ * @param speaker Speaker name or null
238
+ * @param text Text to complete
239
+ * @returns Formatted audio completion
240
+ */
241
+ getFormattedAudioCompletion(speaker: string|null, text: string): string
242
+
243
+ /**
244
+ * Get guide tokens for audio completion
245
+ * @param text Text to complete
246
+ * @returns Guide tokens
247
+ */
248
+ getAudioCompletionGuideTokens(text: string): Int32Array
249
+
250
+ /**
251
+ * Decode audio tokens to audio data
252
+ * @param tokens Tokens to decode
253
+ * @returns Decoded audio tokens
254
+ */
255
+ decodeAudioTokens(tokens: Int32Array): Promise<Float32Array>
256
+
197
257
  // static
198
258
  loadModelInfo(path: string, skip: string[]): Promise<Object>
199
259
  toggleNativeLog(
package/lib/index.js CHANGED
@@ -23,10 +23,11 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
23
23
  });
24
24
  };
25
25
  Object.defineProperty(exports, "__esModule", { value: true });
26
- exports.loadLlamaModelInfo = exports.initLlama = exports.loadModel = exports.toggleNativeLog = void 0;
26
+ exports.loadLlamaModelInfo = exports.initLlama = exports.loadModel = exports.toggleNativeLog = exports.MTMD_DEFAULT_MEDIA_MARKER = void 0;
27
27
  exports.addNativeLogListener = addNativeLogListener;
28
28
  const binding_1 = require("./binding");
29
29
  __exportStar(require("./binding"), exports);
30
+ exports.MTMD_DEFAULT_MEDIA_MARKER = '<__media__>';
30
31
  const mods = {};
31
32
  const logListeners = [];
32
33
  const logCallback = (level, text) => {
@@ -78,13 +79,13 @@ class LlamaContextWrapper {
78
79
  isLlamaChatSupported() {
79
80
  return !!this.ctx.getModelInfo().chatTemplates.llamaChat;
80
81
  }
81
- _formatImageChat(messages) {
82
+ _formatMediaChat(messages) {
82
83
  if (!messages)
83
84
  return {
84
85
  messages,
85
- has_image: false,
86
+ has_media: false,
86
87
  };
87
- const imagePaths = [];
88
+ const mediaPaths = [];
88
89
  return {
89
90
  messages: messages.map((msg) => {
90
91
  if (Array.isArray(msg.content)) {
@@ -93,10 +94,30 @@ class LlamaContextWrapper {
93
94
  // Handle multimodal content
94
95
  if (part.type === 'image_url') {
95
96
  let path = ((_a = part.image_url) === null || _a === void 0 ? void 0 : _a.url) || '';
96
- imagePaths.push(path);
97
+ mediaPaths.push(path);
97
98
  return {
98
99
  type: 'text',
99
- text: '<__image__>',
100
+ text: exports.MTMD_DEFAULT_MEDIA_MARKER,
101
+ };
102
+ }
103
+ else if (part.type === 'input_audio') {
104
+ const { input_audio: audio } = part;
105
+ if (!audio)
106
+ throw new Error('input_audio is required');
107
+ const { format } = audio;
108
+ if (format != 'wav' && format != 'mp3') {
109
+ throw new Error(`Unsupported audio format: ${format}`);
110
+ }
111
+ if (audio.url) {
112
+ const path = audio.url.replace(/file:\/\//, '');
113
+ mediaPaths.push(path);
114
+ }
115
+ else if (audio.data) {
116
+ mediaPaths.push(audio.data);
117
+ }
118
+ return {
119
+ type: 'text',
120
+ text: exports.MTMD_DEFAULT_MEDIA_MARKER,
100
121
  };
101
122
  }
102
123
  return part;
@@ -105,12 +126,12 @@ class LlamaContextWrapper {
105
126
  }
106
127
  return msg;
107
128
  }),
108
- has_image: imagePaths.length > 0,
109
- image_paths: imagePaths,
129
+ has_media: mediaPaths.length > 0,
130
+ media_paths: mediaPaths,
110
131
  };
111
132
  }
112
133
  getFormattedChat(messages, template, params) {
113
- const { messages: chat, has_image, image_paths, } = this._formatImageChat(messages);
134
+ const { messages: chat, has_media, media_paths, } = this._formatMediaChat(messages);
114
135
  const useJinja = this.isJinjaSupported() && (params === null || params === void 0 ? void 0 : params.jinja);
115
136
  let tmpl;
116
137
  if (template)
@@ -127,25 +148,25 @@ class LlamaContextWrapper {
127
148
  return {
128
149
  type: 'llama-chat',
129
150
  prompt: result,
130
- has_image,
131
- image_paths,
151
+ has_media,
152
+ media_paths,
132
153
  };
133
154
  }
134
155
  const jinjaResult = result;
135
156
  jinjaResult.type = 'jinja';
136
- jinjaResult.has_image = has_image;
137
- jinjaResult.image_paths = image_paths;
157
+ jinjaResult.has_media = has_media;
158
+ jinjaResult.media_paths = media_paths;
138
159
  return jinjaResult;
139
160
  }
140
161
  completion(options, callback) {
141
- const { messages, image_paths = options.image_paths } = this._formatImageChat(options.messages);
142
- return this.ctx.completion(Object.assign(Object.assign({}, options), { messages, image_paths: options.image_paths || image_paths }), callback || (() => { }));
162
+ const { messages, media_paths = options.media_paths } = this._formatMediaChat(options.messages);
163
+ return this.ctx.completion(Object.assign(Object.assign({}, options), { messages, media_paths: options.media_paths || media_paths }), callback || (() => { }));
143
164
  }
144
165
  stopCompletion() {
145
166
  return this.ctx.stopCompletion();
146
167
  }
147
- tokenize(text, { image_paths } = {}) {
148
- return this.ctx.tokenize(text, image_paths);
168
+ tokenize(text, { media_paths } = {}) {
169
+ return this.ctx.tokenize(text, media_paths);
149
170
  }
150
171
  detokenize(tokens) {
151
172
  return this.ctx.detokenize(tokens);
@@ -180,6 +201,27 @@ class LlamaContextWrapper {
180
201
  releaseMultimodal() {
181
202
  return this.ctx.releaseMultimodal();
182
203
  }
204
+ getMultimodalSupport() {
205
+ return this.ctx.getMultimodalSupport();
206
+ }
207
+ initVocoder(path) {
208
+ return this.ctx.initVocoder(path);
209
+ }
210
+ releaseVocoder() {
211
+ return this.ctx.releaseVocoder();
212
+ }
213
+ isVocoderEnabled() {
214
+ return this.ctx.isVocoderEnabled();
215
+ }
216
+ getFormattedAudioCompletion(speaker, text) {
217
+ return this.ctx.getFormattedAudioCompletion(speaker, text);
218
+ }
219
+ getAudioCompletionGuideTokens(text) {
220
+ return this.ctx.getAudioCompletionGuideTokens(text);
221
+ }
222
+ decodeAudioTokens(tokens) {
223
+ return this.ctx.decodeAudioTokens(tokens);
224
+ }
183
225
  }
184
226
  const loadModel = (options) => __awaiter(void 0, void 0, void 0, function* () {
185
227
  var _a, _b;
package/lib/index.ts CHANGED
@@ -14,6 +14,8 @@ import type {
14
14
 
15
15
  export * from './binding'
16
16
 
17
+ export const MTMD_DEFAULT_MEDIA_MARKER = '<__media__>'
18
+
17
19
  export interface LlamaModelOptionsExtended extends LlamaModelOptions {
18
20
  lib_variant?: LibVariant
19
21
  }
@@ -63,8 +65,8 @@ const getJsonSchema = (responseFormat?: CompletionResponseFormat) => {
63
65
  export type FormattedChatResult = {
64
66
  type: 'jinja' | 'llama-chat'
65
67
  prompt: string
66
- has_image: boolean
67
- image_paths?: Array<string>
68
+ has_media: boolean
69
+ media_paths?: Array<string>
68
70
  }
69
71
 
70
72
  class LlamaContextWrapper {
@@ -91,17 +93,17 @@ class LlamaContextWrapper {
91
93
  return !!this.ctx.getModelInfo().chatTemplates.llamaChat
92
94
  }
93
95
 
94
- _formatImageChat(messages: ChatMessage[] | undefined): {
96
+ _formatMediaChat(messages: ChatMessage[] | undefined): {
95
97
  messages: ChatMessage[] | undefined
96
- has_image: boolean
97
- image_paths?: string[]
98
+ has_media: boolean
99
+ media_paths?: string[]
98
100
  } {
99
101
  if (!messages)
100
102
  return {
101
103
  messages,
102
- has_image: false,
104
+ has_media: false,
103
105
  }
104
- const imagePaths: string[] = []
106
+ const mediaPaths: string[] = []
105
107
  return {
106
108
  messages: messages.map((msg) => {
107
109
  if (Array.isArray(msg.content)) {
@@ -109,10 +111,28 @@ class LlamaContextWrapper {
109
111
  // Handle multimodal content
110
112
  if (part.type === 'image_url') {
111
113
  let path = part.image_url?.url || ''
112
- imagePaths.push(path)
114
+ mediaPaths.push(path)
113
115
  return {
114
116
  type: 'text',
115
- text: '<__image__>',
117
+ text: MTMD_DEFAULT_MEDIA_MARKER,
118
+ }
119
+ } else if (part.type === 'input_audio') {
120
+ const { input_audio: audio } = part
121
+ if (!audio) throw new Error('input_audio is required')
122
+
123
+ const { format } = audio
124
+ if (format != 'wav' && format != 'mp3') {
125
+ throw new Error(`Unsupported audio format: ${format}`)
126
+ }
127
+ if (audio.url) {
128
+ const path = audio.url.replace(/file:\/\//, '')
129
+ mediaPaths.push(path)
130
+ } else if (audio.data) {
131
+ mediaPaths.push(audio.data)
132
+ }
133
+ return {
134
+ type: 'text',
135
+ text: MTMD_DEFAULT_MEDIA_MARKER,
116
136
  }
117
137
  }
118
138
  return part
@@ -125,8 +145,8 @@ class LlamaContextWrapper {
125
145
  }
126
146
  return msg
127
147
  }),
128
- has_image: imagePaths.length > 0,
129
- image_paths: imagePaths,
148
+ has_media: mediaPaths.length > 0,
149
+ media_paths: mediaPaths,
130
150
  }
131
151
  }
132
152
 
@@ -143,9 +163,9 @@ class LlamaContextWrapper {
143
163
  ): FormattedChatResult {
144
164
  const {
145
165
  messages: chat,
146
- has_image,
147
- image_paths,
148
- } = this._formatImageChat(messages)
166
+ has_media,
167
+ media_paths,
168
+ } = this._formatMediaChat(messages)
149
169
 
150
170
  const useJinja = this.isJinjaSupported() && params?.jinja
151
171
  let tmpl
@@ -164,14 +184,14 @@ class LlamaContextWrapper {
164
184
  return {
165
185
  type: 'llama-chat',
166
186
  prompt: result as string,
167
- has_image,
168
- image_paths,
187
+ has_media,
188
+ media_paths,
169
189
  }
170
190
  }
171
191
  const jinjaResult = result
172
192
  jinjaResult.type = 'jinja'
173
- jinjaResult.has_image = has_image
174
- jinjaResult.image_paths = image_paths
193
+ jinjaResult.has_media = has_media
194
+ jinjaResult.media_paths = media_paths
175
195
  return jinjaResult
176
196
  }
177
197
 
@@ -179,12 +199,12 @@ class LlamaContextWrapper {
179
199
  options: LlamaCompletionOptions,
180
200
  callback?: (token: LlamaCompletionToken) => void,
181
201
  ): Promise<LlamaCompletionResult> {
182
- const { messages, image_paths = options.image_paths } =
183
- this._formatImageChat(options.messages)
202
+ const { messages, media_paths = options.media_paths } =
203
+ this._formatMediaChat(options.messages)
184
204
  return this.ctx.completion({
185
205
  ...options,
186
206
  messages,
187
- image_paths: options.image_paths || image_paths,
207
+ media_paths: options.media_paths || media_paths,
188
208
  }, callback || (() => {}))
189
209
  }
190
210
 
@@ -192,8 +212,8 @@ class LlamaContextWrapper {
192
212
  return this.ctx.stopCompletion()
193
213
  }
194
214
 
195
- tokenize(text: string, { image_paths }: { image_paths?: string[] } = {}): Promise<TokenizeResult> {
196
- return this.ctx.tokenize(text, image_paths)
215
+ tokenize(text: string, { media_paths }: { media_paths?: string[] } = {}): Promise<TokenizeResult> {
216
+ return this.ctx.tokenize(text, media_paths)
197
217
  }
198
218
 
199
219
  detokenize(tokens: number[]): Promise<string> {
@@ -242,6 +262,37 @@ class LlamaContextWrapper {
242
262
  releaseMultimodal(): Promise<void> {
243
263
  return this.ctx.releaseMultimodal()
244
264
  }
265
+
266
+ getMultimodalSupport(): Promise<{
267
+ vision: boolean
268
+ audio: boolean
269
+ }> {
270
+ return this.ctx.getMultimodalSupport()
271
+ }
272
+
273
+ initVocoder(path: string): Promise<boolean> {
274
+ return this.ctx.initVocoder(path)
275
+ }
276
+
277
+ releaseVocoder(): Promise<void> {
278
+ return this.ctx.releaseVocoder()
279
+ }
280
+
281
+ isVocoderEnabled(): boolean {
282
+ return this.ctx.isVocoderEnabled()
283
+ }
284
+
285
+ getFormattedAudioCompletion(speaker: string|null, text: string): string {
286
+ return this.ctx.getFormattedAudioCompletion(speaker, text)
287
+ }
288
+
289
+ getAudioCompletionGuideTokens(text: string): Int32Array {
290
+ return this.ctx.getAudioCompletionGuideTokens(text)
291
+ }
292
+
293
+ decodeAudioTokens(tokens: number[]|Int32Array): Promise<Float32Array> {
294
+ return this.ctx.decodeAudioTokens(tokens)
295
+ }
245
296
  }
246
297
 
247
298
  export const loadModel = async (
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "0.4.7",
4
+ "version": "0.6.0",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -0,0 +1,40 @@
1
+ #include "DecodeAudioTokenWorker.h"
2
+ #include "tts_utils.h"
3
+ #include <vector>
4
+
5
+ DecodeAudioTokenWorker::DecodeAudioTokenWorker(
6
+ const Napi::CallbackInfo &info, llama_model *model, llama_context *ctx,
7
+ int n_threads, const std::vector<llama_token> &tokens)
8
+ : AsyncWorker(info.Env()), Deferred(info.Env()), _model(model), _ctx(ctx),
9
+ _n_threads(n_threads), _tokens(tokens) {}
10
+
11
+ void DecodeAudioTokenWorker::Execute() {
12
+ const int n_codes = _tokens.size();
13
+ llama_batch batch = llama_batch_init(n_codes, 0, 1);
14
+ for (size_t i = 0; i < _tokens.size(); ++i) {
15
+ common_batch_add(batch, _tokens[i], i, {0}, true);
16
+ }
17
+ if (batch.n_tokens != n_codes) {
18
+ SetError("batch.n_tokens != n_codes");
19
+ return;
20
+ }
21
+ if (llama_encode(_ctx, batch) != 0) {
22
+ SetError("llama_encode() failed");
23
+ return;
24
+ }
25
+ llama_synchronize(_ctx);
26
+ const int n_embd = llama_model_n_embd(_model);
27
+ const float *embd = llama_get_embeddings(_ctx);
28
+ _result = embd_to_audio(embd, n_codes, n_embd, _n_threads);
29
+ }
30
+
31
+ void DecodeAudioTokenWorker::OnOK() {
32
+ auto result =
33
+ Napi::Float32Array::New(Napi::AsyncWorker::Env(), _result.size());
34
+ memcpy(result.Data(), _result.data(), _result.size() * sizeof(float));
35
+ Napi::Promise::Deferred::Resolve(result);
36
+ }
37
+
38
+ void DecodeAudioTokenWorker::OnError(const Napi::Error &err) {
39
+ Napi::Promise::Deferred::Reject(err.Value());
40
+ }
@@ -0,0 +1,22 @@
1
+ #include "common.hpp"
2
+ #include <vector>
3
+
4
+ class DecodeAudioTokenWorker : public Napi::AsyncWorker,
5
+ public Napi::Promise::Deferred {
6
+ public:
7
+ DecodeAudioTokenWorker(const Napi::CallbackInfo &info, llama_model *model,
8
+ llama_context *ctx, int n_threads,
9
+ const std::vector<llama_token> &tokens);
10
+
11
+ protected:
12
+ void Execute();
13
+ void OnOK();
14
+ void OnError(const Napi::Error &err);
15
+
16
+ private:
17
+ llama_model *_model;
18
+ llama_context *_ctx;
19
+ int _n_threads;
20
+ std::vector<llama_token> _tokens;
21
+ std::vector<float> _result;
22
+ };
@@ -2,8 +2,10 @@
2
2
  #include "LlamaContext.h"
3
3
 
4
4
  EmbeddingWorker::EmbeddingWorker(const Napi::CallbackInfo &info,
5
- LlamaSessionPtr &sess, std::string text, common_params &params)
6
- : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess), _text(text), _params(params) {}
5
+ LlamaSessionPtr &sess, std::string text,
6
+ common_params &params)
7
+ : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess), _text(text),
8
+ _params(params) {}
7
9
 
8
10
  void EmbeddingWorker::Execute() {
9
11
  llama_kv_self_clear(_sess->context());
@@ -17,8 +19,7 @@ void EmbeddingWorker::Execute() {
17
19
  do {
18
20
  auto ctx = _sess->context();
19
21
  int ret =
20
- llama_decode(ctx,
21
- llama_batch_get_one(tokens.data(), tokens.size()));
22
+ llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()));
22
23
  if (ret < 0) {
23
24
  SetError("Failed to inference, code: " + std::to_string(ret));
24
25
  break;
@@ -37,7 +38,8 @@ void EmbeddingWorker::Execute() {
37
38
  }
38
39
  _result.embedding.resize(n_embd);
39
40
  std::vector<float> embedding(embd, embd + n_embd), out(embd, embd + n_embd);
40
- common_embd_normalize(embedding.data(), out.data(), n_embd, _params.embd_normalize);
41
+ common_embd_normalize(embedding.data(), out.data(), n_embd,
42
+ _params.embd_normalize);
41
43
  memcpy(_result.embedding.data(), out.data(), n_embd * sizeof(float));
42
44
  } while (false);
43
45
  }