@fugood/llama.node 1.0.6 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CMakeLists.txt CHANGED
@@ -73,9 +73,9 @@ if(CMAKE_BUILD_TYPE STREQUAL "Release")
73
73
  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /O1 /Ob1 /bigobj")
74
74
  endif()
75
75
  else()
76
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -funroll-loops -flto")
77
- set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 -funroll-loops -flto")
78
- set(CMAKE_LINKER_FLAGS "${CMAKE_LINKER_FLAGS} -flto")
76
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -funroll-loops -flto=auto")
77
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 -funroll-loops -flto=auto")
78
+ set(CMAKE_LINKER_FLAGS "${CMAKE_LINKER_FLAGS} -flto=auto")
79
79
  endif()
80
80
  endif()
81
81
 
package/lib/binding.js CHANGED
@@ -15,23 +15,13 @@ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (
15
15
  }) : function(o, v) {
16
16
  o["default"] = v;
17
17
  });
18
- var __importStar = (this && this.__importStar) || (function () {
19
- var ownKeys = function(o) {
20
- ownKeys = Object.getOwnPropertyNames || function (o) {
21
- var ar = [];
22
- for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
- return ar;
24
- };
25
- return ownKeys(o);
26
- };
27
- return function (mod) {
28
- if (mod && mod.__esModule) return mod;
29
- var result = {};
30
- if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
- __setModuleDefault(result, mod);
32
- return result;
33
- };
34
- })();
18
+ var __importStar = (this && this.__importStar) || function (mod) {
19
+ if (mod && mod.__esModule) return mod;
20
+ var result = {};
21
+ if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
22
+ __setModuleDefault(result, mod);
23
+ return result;
24
+ };
35
25
  var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
36
26
  function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
37
27
  return new (P || (P = Promise))(function (resolve, reject) {
package/lib/binding.ts CHANGED
@@ -1,6 +1,3 @@
1
- import * as path from 'path'
2
-
3
-
4
1
  export type MessagePart = {
5
2
  type: string,
6
3
  text?: string,
@@ -53,6 +50,11 @@ export type LlamaModelOptions = {
53
50
  * Enable context shifting to handle prompts larger than context size
54
51
  */
55
52
  ctx_shift?: boolean
53
+ /**
54
+ * Use a unified buffer across the input sequences when computing the attention.
55
+ * Try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix.
56
+ */
57
+ kv_unified?: boolean
56
58
  use_mlock?: boolean
57
59
  use_mmap?: boolean
58
60
  vocab_only?: boolean
@@ -65,9 +67,9 @@ export type CompletionResponseFormat = {
65
67
  type: 'text' | 'json_object' | 'json_schema'
66
68
  json_schema?: {
67
69
  strict?: boolean
68
- schema: object
70
+ schema: Record<string, any>
69
71
  }
70
- schema?: object // for json_object type
72
+ schema?: Record<string, any> // for json_object type
71
73
  }
72
74
 
73
75
  export type LlamaCompletionOptions = {
@@ -76,7 +78,7 @@ export type LlamaCompletionOptions = {
76
78
  reasoning_format?: string
77
79
  chat_template?: string
78
80
  response_format?: CompletionResponseFormat
79
- tools?: object
81
+ tools?: Tool[]
80
82
  parallel_tool_calls?: boolean
81
83
  tool_choice?: string
82
84
  enable_thinking?: boolean
@@ -107,7 +109,7 @@ export type LlamaCompletionOptions = {
107
109
  stop?: string[]
108
110
  grammar?: string
109
111
  grammar_lazy?: boolean
110
- grammar_triggers?: { type: number; word: string; at_start: boolean }[]
112
+ grammar_triggers?: { type: number; value: string; token?: number }[]
111
113
  preserved_tokens?: string[]
112
114
  /**
113
115
  * Path(s) to media file(s) to process before generating text.
@@ -120,7 +122,7 @@ export type LlamaCompletionOptions = {
120
122
  * Guide tokens to use for audio completion.
121
123
  * Help prevent hallucinations by forcing the TTS to use the correct words.
122
124
  */
123
- guide_tokens?: Int32Array
125
+ guide_tokens?: number[] | Int32Array
124
126
  }
125
127
 
126
128
  export type LlamaCompletionResult = {
@@ -169,21 +171,101 @@ export type RerankResult = {
169
171
  index: number
170
172
  }
171
173
 
174
+ export type ModelInfo = {
175
+ desc: string
176
+ nEmbd: number
177
+ nParams: number
178
+ size: number
179
+ chatTemplates: {
180
+ llamaChat: boolean
181
+ minja: {
182
+ default: boolean
183
+ defaultCaps: {
184
+ tools: boolean
185
+ toolCalls: boolean
186
+ toolResponses: boolean
187
+ systemRole: boolean
188
+ parallelToolCalls: boolean
189
+ toolCallId: boolean
190
+ }
191
+ toolUse: boolean
192
+ toolUseCaps?: {
193
+ tools: boolean
194
+ toolCalls: boolean
195
+ toolResponses: boolean
196
+ systemRole: boolean
197
+ parallelToolCalls: boolean
198
+ toolCallId: boolean
199
+ }
200
+ }
201
+ }
202
+ metadata: Record<string, string>
203
+ isChatTemplateSupported: boolean
204
+ }
205
+
206
+ export type GGUFModelInfo = {
207
+ version?: number
208
+ alignment?: number
209
+ data_offset?: number
210
+ [key: string]: string | number | undefined
211
+ }
212
+
213
+ export type FormattedChatResult = {
214
+ type: 'jinja' | 'llama-chat'
215
+ prompt: string
216
+ has_media: boolean
217
+ media_paths?: Array<string>
218
+ }
219
+
220
+ export type JinjaFormattedChatResult = {
221
+ prompt: string
222
+ chat_format: number
223
+ grammar: string
224
+ grammea_lazy: boolean
225
+ grammar_triggers: Array<{
226
+ type: number
227
+ value: string
228
+ token: number
229
+ }>
230
+ thinking_forced_open: boolean
231
+ preserved_tokens: string[]
232
+ additional_stops: string[]
233
+ }
234
+
235
+ export type Tool = {
236
+ type: 'function'
237
+ function: {
238
+ name: string
239
+ description: string
240
+ parameters: Record<string, any>
241
+ }
242
+ }
243
+
244
+ export type ToolCall = {
245
+ type: 'function'
246
+ function: {
247
+ name: string
248
+ arguments: string
249
+ }
250
+ id?: string
251
+ }
252
+
172
253
  export interface LlamaContext {
173
254
  new (options: LlamaModelOptions): LlamaContext
174
255
  getSystemInfo(): string
175
- getModelInfo(): object
256
+ getModelInfo(): ModelInfo
176
257
  getFormattedChat(
177
258
  messages: ChatMessage[],
178
259
  chat_template?: string,
179
260
  params?: {
180
261
  jinja?: boolean
181
262
  response_format?: CompletionResponseFormat
182
- tools?: object
183
- parallel_tool_calls?: object
263
+ tools?: Tool[]
264
+ parallel_tool_calls?: boolean
184
265
  tool_choice?: string
266
+ enable_thinking?: boolean
185
267
  },
186
- ): object | string
268
+ ): JinjaFormattedChatResult | string
187
269
  completion(
188
270
  options: LlamaCompletionOptions,
189
271
  callback?: (token: LlamaCompletionToken) => void,
@@ -197,51 +279,50 @@ export interface LlamaContext {
197
279
  loadSession(path: string): Promise<void>
198
280
  release(): Promise<void>
199
281
  applyLoraAdapters(adapters: { path: string; scaled: number }[]): void
200
- removeLoraAdapters(adapters: { path: string }[]): void
282
+ removeLoraAdapters(): void
201
283
  getLoadedLoraAdapters(): { path: string; scaled: number }[]
202
284
  /**
203
285
  * Initialize multimodal support with a mmproj file
204
- * @param mmproj_path Path to the multimodal projector file
205
- * @returns Promise resolving to true if initialization was successful
286
+ * @param options Object containing path and optional use_gpu flag
287
+ * @returns boolean indicating if initialization was successful
206
288
  */
207
- initMultimodal(options: { path: string; use_gpu?: boolean }): Promise<boolean>
289
+ initMultimodal(options: { path: string; use_gpu?: boolean }): boolean
208
290
 
209
291
  /**
210
292
  * Check if multimodal support is enabled
211
- * @returns Promise resolving to true if multimodal is enabled
293
+ * @returns boolean indicating if multimodal is enabled
212
294
  */
213
- isMultimodalEnabled(): Promise<boolean>
295
+ isMultimodalEnabled(): boolean
214
296
 
215
297
  /**
216
298
  * Get multimodal support capabilities
217
- * @returns Promise resolving to an object with vision and audio support
299
+ * @returns Object with vision and audio support
218
300
  */
219
- getMultimodalSupport(): Promise<{
301
+ getMultimodalSupport(): {
220
302
  vision: boolean
221
303
  audio: boolean
222
- }>
304
+ }
223
305
 
224
306
  /**
225
307
  * Release multimodal support
226
308
  */
227
- releaseMultimodal(): Promise<void>
309
+ releaseMultimodal(): void
228
310
 
229
311
  /**
230
312
  * Load a vocoder model
231
- * @param path Path to the vocoder model
232
- * @returns Promise resolving to true if loading was successful
313
+ * @param options Object containing path and optional n_batch
314
+ * @returns boolean indicating if loading was successful
233
315
  */
234
- initVocoder(options: { path: string, n_batch?: number }): Promise<boolean>
316
+ initVocoder(options: { path: string, n_batch?: number }): boolean
235
317
 
236
318
  /**
237
319
  * Unload the vocoder model
238
- * @returns Promise resolving to true if unloading was successful
239
320
  */
240
- releaseVocoder(): Promise<void>
321
+ releaseVocoder(): void
241
322
 
242
323
  /**
243
324
  * Check if the vocoder model is enabled
244
- * @returns Promise resolving to true if the vocoder model is enabled
325
+ * @returns boolean indicating if the vocoder model is enabled
245
326
  */
246
327
  isVocoderEnabled(): boolean
247
328
 
@@ -251,7 +332,10 @@ export interface LlamaContext {
251
332
  * @param text Text to complete
252
333
  * @returns Formatted audio completion
253
334
  */
254
- getFormattedAudioCompletion(speaker: string|null, text: string): string
335
+ getFormattedAudioCompletion(speaker: string|null, text: string): {
336
+ prompt: string
337
+ grammar?: string
338
+ }
255
339
 
256
340
  /**
257
341
  * Get guide tokens for audio completion
@@ -263,12 +347,12 @@ export interface LlamaContext {
263
347
  /**
264
348
  * Decode audio tokens to audio data
265
349
  * @param tokens Tokens to decode
266
- * @returns Decoded audio tokens
350
+ * @returns Promise resolving to decoded audio tokens
267
351
  */
268
- decodeAudioTokens(tokens: Int32Array): Promise<Float32Array>
352
+ decodeAudioTokens(tokens: number[]|Int32Array): Promise<Float32Array>
269
353
 
270
354
  // static
271
- loadModelInfo(path: string, skip: string[]): Promise<Object>
355
+ loadModelInfo(path: string, skip: string[]): Promise<GGUFModelInfo>
272
356
  toggleNativeLog(
273
357
  enable: boolean,
274
358
  callback: (level: string, text: string) => void,
package/lib/index.js CHANGED
@@ -140,7 +140,7 @@ class LlamaContextWrapper {
140
140
  const jsonSchema = getJsonSchema(params === null || params === void 0 ? void 0 : params.response_format);
141
141
  const result = this.ctx.getFormattedChat(chat, tmpl, {
142
142
  jinja: useJinja,
143
- json_schema: jsonSchema,
143
+ response_format: params === null || params === void 0 ? void 0 : params.response_format,
144
144
  tools: params === null || params === void 0 ? void 0 : params.tools,
145
145
  parallel_tool_calls: params === null || params === void 0 ? void 0 : params.parallel_tool_calls,
146
146
  tool_choice: params === null || params === void 0 ? void 0 : params.tool_choice,
@@ -155,10 +155,8 @@ class LlamaContextWrapper {
155
155
  };
156
156
  }
157
157
  const jinjaResult = result;
158
- jinjaResult.type = 'jinja';
159
- jinjaResult.has_media = has_media;
160
- jinjaResult.media_paths = media_paths;
161
- return jinjaResult;
158
+ return Object.assign({ type: 'jinja', has_media,
159
+ media_paths }, jinjaResult);
162
160
  }
163
161
  completion(options, callback) {
164
162
  const { messages, media_paths = options.media_paths } = this._formatMediaChat(options.messages);
@@ -196,8 +194,8 @@ class LlamaContextWrapper {
196
194
  applyLoraAdapters(adapters) {
197
195
  return this.ctx.applyLoraAdapters(adapters);
198
196
  }
199
- removeLoraAdapters(adapters) {
200
- return this.ctx.removeLoraAdapters(adapters);
197
+ removeLoraAdapters() {
198
+ this.ctx.removeLoraAdapters();
201
199
  }
202
200
  getLoadedLoraAdapters() {
203
201
  return this.ctx.getLoadedLoraAdapters();
@@ -209,7 +207,7 @@ class LlamaContextWrapper {
209
207
  return this.ctx.isMultimodalEnabled();
210
208
  }
211
209
  releaseMultimodal() {
212
- return this.ctx.releaseMultimodal();
210
+ this.ctx.releaseMultimodal();
213
211
  }
214
212
  getMultimodalSupport() {
215
213
  return this.ctx.getMultimodalSupport();
@@ -218,7 +216,7 @@ class LlamaContextWrapper {
218
216
  return this.ctx.initVocoder(options);
219
217
  }
220
218
  releaseVocoder() {
221
- return this.ctx.releaseVocoder();
219
+ this.ctx.releaseVocoder();
222
220
  }
223
221
  isVocoderEnabled() {
224
222
  return this.ctx.isVocoderEnabled();
package/lib/index.ts CHANGED
@@ -12,6 +12,10 @@ import type {
12
12
  RerankParams,
13
13
  RerankResult,
14
14
  CompletionResponseFormat,
15
+ ModelInfo,
16
+ JinjaFormattedChatResult,
17
+ Tool,
18
+ GGUFModelInfo,
15
19
  } from './binding'
16
20
 
17
21
  export * from './binding'
@@ -72,9 +76,9 @@ export type FormattedChatResult = {
72
76
  }
73
77
 
74
78
  class LlamaContextWrapper {
75
- ctx: any
79
+ ctx: LlamaContext
76
80
 
77
- constructor(nativeCtx: any) {
81
+ constructor(nativeCtx: LlamaContext) {
78
82
  this.ctx = nativeCtx
79
83
  }
80
84
 
@@ -82,7 +86,7 @@ class LlamaContextWrapper {
82
86
  return this.ctx.getSystemInfo()
83
87
  }
84
88
 
85
- getModelInfo(): object {
89
+ getModelInfo(): ModelInfo {
86
90
  return this.ctx.getModelInfo()
87
91
  }
88
92
 
@@ -158,8 +162,8 @@ class LlamaContextWrapper {
158
162
  params?: {
159
163
  jinja?: boolean
160
164
  response_format?: CompletionResponseFormat
161
- tools?: object
162
- parallel_tool_calls?: object
165
+ tools?: Tool[]
166
+ parallel_tool_calls?: boolean
163
167
  tool_choice?: string,
164
168
  enable_thinking?: boolean,
165
169
  },
@@ -175,9 +179,9 @@ class LlamaContextWrapper {
175
179
  if (template) tmpl = template // Force replace if provided
176
180
  const jsonSchema = getJsonSchema(params?.response_format)
177
181
 
178
- const result = this.ctx.getFormattedChat(chat, tmpl, {
182
+ const result = this.ctx.getFormattedChat(chat!, tmpl, {
179
183
  jinja: useJinja,
180
- json_schema: jsonSchema,
184
+ response_format: params?.response_format,
181
185
  tools: params?.tools,
182
186
  parallel_tool_calls: params?.parallel_tool_calls,
183
187
  tool_choice: params?.tool_choice,
@@ -192,11 +196,13 @@ class LlamaContextWrapper {
192
196
  media_paths,
193
197
  }
194
198
  }
195
- const jinjaResult = result
196
- jinjaResult.type = 'jinja'
197
- jinjaResult.has_media = has_media
198
- jinjaResult.media_paths = media_paths
199
- return jinjaResult
199
+ const jinjaResult = result as JinjaFormattedChatResult
200
+ return {
201
+ type: 'jinja',
202
+ has_media,
203
+ media_paths,
204
+ ...jinjaResult,
205
+ }
200
206
  }
201
207
 
202
208
  completion(
@@ -256,8 +262,8 @@ class LlamaContextWrapper {
256
262
  return this.ctx.applyLoraAdapters(adapters)
257
263
  }
258
264
 
259
- removeLoraAdapters(adapters: { path: string }[]): void {
260
- return this.ctx.removeLoraAdapters(adapters)
265
+ removeLoraAdapters(): void {
266
+ this.ctx.removeLoraAdapters()
261
267
  }
262
268
 
263
269
  getLoadedLoraAdapters(): { path: string; scaled: number }[] {
@@ -267,38 +273,41 @@ class LlamaContextWrapper {
267
273
  initMultimodal(options: {
268
274
  path: string
269
275
  use_gpu?: boolean
270
- }): Promise<boolean> {
276
+ }): boolean {
271
277
  return this.ctx.initMultimodal(options)
272
278
  }
273
279
 
274
- isMultimodalEnabled(): Promise<boolean> {
280
+ isMultimodalEnabled(): boolean {
275
281
  return this.ctx.isMultimodalEnabled()
276
282
  }
277
283
 
278
- releaseMultimodal(): Promise<void> {
279
- return this.ctx.releaseMultimodal()
284
+ releaseMultimodal(): void {
285
+ this.ctx.releaseMultimodal()
280
286
  }
281
287
 
282
- getMultimodalSupport(): Promise<{
288
+ getMultimodalSupport(): {
283
289
  vision: boolean
284
290
  audio: boolean
285
- }> {
291
+ } {
286
292
  return this.ctx.getMultimodalSupport()
287
293
  }
288
294
 
289
- initVocoder(options: { path: string, n_batch?: number }): Promise<boolean> {
295
+ initVocoder(options: { path: string, n_batch?: number }): boolean {
290
296
  return this.ctx.initVocoder(options)
291
297
  }
292
298
 
293
- releaseVocoder(): Promise<void> {
294
- return this.ctx.releaseVocoder()
299
+ releaseVocoder(): void {
300
+ this.ctx.releaseVocoder()
295
301
  }
296
302
 
297
303
  isVocoderEnabled(): boolean {
298
304
  return this.ctx.isVocoderEnabled()
299
305
  }
300
306
 
301
- getFormattedAudioCompletion(speaker: string|null, text: string): string {
307
+ getFormattedAudioCompletion(speaker: string|null, text: string): {
308
+ prompt: string
309
+ grammar?: string
310
+ } {
302
311
  return this.ctx.getFormattedAudioCompletion(speaker, text)
303
312
  }
304
313
 
@@ -332,7 +341,7 @@ const modelInfoSkip = [
332
341
  'tokenizer.ggml.scores',
333
342
  ]
334
343
 
335
- export const loadLlamaModelInfo = async (path: string): Promise<Object> => {
344
+ export const loadLlamaModelInfo = async (path: string): Promise<GGUFModelInfo> => {
336
345
  const variant = 'default'
337
346
  mods[variant] ??= await loadModule(variant)
338
347
  refreshNativeLogSetup()
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.0.6",
4
+ "version": "1.1.0",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -9,6 +9,7 @@
9
9
  "postinstall": "node scripts/check.js",
10
10
  "pretest": "node scripts/download-test-models.js",
11
11
  "test": "jest",
12
+ "typecheck": "tsc --noEmit",
12
13
  "build": "npx cmake-js build",
13
14
  "build-js": "tsc",
14
15
  "prepack": "npm run build-js",
@@ -70,19 +71,19 @@
70
71
  "CMakeLists.txt"
71
72
  ],
72
73
  "optionalDependencies": {
73
- "@fugood/node-llama-linux-x64": "1.0.6",
74
- "@fugood/node-llama-linux-x64-vulkan": "1.0.6",
75
- "@fugood/node-llama-linux-x64-cuda": "1.0.6",
76
- "@fugood/node-llama-linux-arm64": "1.0.6",
77
- "@fugood/node-llama-linux-arm64-vulkan": "1.0.6",
78
- "@fugood/node-llama-linux-arm64-cuda": "1.0.6",
79
- "@fugood/node-llama-win32-x64": "1.0.6",
80
- "@fugood/node-llama-win32-x64-vulkan": "1.0.6",
81
- "@fugood/node-llama-win32-x64-cuda": "1.0.6",
82
- "@fugood/node-llama-win32-arm64": "1.0.6",
83
- "@fugood/node-llama-win32-arm64-vulkan": "1.0.6",
84
- "@fugood/node-llama-darwin-x64": "1.0.6",
85
- "@fugood/node-llama-darwin-arm64": "1.0.6"
74
+ "@fugood/node-llama-linux-x64": "1.1.0",
75
+ "@fugood/node-llama-linux-x64-vulkan": "1.1.0",
76
+ "@fugood/node-llama-linux-x64-cuda": "1.1.0",
77
+ "@fugood/node-llama-linux-arm64": "1.1.0",
78
+ "@fugood/node-llama-linux-arm64-vulkan": "1.1.0",
79
+ "@fugood/node-llama-linux-arm64-cuda": "1.1.0",
80
+ "@fugood/node-llama-win32-x64": "1.1.0",
81
+ "@fugood/node-llama-win32-x64-vulkan": "1.1.0",
82
+ "@fugood/node-llama-win32-x64-cuda": "1.1.0",
83
+ "@fugood/node-llama-win32-arm64": "1.1.0",
84
+ "@fugood/node-llama-win32-arm64-vulkan": "1.1.0",
85
+ "@fugood/node-llama-darwin-x64": "1.1.0",
86
+ "@fugood/node-llama-darwin-arm64": "1.1.0"
86
87
  },
87
88
  "devDependencies": {
88
89
  "@babel/preset-env": "^7.24.4",
@@ -91,10 +92,12 @@
91
92
  "@commitlint/config-conventional": "^19.2.2",
92
93
  "@types/jest": "^29.5.12",
93
94
  "@types/node": "^22.0.0",
95
+ "@types/node-wav": "^0.0.4",
94
96
  "cmake-js": "^7.3.0",
95
97
  "husky": "^9.0.11",
96
98
  "jest": "^29.7.0",
97
99
  "node-addon-api": "^8.0.0",
100
+ "node-wav": "^0.0.2",
98
101
  "release-it": "^17.7.0",
99
102
  "rimraf": "^6.0.1",
100
103
  "typescript": "^5.4.5",
@@ -110,7 +110,7 @@ void LlamaCompletionWorker::Execute() {
110
110
  } else {
111
111
  // Text-only path
112
112
  std::vector<llama_token> prompt_tokens =
113
- ::common_tokenize(ctx, _params.prompt, add_bos);
113
+ ::common_tokenize(ctx, _params.prompt, add_bos, true);
114
114
  n_input = prompt_tokens.size();
115
115
 
116
116
  if (_sess->tokens_ptr()->size() > 0) {
@@ -177,7 +177,7 @@ void LlamaCompletionWorker::Execute() {
177
177
 
178
178
  // Collect audio tokens for TTS if vocoder is enabled
179
179
  if (_has_vocoder) {
180
- if ((_tts_type == OUTETTS_V0_2 || _tts_type == OUTETTS_V0_3) &&
180
+ if ((_tts_type == OUTETTS_V0_1 || _tts_type == OUTETTS_V0_2 || _tts_type == OUTETTS_V0_3) &&
181
181
  (new_token_id >= 151672 && new_token_id <= 155772)) {
182
182
  _result.audio_tokens.push_back(new_token_id);
183
183
  }
@@ -247,6 +247,7 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
247
247
  params.cache_type_v = kv_cache_type_from_str(
248
248
  get_option<std::string>(options, "cache_type_v", "f16").c_str());
249
249
  params.ctx_shift = get_option<bool>(options, "ctx_shift", true);
250
+ params.kv_unified = get_option<bool>(options, "kv_unified", true);
250
251
 
251
252
  params.use_mlock = get_option<bool>(options, "use_mlock", false);
252
253
  params.use_mmap = get_option<bool>(options, "use_mmap", true);
@@ -904,9 +905,27 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
904
905
  // guide_tokens
905
906
  std::vector<llama_token> guide_tokens;
906
907
  if (options.Has("guide_tokens")) {
907
- auto guide_tokens_array = options.Get("guide_tokens").As<Napi::Array>();
908
- for (size_t i = 0; i < guide_tokens_array.Length(); i++) {
909
- guide_tokens.push_back(guide_tokens_array.Get(i).ToNumber().Int32Value());
908
+ auto guide_tokens_value = options.Get("guide_tokens");
909
+ if (guide_tokens_value.IsArray()) {
910
+ auto guide_tokens_array = guide_tokens_value.As<Napi::Array>();
911
+ for (size_t i = 0; i < guide_tokens_array.Length(); i++) {
912
+ guide_tokens.push_back(guide_tokens_array.Get(i).ToNumber().Int32Value());
913
+ }
914
+ } else if (guide_tokens_value.IsTypedArray()) {
915
+ auto guide_tokens_typed_array = guide_tokens_value.As<Napi::TypedArray>();
916
+ if (guide_tokens_typed_array.TypedArrayType() == napi_int32_array) {
917
+ auto guide_tokens_int32_array = guide_tokens_value.As<Napi::Int32Array>();
918
+ size_t length = guide_tokens_int32_array.ElementLength();
919
+ const int32_t* data = guide_tokens_int32_array.Data();
920
+ guide_tokens.resize(length);
921
+ memcpy(guide_tokens.data(), data, length * sizeof(int32_t));
922
+ } else {
923
+ Napi::TypeError::New(env, "guide_tokens must be Array<number> or Int32Array").ThrowAsJavaScriptException();
924
+ return env.Undefined();
925
+ }
926
+ } else {
927
+ Napi::TypeError::New(env, "guide_tokens must be Array<number> or Int32Array").ThrowAsJavaScriptException();
928
+ return env.Undefined();
910
929
  }
911
930
  }
912
931
 
@@ -1345,7 +1364,7 @@ Napi::Value LlamaContext::IsVocoderEnabled(const Napi::CallbackInfo &info) {
1345
1364
  return Napi::Boolean::New(env, _has_vocoder);
1346
1365
  }
1347
1366
 
1348
- // getFormattedAudioCompletion(speaker: string|null, text: string): string
1367
+ // getFormattedAudioCompletion(speaker: string|null, text: string): object
1349
1368
  Napi::Value
1350
1369
  LlamaContext::GetFormattedAudioCompletion(const Napi::CallbackInfo &info) {
1351
1370
  Napi::Env env = info.Env();
@@ -1372,9 +1391,16 @@ LlamaContext::GetFormattedAudioCompletion(const Napi::CallbackInfo &info) {
1372
1391
  audio_text = audio_text_from_speaker(speaker, type);
1373
1392
  audio_data = audio_data_from_speaker(speaker, type);
1374
1393
  }
1375
- return Napi::String::New(env, "<|im_start|>\n" + audio_text +
1376
- process_text(text, type) +
1377
- "<|text_end|>\n" + audio_data + "\n");
1394
+ std::string prompt = "<|im_start|>\n" + audio_text +
1395
+ process_text(text, type) +
1396
+ "<|text_end|>\n" + audio_data + "\n";
1397
+ Napi::Object result = Napi::Object::New(env);
1398
+ result.Set("prompt", prompt);
1399
+ const char *grammar = get_tts_grammar(type);
1400
+ if (grammar != nullptr) {
1401
+ result.Set("grammar", grammar);
1402
+ }
1403
+ return result;
1378
1404
  }
1379
1405
 
1380
1406
  // getAudioCompletionGuideTokens(text: string): Int32Array
@@ -1415,6 +1441,10 @@ LlamaContext::GetAudioCompletionGuideTokens(const Napi::CallbackInfo &info) {
1415
1441
  if (tmp.size() > 0) {
1416
1442
  result.push_back(tmp[0]);
1417
1443
  }
1444
+
1445
+ // Add Audio End, forcing stop generation
1446
+ result.push_back(common_tokenize(vocab, "<|audio_end|>", false, true)[0]);
1447
+
1418
1448
  auto tokens = Napi::Int32Array::New(env, result.size());
1419
1449
  memcpy(tokens.Data(), result.data(), result.size() * sizeof(int32_t));
1420
1450
  return tokens;
@@ -1449,7 +1479,7 @@ Napi::Value LlamaContext::DecodeAudioTokens(const Napi::CallbackInfo &info) {
1449
1479
  .ThrowAsJavaScriptException();
1450
1480
  return env.Undefined();
1451
1481
  }
1452
- if (type == OUTETTS_V0_3 || type == OUTETTS_V0_2) {
1482
+ if (type == OUTETTS_V0_1 || type == OUTETTS_V0_2 || type == OUTETTS_V0_3) {
1453
1483
  tokens.erase(
1454
1484
  std::remove_if(tokens.begin(), tokens.end(),
1455
1485
  [](llama_token t) { return t < 151672 || t > 155772; }),