@fugood/llama.node 1.0.6 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CMakeLists.txt CHANGED
@@ -73,9 +73,9 @@ if(CMAKE_BUILD_TYPE STREQUAL "Release")
73
73
  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /O1 /Ob1 /bigobj")
74
74
  endif()
75
75
  else()
76
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -funroll-loops -flto")
77
- set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 -funroll-loops -flto")
78
- set(CMAKE_LINKER_FLAGS "${CMAKE_LINKER_FLAGS} -flto")
76
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -funroll-loops -flto=auto")
77
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 -funroll-loops -flto=auto")
78
+ set(CMAKE_LINKER_FLAGS "${CMAKE_LINKER_FLAGS} -flto=auto")
79
79
  endif()
80
80
  endif()
81
81
 
package/lib/binding.ts CHANGED
@@ -1,6 +1,3 @@
1
- import * as path from 'path'
2
-
3
-
4
1
  export type MessagePart = {
5
2
  type: string,
6
3
  text?: string,
@@ -53,6 +50,11 @@ export type LlamaModelOptions = {
53
50
  * Enable context shifting to handle prompts larger than context size
54
51
  */
55
52
  ctx_shift?: boolean
53
+ /**
54
+ * Use a unified buffer across the input sequences when computing the attention.
55
+ * Try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix.
56
+ */
57
+ kv_unified?: boolean
56
58
  use_mlock?: boolean
57
59
  use_mmap?: boolean
58
60
  vocab_only?: boolean
@@ -65,9 +67,9 @@ export type CompletionResponseFormat = {
65
67
  type: 'text' | 'json_object' | 'json_schema'
66
68
  json_schema?: {
67
69
  strict?: boolean
68
- schema: object
70
+ schema: Record<string, any>
69
71
  }
70
- schema?: object // for json_object type
72
+ schema?: Record<string, any> // for json_object type
71
73
  }
72
74
 
73
75
  export type LlamaCompletionOptions = {
@@ -76,7 +78,7 @@ export type LlamaCompletionOptions = {
76
78
  reasoning_format?: string
77
79
  chat_template?: string
78
80
  response_format?: CompletionResponseFormat
79
- tools?: object
81
+ tools?: Tool[]
80
82
  parallel_tool_calls?: boolean
81
83
  tool_choice?: string
82
84
  enable_thinking?: boolean
@@ -107,7 +109,7 @@ export type LlamaCompletionOptions = {
107
109
  stop?: string[]
108
110
  grammar?: string
109
111
  grammar_lazy?: boolean
110
- grammar_triggers?: { type: number; word: string; at_start: boolean }[]
112
+ grammar_triggers?: { type: number; value: string; token?: number }[]
111
113
  preserved_tokens?: string[]
112
114
  /**
113
115
  * Path(s) to media file(s) to process before generating text.
@@ -120,13 +122,14 @@ export type LlamaCompletionOptions = {
120
122
  * Guide tokens to use for audio completion.
121
123
  * Help prevent hallucinations by forcing the TTS to use the correct words.
122
124
  */
123
- guide_tokens?: Int32Array
125
+ guide_tokens?: number[] | Int32Array
124
126
  }
125
127
 
126
128
  export type LlamaCompletionResult = {
127
129
  text: string
128
130
  reasoning_content?: string
129
131
  content?: string
132
+ chat_format: number
130
133
  tokens_predicted: number
131
134
  tokens_evaluated: number
132
135
  truncated: boolean
@@ -169,21 +172,101 @@ export type RerankResult = {
169
172
  index: number
170
173
  }
171
174
 
175
+ export type ModelInfo = {
176
+ desc: string
177
+ nEmbd: number
178
+ nParams: number
179
+ size: number
180
+ chatTemplates: {
181
+ llamaChat: boolean
182
+ minja: {
183
+ default: boolean
184
+ defaultCaps: {
185
+ tools: boolean
186
+ toolCalls: boolean
187
+ toolResponses: boolean
188
+ systemRole: boolean
189
+ parallelToolCalls: boolean
190
+ toolCallId: boolean
191
+ }
192
+ toolUse: boolean
193
+ toolUseCaps?: {
194
+ tools: boolean
195
+ toolCalls: boolean
196
+ toolResponses: boolean
197
+ systemRole: boolean
198
+ parallelToolCalls: boolean
199
+ toolCallId: boolean
200
+ }
201
+ }
202
+ }
203
+ metadata: Record<string, string>
204
+ isChatTemplateSupported: boolean
205
+ }
206
+
207
+ export type GGUFModelInfo = {
208
+ version?: number
209
+ alignment?: number
210
+ data_offset?: number
211
+ [key: string]: string | number | undefined
212
+ }
213
+
214
+ export type FormattedChatResult = {
215
+ type: 'jinja' | 'llama-chat'
216
+ prompt: string
217
+ has_media: boolean
218
+ media_paths?: Array<string>
219
+ }
220
+
221
+ export type JinjaFormattedChatResult = {
222
+ prompt: string
223
+ chat_format: number
224
+ grammar: string
225
+ grammea_lazy: boolean
226
+ grammar_triggers: Array<{
227
+ type: number
228
+ value: string
229
+ token: number
230
+ }>
231
+ thinking_forced_open: boolean
232
+ preserved_tokens: string[]
233
+ additional_stops: string[]
234
+ }
235
+
236
+ export type Tool = {
237
+ type: 'function'
238
+ function: {
239
+ name: string
240
+ description: string
241
+ parameters: Record<string, any>
242
+ }
243
+ }
244
+
245
+ export type ToolCall = {
246
+ type: 'function'
247
+ function: {
248
+ name: string
249
+ arguments: string
250
+ }
251
+ id?: string
252
+ }
253
+
172
254
  export interface LlamaContext {
173
255
  new (options: LlamaModelOptions): LlamaContext
174
256
  getSystemInfo(): string
175
- getModelInfo(): object
257
+ getModelInfo(): ModelInfo
176
258
  getFormattedChat(
177
259
  messages: ChatMessage[],
178
260
  chat_template?: string,
179
261
  params?: {
180
262
  jinja?: boolean
181
263
  response_format?: CompletionResponseFormat
182
- tools?: object
183
- parallel_tool_calls?: object
264
+ tools?: Tool[]
265
+ parallel_tool_calls?: boolean
184
266
  tool_choice?: string
267
+ enable_thinking?: boolean
185
268
  },
186
- ): object | string
269
+ ): JinjaFormattedChatResult | string
187
270
  completion(
188
271
  options: LlamaCompletionOptions,
189
272
  callback?: (token: LlamaCompletionToken) => void,
@@ -197,51 +280,50 @@ export interface LlamaContext {
197
280
  loadSession(path: string): Promise<void>
198
281
  release(): Promise<void>
199
282
  applyLoraAdapters(adapters: { path: string; scaled: number }[]): void
200
- removeLoraAdapters(adapters: { path: string }[]): void
283
+ removeLoraAdapters(): void
201
284
  getLoadedLoraAdapters(): { path: string; scaled: number }[]
202
285
  /**
203
286
  * Initialize multimodal support with a mmproj file
204
- * @param mmproj_path Path to the multimodal projector file
205
- * @returns Promise resolving to true if initialization was successful
287
+ * @param options Object containing path and optional use_gpu flag
288
+ * @returns boolean indicating if initialization was successful
206
289
  */
207
- initMultimodal(options: { path: string; use_gpu?: boolean }): Promise<boolean>
290
+ initMultimodal(options: { path: string; use_gpu?: boolean }): boolean
208
291
 
209
292
  /**
210
293
  * Check if multimodal support is enabled
211
- * @returns Promise resolving to true if multimodal is enabled
294
+ * @returns boolean indicating if multimodal is enabled
212
295
  */
213
- isMultimodalEnabled(): Promise<boolean>
296
+ isMultimodalEnabled(): boolean
214
297
 
215
298
  /**
216
299
  * Get multimodal support capabilities
217
- * @returns Promise resolving to an object with vision and audio support
300
+ * @returns Object with vision and audio support
218
301
  */
219
- getMultimodalSupport(): Promise<{
302
+ getMultimodalSupport(): {
220
303
  vision: boolean
221
304
  audio: boolean
222
- }>
305
+ }
223
306
 
224
307
  /**
225
308
  * Release multimodal support
226
309
  */
227
- releaseMultimodal(): Promise<void>
310
+ releaseMultimodal(): void
228
311
 
229
312
  /**
230
313
  * Load a vocoder model
231
- * @param path Path to the vocoder model
232
- * @returns Promise resolving to true if loading was successful
314
+ * @param options Object containing path and optional n_batch
315
+ * @returns boolean indicating if loading was successful
233
316
  */
234
- initVocoder(options: { path: string, n_batch?: number }): Promise<boolean>
317
+ initVocoder(options: { path: string, n_batch?: number }): boolean
235
318
 
236
319
  /**
237
320
  * Unload the vocoder model
238
- * @returns Promise resolving to true if unloading was successful
239
321
  */
240
- releaseVocoder(): Promise<void>
322
+ releaseVocoder(): void
241
323
 
242
324
  /**
243
325
  * Check if the vocoder model is enabled
244
- * @returns Promise resolving to true if the vocoder model is enabled
326
+ * @returns boolean indicating if the vocoder model is enabled
245
327
  */
246
328
  isVocoderEnabled(): boolean
247
329
 
@@ -251,7 +333,10 @@ export interface LlamaContext {
251
333
  * @param text Text to complete
252
334
  * @returns Formatted audio completion
253
335
  */
254
- getFormattedAudioCompletion(speaker: string|null, text: string): string
336
+ getFormattedAudioCompletion(speaker: string|null, text: string): {
337
+ prompt: string
338
+ grammar?: string
339
+ }
255
340
 
256
341
  /**
257
342
  * Get guide tokens for audio completion
@@ -263,12 +348,12 @@ export interface LlamaContext {
263
348
  /**
264
349
  * Decode audio tokens to audio data
265
350
  * @param tokens Tokens to decode
266
- * @returns Decoded audio tokens
351
+ * @returns Promise resolving to decoded audio tokens
267
352
  */
268
- decodeAudioTokens(tokens: Int32Array): Promise<Float32Array>
353
+ decodeAudioTokens(tokens: number[]|Int32Array): Promise<Float32Array>
269
354
 
270
355
  // static
271
- loadModelInfo(path: string, skip: string[]): Promise<Object>
356
+ loadModelInfo(path: string, skip: string[]): Promise<GGUFModelInfo>
272
357
  toggleNativeLog(
273
358
  enable: boolean,
274
359
  callback: (level: string, text: string) => void,
package/lib/index.js CHANGED
@@ -140,7 +140,7 @@ class LlamaContextWrapper {
140
140
  const jsonSchema = getJsonSchema(params === null || params === void 0 ? void 0 : params.response_format);
141
141
  const result = this.ctx.getFormattedChat(chat, tmpl, {
142
142
  jinja: useJinja,
143
- json_schema: jsonSchema,
143
+ response_format: params === null || params === void 0 ? void 0 : params.response_format,
144
144
  tools: params === null || params === void 0 ? void 0 : params.tools,
145
145
  parallel_tool_calls: params === null || params === void 0 ? void 0 : params.parallel_tool_calls,
146
146
  tool_choice: params === null || params === void 0 ? void 0 : params.tool_choice,
@@ -155,10 +155,8 @@ class LlamaContextWrapper {
155
155
  };
156
156
  }
157
157
  const jinjaResult = result;
158
- jinjaResult.type = 'jinja';
159
- jinjaResult.has_media = has_media;
160
- jinjaResult.media_paths = media_paths;
161
- return jinjaResult;
158
+ return Object.assign({ type: 'jinja', has_media,
159
+ media_paths }, jinjaResult);
162
160
  }
163
161
  completion(options, callback) {
164
162
  const { messages, media_paths = options.media_paths } = this._formatMediaChat(options.messages);
@@ -196,8 +194,8 @@ class LlamaContextWrapper {
196
194
  applyLoraAdapters(adapters) {
197
195
  return this.ctx.applyLoraAdapters(adapters);
198
196
  }
199
- removeLoraAdapters(adapters) {
200
- return this.ctx.removeLoraAdapters(adapters);
197
+ removeLoraAdapters() {
198
+ this.ctx.removeLoraAdapters();
201
199
  }
202
200
  getLoadedLoraAdapters() {
203
201
  return this.ctx.getLoadedLoraAdapters();
@@ -209,7 +207,7 @@ class LlamaContextWrapper {
209
207
  return this.ctx.isMultimodalEnabled();
210
208
  }
211
209
  releaseMultimodal() {
212
- return this.ctx.releaseMultimodal();
210
+ this.ctx.releaseMultimodal();
213
211
  }
214
212
  getMultimodalSupport() {
215
213
  return this.ctx.getMultimodalSupport();
@@ -218,7 +216,7 @@ class LlamaContextWrapper {
218
216
  return this.ctx.initVocoder(options);
219
217
  }
220
218
  releaseVocoder() {
221
- return this.ctx.releaseVocoder();
219
+ this.ctx.releaseVocoder();
222
220
  }
223
221
  isVocoderEnabled() {
224
222
  return this.ctx.isVocoderEnabled();
package/lib/index.ts CHANGED
@@ -12,6 +12,10 @@ import type {
12
12
  RerankParams,
13
13
  RerankResult,
14
14
  CompletionResponseFormat,
15
+ ModelInfo,
16
+ JinjaFormattedChatResult,
17
+ Tool,
18
+ GGUFModelInfo,
15
19
  } from './binding'
16
20
 
17
21
  export * from './binding'
@@ -72,9 +76,9 @@ export type FormattedChatResult = {
72
76
  }
73
77
 
74
78
  class LlamaContextWrapper {
75
- ctx: any
79
+ ctx: LlamaContext
76
80
 
77
- constructor(nativeCtx: any) {
81
+ constructor(nativeCtx: LlamaContext) {
78
82
  this.ctx = nativeCtx
79
83
  }
80
84
 
@@ -82,7 +86,7 @@ class LlamaContextWrapper {
82
86
  return this.ctx.getSystemInfo()
83
87
  }
84
88
 
85
- getModelInfo(): object {
89
+ getModelInfo(): ModelInfo {
86
90
  return this.ctx.getModelInfo()
87
91
  }
88
92
 
@@ -158,8 +162,8 @@ class LlamaContextWrapper {
158
162
  params?: {
159
163
  jinja?: boolean
160
164
  response_format?: CompletionResponseFormat
161
- tools?: object
162
- parallel_tool_calls?: object
165
+ tools?: Tool[]
166
+ parallel_tool_calls?: boolean
163
167
  tool_choice?: string,
164
168
  enable_thinking?: boolean,
165
169
  },
@@ -175,9 +179,9 @@ class LlamaContextWrapper {
175
179
  if (template) tmpl = template // Force replace if provided
176
180
  const jsonSchema = getJsonSchema(params?.response_format)
177
181
 
178
- const result = this.ctx.getFormattedChat(chat, tmpl, {
182
+ const result = this.ctx.getFormattedChat(chat!, tmpl, {
179
183
  jinja: useJinja,
180
- json_schema: jsonSchema,
184
+ response_format: params?.response_format,
181
185
  tools: params?.tools,
182
186
  parallel_tool_calls: params?.parallel_tool_calls,
183
187
  tool_choice: params?.tool_choice,
@@ -192,11 +196,13 @@ class LlamaContextWrapper {
192
196
  media_paths,
193
197
  }
194
198
  }
195
- const jinjaResult = result
196
- jinjaResult.type = 'jinja'
197
- jinjaResult.has_media = has_media
198
- jinjaResult.media_paths = media_paths
199
- return jinjaResult
199
+ const jinjaResult = result as JinjaFormattedChatResult
200
+ return {
201
+ type: 'jinja',
202
+ has_media,
203
+ media_paths,
204
+ ...jinjaResult,
205
+ }
200
206
  }
201
207
 
202
208
  completion(
@@ -256,8 +262,8 @@ class LlamaContextWrapper {
256
262
  return this.ctx.applyLoraAdapters(adapters)
257
263
  }
258
264
 
259
- removeLoraAdapters(adapters: { path: string }[]): void {
260
- return this.ctx.removeLoraAdapters(adapters)
265
+ removeLoraAdapters(): void {
266
+ this.ctx.removeLoraAdapters()
261
267
  }
262
268
 
263
269
  getLoadedLoraAdapters(): { path: string; scaled: number }[] {
@@ -267,38 +273,41 @@ class LlamaContextWrapper {
267
273
  initMultimodal(options: {
268
274
  path: string
269
275
  use_gpu?: boolean
270
- }): Promise<boolean> {
276
+ }): boolean {
271
277
  return this.ctx.initMultimodal(options)
272
278
  }
273
279
 
274
- isMultimodalEnabled(): Promise<boolean> {
280
+ isMultimodalEnabled(): boolean {
275
281
  return this.ctx.isMultimodalEnabled()
276
282
  }
277
283
 
278
- releaseMultimodal(): Promise<void> {
279
- return this.ctx.releaseMultimodal()
284
+ releaseMultimodal(): void {
285
+ this.ctx.releaseMultimodal()
280
286
  }
281
287
 
282
- getMultimodalSupport(): Promise<{
288
+ getMultimodalSupport(): {
283
289
  vision: boolean
284
290
  audio: boolean
285
- }> {
291
+ } {
286
292
  return this.ctx.getMultimodalSupport()
287
293
  }
288
294
 
289
- initVocoder(options: { path: string, n_batch?: number }): Promise<boolean> {
295
+ initVocoder(options: { path: string, n_batch?: number }): boolean {
290
296
  return this.ctx.initVocoder(options)
291
297
  }
292
298
 
293
- releaseVocoder(): Promise<void> {
294
- return this.ctx.releaseVocoder()
299
+ releaseVocoder(): void {
300
+ this.ctx.releaseVocoder()
295
301
  }
296
302
 
297
303
  isVocoderEnabled(): boolean {
298
304
  return this.ctx.isVocoderEnabled()
299
305
  }
300
306
 
301
- getFormattedAudioCompletion(speaker: string|null, text: string): string {
307
+ getFormattedAudioCompletion(speaker: string|null, text: string): {
308
+ prompt: string
309
+ grammar?: string
310
+ } {
302
311
  return this.ctx.getFormattedAudioCompletion(speaker, text)
303
312
  }
304
313
 
@@ -332,7 +341,7 @@ const modelInfoSkip = [
332
341
  'tokenizer.ggml.scores',
333
342
  ]
334
343
 
335
- export const loadLlamaModelInfo = async (path: string): Promise<Object> => {
344
+ export const loadLlamaModelInfo = async (path: string): Promise<GGUFModelInfo> => {
336
345
  const variant = 'default'
337
346
  mods[variant] ??= await loadModule(variant)
338
347
  refreshNativeLogSetup()
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.0.6",
4
+ "version": "1.1.1",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -9,6 +9,7 @@
9
9
  "postinstall": "node scripts/check.js",
10
10
  "pretest": "node scripts/download-test-models.js",
11
11
  "test": "jest",
12
+ "typecheck": "tsc --noEmit",
12
13
  "build": "npx cmake-js build",
13
14
  "build-js": "tsc",
14
15
  "prepack": "npm run build-js",
@@ -70,19 +71,19 @@
70
71
  "CMakeLists.txt"
71
72
  ],
72
73
  "optionalDependencies": {
73
- "@fugood/node-llama-linux-x64": "1.0.6",
74
- "@fugood/node-llama-linux-x64-vulkan": "1.0.6",
75
- "@fugood/node-llama-linux-x64-cuda": "1.0.6",
76
- "@fugood/node-llama-linux-arm64": "1.0.6",
77
- "@fugood/node-llama-linux-arm64-vulkan": "1.0.6",
78
- "@fugood/node-llama-linux-arm64-cuda": "1.0.6",
79
- "@fugood/node-llama-win32-x64": "1.0.6",
80
- "@fugood/node-llama-win32-x64-vulkan": "1.0.6",
81
- "@fugood/node-llama-win32-x64-cuda": "1.0.6",
82
- "@fugood/node-llama-win32-arm64": "1.0.6",
83
- "@fugood/node-llama-win32-arm64-vulkan": "1.0.6",
84
- "@fugood/node-llama-darwin-x64": "1.0.6",
85
- "@fugood/node-llama-darwin-arm64": "1.0.6"
74
+ "@fugood/node-llama-linux-x64": "1.1.1",
75
+ "@fugood/node-llama-linux-x64-vulkan": "1.1.1",
76
+ "@fugood/node-llama-linux-x64-cuda": "1.1.1",
77
+ "@fugood/node-llama-linux-arm64": "1.1.1",
78
+ "@fugood/node-llama-linux-arm64-vulkan": "1.1.1",
79
+ "@fugood/node-llama-linux-arm64-cuda": "1.1.1",
80
+ "@fugood/node-llama-win32-x64": "1.1.1",
81
+ "@fugood/node-llama-win32-x64-vulkan": "1.1.1",
82
+ "@fugood/node-llama-win32-x64-cuda": "1.1.1",
83
+ "@fugood/node-llama-win32-arm64": "1.1.1",
84
+ "@fugood/node-llama-win32-arm64-vulkan": "1.1.1",
85
+ "@fugood/node-llama-darwin-x64": "1.1.1",
86
+ "@fugood/node-llama-darwin-arm64": "1.1.1"
86
87
  },
87
88
  "devDependencies": {
88
89
  "@babel/preset-env": "^7.24.4",
@@ -91,10 +92,12 @@
91
92
  "@commitlint/config-conventional": "^19.2.2",
92
93
  "@types/jest": "^29.5.12",
93
94
  "@types/node": "^22.0.0",
95
+ "@types/node-wav": "^0.0.4",
94
96
  "cmake-js": "^7.3.0",
95
97
  "husky": "^9.0.11",
96
98
  "jest": "^29.7.0",
97
99
  "node-addon-api": "^8.0.0",
100
+ "node-wav": "^0.0.2",
98
101
  "release-it": "^17.7.0",
99
102
  "rimraf": "^6.0.1",
100
103
  "typescript": "^5.4.5",
@@ -110,7 +110,7 @@ void LlamaCompletionWorker::Execute() {
110
110
  } else {
111
111
  // Text-only path
112
112
  std::vector<llama_token> prompt_tokens =
113
- ::common_tokenize(ctx, _params.prompt, add_bos);
113
+ ::common_tokenize(ctx, _params.prompt, add_bos, true);
114
114
  n_input = prompt_tokens.size();
115
115
 
116
116
  if (_sess->tokens_ptr()->size() > 0) {
@@ -157,10 +157,26 @@ void LlamaCompletionWorker::Execute() {
157
157
  // For multimodal input, n_past might already be set
158
158
  // Only decode text tokens if we have any input left
159
159
  if (n_input > 0) {
160
- int ret = llama_decode(ctx, llama_batch_get_one(embd->data() + n_cur, n_input));
161
- if (ret < 0) {
162
- SetError("Failed to decode token, code: " + std::to_string(ret));
163
- break;
160
+ // Decode tokens in batches using n_batch as chunk size
161
+ int n_past_batch = n_cur;
162
+ int n_remaining = n_input;
163
+
164
+ while (n_remaining > 0) {
165
+ int n_eval = n_remaining;
166
+ if (n_eval > _params.n_batch) {
167
+ n_eval = _params.n_batch;
168
+ }
169
+
170
+ int ret = llama_decode(ctx, llama_batch_get_one(embd->data() + n_past_batch, n_eval));
171
+ if (ret < 0) {
172
+ SetError("Failed to decode token batch, code: " + std::to_string(ret) +
173
+ ", n_eval: " + std::to_string(n_eval) +
174
+ ", n_past_batch: " + std::to_string(n_past_batch));
175
+ break;
176
+ }
177
+
178
+ n_past_batch += n_eval;
179
+ n_remaining -= n_eval;
164
180
  }
165
181
  }
166
182
 
@@ -177,7 +193,7 @@ void LlamaCompletionWorker::Execute() {
177
193
 
178
194
  // Collect audio tokens for TTS if vocoder is enabled
179
195
  if (_has_vocoder) {
180
- if ((_tts_type == OUTETTS_V0_2 || _tts_type == OUTETTS_V0_3) &&
196
+ if ((_tts_type == OUTETTS_V0_1 || _tts_type == OUTETTS_V0_2 || _tts_type == OUTETTS_V0_3) &&
181
197
  (new_token_id >= 151672 && new_token_id <= 155772)) {
182
198
  _result.audio_tokens.push_back(new_token_id);
183
199
  }
@@ -255,6 +271,8 @@ void LlamaCompletionWorker::OnOK() {
255
271
  try {
256
272
  common_chat_syntax chat_syntax;
257
273
  chat_syntax.format = static_cast<common_chat_format>(_chat_format);
274
+ result.Set("chat_format", Napi::Number::New(env, _chat_format));
275
+
258
276
  chat_syntax.thinking_forced_open = _thinking_forced_open;
259
277
 
260
278
  if (_reasoning_format == "deepseek") {
@@ -247,6 +247,7 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
247
247
  params.cache_type_v = kv_cache_type_from_str(
248
248
  get_option<std::string>(options, "cache_type_v", "f16").c_str());
249
249
  params.ctx_shift = get_option<bool>(options, "ctx_shift", true);
250
+ params.kv_unified = get_option<bool>(options, "kv_unified", false);
250
251
 
251
252
  params.use_mlock = get_option<bool>(options, "use_mlock", false);
252
253
  params.use_mmap = get_option<bool>(options, "use_mmap", true);
@@ -904,9 +905,27 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
904
905
  // guide_tokens
905
906
  std::vector<llama_token> guide_tokens;
906
907
  if (options.Has("guide_tokens")) {
907
- auto guide_tokens_array = options.Get("guide_tokens").As<Napi::Array>();
908
- for (size_t i = 0; i < guide_tokens_array.Length(); i++) {
909
- guide_tokens.push_back(guide_tokens_array.Get(i).ToNumber().Int32Value());
908
+ auto guide_tokens_value = options.Get("guide_tokens");
909
+ if (guide_tokens_value.IsArray()) {
910
+ auto guide_tokens_array = guide_tokens_value.As<Napi::Array>();
911
+ for (size_t i = 0; i < guide_tokens_array.Length(); i++) {
912
+ guide_tokens.push_back(guide_tokens_array.Get(i).ToNumber().Int32Value());
913
+ }
914
+ } else if (guide_tokens_value.IsTypedArray()) {
915
+ auto guide_tokens_typed_array = guide_tokens_value.As<Napi::TypedArray>();
916
+ if (guide_tokens_typed_array.TypedArrayType() == napi_int32_array) {
917
+ auto guide_tokens_int32_array = guide_tokens_value.As<Napi::Int32Array>();
918
+ size_t length = guide_tokens_int32_array.ElementLength();
919
+ const int32_t* data = guide_tokens_int32_array.Data();
920
+ guide_tokens.resize(length);
921
+ memcpy(guide_tokens.data(), data, length * sizeof(int32_t));
922
+ } else {
923
+ Napi::TypeError::New(env, "guide_tokens must be Array<number> or Int32Array").ThrowAsJavaScriptException();
924
+ return env.Undefined();
925
+ }
926
+ } else {
927
+ Napi::TypeError::New(env, "guide_tokens must be Array<number> or Int32Array").ThrowAsJavaScriptException();
928
+ return env.Undefined();
910
929
  }
911
930
  }
912
931
 
@@ -1345,7 +1364,7 @@ Napi::Value LlamaContext::IsVocoderEnabled(const Napi::CallbackInfo &info) {
1345
1364
  return Napi::Boolean::New(env, _has_vocoder);
1346
1365
  }
1347
1366
 
1348
- // getFormattedAudioCompletion(speaker: string|null, text: string): string
1367
+ // getFormattedAudioCompletion(speaker: string|null, text: string): object
1349
1368
  Napi::Value
1350
1369
  LlamaContext::GetFormattedAudioCompletion(const Napi::CallbackInfo &info) {
1351
1370
  Napi::Env env = info.Env();
@@ -1372,9 +1391,16 @@ LlamaContext::GetFormattedAudioCompletion(const Napi::CallbackInfo &info) {
1372
1391
  audio_text = audio_text_from_speaker(speaker, type);
1373
1392
  audio_data = audio_data_from_speaker(speaker, type);
1374
1393
  }
1375
- return Napi::String::New(env, "<|im_start|>\n" + audio_text +
1376
- process_text(text, type) +
1377
- "<|text_end|>\n" + audio_data + "\n");
1394
+ std::string prompt = "<|im_start|>\n" + audio_text +
1395
+ process_text(text, type) +
1396
+ "<|text_end|>\n" + audio_data + "\n";
1397
+ Napi::Object result = Napi::Object::New(env);
1398
+ result.Set("prompt", prompt);
1399
+ const char *grammar = get_tts_grammar(type);
1400
+ if (grammar != nullptr) {
1401
+ result.Set("grammar", grammar);
1402
+ }
1403
+ return result;
1378
1404
  }
1379
1405
 
1380
1406
  // getAudioCompletionGuideTokens(text: string): Int32Array
@@ -1415,6 +1441,10 @@ LlamaContext::GetAudioCompletionGuideTokens(const Napi::CallbackInfo &info) {
1415
1441
  if (tmp.size() > 0) {
1416
1442
  result.push_back(tmp[0]);
1417
1443
  }
1444
+
1445
+ // Add Audio End, forcing stop generation
1446
+ result.push_back(common_tokenize(vocab, "<|audio_end|>", false, true)[0]);
1447
+
1418
1448
  auto tokens = Napi::Int32Array::New(env, result.size());
1419
1449
  memcpy(tokens.Data(), result.data(), result.size() * sizeof(int32_t));
1420
1450
  return tokens;
@@ -1449,7 +1479,7 @@ Napi::Value LlamaContext::DecodeAudioTokens(const Napi::CallbackInfo &info) {
1449
1479
  .ThrowAsJavaScriptException();
1450
1480
  return env.Undefined();
1451
1481
  }
1452
- if (type == OUTETTS_V0_3 || type == OUTETTS_V0_2) {
1482
+ if (type == OUTETTS_V0_1 || type == OUTETTS_V0_2 || type == OUTETTS_V0_3) {
1453
1483
  tokens.erase(
1454
1484
  std::remove_if(tokens.begin(), tokens.end(),
1455
1485
  [](llama_token t) { return t < 151672 || t > 155772; }),