@fugood/llama.node 1.3.3 → 1.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. package/CMakeLists.txt +7 -3
  2. package/lib/binding.js +1 -1
  3. package/lib/binding.ts +40 -14
  4. package/lib/index.js +4 -1
  5. package/lib/index.ts +13 -9
  6. package/package.json +14 -14
  7. package/scripts/llama.cpp.patch +10 -10
  8. package/src/LlamaCompletionWorker.cpp +33 -33
  9. package/src/LlamaContext.cpp +53 -16
  10. package/src/LlamaContext.h +2 -0
  11. package/src/llama.cpp/common/CMakeLists.txt +2 -0
  12. package/src/llama.cpp/common/chat-parser-xml-toolcall.cpp +861 -0
  13. package/src/llama.cpp/common/chat-parser-xml-toolcall.h +45 -0
  14. package/src/llama.cpp/common/chat-parser.h +10 -0
  15. package/src/llama.cpp/common/chat.cpp +461 -87
  16. package/src/llama.cpp/common/chat.h +6 -0
  17. package/src/llama.cpp/common/common.cpp +8 -1
  18. package/src/llama.cpp/common/common.h +12 -5
  19. package/src/llama.cpp/common/json-partial.cpp +19 -2
  20. package/src/llama.cpp/common/json-schema-to-grammar.cpp +2 -0
  21. package/src/llama.cpp/common/json-schema-to-grammar.h +2 -0
  22. package/src/llama.cpp/common/sampling.cpp +60 -6
  23. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +31 -38
  24. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +6 -6
  25. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +15 -5
  26. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +2 -3
  27. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +16 -14
  28. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +49 -48
  29. package/src/llama.cpp/src/llama-grammar.cpp +17 -9
  30. package/src/llama.cpp/src/llama-impl.cpp +3 -3
  31. package/src/llama.cpp/src/llama-sampling.cpp +3 -6
  32. package/src/llama.cpp/src/llama-vocab.cpp +1 -0
package/CMakeLists.txt CHANGED
@@ -120,16 +120,20 @@ if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND NOT DEFINED GGML_OPENMP OR GGML_O
120
120
  endif()
121
121
 
122
122
  set(LLAMA_BUILD_COMMON ON CACHE BOOL "Build common")
123
-
124
- set(LLAMA_BUILD_TOOLS ON CACHE BOOL "Build tools")
125
-
123
+ set(LLAMA_BUILD_TOOLS OFF CACHE BOOL "Build tools")
124
+ set(LLAMA_BUILD_TESTS OFF CACHE BOOL "Build tests")
125
+ set(LLAMA_BUILD_SERVER OFF CACHE BOOL "Build server")
126
+ set(LLAMA_BUILD_EXAMPLES OFF CACHE BOOL "Build examples")
126
127
  set(LLAMA_CURL OFF CACHE BOOL "Build curl")
127
128
 
129
+ set(LLAMA_INSTALL_VERSION "0.0.0") # TODO: Set the version number (0.0.<BUILD_NUMBER>)
130
+
128
131
  set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libraries")
129
132
 
130
133
  add_definitions(-DGGML_MAX_NAME=80)
131
134
 
132
135
  add_subdirectory("src/llama.cpp")
136
+ add_subdirectory("src/llama.cpp/tools/mtmd")
133
137
 
134
138
  include_directories(
135
139
  ${CMAKE_JS_INC}
package/lib/binding.js CHANGED
@@ -51,7 +51,7 @@ const getPlatformPackageName = (variant) => {
51
51
  };
52
52
  const loadPlatformPackage = (packageName) => __awaiter(void 0, void 0, void 0, function* () {
53
53
  try {
54
- return yield Promise.resolve(`${packageName}`).then(s => __importStar(require(s)));
54
+ return (yield Promise.resolve(`${packageName}`).then(s => __importStar(require(s))));
55
55
  }
56
56
  catch (error) {
57
57
  return null;
package/lib/binding.ts CHANGED
@@ -1,9 +1,9 @@
1
1
  export type MessagePart = {
2
- type: string,
3
- text?: string,
2
+ type: string
3
+ text?: string
4
4
  image_url?: {
5
5
  url?: string
6
- },
6
+ }
7
7
  input_audio?: {
8
8
  format: string
9
9
  data?: string
@@ -70,6 +70,12 @@ export type LlamaModelOptions = {
70
70
  * Number of layers to keep MoE weights on CPU
71
71
  */
72
72
  n_cpu_moe?: number
73
+ /**
74
+ * List of device names to use for offloading
75
+ * Device names can be obtained from getBackendDevicesInfo()
76
+ * Example: ['Metal', 'BLAS', 'CPU']
77
+ */
78
+ devices?: string[]
73
79
  use_mlock?: boolean
74
80
  use_mmap?: boolean
75
81
  vocab_only?: boolean
@@ -375,9 +381,13 @@ export type ToolCall = {
375
381
  }
376
382
 
377
383
  export interface LlamaContext {
378
- new (options: LlamaModelOptions, onProgress?: (progress: number) => void): LlamaContext
384
+ new (
385
+ options: LlamaModelOptions,
386
+ onProgress?: (progress: number) => void,
387
+ ): LlamaContext
379
388
  getSystemInfo(): string
380
389
  getModelInfo(): ModelInfo
390
+ getUsedDevices(): string[]
381
391
  getFormattedChat(
382
392
  messages: ChatMessage[],
383
393
  chat_template?: string,
@@ -400,8 +410,15 @@ export interface LlamaContext {
400
410
  stopCompletion(): void
401
411
  tokenize(text: string, media_paths?: string[]): Promise<TokenizeResult>
402
412
  detokenize(tokens: number[]): Promise<string>
403
- embedding(text: string, params?: { embd_normalize?: number }): Promise<EmbeddingResult>
404
- rerank(query: string, documents: string[], params?: RerankParams): Promise<RerankResult[]>
413
+ embedding(
414
+ text: string,
415
+ params?: { embd_normalize?: number },
416
+ ): Promise<EmbeddingResult>
417
+ rerank(
418
+ query: string,
419
+ documents: string[],
420
+ params?: RerankParams,
421
+ ): Promise<RerankResult[]>
405
422
  saveSession(path: string): Promise<void>
406
423
  loadSession(path: string): Promise<void>
407
424
  release(): Promise<void>
@@ -440,7 +457,7 @@ export interface LlamaContext {
440
457
  * @param options Object containing path and optional n_batch
441
458
  * @returns boolean indicating if loading was successful
442
459
  */
443
- initVocoder(options: { path: string, n_batch?: number }): boolean
460
+ initVocoder(options: { path: string; n_batch?: number }): boolean
444
461
 
445
462
  /**
446
463
  * Unload the vocoder model
@@ -459,7 +476,10 @@ export interface LlamaContext {
459
476
  * @param text Text to complete
460
477
  * @returns Formatted audio completion
461
478
  */
462
- getFormattedAudioCompletion(speaker: string|null, text: string): {
479
+ getFormattedAudioCompletion(
480
+ speaker: string | null,
481
+ text: string,
482
+ ): {
463
483
  prompt: string
464
484
  grammar?: string
465
485
  }
@@ -476,7 +496,7 @@ export interface LlamaContext {
476
496
  * @param tokens Tokens to decode
477
497
  * @returns Promise resolving to decoded audio tokens
478
498
  */
479
- decodeAudioTokens(tokens: number[]|Int32Array): Promise<Float32Array>
499
+ decodeAudioTokens(tokens: number[] | Int32Array): Promise<Float32Array>
480
500
 
481
501
  // Parallel decoding methods
482
502
 
@@ -485,7 +505,7 @@ export interface LlamaContext {
485
505
  * @param params Configuration for parallel mode
486
506
  * @returns boolean indicating if successful
487
507
  */
488
- enableParallelMode(params: { n_parallel?: number, n_batch?: number }): boolean
508
+ enableParallelMode(params: { n_parallel?: number; n_batch?: number }): boolean
489
509
 
490
510
  /**
491
511
  * Disable parallel decoding mode
@@ -563,9 +583,11 @@ const getPlatformPackageName = (variant?: LibVariant): string => {
563
583
  return `@fugood/node-llama-${platform}-${arch}${variantSuffix}`
564
584
  }
565
585
 
566
- const loadPlatformPackage = async (packageName: string): Promise<Module | null> => {
586
+ const loadPlatformPackage = async (
587
+ packageName: string,
588
+ ): Promise<Module | null> => {
567
589
  try {
568
- return await import(packageName) as Module
590
+ return (await import(packageName)) as Module
569
591
  } catch (error) {
570
592
  return null
571
593
  }
@@ -579,7 +601,9 @@ export const loadModule = async (variant?: LibVariant): Promise<Module> => {
579
601
 
580
602
  module = await loadPlatformPackage(getPlatformPackageName())
581
603
  if (module) {
582
- console.warn(`Not found package for variant "${variant}", fallback to default`)
604
+ console.warn(
605
+ `Not found package for variant "${variant}", fallback to default`,
606
+ )
583
607
  return module
584
608
  }
585
609
 
@@ -588,7 +612,9 @@ export const loadModule = async (variant?: LibVariant): Promise<Module> => {
588
612
  return (await import('../build/Release/index.node')) as Module
589
613
  }
590
614
 
591
- export const isLibVariantAvailable = async (variant?: LibVariant): Promise<boolean> => {
615
+ export const isLibVariantAvailable = async (
616
+ variant?: LibVariant,
617
+ ): Promise<boolean> => {
592
618
  if (variant && variant !== 'default') {
593
619
  const module = await loadPlatformPackage(getPlatformPackageName(variant))
594
620
  return module != null
package/lib/index.js CHANGED
@@ -76,6 +76,9 @@ class LlamaContextWrapper {
76
76
  getModelInfo() {
77
77
  return this.ctx.getModelInfo();
78
78
  }
79
+ getUsedDevices() {
80
+ return this.ctx.getUsedDevices();
81
+ }
79
82
  isJinjaSupported() {
80
83
  const { minja } = this.ctx.getModelInfo().chatTemplates;
81
84
  return !!(minja === null || minja === void 0 ? void 0 : minja.toolUse) || !!(minja === null || minja === void 0 ? void 0 : minja.default);
@@ -85,7 +88,7 @@ class LlamaContextWrapper {
85
88
  }
86
89
  getFormattedChat(messages, template, params) {
87
90
  var _a;
88
- const { messages: chat, has_media, media_paths, } = (0, utils_1.formatMediaChat)(messages);
91
+ const { messages: chat, has_media, media_paths } = (0, utils_1.formatMediaChat)(messages);
89
92
  const useJinja = this.isJinjaSupported() && (params === null || params === void 0 ? void 0 : params.jinja);
90
93
  let tmpl;
91
94
  if (template)
package/lib/index.ts CHANGED
@@ -94,6 +94,10 @@ class LlamaContextWrapper {
94
94
  return this.ctx.getModelInfo()
95
95
  }
96
96
 
97
+ getUsedDevices(): string[] {
98
+ return this.ctx.getUsedDevices()
99
+ }
100
+
97
101
  isJinjaSupported(): boolean {
98
102
  const { minja } = this.ctx.getModelInfo().chatTemplates
99
103
  return !!minja?.toolUse || !!minja?.default
@@ -118,11 +122,7 @@ class LlamaContextWrapper {
118
122
  chat_template_kwargs?: Record<string, string>
119
123
  },
120
124
  ): FormattedChatResult {
121
- const {
122
- messages: chat,
123
- has_media,
124
- media_paths,
125
- } = formatMediaChat(messages)
125
+ const { messages: chat, has_media, media_paths } = formatMediaChat(messages)
126
126
 
127
127
  const useJinja = this.isJinjaSupported() && params?.jinja
128
128
  let tmpl
@@ -169,8 +169,9 @@ class LlamaContextWrapper {
169
169
  options: LlamaCompletionOptions,
170
170
  callback?: (token: LlamaCompletionToken) => void,
171
171
  ): Promise<LlamaCompletionResult> {
172
- const { messages, media_paths = options.media_paths } =
173
- formatMediaChat(options.messages)
172
+ const { messages, media_paths = options.media_paths } = formatMediaChat(
173
+ options.messages,
174
+ )
174
175
  return this.ctx.completion(
175
176
  {
176
177
  ...options,
@@ -196,7 +197,10 @@ class LlamaContextWrapper {
196
197
  return this.ctx.detokenize(tokens)
197
198
  }
198
199
 
199
- embedding(text: string, params?: { embd_normalize?: number }): Promise<EmbeddingResult> {
200
+ embedding(
201
+ text: string,
202
+ params?: { embd_normalize?: number },
203
+ ): Promise<EmbeddingResult> {
200
204
  return this.ctx.embedding(text, params)
201
205
  }
202
206
 
@@ -329,7 +333,7 @@ export const loadLlamaModelInfo = async (
329
333
  }
330
334
 
331
335
  export const getBackendDevicesInfo = async (
332
- variant: LibVariant = 'default'
336
+ variant: LibVariant = 'default',
333
337
  ): Promise<import('./binding').BackendDeviceInfo[]> => {
334
338
  mods[variant] ??= await loadModule(variant)
335
339
  refreshNativeLogSetup()
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.3.3",
4
+ "version": "1.3.5",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -72,19 +72,19 @@
72
72
  "CMakeLists.txt"
73
73
  ],
74
74
  "optionalDependencies": {
75
- "@fugood/node-llama-linux-x64": "1.3.3",
76
- "@fugood/node-llama-linux-x64-vulkan": "1.3.3",
77
- "@fugood/node-llama-linux-x64-cuda": "1.3.3",
78
- "@fugood/node-llama-linux-arm64": "1.3.3",
79
- "@fugood/node-llama-linux-arm64-vulkan": "1.3.3",
80
- "@fugood/node-llama-linux-arm64-cuda": "1.3.3",
81
- "@fugood/node-llama-win32-x64": "1.3.3",
82
- "@fugood/node-llama-win32-x64-vulkan": "1.3.3",
83
- "@fugood/node-llama-win32-x64-cuda": "1.3.3",
84
- "@fugood/node-llama-win32-arm64": "1.3.3",
85
- "@fugood/node-llama-win32-arm64-vulkan": "1.3.3",
86
- "@fugood/node-llama-darwin-x64": "1.3.3",
87
- "@fugood/node-llama-darwin-arm64": "1.3.3"
75
+ "@fugood/node-llama-linux-x64": "1.3.5",
76
+ "@fugood/node-llama-linux-x64-vulkan": "1.3.5",
77
+ "@fugood/node-llama-linux-x64-cuda": "1.3.5",
78
+ "@fugood/node-llama-linux-arm64": "1.3.5",
79
+ "@fugood/node-llama-linux-arm64-vulkan": "1.3.5",
80
+ "@fugood/node-llama-linux-arm64-cuda": "1.3.5",
81
+ "@fugood/node-llama-win32-x64": "1.3.5",
82
+ "@fugood/node-llama-win32-x64-vulkan": "1.3.5",
83
+ "@fugood/node-llama-win32-x64-cuda": "1.3.5",
84
+ "@fugood/node-llama-win32-arm64": "1.3.5",
85
+ "@fugood/node-llama-win32-arm64-vulkan": "1.3.5",
86
+ "@fugood/node-llama-darwin-x64": "1.3.5",
87
+ "@fugood/node-llama-darwin-arm64": "1.3.5"
88
88
  },
89
89
  "devDependencies": {
90
90
  "@babel/preset-env": "^7.24.4",
@@ -1,8 +1,8 @@
1
1
  diff --git a/src/llama.cpp/common/CMakeLists.txt b/src/llama.cpp/common/CMakeLists.txt
2
- index 706fa32ee..248459903 100644
2
+ index bb168e835..cfc0e2c2e 100644
3
3
  --- a/src/llama.cpp/common/CMakeLists.txt
4
4
  +++ b/src/llama.cpp/common/CMakeLists.txt
5
- @@ -141,9 +141,16 @@ if (LLAMA_LLGUIDANCE)
5
+ @@ -143,9 +143,16 @@ if (LLAMA_LLGUIDANCE)
6
6
  set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
7
7
  endif ()
8
8
 
@@ -21,7 +21,7 @@ index 706fa32ee..248459903 100644
21
21
 
22
22
  #
23
23
  diff --git a/src/llama.cpp/common/chat.cpp b/src/llama.cpp/common/chat.cpp
24
- index 938872e82..6364f173f 100644
24
+ index 6fa05a604..87dfa7a8b 100644
25
25
  --- a/src/llama.cpp/common/chat.cpp
26
26
  +++ b/src/llama.cpp/common/chat.cpp
27
27
  @@ -6,9 +6,6 @@
@@ -51,7 +51,7 @@ index 938872e82..6364f173f 100644
51
51
  struct templates_params {
52
52
  json messages;
53
53
  json tools;
54
- @@ -811,7 +798,7 @@ static std::string apply(
54
+ @@ -817,7 +804,7 @@ static std::string apply(
55
55
  tmpl_inputs.extra_context.merge_patch(*additional_context);
56
56
  }
57
57
  // TODO: add flag to control date/time, if only for testing purposes.
@@ -61,7 +61,7 @@ index 938872e82..6364f173f 100644
61
61
  minja::chat_template_options tmpl_opts;
62
62
  // To avoid double BOS / EOS tokens, we're manually removing begining / trailing tokens
63
63
  diff --git a/src/llama.cpp/common/chat.h b/src/llama.cpp/common/chat.h
64
- index 50efb0d4e..f471a84c7 100644
64
+ index 754c411e2..71241a6cc 100644
65
65
  --- a/src/llama.cpp/common/chat.h
66
66
  +++ b/src/llama.cpp/common/chat.h
67
67
  @@ -9,7 +9,18 @@
@@ -85,10 +85,10 @@ index 50efb0d4e..f471a84c7 100644
85
85
  struct common_chat_tool_call {
86
86
  std::string name;
87
87
  diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
88
- index 4dc95dcba..ea0ea86c0 100644
88
+ index f3cc55247..65398844f 100644
89
89
  --- a/src/llama.cpp/common/common.cpp
90
90
  +++ b/src/llama.cpp/common/common.cpp
91
- @@ -1155,6 +1155,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
91
+ @@ -1162,6 +1162,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
92
92
  mparams.n_gpu_layers = params.n_gpu_layers;
93
93
  }
94
94
 
@@ -97,10 +97,10 @@ index 4dc95dcba..ea0ea86c0 100644
97
97
  mparams.split_mode = params.split_mode;
98
98
  mparams.tensor_split = params.tensor_split;
99
99
  diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
100
- index f42c083fa..c573cc812 100644
100
+ index de5b404dd..d30d252c9 100644
101
101
  --- a/src/llama.cpp/common/common.h
102
102
  +++ b/src/llama.cpp/common/common.h
103
- @@ -274,6 +274,7 @@ struct lr_opt {
103
+ @@ -281,6 +281,7 @@ struct lr_opt {
104
104
  struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
105
105
 
106
106
  struct common_params {
@@ -109,7 +109,7 @@ index f42c083fa..c573cc812 100644
109
109
  int32_t n_ctx = 4096; // context size
110
110
  int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
111
111
  diff --git a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
112
- index e52e050a8..c1000c162 100644
112
+ index d0cab0bcb..48d532838 100644
113
113
  --- a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
114
114
  +++ b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
115
115
  @@ -106,7 +106,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
@@ -9,10 +9,10 @@ Napi::Array TokenProbsToArray(Napi::Env env, llama_context* ctx, const std::vect
9
9
  for (size_t i = 0; i < probs.size(); i++) {
10
10
  const auto &prob = probs[i];
11
11
  Napi::Object token_obj = Napi::Object::New(env);
12
-
12
+
13
13
  std::string token_str = common_token_to_piece(ctx, prob.tok);
14
14
  token_obj.Set("content", Napi::String::New(env, token_str));
15
-
15
+
16
16
  Napi::Array token_probs = Napi::Array::New(env);
17
17
  for (size_t j = 0; j < prob.probs.size(); j++) {
18
18
  const auto &p = prob.probs[j];
@@ -83,10 +83,10 @@ void LlamaCompletionWorker::Execute() {
83
83
  }
84
84
 
85
85
  auto completion = _rn_ctx->completion;
86
-
86
+
87
87
  // Prepare completion context
88
88
  completion->rewind();
89
-
89
+
90
90
  // Set up parameters
91
91
  _rn_ctx->params.prompt = _params.prompt;
92
92
  _rn_ctx->params.sampling = _params.sampling;
@@ -95,50 +95,50 @@ void LlamaCompletionWorker::Execute() {
95
95
  _rn_ctx->params.n_ctx = _params.n_ctx;
96
96
  _rn_ctx->params.n_batch = _params.n_batch;
97
97
  _rn_ctx->params.ctx_shift = _params.ctx_shift;
98
-
98
+
99
99
  // Set prefill text
100
100
  completion->prefill_text = _prefill_text;
101
-
101
+
102
102
  // Set up TTS guide tokens if enabled
103
103
  if (_has_vocoder && _rn_ctx->tts_wrapper != nullptr) {
104
104
  _rn_ctx->tts_wrapper->guide_tokens = _guide_tokens;
105
105
  _rn_ctx->tts_wrapper->next_token_uses_guide_token = true;
106
106
  }
107
-
107
+
108
108
  // Initialize sampling
109
109
  if (!completion->initSampling()) {
110
110
  SetError("Failed to initialize sampling");
111
111
  return;
112
112
  }
113
-
113
+
114
114
  // Load prompt (handles both text-only and multimodal)
115
115
  completion->loadPrompt(_media_paths);
116
-
116
+
117
117
  // Check if context is full after loading prompt
118
118
  if (completion->context_full) {
119
119
  _result.context_full = true;
120
120
  return;
121
121
  }
122
-
122
+
123
123
  // Begin completion with chat format and reasoning settings
124
124
  completion->beginCompletion(_chat_format, common_reasoning_format_from_name(_reasoning_format), _thinking_forced_open);
125
-
125
+
126
126
  // Main completion loop
127
127
  int token_count = 0;
128
128
  const int max_tokens = _params.n_predict < 0 ? std::numeric_limits<int>::max() : _params.n_predict;
129
129
  while (completion->has_next_token && !_interrupted && token_count < max_tokens) {
130
130
  // Get next token using rn-llama completion
131
131
  rnllama::completion_token_output token_output = completion->doCompletion();
132
-
132
+
133
133
  if (token_output.tok == -1) {
134
134
  break;
135
135
  }
136
-
136
+
137
137
  token_count++;
138
-
138
+
139
139
  std::string token_text = common_token_to_piece(_rn_ctx->ctx, token_output.tok);
140
140
  _result.text += token_text;
141
-
141
+
142
142
  // Check for stopping strings after adding the token
143
143
  if (!_stop_words.empty()) {
144
144
  size_t stop_pos = completion->findStoppingStrings(_result.text, token_text.size(), rnllama::STOP_FULL);
@@ -148,7 +148,7 @@ void LlamaCompletionWorker::Execute() {
148
148
  break;
149
149
  }
150
150
  }
151
-
151
+
152
152
  // Handle streaming callback
153
153
  if (_has_callback && !completion->incomplete) {
154
154
  struct TokenData {
@@ -160,9 +160,9 @@ void LlamaCompletionWorker::Execute() {
160
160
  std::vector<rnllama::completion_token_output> completion_probabilities;
161
161
  llama_context* ctx;
162
162
  };
163
-
163
+
164
164
  auto partial_output = completion->parseChatOutput(true);
165
-
165
+
166
166
  // Extract completion probabilities if n_probs > 0, similar to iOS implementation
167
167
  std::vector<rnllama::completion_token_output> probs_output;
168
168
  if (_rn_ctx->params.sampling.n_probs > 0) {
@@ -171,23 +171,23 @@ void LlamaCompletionWorker::Execute() {
171
171
  size_t probs_stop_pos = std::min(_sent_token_probs_index + to_send_toks.size(), completion->generated_token_probs.size());
172
172
  if (probs_pos < probs_stop_pos) {
173
173
  probs_output = std::vector<rnllama::completion_token_output>(
174
- completion->generated_token_probs.begin() + probs_pos,
174
+ completion->generated_token_probs.begin() + probs_pos,
175
175
  completion->generated_token_probs.begin() + probs_stop_pos
176
176
  );
177
177
  }
178
178
  _sent_token_probs_index = probs_stop_pos;
179
179
  }
180
-
180
+
181
181
  TokenData *token_data = new TokenData{
182
- token_text,
183
- partial_output.content,
184
- partial_output.reasoning_content,
185
- partial_output.tool_calls,
182
+ token_text,
183
+ partial_output.content,
184
+ partial_output.reasoning_content,
185
+ partial_output.tool_calls,
186
186
  partial_output.accumulated_text,
187
187
  probs_output,
188
188
  _rn_ctx->ctx
189
189
  };
190
-
190
+
191
191
  _tsfn.BlockingCall(token_data, [](Napi::Env env, Napi::Function jsCallback,
192
192
  TokenData *data) {
193
193
  auto obj = Napi::Object::New(env);
@@ -216,25 +216,25 @@ void LlamaCompletionWorker::Execute() {
216
216
  obj.Set("tool_calls", tool_calls);
217
217
  }
218
218
  obj.Set("accumulated_text", Napi::String::New(env, data->accumulated_text));
219
-
219
+
220
220
  // Add completion_probabilities if available
221
221
  if (!data->completion_probabilities.empty()) {
222
222
  obj.Set("completion_probabilities", TokenProbsToArray(env, data->ctx, data->completion_probabilities));
223
223
  }
224
-
224
+
225
225
  delete data;
226
226
  jsCallback.Call({obj});
227
227
  });
228
228
  }
229
229
  }
230
-
230
+
231
231
  // Check stopping conditions
232
232
  if (token_count >= max_tokens) {
233
233
  _result.stopped_limited = true;
234
234
  } else if (!completion->has_next_token && completion->n_remain == 0) {
235
235
  _result.stopped_limited = true;
236
236
  }
237
-
237
+
238
238
  // Set completion results from rn-llama completion context
239
239
  // tokens_evaluated should include both prompt tokens and generated tokens that were processed
240
240
  _result.tokens_evaluated = completion->num_prompt_tokens + completion->num_tokens_predicted;
@@ -245,20 +245,20 @@ void LlamaCompletionWorker::Execute() {
245
245
  _result.stopped_words = completion->stopped_word;
246
246
  _result.stopping_word = completion->stopping_word;
247
247
  _result.stopped_limited = completion->stopped_limit;
248
-
248
+
249
249
  // Get audio tokens if TTS is enabled
250
250
  if (_has_vocoder && _rn_ctx->tts_wrapper != nullptr) {
251
251
  _result.audio_tokens = _rn_ctx->tts_wrapper->audio_tokens;
252
252
  }
253
-
253
+ common_perf_print(_rn_ctx->ctx, _rn_ctx->completion->ctx_sampling);
254
254
  // End completion
255
255
  completion->endCompletion();
256
-
256
+
257
257
  } catch (const std::exception &e) {
258
258
  SetError(e.what());
259
259
  return;
260
260
  }
261
-
261
+
262
262
  if (_onComplete) {
263
263
  _onComplete();
264
264
  }