npm - @fugood/llama.node - Versions diffs - 1.1.8 → 1.1.10 - Mend

@fugood/llama.node 1.1.8 → 1.1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

package/lib/binding.ts +9 -0
package/lib/index.js +9 -2
package/lib/index.ts +57 -30
package/lib/version.js +2 -2
package/lib/version.ts +2 -2
package/package.json +14 -14
package/scripts/llama.cpp.patch +15 -5
package/src/LlamaCompletionWorker.cpp +12 -3
package/src/LlamaCompletionWorker.h +3 -1
package/src/LlamaContext.cpp +14 -1
package/src/llama.cpp/common/arg.cpp +6 -4
package/src/llama.cpp/common/chat.cpp +34 -3
package/src/llama.cpp/common/common.cpp +0 -15
package/src/llama.cpp/common/common.h +1 -2
package/src/llama.cpp/ggml/CMakeLists.txt +0 -1
package/src/llama.cpp/ggml/include/ggml.h +25 -0
package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +316 -0
package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -2
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +8 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +6 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +142 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
package/src/llama.cpp/include/llama.h +1 -110
package/src/llama.cpp/src/CMakeLists.txt +2 -2
package/src/llama.cpp/src/llama-arch.cpp +19 -0
package/src/llama.cpp/src/llama-arch.h +1 -0
package/src/llama.cpp/src/llama-chat.cpp +13 -2
package/src/llama.cpp/src/llama-chat.h +1 -0
package/src/llama.cpp/src/llama-context.cpp +5 -192
package/src/llama.cpp/src/llama-context.h +2 -7
package/src/llama.cpp/src/llama-cparams.h +0 -1
package/src/llama.cpp/src/llama-graph.cpp +35 -57
package/src/llama.cpp/src/llama-graph.h +36 -46
package/src/llama.cpp/src/llama-hparams.cpp +25 -0
package/src/llama.cpp/src/llama-hparams.h +6 -0
package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.cpp → llama-kv-cache-iswa.cpp} +69 -52
package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +28 -26
package/src/llama.cpp/src/{llama-kv-cache-unified.cpp → llama-kv-cache.cpp} +123 -474
package/src/llama.cpp/src/{llama-kv-cache-unified.h → llama-kv-cache.h} +34 -59
package/src/llama.cpp/src/llama-kv-cells.h +21 -21
package/src/llama.cpp/src/llama-memory-hybrid.cpp +34 -33
package/src/llama.cpp/src/llama-memory-hybrid.h +24 -28
package/src/llama.cpp/src/llama-memory-recurrent.cpp +7 -7
package/src/llama.cpp/src/llama-memory-recurrent.h +8 -12
package/src/llama.cpp/src/llama-memory.h +11 -8
package/src/llama.cpp/src/llama-model.cpp +396 -187
package/src/llama.cpp/src/llama-model.h +1 -0

package/lib/binding.ts CHANGED Viewed

@@ -59,6 +59,10 @@ export type LlamaModelOptions = {
    * Use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
    */
   swa_full?: boolean
+  /**
+   * Number of layers to keep MoE weights on CPU
+   */
+  n_cpu_moe?: number
   use_mlock?: boolean
   use_mmap?: boolean
   vocab_only?: boolean
@@ -96,6 +100,11 @@ export type LlamaCompletionOptions = {
   enable_thinking?: boolean
   thinking_forced_open?: boolean
   prompt?: string
+  /**
+   * Text to prefill the response with.
+   * This text will be added to the beginning of the generated response.
+   */
+  prefill_text?: string
   temperature?: number
   top_k?: number
   top_p?: number

package/lib/index.js CHANGED Viewed

@@ -148,7 +148,12 @@ class LlamaContextWrapper {
             enable_thinking: (_a = params === null || params === void 0 ? void 0 : params.enable_thinking) !== null && _a !== void 0 ? _a : true,
             add_generation_prompt: params === null || params === void 0 ? void 0 : params.add_generation_prompt,
             now: params === null || params === void 0 ? void 0 : params.now,
-            chat_template_kwargs: params === null || params === void 0 ? void 0 : params.chat_template_kwargs,
+            chat_template_kwargs: (params === null || params === void 0 ? void 0 : params.chat_template_kwargs)
+                ? Object.entries(params.chat_template_kwargs).reduce((acc, [key, value]) => {
+                    acc[key] = JSON.stringify(value); // Each value is a stringified JSON object
+                    return acc;
+                }, {})
+                : undefined,
         });
         if (!useJinja) {
             return {
@@ -179,7 +184,9 @@ class LlamaContextWrapper {
         return this.ctx.embedding(text);
     }
     rerank(query, documents, params) {
-        return this.ctx.rerank(query, documents, params).then((results) => {
+        return this.ctx
+            .rerank(query, documents, params)
+            .then((results) => {
             // Sort by score descending and add document text for convenience
             return results
                 .map((result) => (Object.assign(Object.assign({}, result), { document: documents[result.index] })))

package/lib/index.ts CHANGED Viewed

@@ -165,11 +165,11 @@ class LlamaContextWrapper {
       response_format?: CompletionResponseFormat
       tools?: Tool[]
       parallel_tool_calls?: boolean
-      tool_choice?: string,
-      enable_thinking?: boolean,
-      add_generation_prompt?: boolean,
-      now?: string | number,
-      chat_template_kwargs?: Record<string, string>,
+      tool_choice?: string
+      enable_thinking?: boolean
+      add_generation_prompt?: boolean
+      now?: string | number
+      chat_template_kwargs?: Record<string, string>
     },
   ): FormattedChatResult {
     const {
@@ -192,7 +192,15 @@ class LlamaContextWrapper {
       enable_thinking: params?.enable_thinking ?? true,
       add_generation_prompt: params?.add_generation_prompt,
       now: params?.now,
-      chat_template_kwargs: params?.chat_template_kwargs,
+      chat_template_kwargs: params?.chat_template_kwargs
+        ? Object.entries(params.chat_template_kwargs).reduce(
+            (acc, [key, value]) => {
+              acc[key] = JSON.stringify(value) // Each value is a stringified JSON object
+              return acc
+            },
+            {} as Record<string, any>,
+          )
+        : undefined,
     })
     if (!useJinja) {
@@ -218,18 +226,24 @@ class LlamaContextWrapper {
   ): Promise<LlamaCompletionResult> {
     const { messages, media_paths = options.media_paths } =
       this._formatMediaChat(options.messages)
-    return this.ctx.completion({
-      ...options,
-      messages,
-      media_paths: options.media_paths || media_paths,
-    }, callback || (() => {}))
+    return this.ctx.completion(
+      {
+        ...options,
+        messages,
+        media_paths: options.media_paths || media_paths,
+      },
+      callback || (() => {}),
+    )
   }
   stopCompletion(): void {
     return this.ctx.stopCompletion()
   }
-  tokenize(text: string, { media_paths }: { media_paths?: string[] } = {}): Promise<TokenizeResult> {
+  tokenize(
+    text: string,
+    { media_paths }: { media_paths?: string[] } = {},
+  ): Promise<TokenizeResult> {
     return this.ctx.tokenize(text, media_paths)
   }
@@ -241,16 +255,27 @@ class LlamaContextWrapper {
     return this.ctx.embedding(text)
   }
-  rerank(query: string, documents: string[], params?: RerankParams): Promise<Array<RerankResult & { document: string }>> {
-    return this.ctx.rerank(query, documents, params).then((results: RerankResult[]) => {
-      // Sort by score descending and add document text for convenience
-      return results
-        .map((result: RerankResult) => ({
-          ...result,
-          document: documents[result.index],
-        }))
-        .sort((a: RerankResult & { document: string }, b: RerankResult & { document: string }) => b.score - a.score)
-    })
+  rerank(
+    query: string,
+    documents: string[],
+    params?: RerankParams,
+  ): Promise<Array<RerankResult & { document: string }>> {
+    return this.ctx
+      .rerank(query, documents, params)
+      .then((results: RerankResult[]) => {
+        // Sort by score descending and add document text for convenience
+        return results
+          .map((result: RerankResult) => ({
+            ...result,
+            document: documents[result.index],
+          }))
+          .sort(
+            (
+              a: RerankResult & { document: string },
+              b: RerankResult & { document: string },
+            ) => b.score - a.score,
+          )
+      })
   }
   saveSession(path: string): Promise<void> {
@@ -277,10 +302,7 @@ class LlamaContextWrapper {
     return this.ctx.getLoadedLoraAdapters()
   }
-  initMultimodal(options: {
-    path: string
-    use_gpu?: boolean
-  }): boolean {
+  initMultimodal(options: { path: string; use_gpu?: boolean }): boolean {
     return this.ctx.initMultimodal(options)
   }
@@ -299,7 +321,7 @@ class LlamaContextWrapper {
     return this.ctx.getMultimodalSupport()
   }
-  initVocoder(options: { path: string, n_batch?: number }): boolean {
+  initVocoder(options: { path: string; n_batch?: number }): boolean {
     return this.ctx.initVocoder(options)
   }
@@ -311,7 +333,10 @@ class LlamaContextWrapper {
     return this.ctx.isVocoderEnabled()
   }
-  getFormattedAudioCompletion(speaker: string|null, text: string): {
+  getFormattedAudioCompletion(
+    speaker: string | null,
+    text: string,
+  ): {
     prompt: string
     grammar?: string
   } {
@@ -322,7 +347,7 @@ class LlamaContextWrapper {
     return this.ctx.getAudioCompletionGuideTokens(text)
   }
-  decodeAudioTokens(tokens: number[]|Int32Array): Promise<Float32Array> {
+  decodeAudioTokens(tokens: number[] | Int32Array): Promise<Float32Array> {
     return this.ctx.decodeAudioTokens(tokens)
   }
 }
@@ -348,7 +373,9 @@ const modelInfoSkip = [
   'tokenizer.ggml.scores',
 ]
-export const loadLlamaModelInfo = async (path: string): Promise<GGUFModelInfo> => {
+export const loadLlamaModelInfo = async (
+  path: string,
+): Promise<GGUFModelInfo> => {
   const variant = 'default'
   mods[variant] ??= await loadModule(variant)
   refreshNativeLogSetup()

package/lib/version.js CHANGED Viewed

@@ -1,5 +1,5 @@
 "use strict";
 Object.defineProperty(exports, "__esModule", { value: true });
 exports.BUILD_COMMIT = exports.BUILD_NUMBER = void 0;
-exports.BUILD_NUMBER = '6096';
-exports.BUILD_COMMIT = 'fd1234cb';
+exports.BUILD_NUMBER = '6250';
+exports.BUILD_COMMIT = 'e92734d51';

package/lib/version.ts CHANGED Viewed

@@ -1,2 +1,2 @@
-export const BUILD_NUMBER = '6096';
-export const BUILD_COMMIT = 'fd1234cb';
+export const BUILD_NUMBER = '6250';
+export const BUILD_COMMIT = 'e92734d51';

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@fugood/llama.node",
   "access": "public",
-  "version": "1.1.8",
+  "version": "1.1.10",
   "description": "An another Node binding of llama.cpp",
   "main": "lib/index.js",
   "scripts": {
@@ -71,19 +71,19 @@
     "CMakeLists.txt"
   ],
   "optionalDependencies": {
-    "@fugood/node-llama-linux-x64": "1.1.8",
-    "@fugood/node-llama-linux-x64-vulkan": "1.1.8",
-    "@fugood/node-llama-linux-x64-cuda": "1.1.8",
-    "@fugood/node-llama-linux-arm64": "1.1.8",
-    "@fugood/node-llama-linux-arm64-vulkan": "1.1.8",
-    "@fugood/node-llama-linux-arm64-cuda": "1.1.8",
-    "@fugood/node-llama-win32-x64": "1.1.8",
-    "@fugood/node-llama-win32-x64-vulkan": "1.1.8",
-    "@fugood/node-llama-win32-x64-cuda": "1.1.8",
-    "@fugood/node-llama-win32-arm64": "1.1.8",
-    "@fugood/node-llama-win32-arm64-vulkan": "1.1.8",
-    "@fugood/node-llama-darwin-x64": "1.1.8",
-    "@fugood/node-llama-darwin-arm64": "1.1.8"
+    "@fugood/node-llama-linux-x64": "1.1.10",
+    "@fugood/node-llama-linux-x64-vulkan": "1.1.10",
+    "@fugood/node-llama-linux-x64-cuda": "1.1.10",
+    "@fugood/node-llama-linux-arm64": "1.1.10",
+    "@fugood/node-llama-linux-arm64-vulkan": "1.1.10",
+    "@fugood/node-llama-linux-arm64-cuda": "1.1.10",
+    "@fugood/node-llama-win32-x64": "1.1.10",
+    "@fugood/node-llama-win32-x64-vulkan": "1.1.10",
+    "@fugood/node-llama-win32-x64-cuda": "1.1.10",
+    "@fugood/node-llama-win32-arm64": "1.1.10",
+    "@fugood/node-llama-win32-arm64-vulkan": "1.1.10",
+    "@fugood/node-llama-darwin-x64": "1.1.10",
+    "@fugood/node-llama-darwin-arm64": "1.1.10"
   },
   "devDependencies": {
     "@babel/preset-env": "^7.24.4",

package/scripts/llama.cpp.patch CHANGED Viewed

@@ -1,5 +1,5 @@
 diff --git a/src/llama.cpp/common/chat.cpp b/src/llama.cpp/common/chat.cpp
-index 23d3828f9..ca48af00c 100644
+index 111b4a21b..16ce87672 100644
 --- a/src/llama.cpp/common/chat.cpp
 +++ b/src/llama.cpp/common/chat.cpp
@@ -6,9 +6,6 @@
@@ -29,6 +29,16 @@ index 23d3828f9..ca48af00c 100644
  struct templates_params {
      json messages;
      json tools;
+@@ -784,8 +771,7 @@ static std::string apply(
+     if (additional_context) {
+         tmpl_inputs.extra_context.merge_patch(*additional_context);
+     }
+-    // TODO: add flag to control date/time, if only for testing purposes.
+-    // tmpl_inputs.now = std::chrono::system_clock::now();
++    tmpl_inputs.now = inputs.now;
+     minja::chat_template_options tmpl_opts;
+     // To avoid double BOS / EOS tokens, we're manually removing begining / trailing tokens
 diff --git a/src/llama.cpp/common/chat.h b/src/llama.cpp/common/chat.h
 index d1e480c91..437e64e29 100644
 --- a/src/llama.cpp/common/chat.h
@@ -54,10 +64,10 @@ index d1e480c91..437e64e29 100644
  struct common_chat_tool_call {
      std::string name;
 diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
-index 67dd5404f..909a97c66 100644
+index fdce1dcde..55aac3412 100644
 --- a/src/llama.cpp/common/common.cpp
 +++ b/src/llama.cpp/common/common.cpp
-@@ -1117,6 +1117,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
+@@ -1103,6 +1103,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
          mparams.n_gpu_layers = params.n_gpu_layers;
      }
@@ -66,10 +76,10 @@ index 67dd5404f..909a97c66 100644
      mparams.split_mode      = params.split_mode;
      mparams.tensor_split    = params.tensor_split;
 diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
-index 75596e6b3..0e04694c8 100644
+index 390dda5e5..f259ca785 100644
 --- a/src/llama.cpp/common/common.h
 +++ b/src/llama.cpp/common/common.h
-@@ -267,6 +267,7 @@ struct lr_opt {
+@@ -270,6 +270,7 @@ struct lr_opt {
  struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
  struct common_params {

package/src/LlamaCompletionWorker.cpp CHANGED Viewed

@@ -35,12 +35,14 @@ LlamaCompletionWorker::LlamaCompletionWorker(
     const std::vector<std::string> &media_paths,
     const std::vector<llama_token> &guide_tokens,
     bool has_vocoder,
-    tts_type tts_type_val)
+    tts_type tts_type_val,
+    const std::string &prefill_text)
     : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess),
       _params(params), _stop_words(stop_words), _chat_format(chat_format),
       _thinking_forced_open(thinking_forced_open),
       _reasoning_format(reasoning_format),
       _media_paths(media_paths), _guide_tokens(guide_tokens),
+      _prefill_text(prefill_text),
       _has_vocoder(has_vocoder), _tts_type(tts_type_val) {
   if (!callback.IsEmpty()) {
     _tsfn = Napi::ThreadSafeFunction::New(info.Env(), callback,
@@ -68,8 +70,11 @@ LlamaCompletionWorker::PartialOutput LlamaCompletionWorker::getPartialOutput(con
     chat_syntax.parse_tool_calls = true;
+    // Combine prefill_text with generated_text for parsing
+    std::string full_text = _prefill_text + generated_text;
     // Use is_partial=true for streaming partial output
-    common_chat_msg parsed_msg = common_chat_parse(generated_text, true, chat_syntax);
+    common_chat_msg parsed_msg = common_chat_parse(full_text, true, chat_syntax);
     result.content = parsed_msg.content;
     result.reasoning_content = parsed_msg.reasoning_content;
@@ -156,6 +161,7 @@ void LlamaCompletionWorker::Execute() {
   auto embd = _sess->tokens_ptr();
   embd->reserve(embd->size() + max_len);
   if (is_enc_dec) {
     if (n_input > 0) {
       // Decode tokens in batches using n_batch as chunk size
@@ -378,8 +384,11 @@ void LlamaCompletionWorker::OnOK() {
       chat_syntax.thinking_forced_open = _thinking_forced_open;
       chat_syntax.reasoning_format = common_reasoning_format_from_name(_reasoning_format);
+      // Combine prefill_text with generated_text for final parsing
+      std::string full_text = _prefill_text + _result.text;
       common_chat_msg message = common_chat_parse(
-          _result.text,
+          full_text,
           false,
           chat_syntax
       );

package/src/LlamaCompletionWorker.h CHANGED Viewed

@@ -26,7 +26,8 @@ public:
                         const std::vector<std::string> &media_paths = {},
                         const std::vector<llama_token> &guide_tokens = {},
                         bool has_vocoder = false,
-                        tts_type tts_type_val = UNKNOWN);
+                        tts_type tts_type_val = UNKNOWN,
+                        const std::string &prefill_text = "");
   ~LlamaCompletionWorker();
@@ -58,6 +59,7 @@ private:
   std::string _reasoning_format;
   std::vector<std::string> _media_paths;
   std::vector<llama_token> _guide_tokens;
+  std::string _prefill_text;
   std::function<void()> _onComplete;
   bool _has_callback = false;
   bool _interrupted = false;

package/src/LlamaContext.cpp CHANGED Viewed

@@ -15,6 +15,7 @@
 #include "llama-impl.h"
 #include <atomic>
+#include <list>
 #include <mutex>
 #include <queue>
@@ -258,6 +259,16 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
   params.numa =
       static_cast<ggml_numa_strategy>(get_option<uint32_t>(options, "numa", 0));
+  int n_cpu_moe = get_option<int32_t>(options, "n_cpu_moe", 0);
+  if (n_cpu_moe > 0) {
+    static std::list<std::string> buft_overrides;
+    for (int i = 0; i < n_cpu_moe; ++i) {
+      buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
+      params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
+    }
+    params.tensor_buft_overrides.push_back({nullptr, nullptr});
+  }
   llama_backend_init();
   llama_numa_init(params.numa);
@@ -924,6 +935,8 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
         json_schema_to_grammar(json::parse(json_schema_str));
   }
+  std::string prefill_text = get_option<std::string>(options, "prefill_text", "");
   params.n_predict = get_option<int32_t>(options, "n_predict", -1);
   params.sampling.temp = get_option<float>(options, "temperature", 0.80f);
   params.sampling.top_k = get_option<int32_t>(options, "top_k", 40);
@@ -996,7 +1009,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
   auto *worker =
       new LlamaCompletionWorker(info, _sess, callback, params, stop_words,
                                 chat_format, thinking_forced_open, reasoning_format, media_paths, guide_tokens,
-                                _has_vocoder, _tts_type);
+                                _has_vocoder, _tts_type, prefill_text);
   worker->Queue();
   _wip = worker;
   worker->OnComplete([this]() { _wip = nullptr; });

package/src/llama.cpp/common/arg.cpp CHANGED Viewed

@@ -1532,7 +1532,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
     add_opt(common_arg(
         {"--context-shift"},
-        string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
+        string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
         [](common_params & params) {
             params.ctx_shift = true;
         }
@@ -1755,7 +1755,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params) {
             params.warmup = false;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(common_arg(
         {"--spm-infill"},
         string_format(
@@ -2254,9 +2254,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(common_arg(
         {"-dt", "--defrag-thold"}, "N",
-        string_format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold),
+        string_format("KV cache defragmentation threshold (DEPRECATED)"),
         [](common_params & params, const std::string & value) {
-            params.defrag_thold = std::stof(value);
+            GGML_UNUSED(params);
+            GGML_UNUSED(value);
+            LOG_WRN("DEPRECATED: --defrag-thold is deprecated and no longer necessary to specify\n");
         }
     ).set_env("LLAMA_ARG_DEFRAG_THOLD"));
     add_opt(common_arg(

package/src/llama.cpp/common/chat.cpp CHANGED Viewed

@@ -134,6 +134,7 @@ struct templates_params {
     json extra_context;
     bool add_bos;
     bool add_eos;
+    bool is_inference = true;
 };
 common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
@@ -770,8 +771,7 @@ static std::string apply(
     if (additional_context) {
         tmpl_inputs.extra_context.merge_patch(*additional_context);
     }
-    // TODO: add flag to control date/time, if only for testing purposes.
-    // tmpl_inputs.now = std::chrono::system_clock::now();
+    tmpl_inputs.now = inputs.now;
     minja::chat_template_options tmpl_opts;
     // To avoid double BOS / EOS tokens, we're manually removing begining / trailing tokens
@@ -1323,6 +1323,17 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
     common_chat_params data;
     auto prompt = apply(tmpl, inputs);
+    // Check if we need to replace the return token with end token during
+    // inference and without generation prompt. For more details see:
+    // https://github.com/ggml-org/llama.cpp/issues/15417
+    if (inputs.is_inference && !inputs.add_generation_prompt) {
+        static constexpr std::string_view return_token = "<|return|>";
+        static constexpr std::string_view end_token    = "<|end|>";
+        if (size_t pos = prompt.rfind(return_token); pos != std::string::npos) {
+            prompt.replace(pos, return_token.length(), end_token);
+        }
+    }
     data.prompt = prompt;
     data.format = COMMON_CHAT_FORMAT_GPT_OSS;
@@ -1336,6 +1347,26 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
         "<|end|>",
     };
+    if (!inputs.json_schema.is_null()) {
+        data.grammar_lazy = false;
+        data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+            auto schema = inputs.json_schema;
+            builder.resolve_refs(schema);
+            auto not_end = builder.add_rule("not-end",
+                "[^<] | \"<\" [^|] | \"<|\" [^e] | \"<|e\" [^n] | \"<|en\" [^d] | \"<|end\" [^|] | \"<|end|\" [^>]");
+            auto analysis = builder.add_rule("analysis",
+                "\"<|channel|>analysis<|message|>\" ( " + not_end + " )* \"<|end|>\"");
+            auto constraint = builder.add_rule("constraint", "\"<|constrain|>\"? [a-zA-Z0-9_-]+");
+            auto final = builder.add_rule("final",
+                "\"<|channel|>final\" ( \" \" " + constraint + " )? \"<|message|>\" " +
+                builder.add_schema("response", schema)
+            );
+            builder.add_rule("root", "( " + analysis + " \"<|start|>assistant\" )? " + final);
+        });
+    }
     if (inputs.tools.is_array() && !inputs.tools.empty()) {
         data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
         data.grammar = build_grammar([&](const common_grammar_builder & builder) {
@@ -2096,7 +2127,7 @@ static common_chat_params common_chat_templates_apply_jinja(
     }
     // GPT-OSS
-    if (src.find("<|channel|>") != std::string::npos && params.json_schema.is_null()) {
+    if (src.find("<|channel|>") != std::string::npos) {
         return common_chat_params_init_gpt_oss(tmpl, params);
     }

package/src/llama.cpp/common/common.cpp CHANGED Viewed

@@ -558,13 +558,6 @@ std::string string_from(const struct llama_context * ctx, const std::vector<llam
         auto detokenized = common_token_to_piece(ctx, token);
-        detokenized.erase(
-            std::remove_if(
-                detokenized.begin(),
-                detokenized.end(),
-                [](const unsigned char c) { return !std::isprint(c); }),
-            detokenized.end());
         buf << "'" << detokenized << "'"
             << ":" << std::to_string(token);
     }
@@ -589,13 +582,6 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
         auto detokenized = common_token_to_piece(ctx, batch.token[i]);
-        detokenized.erase(
-                std::remove_if(
-                    detokenized.begin(),
-                    detokenized.end(),
-                    [](const unsigned char c) { return !std::isprint(c); }),
-                detokenized.end());
         buf << "\n"          << std::to_string(i)
             << ", token '"   << detokenized << "'"
             << ", pos "      << std::to_string(batch.pos[i])
@@ -1167,7 +1153,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
     cparams.yarn_orig_ctx     = params.yarn_orig_ctx;
     cparams.pooling_type      = params.pooling_type;
     cparams.attention_type    = params.attention_type;
-    cparams.defrag_thold      = params.defrag_thold;
     cparams.cb_eval           = params.cb_eval;
     cparams.cb_eval_user_data = params.cb_eval_user_data;
     cparams.offload_kqv       = !params.no_kv_offload;

package/src/llama.cpp/common/common.h CHANGED Viewed

@@ -289,7 +289,6 @@ struct common_params {
     float   yarn_beta_fast        = 32.0f; // YaRN low correction dim
     float   yarn_beta_slow        =  1.0f; // YaRN high correction dim
     int32_t yarn_orig_ctx         =     0; // YaRN original context length
-    float   defrag_thold          =  0.1f; // KV cache defragmentation threshold
     // offload params
     std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
@@ -376,7 +375,7 @@ struct common_params {
     bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
     bool flash_attn        = false; // flash attention
     bool no_perf           = false; // disable performance metrics
-    bool ctx_shift         = false;  // context shift on inifinite text generation
+    bool ctx_shift         = false;  // context shift on infinite text generation
     bool swa_full          = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
     bool kv_unified        = false; // enable unified KV cache

package/src/llama.cpp/ggml/CMakeLists.txt CHANGED Viewed

@@ -158,7 +158,6 @@ option(GGML_CUDA                            "ggml: use CUDA"
 option(GGML_MUSA                            "ggml: use MUSA"                                  OFF)
 option(GGML_CUDA_FORCE_MMQ                  "ggml: use mmq kernels instead of cuBLAS"         OFF)
 option(GGML_CUDA_FORCE_CUBLAS               "ggml: always use cuBLAS instead of mmq kernels"  OFF)
-option(GGML_CUDA_F16                        "ggml: use 16 bit floats for some calculations"   OFF)
 set   (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
                                             "ggml: max. batch size for using peer access")
 option(GGML_CUDA_NO_PEER_COPY               "ggml: do not use peer to peer copies"            OFF)

package/src/llama.cpp/ggml/include/ggml.h CHANGED Viewed

@@ -244,6 +244,13 @@
 #define GGML_MROPE_SECTIONS   4
 #define GGML_UNUSED(x) (void)(x)
+#ifdef __CUDACC__
+template<typename... Args>
+__host__ __device__ constexpr inline void ggml_unused_vars_impl(Args&&...) noexcept {}
+#define GGML_UNUSED_VARS(...) ggml_unused_vars_impl(__VA_ARGS__)
+#else
+#define GGML_UNUSED_VARS(...) do { (void)sizeof((__VA_ARGS__, 0)); } while(0)
+#endif // __CUDACC__
 #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
@@ -505,6 +512,7 @@ extern "C" {
         GGML_OP_IM2COL,
         GGML_OP_IM2COL_BACK,
         GGML_OP_CONV_2D,
+        GGML_OP_CONV_3D,
         GGML_OP_CONV_2D_DW,
         GGML_OP_CONV_TRANSPOSE_2D,
         GGML_OP_POOL_1D,
@@ -1933,6 +1941,23 @@ extern "C" {
             int                   d0,  // dilation dimension 0
             int                   d1); // dilation dimension 1
+    GGML_API struct ggml_tensor * ggml_conv_3d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,   // kernel [KW, KH, KD, IC * OC]
+            struct ggml_tensor  * b,   // input  [W, H, D, C * N]
+            int                   s0,  // stride
+            int                   s1,
+            int                   s2,
+            int                   p0,  // padding
+            int                   p1,
+            int                   p2,
+            int                   d0,  // dilation
+            int                   d1,
+            int                   d2,
+            int                   n_channels,
+            int                   n_batch,
+            int                   n_channels_out);
     enum ggml_op_pool {
         GGML_OP_POOL_MAX,
         GGML_OP_POOL_AVG,