npm - @fugood/llama.node - Versions diffs - 1.2.5 → 1.3.0-rc.0 - Mend

@fugood/llama.node 1.2.5 → 1.3.0-rc.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

package/CMakeLists.txt +1 -0
package/lib/binding.ts +96 -1
package/lib/index.js +4 -2
package/lib/index.ts +4 -1
package/lib/parallel.js +214 -0
package/lib/parallel.ts +273 -0
package/package.json +14 -14
package/src/LlamaContext.cpp +34 -1
package/src/LlamaContext.h +16 -0
package/src/common.hpp +4 -3
package/src/llama.cpp/common/arg.cpp +1 -1
package/src/llama.cpp/ggml/include/ggml.h +44 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +16 -3
package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +16 -0
package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +32 -0
package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +4 -0
package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +1 -1
package/src/llama.cpp/src/llama-arch.cpp +5 -0
package/src/llama.cpp/src/llama-arch.h +1 -0
package/src/llama.cpp/src/llama-graph.cpp +74 -43
package/src/llama.cpp/src/llama-graph.h +7 -3
package/src/llama.cpp/src/llama-model.cpp +8 -7
package/src/llama.cpp/src/llama-quant.cpp +7 -1
package/src/llama.cpp/src/llama.cpp +4 -0

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@fugood/llama.node",
   "access": "public",
-  "version": "1.2.5",
+  "version": "1.3.0-rc.0",
   "description": "An another Node binding of llama.cpp",
   "main": "lib/index.js",
   "scripts": {
@@ -72,19 +72,19 @@
     "CMakeLists.txt"
   ],
   "optionalDependencies": {
-    "@fugood/node-llama-linux-x64": "1.2.5",
-    "@fugood/node-llama-linux-x64-vulkan": "1.2.5",
-    "@fugood/node-llama-linux-x64-cuda": "1.2.5",
-    "@fugood/node-llama-linux-arm64": "1.2.5",
-    "@fugood/node-llama-linux-arm64-vulkan": "1.2.5",
-    "@fugood/node-llama-linux-arm64-cuda": "1.2.5",
-    "@fugood/node-llama-win32-x64": "1.2.5",
-    "@fugood/node-llama-win32-x64-vulkan": "1.2.5",
-    "@fugood/node-llama-win32-x64-cuda": "1.2.5",
-    "@fugood/node-llama-win32-arm64": "1.2.5",
-    "@fugood/node-llama-win32-arm64-vulkan": "1.2.5",
-    "@fugood/node-llama-darwin-x64": "1.2.5",
-    "@fugood/node-llama-darwin-arm64": "1.2.5"
+    "@fugood/node-llama-linux-x64": "1.3.0-rc.0",
+    "@fugood/node-llama-linux-x64-vulkan": "1.3.0-rc.0",
+    "@fugood/node-llama-linux-x64-cuda": "1.3.0-rc.0",
+    "@fugood/node-llama-linux-arm64": "1.3.0-rc.0",
+    "@fugood/node-llama-linux-arm64-vulkan": "1.3.0-rc.0",
+    "@fugood/node-llama-linux-arm64-cuda": "1.3.0-rc.0",
+    "@fugood/node-llama-win32-x64": "1.3.0-rc.0",
+    "@fugood/node-llama-win32-x64-vulkan": "1.3.0-rc.0",
+    "@fugood/node-llama-win32-x64-cuda": "1.3.0-rc.0",
+    "@fugood/node-llama-win32-arm64": "1.3.0-rc.0",
+    "@fugood/node-llama-win32-arm64-vulkan": "1.3.0-rc.0",
+    "@fugood/node-llama-darwin-x64": "1.3.0-rc.0",
+    "@fugood/node-llama-darwin-arm64": "1.3.0-rc.0"
   },
   "devDependencies": {
     "@babel/preset-env": "^7.24.4",

package/src/LlamaContext.cpp CHANGED Viewed

@@ -168,6 +168,25 @@ void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
            static_cast<napi_property_attributes>(napi_enumerable)),
        InstanceMethod<&LlamaContext::DecodeAudioTokens>(
            "decodeAudioTokens",
+           static_cast<napi_property_attributes>(napi_enumerable)),
+       // Parallel decoding methods
+       InstanceMethod<&LlamaContext::EnableParallelMode>(
+           "enableParallelMode",
+           static_cast<napi_property_attributes>(napi_enumerable)),
+       InstanceMethod<&LlamaContext::DisableParallelMode>(
+           "disableParallelMode",
+           static_cast<napi_property_attributes>(napi_enumerable)),
+       InstanceMethod<&LlamaContext::QueueCompletion>(
+           "queueCompletion",
+           static_cast<napi_property_attributes>(napi_enumerable)),
+       InstanceMethod<&LlamaContext::QueueEmbedding>(
+           "queueEmbedding",
+           static_cast<napi_property_attributes>(napi_enumerable)),
+       InstanceMethod<&LlamaContext::QueueRerank>(
+           "queueRerank",
+           static_cast<napi_property_attributes>(napi_enumerable)),
+       InstanceMethod<&LlamaContext::CancelRequest>(
+           "cancelRequest",
            static_cast<napi_property_attributes>(napi_enumerable))});
   Napi::FunctionReference *constructor = new Napi::FunctionReference();
   *constructor = Napi::Persistent(func);
@@ -217,6 +236,7 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
   params.n_ctx = get_option<int32_t>(options, "n_ctx", 512);
   params.n_batch = get_option<int32_t>(options, "n_batch", 2048);
   params.n_ubatch = get_option<int32_t>(options, "n_ubatch", 512);
+  params.n_parallel = get_option<int32_t>(options, "n_parallel", 1); // Default to 1 for compatibility
   params.embedding = get_option<bool>(options, "embedding", false);
   if (params.embedding) {
     // For non-causal models, batch size must be equal to ubatch size
@@ -288,6 +308,9 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
       }
     }
   }
+  // Initialize validity flag for async callback safety
+  _context_valid = std::make_shared<std::atomic<bool>>(true);
   // Use rn-llama context instead of direct session
   _rn_ctx = new llama_rn_context();
   if (!_rn_ctx->loadModel(params)) {
@@ -305,6 +328,11 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
 }
 LlamaContext::~LlamaContext() {
+  // Invalidate the context to prevent use-after-free in async callbacks
+  if (_context_valid) {
+    _context_valid->store(false);
+  }
   // The DisposeWorker is responsible for cleanup of _rn_ctx
   // If _rn_ctx is still not null here, it means disposal was not properly initiated
   if (_rn_ctx) {
@@ -579,7 +607,7 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
     // grammar: string
     result.Set("grammar", chatParams.grammar);
     // grammar_lazy: boolean
-    result.Set("grammea_lazy", chatParams.grammar_lazy);
+    result.Set("grammar_lazy", chatParams.grammar_lazy);
     // grammar_triggers: [{ value: string, token: number }]
     Napi::Array grammar_triggers = Napi::Array::New(env);
     for (size_t i = 0; i < chatParams.grammar_triggers.size(); i++) {
@@ -1135,6 +1163,11 @@ Napi::Value LlamaContext::Release(const Napi::CallbackInfo &info) {
     _wip->SetStop();
   }
+  // stop_processing_loop
+  if (_rn_ctx && _rn_ctx->slot_manager) {
+    _rn_ctx->slot_manager->stop_processing_loop();
+  }
   if (_rn_ctx == nullptr) {
     auto promise = Napi::Promise::Deferred(env);
     promise.Resolve(env.Undefined());

package/src/LlamaContext.h CHANGED Viewed

@@ -4,6 +4,10 @@
 #include "rn-llama/rn-llama.h"
 #include "rn-llama/rn-completion.h"
 #include "rn-llama/rn-tts.h"
+#include "rn-llama/rn-slot.h"
+#include "rn-llama/rn-slot-manager.h"
+#include <atomic>
+#include <memory>
 using namespace rnllama;
@@ -55,10 +59,22 @@ private:
   Napi::Value GetAudioCompletionGuideTokens(const Napi::CallbackInfo &info);
   Napi::Value DecodeAudioTokens(const Napi::CallbackInfo &info);
+  // Parallel decoding methods
+  Napi::Value EnableParallelMode(const Napi::CallbackInfo &info);
+  void DisableParallelMode(const Napi::CallbackInfo &info);
+  Napi::Value QueueCompletion(const Napi::CallbackInfo &info);
+  Napi::Value QueueEmbedding(const Napi::CallbackInfo &info);
+  Napi::Value QueueRerank(const Napi::CallbackInfo &info);
+  void CancelRequest(const Napi::CallbackInfo &info);
   std::string _info;
   Napi::Object _meta;
   LlamaCompletionWorker *_wip = nullptr;
   // Use rn-llama context instead of direct llama.cpp types
   llama_rn_context *_rn_ctx = nullptr;
+  // Validity flag for async callbacks to prevent use-after-free
+  // Shared pointer ensures callbacks can safely check if context is still alive
+  std::shared_ptr<std::atomic<bool>> _context_valid;
 };

package/src/common.hpp CHANGED Viewed

@@ -16,11 +16,12 @@ static bool is_nil(const Napi::Value &value) {
   return value.IsNull() || value.IsUndefined();
 }
-static std::string json_stringify(const Napi::Object &obj) {
-  Napi::Env env = obj.Env();
+// Overload for Napi::Value to handle both arrays and objects
+static std::string json_stringify(const Napi::Value &value) {
+  Napi::Env env = value.Env();
   Napi::Object json = env.Global().Get("JSON").As<Napi::Object>();
   Napi::Function stringify = json.Get("stringify").As<Napi::Function>();
-  return stringify.Call(json, {obj}).As<Napi::String>().ToString();
+  return stringify.Call(json, {value}).As<Napi::String>().ToString();
 }
 static void console_log(Napi::Env env, const std::string &message) {

package/src/llama.cpp/common/arg.cpp CHANGED Viewed

@@ -1760,7 +1760,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
     add_opt(common_arg(
         {"-t", "--threads"}, "N",
-        string_format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
+        string_format("number of CPU threads to use during generation (default: %d)", params.cpuparams.n_threads),
         [](common_params & params, int value) {
             params.cpuparams.n_threads = value;
             if (params.cpuparams.n_threads <= 0) {

package/src/llama.cpp/ggml/include/ggml.h CHANGED Viewed

@@ -577,6 +577,10 @@ extern "C" {
         GGML_UNARY_OP_EXP,
         GGML_UNARY_OP_GELU_ERF,
         GGML_UNARY_OP_XIELU,
+        GGML_UNARY_OP_FLOOR,
+        GGML_UNARY_OP_CEIL,
+        GGML_UNARY_OP_ROUND,
+        GGML_UNARY_OP_TRUNC,
         GGML_UNARY_OP_COUNT,
     };
@@ -1151,6 +1155,46 @@ extern "C" {
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
+    GGML_API struct ggml_tensor * ggml_floor(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+    GGML_API struct ggml_tensor * ggml_floor_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+    GGML_API struct ggml_tensor * ggml_ceil(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+    GGML_API struct ggml_tensor * ggml_ceil_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+    GGML_API struct ggml_tensor * ggml_round(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+    GGML_API struct ggml_tensor * ggml_round_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+     /**
+     * Truncates the fractional part of each element in the tensor (towards zero).
+     * For example: trunc(3.7) = 3.0, trunc(-2.9) = -2.0
+     * Similar to std::trunc in C/C++.
+     */
+    GGML_API struct ggml_tensor * ggml_trunc(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+    GGML_API struct ggml_tensor * ggml_trunc_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
     // xIELU activation function
     // x = x * (c_a(alpha_n) + c_b(alpha_p, beta) * sigmoid(beta * x)) + eps * (x > 0)
     // where c_a = softplus and c_b(a, b) = softplus(a) + b are constraining functions

package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h CHANGED Viewed

@@ -68,7 +68,7 @@ struct ggml_compute_params {
 #endif  // __VXE2__
 #endif  // __s390x__ && __VEC__
-#if defined(__ARM_FEATURE_SVE)
+#if defined(__ARM_FEATURE_SVE) && defined(__linux__)
 #include <sys/prctl.h>
 #endif

package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c CHANGED Viewed

@@ -689,8 +689,13 @@ bool ggml_is_numa(void) {
 #endif
 static void ggml_init_arm_arch_features(void) {
-#if defined(__linux__) && defined(__aarch64__) && defined(__ARM_FEATURE_SVE)
+#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE)
+#if defined(__linux__)
     ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
+#else
+    // TODO: add support of SVE for non-linux systems
+#error "TODO: SVE is not supported on this platform. To use SVE, sve_cnt needs to be initialized here."
+#endif
 #endif
 }
@@ -2179,6 +2184,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
                 case GGML_UNARY_OP_HARDSWISH:
                 case GGML_UNARY_OP_HARDSIGMOID:
                 case GGML_UNARY_OP_EXP:
+                case GGML_UNARY_OP_FLOOR:
+                case GGML_UNARY_OP_CEIL:
+                case GGML_UNARY_OP_ROUND:
+                case GGML_UNARY_OP_TRUNC:
                     {
                         n_tasks = 1;
                     } break;
@@ -3558,13 +3567,17 @@ void ggml_cpu_init(void) {
 #ifdef GGML_USE_OPENMP
             //if (!getenv("OMP_WAIT_POLICY")) {
             //    // set the wait policy to active, so that OpenMP threads don't sleep
-            //    putenv("OMP_WAIT_POLICY=active");
+            //    setenv("OMP_WAIT_POLICY", "active", 0)
             //}
             if (!getenv("KMP_BLOCKTIME")) {
                 // set the time to wait before sleeping a thread
                 // this is less aggressive than setting the wait policy to active, but should achieve similar results in most cases
-                putenv("KMP_BLOCKTIME=200"); // 200ms
+#ifdef _WIN32
+                _putenv_s("KMP_BLOCKTIME", "200"); // 200ms
+#else
+                setenv("KMP_BLOCKTIME", "200", 0); // 200ms
+#endif
             }
 #endif
         }

package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp CHANGED Viewed

@@ -8993,6 +8993,22 @@ void ggml_compute_forward_unary(
             {
                 ggml_compute_forward_exp(params, dst);
             } break;
+        case GGML_UNARY_OP_FLOOR:
+            {
+                ggml_compute_forward_floor(params, dst);
+            } break;
+        case GGML_UNARY_OP_CEIL:
+            {
+                ggml_compute_forward_ceil(params, dst);
+            } break;
+        case GGML_UNARY_OP_ROUND:
+            {
+                ggml_compute_forward_round(params, dst);
+            } break;
+        case GGML_UNARY_OP_TRUNC:
+            {
+                ggml_compute_forward_trunc(params, dst);
+            } break;
         case GGML_UNARY_OP_XIELU:
             {
                 ggml_compute_forward_xielu(params, dst);

package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp CHANGED Viewed

@@ -73,6 +73,22 @@ static inline float op_log(float x) {
     return logf(x);
 }
+static inline float op_floor(float x) {
+    return floorf(x);
+}
+static inline float op_ceil(float x) {
+    return ceilf(x);
+}
+static inline float op_round(float x) {
+    return roundf(x);
+}
+static inline float op_trunc(float x) {
+    return truncf(x);
+}
 template <float (*op)(float), typename src0_t, typename dst_t>
 static inline void vec_unary_op(int64_t n, dst_t * y, const src0_t * x) {
     constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
@@ -274,6 +290,22 @@ void ggml_compute_forward_log(const ggml_compute_params * params, ggml_tensor *
     unary_op<op_log>(params, dst);
 }
+void ggml_compute_forward_floor(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_floor>(params, dst);
+}
+void ggml_compute_forward_ceil(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_ceil>(params, dst);
+}
+void ggml_compute_forward_round(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_round>(params, dst);
+}
+void ggml_compute_forward_trunc(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_trunc>(params, dst);
+}
 void ggml_compute_forward_xielu(const ggml_compute_params * params, ggml_tensor * dst) {
     const float alpha_n = ggml_get_op_params_f32(dst, 1);
     const float alpha_p = ggml_get_op_params_f32(dst, 2);

package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h CHANGED Viewed

@@ -22,6 +22,10 @@ void ggml_compute_forward_sqrt(const struct ggml_compute_params * params, struct
 void ggml_compute_forward_sin(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_cos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_log(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_floor(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_ceil(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_round(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_trunc(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_xielu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 #ifdef __cplusplus

package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp CHANGED Viewed

@@ -463,9 +463,9 @@ ggml_float ggml_vec_cvar_f32(const int n, float * y, const float * x, const floa
 #endif
     for (; i < n; ++i) {
         float val = x[i] - mean;
+        y[i] = val;
         val *= val;
         sum += (ggml_float)val;
-        y[i] = val;
     }
     return sum/n;
 }

package/src/llama.cpp/src/llama-arch.cpp CHANGED Viewed

@@ -5,6 +5,7 @@
 #include <map>
 static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
+    { LLM_ARCH_CLIP,             "clip"             }, // dummy, only used by llama-quantize
     { LLM_ARCH_LLAMA,            "llama"            },
     { LLM_ARCH_LLAMA4,           "llama4"           },
     { LLM_ARCH_DECI,             "deci"             },
@@ -275,6 +276,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
 };
 static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
+    {
+        LLM_ARCH_CLIP,
+        {},
+    },
     {
         LLM_ARCH_LLAMA,
         {

package/src/llama.cpp/src/llama-arch.h CHANGED Viewed

@@ -9,6 +9,7 @@
 //
 enum llm_arch {
+    LLM_ARCH_CLIP,
     LLM_ARCH_LLAMA,
     LLM_ARCH_LLAMA4,
     LLM_ARCH_DECI,

package/src/llama.cpp/src/llama-graph.cpp CHANGED Viewed

@@ -261,12 +261,17 @@ void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
     }
 }
-static void print_mask(float * data, int64_t n_tokens, int64_t n_kv, int64_t n_swa, llama_swa_type swa_type) {
+static void print_mask(const float * data, int64_t n_tokens, int64_t n_kv, int64_t n_swa, llama_swa_type swa_type) {
     LLAMA_LOG_DEBUG("%s: === Attention mask ===\n", __func__);
-    const char * swa_type_str = (swa_type == LLAMA_SWA_TYPE_NONE) ? "LLAMA_SWA_TYPE_NONE" :
-                          (swa_type == LLAMA_SWA_TYPE_STANDARD) ? "LLAMA_SWA_TYPE_STANDARD" :
-                          (swa_type == LLAMA_SWA_TYPE_CHUNKED) ? "LLAMA_SWA_TYPE_CHUNKED" :
-                          (swa_type == LLAMA_SWA_TYPE_SYMMETRIC) ? "LLAMA_SWA_TYPE_SYMMETRIC" : "unknown";
+    const char * swa_type_str = "unknown";
+    switch (swa_type) {
+        case LLAMA_SWA_TYPE_NONE:      swa_type_str = "LLAMA_SWA_TYPE_NONE"; break;
+        case LLAMA_SWA_TYPE_STANDARD:  swa_type_str = "LLAMA_SWA_TYPE_STANDARD"; break;
+        case LLAMA_SWA_TYPE_CHUNKED:   swa_type_str = "LLAMA_SWA_TYPE_CHUNKED"; break;
+        case LLAMA_SWA_TYPE_SYMMETRIC: swa_type_str = "LLAMA_SWA_TYPE_SYMMETRIC"; break;
+    };
     LLAMA_LOG_DEBUG("%s: n_swa : %d, n_kv: %d, swq_type: %s\n", __func__, (int)n_swa, (int)n_kv, swa_type_str);
     LLAMA_LOG_DEBUG("%s: '0' = can attend, '∞' = masked\n", __func__);
     LLAMA_LOG_DEBUG("%s: Rows = query tokens, Columns = key/value tokens\n\n", __func__);
@@ -295,50 +300,67 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
     const int64_t n_kv     = ubatch->n_tokens;
     const int64_t n_tokens = ubatch->n_tokens;
-    GGML_ASSERT(kq_mask);
-    GGML_ASSERT(ggml_backend_buffer_is_host(kq_mask->buffer));
-    float * data = (float *) kq_mask->data;
-    // [TAG_NO_CACHE_ISWA]
-    GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "TODO: implement");
+    const auto fill_mask = [&](float * data, int n_swa, llama_swa_type swa_type) {
+        for (int h = 0; h < 1; ++h) {
+            for (int i1 = 0; i1 < n_tokens; ++i1) {
+                const llama_seq_id s1 = ubatch->seq_id[i1][0];
+                const llama_pos    p1 = ubatch->pos[i1];
-    for (int h = 0; h < 1; ++h) {
-        for (int i1 = 0; i1 < n_tokens; ++i1) {
-            const llama_seq_id s1 = ubatch->seq_id[i1][0];
+                const uint64_t idst = h*(n_kv*n_tokens) + i1*n_kv;
-            for (int i0 = 0; i0 < n_tokens; ++i0) {
-                float f = -INFINITY;
-                for (int s = 0; s < ubatch->n_seq_id[i0]; ++s) {
+                for (int i0 = 0; i0 < n_tokens; ++i0) {
                     const llama_seq_id s0 = ubatch->seq_id[i0][0];
+                    const llama_pos p0    = ubatch->pos[i0];
+                    // mask different sequences
                     if (s0 != s1) {
-                        continue; // skip different sequences
+                        continue;
                     }
-                    if (cparams.causal_attn && ubatch->pos[i0] > ubatch->pos[i1]) {
-                        continue; // skip future tokens for causal attention
+                    // mask future tokens
+                    if (cparams.causal_attn && p0 > p1) {
+                        continue;
                     }
-                    // TODO: this does not take into account that some layers are SWA and others are note (i.e. iSWA) [TAG_NO_CACHE_ISWA]
-                    //if (hparams.is_masked_swa(ubatch->pos[i0], ubatch->pos[i1])) {
-                    //    continue; // skip masked tokens for SWA
-                    //}
-                    // TODO: reimplement this like in llama_kv_cache_unified
-                    if (hparams.use_alibi) {
-                        f = -std::abs(ubatch->pos[i0] - ubatch->pos[i1]);
-                    } else {
-                        f = 0.0f;
+                    // apply SWA if any
+                    if (llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1)) {
+                        continue;
                     }
+                    data[idst + i0] = hparams.use_alibi ? -std::abs(p0 - p1) : 0.0f;
                 }
-                data[h*(n_kv*n_tokens) + i1*n_kv + i0] = f;
             }
         }
+    };
+    {
+        GGML_ASSERT(self_kq_mask);
+        GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask->buffer));
+        float * data = (float *) self_kq_mask->data;
+        std::fill(data, data + ggml_nelements(self_kq_mask), -INFINITY);
+        fill_mask(data, 0, LLAMA_SWA_TYPE_NONE);
+        if (debug) {
+            print_mask(data, n_tokens, n_kv, 0, LLAMA_SWA_TYPE_NONE);
+        }
     }
-    if (debug) {
-        print_mask(data, n_tokens, n_kv, hparams.n_swa, hparams.swa_type);
+    if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
+        GGML_ASSERT(self_kq_mask_swa);
+        GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask_swa->buffer));
+        float * data = (float *) self_kq_mask_swa->data;
+        std::fill(data, data + ggml_nelements(self_kq_mask_swa), -INFINITY);
+        fill_mask(data, hparams.n_swa, hparams.swa_type);
+        if (debug) {
+            print_mask(data, n_tokens, n_kv, hparams.n_swa, hparams.swa_type);
+        }
     }
 }
@@ -1299,12 +1321,9 @@ ggml_tensor * llm_graph_context::build_attn_mha(
     k = ggml_permute(ctx0, k, 0, 2, 1, 3);
     v = ggml_permute(ctx0, v, 0, 2, 1, 3);
-    const auto n_kv = k->ne[1];
     ggml_tensor * cur;
-    // TODO: replace hardcoded padding with ggml-provided padding
-    if (cparams.flash_attn && (n_kv % 256 == 0) && kq_b == nullptr) {
+    if (cparams.flash_attn && kq_b == nullptr) {
         GGML_ASSERT(kq_b == nullptr && "Flash attention does not support KQ bias yet");
         if (v_trans) {
@@ -1419,10 +1438,20 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con
     auto inp = std::make_unique<llm_graph_input_attn_no_cache>(hparams, cparams);
     // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
-    inp->kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
-    ggml_set_input(inp->kq_mask);
+    inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
+    ggml_set_input(inp->self_kq_mask);
+    inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
-    inp->kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->kq_mask, GGML_TYPE_F16) : inp->kq_mask;
+    if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
+        inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
+        ggml_set_input(inp->self_kq_mask_swa);
+        inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
+    } else {
+        inp->self_kq_mask_swa     = nullptr;
+        inp->self_kq_mask_swa_cnv = nullptr;
+    }
     return (llm_graph_input_attn_no_cache *) res->add_input(std::move(inp));
 }
@@ -1447,7 +1476,9 @@ ggml_tensor * llm_graph_context::build_attn(
     ggml_build_forward_expand(gf, k_cur);
     ggml_build_forward_expand(gf, v_cur);
-    const auto & kq_mask = inp->get_kq_mask();
+    const bool is_swa = hparams.is_swa(il);
+    const auto & kq_mask = is_swa ? inp->get_kq_mask_swa() : inp->get_kq_mask();
     // [TAG_NO_CACHE_PAD]
     // TODO: if ubatch.equal_seqs() == true, we can split the three tensors below into ubatch.n_seqs_unq streams

package/src/llama.cpp/src/llama-graph.h CHANGED Viewed

@@ -257,10 +257,14 @@ public:
     void set_input(const llama_ubatch * ubatch) override;
-    ggml_tensor * get_kq_mask() const { return kq_mask_cnv; }
+    ggml_tensor * get_kq_mask()     const { return self_kq_mask_cnv; }
+    ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; }
-    ggml_tensor * kq_mask     = nullptr; // F32 [n_tokens, n_batch, 1, 1]
-    ggml_tensor * kq_mask_cnv = nullptr; //     [n_tokens, n_batch, 1, 1]
+    // n_tokens == n_batch
+    ggml_tensor * self_kq_mask         = nullptr; // F32 [n_tokens, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask_cnv     = nullptr; //     [n_tokens, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask_swa     = nullptr; // F32 [n_tokens, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask_swa_cnv = nullptr; //     [n_tokens, n_batch/n_stream, 1, n_stream]
     const llama_hparams hparams;
     const llama_cparams cparams;