RubyGems - llama_cpp - Versions diffs - 0.15.1 → 0.15.2 - Mend

llama_cpp 0.15.1 → 0.15.2

Files changed (24) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +6 -0
data/lib/llama_cpp/version.rb +2 -2
data/vendor/tmp/llama.cpp/Makefile +3 -3
data/vendor/tmp/llama.cpp/ggml-backend.c +2 -3
data/vendor/tmp/llama.cpp/ggml-cuda.cu +15 -7
data/vendor/tmp/llama.cpp/ggml-impl.h +7 -0
data/vendor/tmp/llama.cpp/ggml-kompute.cpp +9 -3
data/vendor/tmp/llama.cpp/ggml-metal.m +114 -125
data/vendor/tmp/llama.cpp/ggml-metal.metal +86 -109
data/vendor/tmp/llama.cpp/ggml-quants.c +2202 -28
data/vendor/tmp/llama.cpp/ggml-rpc.cpp +1032 -0
data/vendor/tmp/llama.cpp/ggml-rpc.h +24 -0
data/vendor/tmp/llama.cpp/ggml-sycl.cpp +24 -143
data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +4 -2
data/vendor/tmp/llama.cpp/ggml.c +726 -646
data/vendor/tmp/llama.cpp/ggml.h +28 -17
data/vendor/tmp/llama.cpp/llama.cpp +478 -281
data/vendor/tmp/llama.cpp/llama.h +3 -0
data/vendor/tmp/llama.cpp/unicode-data.cpp +6969 -2169
data/vendor/tmp/llama.cpp/unicode-data.h +15 -12
data/vendor/tmp/llama.cpp/unicode.cpp +89 -111
data/vendor/tmp/llama.cpp/unicode.h +44 -12
metadata +4 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: ce6d72aeb5fb9aff775d44284bf934e164f8470973619507ef6e6eb1ac0bec4d
-  data.tar.gz: 7c1ae823c90f957219b3edbc20f091b65a50caa984c1a6f4d137a46c376b2f0c
+  metadata.gz: 30dd4c29b86098faf7c78de5fa8e57021b631bb5eb3d14c93f63f1d186383ab8
+  data.tar.gz: b011d891f1cd725f84821428a8db24004b52c9614e785f493f721f7abde71029
 SHA512:
-  metadata.gz: d23cb6a63b7734df2547c5e61a699fa206878c747e274e004c829b77335a7cc7434e92168a55d8ab0a617b11eddb5d45d5057a91b92e848735fd9e852b2476cd
-  data.tar.gz: f54b09de3cc60de81be977e9706a9beb3bf28e7740a19a57f6add543fe10cd6dc4101cbbe22dd5b62870c78a1ad4d10f57dd29b7c3e3e12b950e6575cf67b0c7
+  metadata.gz: 6c1628f93762747688f802db8593946e8581c869f63c610669b45759f644b3d19b061825b788e328b6b984977112837586ed398b6118a8f8e5f0c7f6fd0eb2dd
+  data.tar.gz: 2f8c3d9f1e6c0f6db7e0682995c8d34179d5405d32784bf00f04a3408cb5bf4c95557bfa1692026f8d3dc9e672d6b15dec5d33cbd76ddc1d94e5ec964a9d0409

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,9 @@
+## [[0.15.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.1...v0.15.2)] - 2024-05-18
+- Bump llama.cpp from b2839 to b2917.
+Implementation binding for rpc_servers in llama_model_params has been skipped.
 ## [[0.15.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.0...v0.15.1)] - 2024-05-11
 - Bump llama.cpp from b2781 to b2839.

data/lib/llama_cpp/version.rb CHANGED Viewed

@@ -3,8 +3,8 @@
 # llama_cpp.rb provides Ruby bindings for the llama.cpp.
 module LLaMACpp
   # The version of llama_cpp.rb you install.
-  VERSION = '0.15.1'
+  VERSION = '0.15.2'
   # The version of llama.cpp bundled with llama_cpp.rb.
-  LLAMA_CPP_VERSION = 'b2839'
+  LLAMA_CPP_VERSION = 'b2917'
 end

data/vendor/tmp/llama.cpp/Makefile CHANGED Viewed

@@ -562,10 +562,10 @@ endif # LLAMA_VULKAN
 ifdef LLAMA_HIPBLAS
 	ifeq ($(wildcard /opt/rocm),)
 		ROCM_PATH	?= /usr
-		GPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
+		AMDGPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
 	else
 		ROCM_PATH	?= /opt/rocm
-		GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
+		AMDGPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
 	endif
 	HIPCC                   ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc
 	LLAMA_CUDA_DMMV_X       ?= 32
@@ -577,7 +577,7 @@ ifdef LLAMA_HIP_UMA
 endif # LLAMA_HIP_UMA
 	MK_LDFLAGS  += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
 	MK_LDFLAGS	+= -lhipblas -lamdhip64 -lrocblas
-	HIPFLAGS    += $(addprefix --offload-arch=,$(GPU_TARGETS))
+	HIPFLAGS    += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
 	HIPFLAGS    += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
 	HIPFLAGS    += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
 	HIPFLAGS    += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)

data/vendor/tmp/llama.cpp/ggml-backend.c CHANGED Viewed

@@ -1182,9 +1182,9 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
 static char * fmt_size(size_t size) {
     static char buffer[128];
     if (size >= 1024*1024) {
-        sprintf(buffer, "%zuM", size/1024/1024);
+        snprintf(buffer, sizeof(buffer), "%zuM", size/1024/1024);
     } else {
-        sprintf(buffer, "%zuK", size/1024);
+        snprintf(buffer, sizeof(buffer), "%zuK", size/1024);
     }
     return buffer;
 }
@@ -1895,7 +1895,6 @@ void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * t
     tensor->buffer = buffer;
     tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
-    tensor->backend = tensor->view_src->backend;
     ggml_backend_buffer_init_tensor(buffer, tensor);
 }

data/vendor/tmp/llama.cpp/ggml-cuda.cu CHANGED Viewed

@@ -4,7 +4,6 @@
 #include "ggml-cuda/common.cuh"
 #include "ggml-cuda/acc.cuh"
-#include "ggml-cuda/alibi.cuh"
 #include "ggml-cuda/arange.cuh"
 #include "ggml-cuda/argsort.cuh"
 #include "ggml-cuda/binbcast.cuh"
@@ -2205,6 +2204,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
                 case GGML_UNARY_OP_RELU:
                     ggml_cuda_op_relu(ctx, dst);
                     break;
+                case GGML_UNARY_OP_SIGMOID:
+                    ggml_cuda_op_sigmoid(ctx, dst);
+                    break;
                 case GGML_UNARY_OP_HARDSIGMOID:
                     ggml_cuda_op_hardsigmoid(ctx, dst);
                     break;
@@ -2277,9 +2279,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_ROPE:
             ggml_cuda_op_rope(ctx, dst);
             break;
-        case GGML_OP_ALIBI:
-            ggml_cuda_op_alibi(ctx, dst);
-            break;
         case GGML_OP_IM2COL:
             ggml_cuda_op_im2col(ctx, dst);
             break;
@@ -2559,7 +2558,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
         }
         // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
-        if (cuda_graph_update_required) {
+        if (use_cuda_graph && cuda_graph_update_required) {
             cuda_ctx->cuda_graph->number_consecutive_updates++;
         } else {
             cuda_ctx->cuda_graph->number_consecutive_updates = 0;
@@ -2714,12 +2713,14 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
 }
 GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
+    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
     switch (op->op) {
         case GGML_OP_UNARY:
             switch (ggml_get_unary_op(op)) {
                 case GGML_UNARY_OP_GELU:
                 case GGML_UNARY_OP_SILU:
                 case GGML_UNARY_OP_RELU:
+                case GGML_UNARY_OP_SIGMOID:
                 case GGML_UNARY_OP_HARDSIGMOID:
                 case GGML_UNARY_OP_HARDSWISH:
                 case GGML_UNARY_OP_GELU_QUICK:
@@ -2829,7 +2830,6 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
         case GGML_OP_DIAG_MASK_INF:
         case GGML_OP_SOFT_MAX:
         case GGML_OP_ROPE:
-        case GGML_OP_ALIBI:
         case GGML_OP_IM2COL:
         case GGML_OP_POOL_2D:
         case GGML_OP_SUM_ROWS:
@@ -2841,8 +2841,16 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
         case GGML_OP_ARANGE:
         case GGML_OP_TIMESTEP_EMBEDDING:
         case GGML_OP_LEAKY_RELU:
-        case GGML_OP_FLASH_ATTN_EXT:
             return true;
+        case GGML_OP_FLASH_ATTN_EXT:
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+            return op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128;
+#else
+            if (op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128) {
+                return true;
+            }
+            return ggml_cuda_info().devices[cuda_ctx->device].cc >= CC_VOLTA;
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
         default:
             return false;
     }

data/vendor/tmp/llama.cpp/ggml-impl.h CHANGED Viewed

@@ -120,9 +120,16 @@ extern "C" {
 #ifndef __F16C__
 #define __F16C__
 #endif
+#endif
+// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
+#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
 #ifndef __SSE3__
 #define __SSE3__
 #endif
+#ifndef __SSSE3__
+#define __SSSE3__
+#endif
 #endif
 // 16-bit float

data/vendor/tmp/llama.cpp/ggml-kompute.cpp CHANGED Viewed

@@ -1559,12 +1559,18 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
                 case GGML_OP_SOFT_MAX:
                     {
                         float scale;
-                        memcpy(&scale, dst->op_params, sizeof(float));
+                        float max_bias;
-#pragma message("TODO: add ggml_vk_soft_max() F16/F32 src1 and src2 support")
+                        memcpy(&scale,    (float *)dst->op_params + 0, sizeof(float));
+                        memcpy(&max_bias, (float *)dst->op_params + 1, sizeof(float));
+#pragma message("TODO: add ggml_vk_soft_max() F16 src1 support")
 #pragma message("ref:  https://github.com/ggerganov/llama.cpp/pull/5021")
                         GGML_ASSERT(!src1 || src1t == GGML_TYPE_F32);
-                        GGML_ASSERT(src2 == nullptr);
+#pragma message("TODO: add ALiBi support")
+#pragma message("ref:  https://github.com/ggerganov/llama.cpp/pull/7192")
+                        GGML_ASSERT(max_bias == 0.0f);
                         ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale);
                     } break;

data/vendor/tmp/llama.cpp/ggml-metal.m CHANGED Viewed

@@ -40,6 +40,7 @@ enum ggml_metal_kernel_type {
     GGML_METAL_KERNEL_TYPE_CLAMP,
     GGML_METAL_KERNEL_TYPE_TANH,
     GGML_METAL_KERNEL_TYPE_RELU,
+    GGML_METAL_KERNEL_TYPE_SIGMOID,
     GGML_METAL_KERNEL_TYPE_GELU,
     GGML_METAL_KERNEL_TYPE_GELU_4,
     GGML_METAL_KERNEL_TYPE_GELU_QUICK,
@@ -169,7 +170,6 @@ enum ggml_metal_kernel_type {
     GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32,
     GGML_METAL_KERNEL_TYPE_ROPE_F32,
     GGML_METAL_KERNEL_TYPE_ROPE_F16,
-    GGML_METAL_KERNEL_TYPE_ALIBI_F32,
     GGML_METAL_KERNEL_TYPE_IM2COL_F16,
     GGML_METAL_KERNEL_TYPE_IM2COL_F32,
     GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
@@ -494,6 +494,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CLAMP,                         clamp,                          true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TANH,                          tanh,                           true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RELU,                          relu,                           true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SIGMOID,                       sigmoid,                        true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU,                          gelu,                           true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_4,                        gelu_4,                         true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK,                    gelu_quick,                     true);
@@ -623,7 +624,6 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32,          mul_mm_id_iq4_xs_f32,           ctx->support_simdgroup_mm);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32,                      rope_f32,                       true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F16,                      rope_f16,                       true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ALIBI_F32,                     alibi_f32,                      true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F16,                    im2col_f16,                     true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32,                    im2col_f32,                     true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32,                   upscale_f32,                    true);
@@ -633,14 +633,14 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,           argsort_f32_i32_asc,            true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC,          argsort_f32_i32_desc,           true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32,                leaky_relu_f32,                 true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64,        flash_attn_ext_f16_h64,         true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80,        flash_attn_ext_f16_h80,         true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96,        flash_attn_ext_f16_h96,         true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112,       flash_attn_ext_f16_h112,        true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128,       flash_attn_ext_f16_h128,        true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256,       flash_attn_ext_f16_h256,        true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128,   flash_attn_ext_vec_f16_h128,    true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256,   flash_attn_ext_vec_f16_h256,    true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64,        flash_attn_ext_f16_h64,         ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80,        flash_attn_ext_f16_h80,         ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96,        flash_attn_ext_f16_h96,         ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112,       flash_attn_ext_f16_h112,        ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128,       flash_attn_ext_f16_h128,        ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256,       flash_attn_ext_f16_h256,        ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128,   flash_attn_ext_vec_f16_h128,    ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256,   flash_attn_ext_vec_f16_h256,    ctx->support_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16,                   cpy_f32_f16,                    true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32,                   cpy_f32_f32,                    true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0,                  cpy_f32_q8_0,                   true);
@@ -732,6 +732,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
             switch (ggml_get_unary_op(op)) {
                 case GGML_UNARY_OP_TANH:
                 case GGML_UNARY_OP_RELU:
+                case GGML_UNARY_OP_SIGMOID:
                 case GGML_UNARY_OP_GELU:
                 case GGML_UNARY_OP_GELU_QUICK:
                 case GGML_UNARY_OP_SILU:
@@ -759,7 +760,6 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
         case GGML_OP_GROUP_NORM:
             return ctx->support_simdgroup_reduction;
         case GGML_OP_NORM:
-        case GGML_OP_ALIBI:
         case GGML_OP_ROPE:
         case GGML_OP_IM2COL:
             return true;
@@ -772,8 +772,9 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
         case GGML_OP_TIMESTEP_EMBEDDING:
         case GGML_OP_ARGSORT:
         case GGML_OP_LEAKY_RELU:
-        case GGML_OP_FLASH_ATTN_EXT:
             return true;
+        case GGML_OP_FLASH_ATTN_EXT:
+            return ctx->support_simdgroup_mm; // TODO: over-restricted for vec-kernels
         case GGML_OP_MUL_MAT:
         case GGML_OP_MUL_MAT_ID:
             return ctx->support_simdgroup_reduction &&
@@ -1194,24 +1195,24 @@ static enum ggml_status ggml_metal_graph_compute(
                         [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                     } break;
                 case GGML_OP_CLAMP:
-                {
-                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CLAMP].pipeline;
+                    {
+                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CLAMP].pipeline;
-                    float min;
-                    float max;
-                    memcpy(&min, ((int32_t *) dst->op_params) + 0, sizeof(float));
-                    memcpy(&max, ((int32_t *) dst->op_params) + 1, sizeof(float));
+                        float min;
+                        float max;
+                        memcpy(&min, ((int32_t *) dst->op_params) + 0, sizeof(float));
+                        memcpy(&max, ((int32_t *) dst->op_params) + 1, sizeof(float));
-                    [encoder setComputePipelineState:pipeline];
-                    [encoder setBuffer:id_src0   offset:offs_src0 atIndex:0];
-                    [encoder setBuffer:id_dst    offset:offs_dst  atIndex:1];
-                    [encoder setBytes:&min length:sizeof(min) atIndex:2];
-                    [encoder setBytes:&max length:sizeof(max) atIndex:3];
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src0   offset:offs_src0 atIndex:0];
+                        [encoder setBuffer:id_dst    offset:offs_dst  atIndex:1];
+                        [encoder setBytes:&min length:sizeof(min) atIndex:2];
+                        [encoder setBytes:&max length:sizeof(max) atIndex:3];
-                    const int64_t n = ggml_nelements(dst);
+                        const int64_t n = ggml_nelements(dst);
-                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                } break;
+                        [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                    } break;
                 case GGML_OP_UNARY:
                     switch (ggml_get_unary_op(gf->nodes[i])) {
                         // we are not taking into account the strides, so for now require contiguous tensors
@@ -1239,6 +1240,18 @@ static enum ggml_status ggml_metal_graph_compute(
                                 const int64_t n = ggml_nelements(dst);
+                                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                            } break;
+                        case GGML_UNARY_OP_SIGMOID:
+                            {
+                                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SIGMOID].pipeline;
+                                [encoder setComputePipelineState:pipeline];
+                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                                const int64_t n = ggml_nelements(dst);
                                 [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                             } break;
                         case GGML_UNARY_OP_GELU:
@@ -1357,16 +1370,15 @@ static enum ggml_status ggml_metal_graph_compute(
                 case GGML_OP_SOFT_MAX:
                     {
                         GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_F32);
-                        GGML_ASSERT(!src2 || src2->type == GGML_TYPE_F16 || src2->type == GGML_TYPE_F32);
                         int nth = 32; // SIMD width
                         id<MTLComputePipelineState> pipeline = nil;
-                        const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16) || (src2 && src2->type == GGML_TYPE_F16);
+                        const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
                         if (ne00%4 == 0) {
-                            while (nth < ne00/4 && nth < 256) {
+                            while (nth < ne00/4 && nth*ne01*ne02*ne03 < 256) {
                                 nth *= 2;
                             }
                             if (use_f16) {
@@ -1375,7 +1387,7 @@ static enum ggml_status ggml_metal_graph_compute(
                                 pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32_4].pipeline;
                             }
                         } else {
-                            while (nth < ne00 && nth < 1024) {
+                            while (nth < ne00 && nth*ne01*ne02*ne03 < 256) {
                                 nth *= 2;
                             }
                             if (use_f16) {
@@ -1394,8 +1406,8 @@ static enum ggml_status ggml_metal_graph_compute(
                         const int64_t nrows_x = ggml_nrows(src0);
                         const int64_t nrows_y = src0->ne[1];
-                        const uint32_t n_head_kv   = nrows_x/nrows_y;
-                        const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
+                        const uint32_t n_head      = nrows_x/nrows_y;
+                        const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
                         const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
                         const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
@@ -1407,20 +1419,15 @@ static enum ggml_status ggml_metal_graph_compute(
                         } else {
                             [encoder setBuffer:id_src0 offset:offs_src0   atIndex:1];
                         }
-                        if (id_src2) {
-                            [encoder setBuffer:id_src2 offset:offs_src2   atIndex:2];
-                        } else {
-                            [encoder setBuffer:id_src0 offset:offs_src0   atIndex:2];
-                        }
-                        [encoder setBuffer:id_dst   offset:offs_dst          atIndex:3];
-                        [encoder setBytes:&ne00     length:sizeof(ne00)      atIndex:4];
-                        [encoder setBytes:&ne01     length:sizeof(ne01)      atIndex:5];
-                        [encoder setBytes:&ne02     length:sizeof(ne02)      atIndex:6];
-                        [encoder setBytes:&scale    length:sizeof(scale)     atIndex:7];
-                        [encoder setBytes:&max_bias length:sizeof(max_bias)  atIndex:8];
-                        [encoder setBytes:&m0       length:sizeof(m0)        atIndex:9];
-                        [encoder setBytes:&m1       length:sizeof(m1)        atIndex:10];
-                        [encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:11];
+                        [encoder setBuffer:id_dst      offset:offs_dst            atIndex:2];
+                        [encoder setBytes:&ne00        length:sizeof(ne00)        atIndex:3];
+                        [encoder setBytes:&ne01        length:sizeof(ne01)        atIndex:4];
+                        [encoder setBytes:&ne02        length:sizeof(ne02)        atIndex:5];
+                        [encoder setBytes:&scale       length:sizeof(scale)       atIndex:6];
+                        [encoder setBytes:&max_bias    length:sizeof(max_bias)    atIndex:7];
+                        [encoder setBytes:&m0          length:sizeof(m0)          atIndex:8];
+                        [encoder setBytes:&m1          length:sizeof(m1)          atIndex:9];
+                        [encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:10];
                         [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
                         [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
@@ -2225,49 +2232,6 @@ static enum ggml_status ggml_metal_graph_compute(
                         [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                     } break;
-                case GGML_OP_ALIBI:
-                    {
-                        GGML_ASSERT((src0t == GGML_TYPE_F32));
-                        const int nth = MIN(1024, ne00);
-                        //const int n_past = ((int32_t *) dst->op_params)[0];
-                        const int n_head = ((int32_t *) dst->op_params)[1];
-                        float max_bias;
-                        memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
-                        const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
-                        const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
-                        const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
-                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ALIBI_F32].pipeline;
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                        [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
-                        [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
-                        [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
-                        [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
-                        [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
-                        [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
-                        [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
-                        [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
-                        [encoder setBytes:&ne0  length:sizeof( int64_t) atIndex:10];
-                        [encoder setBytes:&ne1  length:sizeof( int64_t) atIndex:11];
-                        [encoder setBytes:&ne2  length:sizeof( int64_t) atIndex:12];
-                        [encoder setBytes:&ne3  length:sizeof( int64_t) atIndex:13];
-                        [encoder setBytes:&nb0  length:sizeof(uint64_t) atIndex:14];
-                        [encoder setBytes:&nb1  length:sizeof(uint64_t) atIndex:15];
-                        [encoder setBytes:&nb2  length:sizeof(uint64_t) atIndex:16];
-                        [encoder setBytes:&nb3  length:sizeof(uint64_t) atIndex:17];
-                        [encoder setBytes:&m0   length:sizeof(   float) atIndex:18];
-                        [encoder setBytes:&m1   length:sizeof(   float) atIndex:19];
-                        [encoder setBytes:&n_heads_log2_floor   length:sizeof(int) atIndex:20];
-                        [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                    } break;
                 case GGML_OP_ROPE:
                     {
                         GGML_ASSERT(ne10 == ne02);
@@ -2389,7 +2353,10 @@ static enum ggml_status ggml_metal_graph_compute(
                     {
                         GGML_ASSERT(src0->type == GGML_TYPE_F32);
-                        const int sf = dst->op_params[0];
+                        const float sf0 = (float)ne0/src0->ne[0];
+                        const float sf1 = (float)ne1/src0->ne[1];
+                        const float sf2 = (float)ne2/src0->ne[2];
+                        const float sf3 = (float)ne3/src0->ne[3];
                         const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UPSCALE_F32].pipeline;
@@ -2412,7 +2379,10 @@ static enum ggml_status ggml_metal_graph_compute(
                         [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:15];
                         [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:16];
                         [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:17];
-                        [encoder setBytes:&sf   length:sizeof(sf)   atIndex:18];
+                        [encoder setBytes:&sf0  length:sizeof(sf0)  atIndex:18];
+                        [encoder setBytes:&sf1  length:sizeof(sf1)  atIndex:19];
+                        [encoder setBytes:&sf2  length:sizeof(sf2)  atIndex:20];
+                        [encoder setBytes:&sf3  length:sizeof(sf3)  atIndex:21];
                         const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
@@ -2548,13 +2518,14 @@ static enum ggml_status ggml_metal_graph_compute(
                     } break;
                 case GGML_OP_FLASH_ATTN_EXT:
                     {
-                        GGML_ASSERT(ne00 % 4 == 0);
+                        GGML_ASSERT(ne00 % 4  == 0);
+                        GGML_ASSERT(ne11 % 32 == 0);
                         GGML_ASSERT(src0->type == GGML_TYPE_F32);
-                        struct ggml_tensor * src3 = gf->nodes[i]->src[3];
+                        GGML_ASSERT(ggml_are_same_shape (src1, src2));
-                        GGML_ASSERT(ggml_are_same_shape(src1, src2));
-                        GGML_ASSERT(src3);
+                        struct ggml_tensor * src3 = gf->nodes[i]->src[3];
                         size_t offs_src3 = 0;
@@ -2564,8 +2535,13 @@ static enum ggml_status ggml_metal_graph_compute(
                         GGML_ASSERT(!src3 || src3->ne[1] >= GGML_PAD(src0->ne[1], 8) &&
                                 "the Flash-Attention Metal kernel requires the mask to be padded to 8 and at least n_queries big");
+                        const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20);
+                        const uint64_t nb21 = src2 ? src2->nb[1] : 0;
+                        const uint64_t nb22 = src2 ? src2->nb[2] : 0;
+                        const uint64_t nb23 = src2 ? src2->nb[3] : 0;
                         const int64_t  ne30 = src3 ? src3->ne[0] : 0; GGML_UNUSED(ne30);
-                        const int64_t  ne31 = src3 ? src3->ne[1] : 0;
+                      //const int64_t  ne31 = src3 ? src3->ne[1] : 0;
                         const int64_t  ne32 = src3 ? src3->ne[2] : 0; GGML_UNUSED(ne32);
                         const int64_t  ne33 = src3 ? src3->ne[3] : 0; GGML_UNUSED(ne33);
@@ -2577,7 +2553,16 @@ static enum ggml_status ggml_metal_graph_compute(
                         const enum ggml_type src2t = src2 ? src2->type : GGML_TYPE_COUNT; GGML_UNUSED(src2t);
                         float scale;
-                        memcpy(&scale, dst->op_params, sizeof(float));
+                        float max_bias;
+                        memcpy(&scale,    ((int32_t *) dst->op_params) + 0, sizeof(scale));
+                        memcpy(&max_bias, ((int32_t *) dst->op_params) + 1, sizeof(max_bias));
+                        const uint32_t n_head      = src0->ne[2];
+                        const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
+                        const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
+                        const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
                         id<MTLComputePipelineState> pipeline = nil;
@@ -2614,34 +2599,38 @@ static enum ggml_status ggml_metal_graph_compute(
                         }
                         [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
-                        [encoder setBuffer:id_src1 offset:offs_src1        atIndex:1];
-                        [encoder setBuffer:id_src2 offset:offs_src2        atIndex:2];
-                        [encoder setBuffer:id_src3 offset:offs_src3        atIndex:3];
-                        [encoder setBuffer:id_dst  offset:offs_dst         atIndex:4];
-                        [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:5];
-                        [encoder setBytes:&ne01    length:sizeof( int64_t) atIndex:6];
-                        [encoder setBytes:&ne02    length:sizeof( int64_t) atIndex:7];
-                        [encoder setBytes:&ne03    length:sizeof( int64_t) atIndex:8];
-                        [encoder setBytes:&nb00    length:sizeof(uint64_t) atIndex:9];
-                        [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:10];
-                        [encoder setBytes:&nb02    length:sizeof(uint64_t) atIndex:11];
-                        [encoder setBytes:&nb03    length:sizeof(uint64_t) atIndex:12];
-                        [encoder setBytes:&ne10    length:sizeof( int64_t) atIndex:13];
-                        [encoder setBytes:&ne11    length:sizeof( int64_t) atIndex:14];
-                        [encoder setBytes:&ne12    length:sizeof( int64_t) atIndex:15];
-                        [encoder setBytes:&ne13    length:sizeof( int64_t) atIndex:16];
-                        [encoder setBytes:&nb10    length:sizeof(uint64_t) atIndex:17];
-                        [encoder setBytes:&nb11    length:sizeof(uint64_t) atIndex:18];
-                        [encoder setBytes:&nb12    length:sizeof(uint64_t) atIndex:19];
-                        [encoder setBytes:&nb13    length:sizeof(uint64_t) atIndex:20];
-                        [encoder setBytes:&ne31    length:sizeof( int64_t) atIndex:21];
-                        [encoder setBytes:&nb31    length:sizeof(uint64_t) atIndex:22];
-                        [encoder setBytes:&ne0     length:sizeof( int64_t) atIndex:23];
-                        [encoder setBytes:&ne1     length:sizeof( int64_t) atIndex:24];
-                        [encoder setBytes:&ne2     length:sizeof( int64_t) atIndex:25];
-                        [encoder setBytes:&ne3     length:sizeof( int64_t) atIndex:26];
-                        [encoder setBytes:&scale   length:sizeof(   float) atIndex:27];
+                        [encoder setBuffer:id_src0     offset:offs_src0           atIndex:0];
+                        [encoder setBuffer:id_src1     offset:offs_src1           atIndex:1];
+                        [encoder setBuffer:id_src2     offset:offs_src2           atIndex:2];
+                        if (id_src3) {
+                            [encoder setBuffer:id_src3     offset:offs_src3           atIndex:3];
+                        } else {
+                            [encoder setBuffer:id_src0     offset:offs_src0           atIndex:3];
+                        }
+                        [encoder setBuffer:id_dst      offset:offs_dst            atIndex:4];
+                        [encoder setBytes:&ne01        length:sizeof( int64_t)    atIndex:5];
+                        [encoder setBytes:&ne02        length:sizeof( int64_t)    atIndex:6];
+                        [encoder setBytes:&ne03        length:sizeof( int64_t)    atIndex:7];
+                        [encoder setBytes:&nb01        length:sizeof(uint64_t)    atIndex:8];
+                        [encoder setBytes:&nb02        length:sizeof(uint64_t)    atIndex:9];
+                        [encoder setBytes:&nb03        length:sizeof(uint64_t)    atIndex:10];
+                        [encoder setBytes:&ne11        length:sizeof( int64_t)    atIndex:11];
+                        [encoder setBytes:&ne12        length:sizeof( int64_t)    atIndex:12];
+                        [encoder setBytes:&ne13        length:sizeof( int64_t)    atIndex:13];
+                        [encoder setBytes:&nb11        length:sizeof(uint64_t)    atIndex:14];
+                        [encoder setBytes:&nb12        length:sizeof(uint64_t)    atIndex:15];
+                        [encoder setBytes:&nb13        length:sizeof(uint64_t)    atIndex:16];
+                        [encoder setBytes:&nb21        length:sizeof(uint64_t)    atIndex:17];
+                        [encoder setBytes:&nb22        length:sizeof(uint64_t)    atIndex:18];
+                        [encoder setBytes:&nb23        length:sizeof(uint64_t)    atIndex:19];
+                        [encoder setBytes:&nb31        length:sizeof(uint64_t)    atIndex:20];
+                        [encoder setBytes:&ne1         length:sizeof( int64_t)    atIndex:21];
+                        [encoder setBytes:&ne2         length:sizeof( int64_t)    atIndex:22];
+                        [encoder setBytes:&scale       length:sizeof(   float)    atIndex:23];
+                        [encoder setBytes:&max_bias    length:sizeof(   float)    atIndex:24];
+                        [encoder setBytes:&m0          length:sizeof(m0)          atIndex:25];
+                        [encoder setBytes:&m1          length:sizeof(m1)          atIndex:26];
+                        [encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:27];
                         if (!use_vec_kernel) {
                             // half8x8 kernel