npm - @fugood/llama.node - Versions diffs - 0.3.9 → 0.3.11 - Mend

@fugood/llama.node 0.3.9 → 0.3.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (106) hide show

package/bin/darwin/arm64/llama-node.node +0 -0
package/bin/darwin/x64/llama-node.node +0 -0
package/bin/linux/arm64/llama-node.node +0 -0
package/bin/linux/x64/llama-node.node +0 -0
package/bin/linux-cuda/arm64/llama-node.node +0 -0
package/bin/linux-cuda/x64/llama-node.node +0 -0
package/bin/linux-vulkan/arm64/llama-node.node +0 -0
package/bin/linux-vulkan/x64/llama-node.node +0 -0
package/bin/win32/arm64/llama-node.node +0 -0
package/bin/win32/arm64/node.lib +0 -0
package/bin/win32/x64/llama-node.node +0 -0
package/bin/win32/x64/node.lib +0 -0
package/bin/win32-vulkan/arm64/llama-node.node +0 -0
package/bin/win32-vulkan/arm64/node.lib +0 -0
package/bin/win32-vulkan/x64/llama-node.node +0 -0
package/bin/win32-vulkan/x64/node.lib +0 -0
package/lib/binding.js +2 -2
package/lib/binding.ts +47 -8
package/lib/index.js +21 -1
package/lib/index.ts +31 -1
package/package.json +12 -3
package/src/LlamaCompletionWorker.cpp +33 -6
package/src/LlamaCompletionWorker.h +3 -1
package/src/LlamaContext.cpp +336 -28
package/src/LlamaContext.h +2 -0
package/src/common.hpp +19 -2
package/src/llama.cpp/.github/workflows/build.yml +289 -107
package/src/llama.cpp/.github/workflows/close-issue.yml +1 -1
package/src/llama.cpp/.github/workflows/docker.yml +2 -1
package/src/llama.cpp/.github/workflows/server.yml +25 -2
package/src/llama.cpp/CMakeLists.txt +10 -19
package/src/llama.cpp/cmake/build-info.cmake +1 -1
package/src/llama.cpp/common/CMakeLists.txt +32 -0
package/src/llama.cpp/common/arg.cpp +66 -16
package/src/llama.cpp/common/chat-template.hpp +515 -0
package/src/llama.cpp/common/chat.cpp +966 -0
package/src/llama.cpp/common/chat.hpp +52 -0
package/src/llama.cpp/common/common.cpp +159 -36
package/src/llama.cpp/common/common.h +56 -14
package/src/llama.cpp/common/json-schema-to-grammar.cpp +46 -66
package/src/llama.cpp/common/json-schema-to-grammar.h +15 -1
package/src/llama.cpp/common/llguidance.cpp +270 -0
package/src/llama.cpp/common/log.cpp +1 -10
package/src/llama.cpp/common/log.h +10 -0
package/src/llama.cpp/common/minja.hpp +2868 -0
package/src/llama.cpp/common/sampling.cpp +22 -1
package/src/llama.cpp/common/sampling.h +3 -0
package/src/llama.cpp/docs/build.md +54 -9
package/src/llama.cpp/examples/export-lora/export-lora.cpp +12 -2
package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +1 -1
package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +59 -0
package/src/llama.cpp/examples/llava/clip.cpp +133 -14
package/src/llama.cpp/examples/llava/clip.h +2 -0
package/src/llama.cpp/examples/llava/llava.cpp +22 -8
package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +9 -1
package/src/llama.cpp/examples/main/main.cpp +26 -25
package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +136 -137
package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +18 -4
package/src/llama.cpp/examples/run/run.cpp +224 -69
package/src/llama.cpp/examples/server/server.cpp +252 -81
package/src/llama.cpp/examples/server/utils.hpp +73 -21
package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +6 -4
package/src/llama.cpp/examples/simple-cmake-pkg/CMakeLists.txt +11 -0
package/src/llama.cpp/ggml/CMakeLists.txt +78 -1
package/src/llama.cpp/ggml/include/ggml.h +1 -1
package/src/llama.cpp/ggml/src/CMakeLists.txt +21 -4
package/src/llama.cpp/ggml/src/ggml-alloc.c +1 -13
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +91 -78
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +7 -7
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -1
package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +1 -1
package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +46 -0
package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +16 -1
package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +1 -1
package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +28 -8
package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +5 -7
package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +33 -23
package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +1 -5
package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +323 -121
package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +13 -3
package/src/llama.cpp/ggml/src/ggml.c +23 -13
package/src/llama.cpp/include/llama.h +14 -1
package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +112 -0
package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +46 -0
package/src/llama.cpp/src/CMakeLists.txt +1 -1
package/src/llama.cpp/src/llama-arch.cpp +7 -2
package/src/llama.cpp/src/llama-arch.h +3 -1
package/src/llama.cpp/src/llama-chat.cpp +11 -2
package/src/llama.cpp/src/llama-chat.h +1 -0
package/src/llama.cpp/src/llama-grammar.cpp +86 -6
package/src/llama.cpp/src/llama-grammar.h +22 -1
package/src/llama.cpp/src/llama-mmap.cpp +1 -0
package/src/llama.cpp/src/llama-model-loader.cpp +1 -1
package/src/llama.cpp/src/llama-model.cpp +76 -6
package/src/llama.cpp/src/llama-sampling.cpp +47 -4
package/src/llama.cpp/src/llama-vocab.cpp +10 -4
package/src/llama.cpp/src/llama.cpp +181 -123
package/src/llama.cpp/tests/CMakeLists.txt +4 -0
package/src/llama.cpp/tests/test-backend-ops.cpp +158 -57
package/src/llama.cpp/tests/test-chat-template.cpp +154 -31
package/src/llama.cpp/tests/test-chat.cpp +607 -0
package/src/llama.cpp/tests/test-grammar-integration.cpp +2 -2
package/src/llama.cpp/tests/test-grammar-llguidance.cpp +1140 -0
package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +1 -1
package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +0 -32

package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h CHANGED Viewed

@@ -1,5 +1,6 @@
 #pragma once
+#define HIP_ENABLE_WARP_SYNC_BUILTINS 1
 #include <hip/hip_runtime.h>
 #include <hipblas/hipblas.h>
 #include <hip/hip_fp16.h>
@@ -8,6 +9,7 @@
 // for rocblas_initialize()
 #include "rocblas/rocblas.h"
 #endif // __HIP_PLATFORM_AMD__
 #define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
 #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
 #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
@@ -19,6 +21,13 @@
 #define CUBLAS_TF32_TENSOR_OP_MATH 0
 #define CUDA_R_16F  HIPBLAS_R_16F
 #define CUDA_R_32F  HIPBLAS_R_32F
+#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED hipDeviceAttributeVirtualMemoryManagementSupported
+#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED hipMemAllocationGranularityRecommended
+#define CU_MEM_ALLOCATION_TYPE_PINNED hipMemAllocationTypePinned
+#define CU_MEM_LOCATION_TYPE_DEVICE hipMemLocationTypeDevice
+#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite
+#define CU_CHECK(fn) {hipError_t err = fn; if(err != hipSuccess) { GGML_ABORT("HipVMM Failure: %s\n", hipGetErrorString(err)); }}
+#define __shfl_sync(mask, var, laneMask, width) __shfl(var, laneMask, width)
 #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
 #define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
 #define cublasCreate hipblasCreate
@@ -74,6 +83,21 @@
 #define cudaMemGetInfo hipMemGetInfo
 #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
 #define cudaSetDevice hipSetDevice
+#define cuDeviceGet hipDeviceGet
+#define CUdevice hipDevice_t
+#define CUdeviceptr hipDeviceptr_t
+#define cuMemUnmap hipMemUnmap
+#define CUmemAccessDesc hipMemAccessDesc
+#define cuMemAddressFree hipMemAddressFree
+#define cuMemRelease hipMemRelease
+#define CUmemGenericAllocationHandle hipMemGenericAllocationHandle_t
+#define cuMemCreate hipMemCreate
+#define cuMemAddressReserve hipMemAddressReserve
+#define cuMemMap hipMemMap
+#define cuMemSetAccess hipMemSetAccess
+#define cuMemGetAllocationGranularity hipMemGetAllocationGranularity
+#define CUmemAllocationProp hipMemAllocationProp
+#define cuDeviceGetAttribute hipDeviceGetAttribute
 #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
 #define cudaStreamDestroy hipStreamDestroy
 #define cudaStreamFireAndForget hipStreamFireAndForget
@@ -81,6 +105,28 @@
 #define cudaStreamPerThread hipStreamPerThread
 #define cudaStreamSynchronize hipStreamSynchronize
 #define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
+#define cudaGraphExec_t hipGraphExec_t
+#define cudaGraphNode_t hipGraphNode_t
+#define cudaKernelNodeParams hipKernelNodeParams
+#define cudaKernelNodeParams hipKernelNodeParams
+#define cudaGraphExecDestroy hipGraphExecDestroy
+#define cudaGraphLaunch hipGraphLaunch
+#define cudaErrorGraphExecUpdateFailure hipErrorGraphExecUpdateFailure
+#define cudaGraphExecUpdateResultInfo hipGraphExecUpdateResult
+#define cudaGraphNodeType hipGraphNodeType
+#define cudaGraphNodeTypeKernel hipGraphNodeTypeKernel
+#define cudaGraphInstantiate hipGraphInstantiate
+#define cudaStreamEndCapture hipStreamEndCapture
+#define cudaGraphDestroy hipGraphDestroy
+#define cudaGraphKernelNodeSetParams hipGraphKernelNodeSetParams
+#define cudaErrorInvalidDeviceFunction hipErrorInvalidDeviceFunction
+#define cudaGraphKernelNodeGetParams hipGraphKernelNodeGetParams
+#define cudaGraphNodeGetType hipGraphNodeGetType
+#define cudaGraphGetNodes hipGraphGetNodes
+#define cudaGraphExecUpdate hipGraphExecUpdate
+#define cudaStreamCaptureModeRelaxed hipStreamCaptureModeRelaxed
+#define cudaStreamBeginCapture hipStreamBeginCapture
+#define cudaGraph_t hipGraph_t
 #define cudaStream_t hipStream_t
 #define cudaSuccess hipSuccess
 #define __trap() do { abort(); __builtin_unreachable(); } while(0)

package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt CHANGED Viewed

@@ -40,13 +40,20 @@ find_package(hip     REQUIRED)
 find_package(hipblas REQUIRED)
 find_package(rocblas REQUIRED)
+if (${hip_VERSION} VERSION_LESS 5.5)
+    message(FATAL_ERROR "At least ROCM/HIP V5.5 is required")
+endif()
 message(STATUS "HIP and hipBLAS found")
+# Workaround old compilers
+set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} --gpu-max-threads-per-block=1024")
 file(GLOB   GGML_HEADERS_ROCM "../ggml-cuda/*.cuh")
 list(APPEND GGML_HEADERS_ROCM "../../include/ggml-cuda.h")
 file(GLOB   GGML_SOURCES_ROCM "../ggml-cuda/*.cu")
-file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-wmma*.cu")
+file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-mma*.cu")
 list(APPEND GGML_SOURCES_ROCM ${SRCS})
 file(GLOB   SRCS "../ggml-cuda/template-instances/mmq*.cu")
 list(APPEND GGML_SOURCES_ROCM ${SRCS})
@@ -92,6 +99,14 @@ if (GGML_CUDA_NO_PEER_COPY)
     add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
 endif()
+if (GGML_HIP_GRAPHS)
+    add_compile_definitions(GGML_HIP_GRAPHS)
+endif()
+if (GGML_HIP_NO_VMM)
+    add_compile_definitions(GGML_HIP_NO_VMM)
+endif()
 if (CXX_IS_HIPCC)
     set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
     target_link_libraries(ggml-hip PRIVATE hip::device)

package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt CHANGED Viewed

@@ -29,7 +29,7 @@ if (MUSAToolkit_FOUND)
     list(APPEND GGML_HEADERS_MUSA "../../include/ggml-cuda.h")
     file(GLOB   GGML_SOURCES_MUSA "../ggml-cuda/*.cu")
-    file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-wmma*.cu")
+    file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-mma*.cu")
     list(APPEND GGML_SOURCES_MUSA ${SRCS})
     file(GLOB   SRCS "../ggml-cuda/template-instances/mmq*.cu")
     list(APPEND GGML_SOURCES_MUSA ${SRCS})

package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp CHANGED Viewed

@@ -181,7 +181,7 @@ struct ggml_backend_rpc_context {
 struct ggml_backend_rpc_buffer_context {
     std::shared_ptr<socket_t> sock;
-    std::unordered_map<ggml_backend_buffer_t, void *> base_cache;
+    void * base_ptr;
     uint64_t remote_ptr;
 };
@@ -423,16 +423,15 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
 static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
     ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
-    if (ctx->base_cache.find(buffer) != ctx->base_cache.end()) {
-        return ctx->base_cache[buffer];
+    if (ctx->base_ptr != nullptr) {
+        return ctx->base_ptr;
     }
     rpc_msg_buffer_get_base_req request = {ctx->remote_ptr};
     rpc_msg_buffer_get_base_rsp response;
     bool status = send_rpc_cmd(ctx->sock, RPC_CMD_BUFFER_GET_BASE, &request, sizeof(request), &response, sizeof(response));
     GGML_ASSERT(status);
-    void * base_ptr = reinterpret_cast<void *>(response.base_ptr);
-    ctx->base_cache[buffer] = base_ptr;
-    return base_ptr;
+    ctx->base_ptr = reinterpret_cast<void *>(response.base_ptr);
+    return ctx->base_ptr;
 }
 static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
@@ -557,7 +556,7 @@ static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer(ggml_back
     if (response.remote_ptr != 0) {
         ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
             ggml_backend_rpc_buffer_interface,
-            new ggml_backend_rpc_buffer_context{sock, {}, response.remote_ptr},
+            new ggml_backend_rpc_buffer_context{sock, nullptr, response.remote_ptr},
             response.remote_size);
         return buffer;
     } else {
@@ -1046,7 +1045,28 @@ bool rpc_server::copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_co
         ggml_free(ctx);
         return false;
     }
-    GGML_PRINT_DEBUG("[%s] src->buffer: %p, dst->buffer: %p\n", __func__, (void*)src->buffer, (void*)dst->buffer);
+    uint64_t src_size   = (uint64_t) ggml_nbytes(src);
+    uint64_t dst_data   = (uint64_t) dst->data;
+    uint64_t dst_base   = (uint64_t) ggml_backend_buffer_get_base(dst->buffer);
+    uint64_t dst_buf_sz = (uint64_t) ggml_backend_buffer_get_size(dst->buffer);
+    if (dst_data + src_size > dst_base + dst_buf_sz) {
+        GGML_PRINT_DEBUG("[%s] out-of-bounds write in rpc_server::copy_tensor:\n"
+                         "    write range : [0x%" PRIx64 ", 0x%" PRIx64 "]\n"
+                         "    buffer base: [0x%" PRIx64 ", 0x%" PRIx64 "]\n",
+                         __func__,
+                         dst_data,
+                         dst_data + src_size,
+                         dst_base,
+                         dst_base + dst_buf_sz);
+        ggml_free(ctx);
+        return false;
+    }
+    GGML_PRINT_DEBUG("[%s] src->buffer: %p, dst->buffer: %p\n",
+                     __func__, (void*) src->buffer, (void*) dst->buffer);
     response.result = ggml_backend_buffer_copy_tensor(src, dst);
     ggml_free(ctx);
     return true;

package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp CHANGED Viewed

@@ -3878,10 +3878,6 @@ static void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor
     ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_diag_mask_inf);
 }
-static void ggml_sycl_soft_max(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_soft_max);
-}
 static void ggml_sycl_rope(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
     GGML_ASSERT(ggml_is_contiguous(dst->src[0])); // TODO: this restriction is temporary until non-cont support is implemented
     ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_rope);
@@ -4090,7 +4086,7 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens
             ggml_sycl_diag_mask_inf(ctx, dst);
             break;
         case GGML_OP_SOFT_MAX:
-            ggml_sycl_soft_max(ctx, dst);
+            ggml_sycl_op_soft_max(ctx, dst);
             break;
         case GGML_OP_ROPE:
             ggml_sycl_rope(ctx, dst);
@@ -4541,14 +4537,17 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
         case GGML_OP_VIEW:
         case GGML_OP_PERMUTE:
         case GGML_OP_TRANSPOSE:
-        case GGML_OP_NORM:
         case GGML_OP_ADD:
         case GGML_OP_ADD1:
         case GGML_OP_LOG:
         case GGML_OP_SUB:
         case GGML_OP_MUL:
         case GGML_OP_DIV:
+            return true;
+        case GGML_OP_NORM:
         case GGML_OP_RMS_NORM:
+        case GGML_OP_GROUP_NORM:
+            return ggml_is_contiguous(op->src[0]);
         case GGML_OP_SCALE:
         case GGML_OP_SQR:
         case GGML_OP_SQRT:
@@ -4580,7 +4579,6 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
         case GGML_OP_SUM_ROWS:
         case GGML_OP_ARGSORT:
         case GGML_OP_ACC:
-        case GGML_OP_GROUP_NORM:
         case GGML_OP_UPSCALE:
         case GGML_OP_PAD:
         case GGML_OP_LEAKY_RELU:

package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp CHANGED Viewed

@@ -1,7 +1,7 @@
-#include "norm.hpp"
+#include "softmax.hpp"
-template <bool vals_smem, int ncols_template, int block_size_template>
-static void soft_max_f32(const float * x, const float * mask, float * dst, const int ncols_par,
+template <bool vals_smem, int ncols_template, int block_size_template, typename T>
+static void soft_max_f32(const float * x, const T * mask, float * dst, const int ncols_par,
                          const int nrows_y, const float scale, const float max_bias, const float m0,
                          const float m1, uint32_t n_head_log2, const sycl::nd_item<3> &item_ct1, float *buf) {
     const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
@@ -29,7 +29,7 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const
         slope = sycl::pow(base, float(exp));
     }
-    float *vals = vals_smem ? buf + std::max(nwarps, WARP_SIZE) : dst + rowx * ncols;
+    float *vals = vals_smem ? buf + sycl::max(nwarps, WARP_SIZE) : dst + rowx * ncols;
     float max_val = -INFINITY;
     for (int col0 = 0; col0 < ncols; col0 += block_size) {
@@ -42,7 +42,7 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const
         const int ix = rowx*ncols + col;
         const int iy = rowy*ncols + col;
-        const float val = x[ix]*scale + (mask ? slope*mask[iy] : 0.0f);
+        const float val = x[ix]*scale + (mask ? slope*static_cast<float>(mask[iy]) : 0.0f);
         vals[col] = val;
         max_val = sycl::max(max_val, val);
@@ -65,7 +65,7 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const
         item_ct1.barrier(sycl::access::fence_space::local_space);
         max_val = buf[lane_id];
         for (size_t i = 1; i < nreduce; i += 1) {
-            max_val = std::max(max_val, buf[lane_id + i * WARP_SIZE]);
+            max_val = sycl::max(max_val, buf[lane_id + i * WARP_SIZE]);
         }
         max_val = warp_reduce_max(max_val, item_ct1);
     }
@@ -122,8 +122,8 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const
     }
 }
-template <bool vals_smem, int ncols_template, int block_size_template>
-static void soft_max_f32_submitter(const float * x, const float * mask, float * dst, const int ncols_par,
+template <bool vals_smem, int ncols_template, int block_size_template, typename T>
+static void soft_max_f32_submitter(const float * x, const T * mask, float * dst, const int ncols_par,
                                    const int nrows_y, const float scale, const float max_bias, const float m0,
                                    const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims,
                                    const size_t n_local_scratch, queue_ptr stream) {
@@ -141,7 +141,8 @@ static void soft_max_f32_submitter(const float * x, const float * mask, float *
     });
 }
-static void soft_max_f32_sycl(const float * x, const float * mask,
+template<typename T>
+static void soft_max_f32_sycl(const float * x, const T * mask,
                               float * dst, const int ncols_x, const int nrows_x,
                               const int nrows_y, const float scale, const float max_bias,
                               queue_ptr stream, int device) {
@@ -223,22 +224,16 @@ static void soft_max_f32_sycl(const float * x, const float * mask,
     }
 }
-void ggml_sycl_op_soft_max(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
-                                  const ggml_tensor *src1, ggml_tensor *dst,
-                                  const float *src0_dd, const float *src1_dd,
-                                  float *dst_dd,
-                                  const queue_ptr &main_stream) {
+void ggml_sycl_op_soft_max(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
     GGML_ASSERT( dst->type == GGML_TYPE_F32);
-#pragma message("TODO: add ggml_sycl_op_soft_max() F16 src1 support")
-#pragma message("ref:  https://github.com/ggerganov/llama.cpp/pull/5021")
-    GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
+    GGML_ASSERT(!dst->src[1] || dst->src[1]->type == GGML_TYPE_F16 || dst->src[1]->type == GGML_TYPE_F32); // src1 contains mask and it is optional
-    const int64_t ne00 = src0->ne[0];
-    const int64_t nrows_x = ggml_nrows(src0);
-    const int64_t nrows_y = src0->ne[1];
+    const int64_t ne00 = dst->src[0]->ne[0];
+    const int64_t nrows_x = ggml_nrows(dst->src[0]);
+    const int64_t nrows_y = dst->src[0]->ne[1];
     float scale = 1.0f;
     float max_bias = 0.0f;
@@ -246,6 +241,21 @@ void ggml_sycl_op_soft_max(ggml_backend_sycl_context & ctx, const ggml_tensor *s
     memcpy(&scale, dst->op_params + 0, sizeof(float));
     memcpy(&max_bias, dst->op_params + 1, sizeof(float));
-    soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00,
-        nrows_x, nrows_y, scale, max_bias, main_stream, ctx.device);
+    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
+    float * dst_dd = static_cast<float *>(dst->data);
+    ggml_sycl_set_device(ctx.device);
+    dpct::queue_ptr main_stream = ctx.stream();
+    if (dst->src[1] && dst->src[1]->type == GGML_TYPE_F16) {
+        const sycl::half * src1_dd = static_cast<sycl::half *>(dst->src[1]->data);
+        soft_max_f32_sycl<sycl::half>(src0_dd, src1_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias,
+                          main_stream, ctx.device);
+    } else if (dst->src[1] && dst->src[1]->type == GGML_TYPE_F32) {
+        const float * src1_dd = static_cast<const float *>(dst->src[1]->data);
+        soft_max_f32_sycl<float>(src0_dd, src1_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream, ctx.device);
+    } else {
+        /* mask unavailable */
+        soft_max_f32_sycl<float>(src0_dd, nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream, ctx.device);
+    }
 }

package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp CHANGED Viewed

@@ -15,10 +15,6 @@
 #include "common.hpp"
-void ggml_sycl_op_soft_max(ggml_backend_sycl_context &ctx, const ggml_tensor *src0,
-    const ggml_tensor *src1, ggml_tensor *dst,
-    const float *src0_dd, const float *src1_dd,
-    float *dst_dd,
-    const queue_ptr &main_stream);
+void ggml_sycl_op_soft_max(ggml_backend_sycl_context &ctx, ggml_tensor *dst);
 #endif // GGML_SYCL_SOFTMAX_HPP