npm - @fugood/llama.node - Versions diffs - 0.3.1 → 0.3.3 - Mend

@fugood/llama.node 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (252) hide show

package/CMakeLists.txt +1 -8
package/bin/darwin/arm64/llama-node.node +0 -0
package/bin/darwin/x64/llama-node.node +0 -0
package/bin/linux/arm64/llama-node.node +0 -0
package/bin/linux/x64/llama-node.node +0 -0
package/bin/linux-vulkan/arm64/llama-node.node +0 -0
package/bin/linux-vulkan/x64/llama-node.node +0 -0
package/bin/win32/arm64/llama-node.node +0 -0
package/bin/win32/arm64/node.lib +0 -0
package/bin/win32/x64/llama-node.node +0 -0
package/bin/win32/x64/node.lib +0 -0
package/bin/win32-vulkan/arm64/llama-node.node +0 -0
package/bin/win32-vulkan/arm64/node.lib +0 -0
package/bin/win32-vulkan/x64/llama-node.node +0 -0
package/bin/win32-vulkan/x64/node.lib +0 -0
package/package.json +4 -2
package/src/DetokenizeWorker.cpp +1 -1
package/src/EmbeddingWorker.cpp +2 -2
package/src/LlamaCompletionWorker.cpp +10 -10
package/src/LlamaCompletionWorker.h +2 -2
package/src/LlamaContext.cpp +14 -17
package/src/TokenizeWorker.cpp +1 -1
package/src/common.hpp +5 -4
package/src/llama.cpp/.github/workflows/build.yml +137 -29
package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
package/src/llama.cpp/.github/workflows/docker.yml +46 -34
package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
package/src/llama.cpp/.github/workflows/server.yml +7 -0
package/src/llama.cpp/CMakeLists.txt +26 -11
package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
package/src/llama.cpp/common/CMakeLists.txt +10 -10
package/src/llama.cpp/common/arg.cpp +2041 -0
package/src/llama.cpp/common/arg.h +77 -0
package/src/llama.cpp/common/common.cpp +523 -1861
package/src/llama.cpp/common/common.h +234 -106
package/src/llama.cpp/common/console.cpp +3 -0
package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
package/src/llama.cpp/common/log.cpp +401 -0
package/src/llama.cpp/common/log.h +66 -698
package/src/llama.cpp/common/ngram-cache.cpp +39 -36
package/src/llama.cpp/common/ngram-cache.h +19 -19
package/src/llama.cpp/common/sampling.cpp +356 -350
package/src/llama.cpp/common/sampling.h +62 -139
package/src/llama.cpp/common/stb_image.h +5990 -6398
package/src/llama.cpp/docs/build.md +72 -17
package/src/llama.cpp/examples/CMakeLists.txt +1 -2
package/src/llama.cpp/examples/batched/batched.cpp +49 -65
package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
package/src/llama.cpp/examples/infill/infill.cpp +131 -192
package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
package/src/llama.cpp/examples/llava/clip.cpp +686 -150
package/src/llama.cpp/examples/llava/clip.h +11 -2
package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
package/src/llama.cpp/examples/llava/llava.cpp +146 -26
package/src/llama.cpp/examples/llava/llava.h +2 -3
package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
package/src/llama.cpp/examples/llava/requirements.txt +1 -0
package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
package/src/llama.cpp/examples/main/main.cpp +216 -313
package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
package/src/llama.cpp/examples/server/server.cpp +1347 -1531
package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
package/src/llama.cpp/examples/server/utils.hpp +396 -107
package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/simple/simple.cpp +132 -106
package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
package/src/llama.cpp/ggml/include/ggml.h +272 -505
package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
package/src/llama.cpp/include/llama.h +296 -285
package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
package/src/llama.cpp/src/CMakeLists.txt +2 -1
package/src/llama.cpp/src/llama-grammar.cpp +721 -122
package/src/llama.cpp/src/llama-grammar.h +120 -15
package/src/llama.cpp/src/llama-impl.h +156 -1
package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
package/src/llama.cpp/src/llama-sampling.h +39 -47
package/src/llama.cpp/src/llama-vocab.cpp +390 -127
package/src/llama.cpp/src/llama-vocab.h +60 -20
package/src/llama.cpp/src/llama.cpp +6215 -3263
package/src/llama.cpp/src/unicode-data.cpp +6 -4
package/src/llama.cpp/src/unicode-data.h +4 -4
package/src/llama.cpp/src/unicode.cpp +15 -7
package/src/llama.cpp/tests/CMakeLists.txt +4 -2
package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
package/src/llama.cpp/tests/test-barrier.cpp +94 -0
package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
package/src/llama.cpp/tests/test-log.cpp +39 -0
package/src/llama.cpp/tests/test-opt.cpp +853 -142
package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
package/src/llama.cpp/tests/test-rope.cpp +2 -1
package/src/llama.cpp/tests/test-sampling.cpp +226 -142
package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
package/patches/llama.patch +0 -22
package/src/llama.cpp/.github/workflows/bench.yml +0 -310
package/src/llama.cpp/common/grammar-parser.cpp +0 -536
package/src/llama.cpp/common/grammar-parser.h +0 -29
package/src/llama.cpp/common/train.cpp +0 -1513
package/src/llama.cpp/common/train.h +0 -233
package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
/package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
/package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0

package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp CHANGED Viewed

@@ -8,7 +8,6 @@ static void norm_f32(const float* x, float* dst, const int ncols, const float ep
     const int nthreads = item_ct1.get_local_range(2);
     const int nwarps = nthreads / WARP_SIZE;
-    assert(nwarps % WARP_SIZE == 0);
     sycl::float2 mean_var = sycl::float2(0.f, 0.f);
     for (int col = tid; col < ncols; col += block_size) {
@@ -55,7 +54,6 @@ static void group_norm_f32(const float* x, float* dst, const int group_size, con
     int end = start + group_size;
     const int nthreads = item_ct1.get_local_range(2);
     const int nwarps = nthreads / WARP_SIZE;
-    assert(nwarps % WARP_SIZE == 0);
     start += item_ct1.get_local_id(2);
     int nreduce = nwarps / WARP_SIZE;
@@ -144,7 +142,6 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols, const floa
     const int tid = item_ct1.get_local_id(2);
     const int nthreads = item_ct1.get_local_range(2);
     const int nwarps = nthreads / WARP_SIZE;
-    assert(nwarps % WARP_SIZE == 0);
     float tmp = 0.0f; // partial sum for thread in warp
     for (int col = tid; col < ncols; col += block_size) {
@@ -202,6 +199,7 @@ static void norm_f32_sycl(const float* x, float* dst, const int ncols,
     }
     else {
         const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
+        assert(work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
         const sycl::range<3> block_dims(1, 1, work_group_size);
         /*
         DPCT1049:17: The work-group size passed to the SYCL kernel may exceed
@@ -225,9 +223,8 @@ static void norm_f32_sycl(const float* x, float* dst, const int ncols,
 }
 static void group_norm_f32_sycl(const float* x, float* dst,
-    const int num_groups, const int group_size,
+    const int num_groups, const float eps, const int group_size,
     const int ne_elements, queue_ptr stream, int device) {
-    static const float eps = 1e-6f;
     if (group_size < 1024) {
         const sycl::range<3> block_dims(1, 1, WARP_SIZE);
         stream->submit([&](sycl::handler& cgh) {
@@ -245,6 +242,7 @@ static void group_norm_f32_sycl(const float* x, float* dst,
     }
     else {
         const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
+        assert(work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
         const sycl::range<3> block_dims(1, 1, work_group_size);
         /*
         DPCT1049:18: The work-group size passed to the SYCL kernel may exceed
@@ -291,6 +289,7 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols,
     }
     else {
         const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
+        assert(work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
         const sycl::range<3> block_dims(1, 1, work_group_size);
         /*
         DPCT1049:19: The work-group size passed to the SYCL kernel may exceed
@@ -343,8 +342,12 @@ void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, const ggml_tensor*
     GGML_ASSERT(dst->type == GGML_TYPE_F32);
     int num_groups = dst->op_params[0];
+    float eps;
+    memcpy(&eps, dst->op_params + 1, sizeof(float));
     int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
-    group_norm_f32_sycl(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream, ctx.device);
+    group_norm_f32_sycl(src0_dd, dst_dd, num_groups, eps, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream, ctx.device);
     (void)src1;
     (void)dst;

package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp ADDED Viewed

@@ -0,0 +1,56 @@
+#include <sycl/sycl.hpp>
+#include <oneapi/mkl.hpp>
+#include "outprod.hpp"
+void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
+    const ggml_tensor* src1, ggml_tensor* dst) {
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguous(dst));
+    GGML_TENSOR_BINARY_OP_LOCALS
+    // Get SYCL queue
+    dpct::queue_ptr stream = ctx.stream();
+    // Dimension checks
+    GGML_ASSERT(ne01 == ne11);  // Inner dimensions must match
+    GGML_ASSERT(ne0 == ne00);   // Output rows match src0 rows
+    GGML_ASSERT(ne1 == ne10);   // Output cols match src1 cols
+    // Get data pointers
+    const float* src0_d = (const float*)src0->data;
+    const float* src1_d = (const float*)src1->data;
+    float* dst_d = (float*)dst->data;
+    // GEMM parameters
+    const float alpha = 1.0f;
+    const float beta = 0.0f;
+    // Handle transposition of src1
+    const bool src1_T = ggml_is_transposed(src1);
+    const oneapi::mkl::transpose src1_op =
+        src1_T ? oneapi::mkl::transpose::nontrans : oneapi::mkl::transpose::trans;
+    const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float);
+    try {
+        // Perform matrix multiplication using oneMKL GEMM
+        oneapi::mkl::blas::column_major::gemm(*stream,
+            oneapi::mkl::transpose::nontrans, src1_op,
+            ne0, ne1, ne01,
+            alpha,
+            src0_d, ne00,
+            src1_d, ldb,
+            beta,
+            dst_d, ne0);
+    }
+    catch (sycl::exception const& exc) {
+        std::cerr << exc.what() << std::endl;
+        GGML_ASSERT(false);
+    }
+}

package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp ADDED Viewed

@@ -0,0 +1,11 @@
+#ifndef GGML_SYCL_OUTPROD_HPP
+#define GGML_SYCL_OUTPROD_HPP
+#include "common.hpp"
+void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
+    const ggml_tensor* src1, ggml_tensor* dst);
+#endif // GGML_SYCL_OUTPROD_HPP

package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp CHANGED Viewed

@@ -25,6 +25,11 @@
 #define SYCL_RELU_BLOCK_SIZE 256
 #define SYCL_HARDSIGMOID_BLOCK_SIZE 256
 #define SYCL_HARDSWISH_BLOCK_SIZE 256
+#define SYCL_EXP_BLOCK_SIZE 256
+#define SYCL_NEG_BLOCK_SIZE 256
+#define SYCL_SIGMOID_BLOCK_SIZE 256
+#define SYCL_SQRT_BLOCK_SIZE 256
+#define SYCL_SIN_BLOCK_SIZE 256
 #define SYCL_SQR_BLOCK_SIZE 256
 #define SYCL_CPY_BLOCK_SIZE 32
 #define SYCL_SCALE_BLOCK_SIZE 256
@@ -41,6 +46,9 @@
 #define SYCL_ACC_BLOCK_SIZE 256
 #define SYCL_IM2COL_BLOCK_SIZE 256
 #define SYCL_POOL2D_BLOCK_SIZE 256
+#define SYCL_ARGMAX_BLOCK_SIZE 256
+#define SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE 256
+#define SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE 256
 // dmmv = dequantize_mul_mat_vec
 #ifndef GGML_SYCL_DMMV_X

package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp CHANGED Viewed

@@ -226,7 +226,7 @@ void ggml_sycl_op_rope(
     memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
     memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
-    const bool is_neox = mode & 2;
+    const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
     const int32_t * pos = (const int32_t *) src1_dd;

package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp ADDED Viewed

@@ -0,0 +1,71 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+#include "tsembd.hpp"
+static void timestep_embedding_f32(
+        const float * timesteps, float * dst, const int nb1,
+        const int dim, const int max_period, const sycl::nd_item<3> &item_ct1) {
+    // item_ct1.get_group(1)(blockIDx.y): idx of timesteps->ne[0]
+    // item_ct1.get_group(2) (blockIDx.x): idx of ((dim + 1) / 2) / BLOCK_SIZE
+    int i = item_ct1.get_group(1);
+    int j = item_ct1.get_local_id(2) + item_ct1.get_group(2) * item_ct1.get_local_range(2);
+    float * embed_data = (float *)((char *)dst +  i*nb1);
+    if (dim % 2 != 0 && j == ((dim + 1) / 2)) {
+        embed_data[dim] = 0.f;
+    }
+    int half = dim / 2;
+    if (j >= half) {
+        return;
+    }
+    float timestep = timesteps[i];
+    float freq = (float)sycl::native::exp(-(sycl::log((float)max_period)) * j / half);
+    float arg = timestep * freq;
+    embed_data[j] = sycl::cos(arg);
+    embed_data[j + half] = sycl::sin(arg);
+}
+static void timestep_embedding_f32_sycl(
+        const float * x, float * dst, const int ne00, const int nb1,
+        const int dim, const int max_period, const queue_ptr& stream) {
+    // As the kernel returns when thread.idx is larger than dim/2, the half_ceil does not need to pad
+    int half_ceil = dim / 2;
+    int num_blocks = (half_ceil + SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE - 1) / SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE;
+    sycl::range<3> block_dims(1, 1, SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE);
+    sycl::range<3> gridDim(1, ne00, num_blocks);
+    stream->parallel_for(
+        sycl::nd_range<3>(
+            gridDim * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) {
+            timestep_embedding_f32(
+                x, dst, nb1, dim, max_period, item_ct1
+            );
+        });
+}
+void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+    const ggml_tensor *src1, ggml_tensor * dst) {
+    const float * src0_d = (const float *)src0->data;
+    float * dst_d = (float *)dst->data;
+    dpct::queue_ptr stream = ctx.stream();
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+    const int dim = dst->op_params[0];
+    const int max_period = dst->op_params[1];
+    timestep_embedding_f32_sycl(src0_d, dst_d, src0->ne[0], dst->nb[1], dim, max_period, stream);
+}

package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp ADDED Viewed

@@ -0,0 +1,21 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+#ifndef GGML_SYCL_TSEMBD_HPP
+#define GGML_SYCL_TSEMBD_HPP
+#include "common.hpp"
+void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+    const ggml_tensor *src1, ggml_tensor * dst);
+#endif // GGML_SYCL_TSEMBD_HPP

package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp CHANGED Viewed

@@ -968,8 +968,8 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
             grid1[0] ^ signs[0], signs[0], std::minus<>());
         const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
             grid2[0] ^ signs[1], signs[1], std::minus<>());
-        sumi = dpct::dp4a(grid_l, *((int *)q8 + 0), sumi);
-        sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi);
+        sumi = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi);
+        sumi = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi);
         q8 += 8;
         aux32 >>= 7;
     }
@@ -1009,8 +1009,8 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
             grid1[0] ^ signs0, signs0, std::minus<>());
         const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
             grid2[0] ^ signs1, signs1, std::minus<>());
-        sumi = dpct::dp4a(grid_l, *((int *)q8 + 0), sumi);
-        sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi);
+        sumi = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi);
+        sumi = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi);
         q8 += 8;
     }
     const float d =

package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp ADDED Viewed

@@ -0,0 +1,138 @@
+#include <sycl/sycl.hpp>
+#include "wkv6.hpp"
+constexpr int WKV_BLOCK_SIZE = 64;  // Matching CUDA_WKV_BLOCK_SIZE
+// Helper function for the main kernel
+static void rwkv_wkv_f32_kernel(
+    const int B, const int T, const int C, const int H,
+    const float* k, const float* v, const float* r,
+    const float* tf, const float* td, const float* s,
+    float* dst, const sycl::nd_item<3>& item_ct1, float* shared_mem) {
+    const int tid = item_ct1.get_local_id(2);
+    const int bid = item_ct1.get_group(2);
+    const int head_size = WKV_BLOCK_SIZE;
+    const int batch_i = bid / H;
+    const int head_i = bid % H;
+    const int state_size = C * head_size;
+    const int n_seq_tokens = T / B;
+    // Set up shared memory pointers
+    float* _k = shared_mem;
+    float* _r = _k + head_size;
+    float* _tf = _r + head_size;
+    float* _td = _tf + head_size;
+    // Local state array
+    float state[WKV_BLOCK_SIZE];
+    // Load initial state
+    #pragma unroll
+    for (int i = 0; i < head_size; i++) {
+        state[i] = s[batch_i * state_size + head_i * head_size * head_size + i * head_size + tid];
+    }
+    // Sync threads before shared memory operations
+    item_ct1.barrier(sycl::access::fence_space::local_space);
+    // Load time-mixing parameters
+    _tf[tid] = tf[head_i * head_size + tid];
+    item_ct1.barrier(sycl::access::fence_space::local_space);
+    // Main sequence processing loop
+    for (int t = batch_i * n_seq_tokens * C + head_i * head_size + tid;
+         t < (batch_i + 1) * n_seq_tokens * C + head_i * head_size + tid;
+         t += C) {
+        item_ct1.barrier(sycl::access::fence_space::local_space);
+        // Load current timestep data to shared memory
+        _k[tid] = k[t];
+        _r[tid] = r[t];
+        _td[tid] = td[t];
+        item_ct1.barrier(sycl::access::fence_space::local_space);
+        const float _v = v[t];
+        float y = 0;
+        // Process in chunks of 4 for better vectorization
+        sycl::float4 k4, r4, tf4, td4, s4, kv4;
+        #pragma unroll
+        for (int j = 0; j < head_size; j += 4) {
+            // Load data in vec4 chunks
+            k4 = sycl::float4(_k[j], _k[j+1], _k[j+2], _k[j+3]);
+            r4 = sycl::float4(_r[j], _r[j+1], _r[j+2], _r[j+3]);
+            tf4 = sycl::float4(_tf[j], _tf[j+1], _tf[j+2], _tf[j+3]);
+            td4 = sycl::float4(_td[j], _td[j+1], _td[j+2], _td[j+3]);
+            s4 = sycl::float4(state[j], state[j+1], state[j+2], state[j+3]);
+            // Compute key-value product
+            sycl::float4 kv4 = k4 * _v;
+            // Accumulate weighted sum
+            y += sycl::dot(r4, tf4 * kv4 + s4);
+            // Update state
+            s4 = s4 * td4 + kv4;
+            // Store updated state
+            state[j] = s4.x();
+            state[j+1] = s4.y();
+            state[j+2] = s4.z();
+            state[j+3] = s4.w();
+        }
+        dst[t] = y;
+    }
+    // Save final state
+    #pragma unroll
+    for (int i = 0; i < head_size; i++) {
+        dst[T * C + batch_i * state_size + head_i * head_size * head_size + i * head_size + tid] = state[i];
+    }
+}
+void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
+    const ggml_tensor* src1, ggml_tensor* dst) {
+    const float* k_d = (const float*)dst->src[0]->data;
+    const float* v_d = (const float*)dst->src[1]->data;
+    const float* r_d = (const float*)dst->src[2]->data;
+    const float* tf_d = (const float*)dst->src[3]->data;
+    const float* td_d = (const float*)dst->src[4]->data;
+    const float* s_d = (const float*)dst->src[5]->data;
+    float* dst_d = (float*)dst->data;
+    const int64_t B = dst->src[5]->ne[1];
+    const int64_t T = dst->src[0]->ne[3];
+    const int64_t C = dst->ne[0];
+    const int64_t H = dst->src[0]->ne[2];
+    GGML_ASSERT(dst->src[5]->type == GGML_TYPE_F32);
+    GGML_ASSERT(C % H == 0);
+    GGML_ASSERT(C / H == WKV_BLOCK_SIZE); // The current sycl kernel is designed for RWKV6, HEAD_SIZE == 64
+    dpct::queue_ptr stream = ctx.stream();
+    // Calculate execution configuration
+    const size_t shared_mem_size = WKV_BLOCK_SIZE * 4 * sizeof(float); // For k, r, tf, td
+    sycl::range<3> block_dims(1, 1, C / H);
+    sycl::range<3> grid_dims(1, 1, B * H);
+    // Submit kernel
+    stream->submit([&](sycl::handler& cgh) {
+        sycl::local_accessor<float, 1> shared_mem_acc(shared_mem_size, cgh);
+        cgh.parallel_for(
+            sycl::nd_range<3>(grid_dims * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                rwkv_wkv_f32_kernel(
+                    B, T, C, H, k_d, v_d, r_d, tf_d, td_d, s_d, dst_d,
+                    item_ct1, shared_mem_acc.get_pointer()
+                );
+            });
+    });
+}

package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp ADDED Viewed

@@ -0,0 +1,10 @@
+#ifndef GGML_SYCL_WKV6_HPP
+#define GGML_SYCL_WKV6_HPP
+#include "common.hpp"
+void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+    const ggml_tensor *src1, ggml_tensor * dst);
+#endif // GGML_SYCL_WKV6_HPP

package/src/llama.cpp/ggml/src/ggml-threading.cpp ADDED Viewed

@@ -0,0 +1,12 @@
+#include "ggml-threading.h"
+#include <mutex>
+std::mutex ggml_critical_section_mutex;
+void ggml_critical_section_start() {
+    ggml_critical_section_mutex.lock();
+}
+void ggml_critical_section_end(void) {
+    ggml_critical_section_mutex.unlock();
+}

package/src/llama.cpp/ggml/src/ggml-threading.h ADDED Viewed

@@ -0,0 +1,12 @@
+#pragma once
+#ifdef __cplusplus
+extern "C" {
+#endif
+void ggml_critical_section_start(void);
+void ggml_critical_section_end(void);
+#ifdef __cplusplus
+}
+#endif

package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt ADDED Viewed

@@ -0,0 +1,78 @@
+find_package(Vulkan COMPONENTS glslc REQUIRED)
+if (Vulkan_FOUND)
+    message(STATUS "Vulkan found")
+    add_library(ggml-vulkan
+                ggml-vulkan.cpp
+                ../../include/ggml-vulkan.h
+                )
+    target_link_libraries(ggml-vulkan PRIVATE ggml-base Vulkan::Vulkan)
+    target_include_directories(ggml-vulkan PRIVATE . .. ${CMAKE_CURRENT_BINARY_DIR})
+    # Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build
+    # Posssibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector
+    if (MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+        add_compile_definitions(_ITERATOR_DEBUG_LEVEL=0)
+    endif()
+    if (GGML_VULKAN_CHECK_RESULTS)
+        add_compile_definitions(GGML_VULKAN_CHECK_RESULTS)
+    endif()
+    if (GGML_VULKAN_DEBUG)
+        add_compile_definitions(GGML_VULKAN_DEBUG)
+    endif()
+    if (GGML_VULKAN_MEMORY_DEBUG)
+        add_compile_definitions(GGML_VULKAN_MEMORY_DEBUG)
+    endif()
+    if (GGML_VULKAN_SHADER_DEBUG_INFO)
+        add_compile_definitions(GGML_VULKAN_SHADER_DEBUG_INFO)
+    endif()
+    if (GGML_VULKAN_PERF)
+        add_compile_definitions(GGML_VULKAN_PERF)
+    endif()
+    if (GGML_VULKAN_VALIDATE)
+        add_compile_definitions(GGML_VULKAN_VALIDATE)
+    endif()
+    if (GGML_VULKAN_RUN_TESTS)
+        add_compile_definitions(GGML_VULKAN_RUN_TESTS)
+    endif()
+    add_subdirectory(vulkan-shaders)
+    set (_ggml_vk_genshaders_cmd vulkan-shaders-gen)
+    set (_ggml_vk_header     ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.hpp)
+    set (_ggml_vk_source     ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.cpp)
+    set (_ggml_vk_input_dir  ${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders)
+    set (_ggml_vk_output_dir ${CMAKE_CURRENT_BINARY_DIR}/vulkan-shaders.spv)
+    file(GLOB _ggml_vk_shader_deps "${_ggml_vk_input_dir}/*.comp")
+    add_custom_command(
+        OUTPUT ${_ggml_vk_header}
+                ${_ggml_vk_source}
+        COMMAND ${_ggml_vk_genshaders_cmd}
+            --glslc      ${Vulkan_GLSLC_EXECUTABLE}
+            --input-dir  ${_ggml_vk_input_dir}
+            --output-dir ${_ggml_vk_output_dir}
+            --target-hpp ${_ggml_vk_header}
+            --target-cpp ${_ggml_vk_source}
+            --no-clean
+        DEPENDS ${_ggml_vk_shader_deps}
+        COMMENT "Generate vulkan shaders"
+    )
+    target_sources(ggml-vulkan PRIVATE ${_ggml_vk_source} ${_ggml_vk_header})
+else()
+    message(WARNING "Vulkan not found")
+endif()