npm - @fugood/llama.node - Versions diffs - 0.3.3 → 0.3.4 - Mend

@fugood/llama.node 0.3.3 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (225) hide show

package/CMakeLists.txt +5 -0
package/bin/darwin/arm64/llama-node.node +0 -0
package/bin/darwin/x64/llama-node.node +0 -0
package/bin/linux/arm64/llama-node.node +0 -0
package/bin/linux/x64/llama-node.node +0 -0
package/bin/linux-vulkan/arm64/llama-node.node +0 -0
package/bin/linux-vulkan/x64/llama-node.node +0 -0
package/bin/win32/arm64/llama-node.node +0 -0
package/bin/win32/arm64/node.lib +0 -0
package/bin/win32/x64/llama-node.node +0 -0
package/bin/win32/x64/node.lib +0 -0
package/bin/win32-vulkan/arm64/llama-node.node +0 -0
package/bin/win32-vulkan/arm64/node.lib +0 -0
package/bin/win32-vulkan/x64/llama-node.node +0 -0
package/bin/win32-vulkan/x64/node.lib +0 -0
package/lib/binding.ts +18 -1
package/package.json +1 -1
package/src/EmbeddingWorker.cpp +15 -5
package/src/EmbeddingWorker.h +2 -1
package/src/LlamaCompletionWorker.cpp +1 -1
package/src/LlamaContext.cpp +81 -18
package/src/LlamaContext.h +2 -0
package/src/llama.cpp/.github/workflows/build.yml +197 -159
package/src/llama.cpp/.github/workflows/docker.yml +5 -8
package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
package/src/llama.cpp/.github/workflows/server.yml +21 -14
package/src/llama.cpp/CMakeLists.txt +11 -6
package/src/llama.cpp/Sources/llama/llama.h +4 -0
package/src/llama.cpp/cmake/common.cmake +33 -0
package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
package/src/llama.cpp/common/CMakeLists.txt +6 -2
package/src/llama.cpp/common/arg.cpp +426 -245
package/src/llama.cpp/common/common.cpp +143 -80
package/src/llama.cpp/common/common.h +81 -24
package/src/llama.cpp/common/sampling.cpp +53 -19
package/src/llama.cpp/common/sampling.h +22 -1
package/src/llama.cpp/common/speculative.cpp +274 -0
package/src/llama.cpp/common/speculative.h +28 -0
package/src/llama.cpp/docs/build.md +101 -148
package/src/llama.cpp/examples/CMakeLists.txt +32 -13
package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/batched/batched.cpp +5 -4
package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/imatrix/imatrix.cpp +11 -2
package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/infill/infill.cpp +1 -1
package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +405 -316
package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
package/src/llama.cpp/examples/llava/clip.cpp +262 -66
package/src/llama.cpp/examples/llava/clip.h +8 -2
package/src/llama.cpp/examples/llava/llava-cli.cpp +1 -1
package/src/llama.cpp/examples/llava/llava.cpp +46 -19
package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +1 -1
package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -1
package/src/llama.cpp/examples/lookup/lookup.cpp +2 -2
package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/main/main.cpp +9 -5
package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/retrieval/retrieval.cpp +4 -4
package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/run/run.cpp +911 -0
package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -4
package/src/llama.cpp/examples/server/CMakeLists.txt +3 -7
package/src/llama.cpp/examples/server/server.cpp +1758 -886
package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
package/src/llama.cpp/examples/server/utils.hpp +94 -304
package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/simple/simple.cpp +4 -0
package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +3 -0
package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/speculative/speculative.cpp +16 -15
package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/tokenize/tokenize.cpp +1 -1
package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/tts/tts.cpp +932 -0
package/src/llama.cpp/ggml/CMakeLists.txt +46 -34
package/src/llama.cpp/ggml/include/ggml-backend.h +16 -0
package/src/llama.cpp/ggml/include/ggml-cpu.h +7 -49
package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
package/src/llama.cpp/ggml/include/ggml.h +106 -24
package/src/llama.cpp/ggml/src/CMakeLists.txt +73 -24
package/src/llama.cpp/ggml/src/ggml-alloc.c +0 -1
package/src/llama.cpp/ggml/src/ggml-backend-impl.h +51 -11
package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +379 -22
package/src/llama.cpp/ggml/src/ggml-backend.cpp +4 -4
package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +3 -7
package/src/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +5 -2
package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +33 -3
package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +95 -35
package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +288 -213
package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/common.h +19 -22
package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.cpp +93 -92
package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.h +2 -9
package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.c → ggml-cpu-aarch64.cpp} +892 -190
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +2 -24
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +15 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +38 -25
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +552 -399
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +101 -136
package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +2 -2
package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +7 -10
package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -6
package/src/llama.cpp/ggml/src/ggml-impl.h +32 -11
package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +13 -9
package/src/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +131 -64
package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +3 -6
package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +39 -0
package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +14 -7
package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
package/src/llama.cpp/ggml/src/ggml-opt.cpp +67 -80
package/src/llama.cpp/ggml/src/ggml-quants.c +0 -9
package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +3 -5
package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +5 -2
package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +13 -10
package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +2 -11
package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -0
package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +2 -2
package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +32 -13
package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +80 -61
package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +159 -114
package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +6 -20
package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +4 -3
package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +8 -8
package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +4 -1
package/src/llama.cpp/ggml/src/ggml-threading.h +4 -2
package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +21 -7
package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1718 -399
package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +3 -1
package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +105 -31
package/src/llama.cpp/ggml/src/ggml.c +367 -207
package/src/llama.cpp/include/llama-cpp.h +25 -0
package/src/llama.cpp/include/llama.h +26 -19
package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
package/src/llama.cpp/src/CMakeLists.txt +2 -7
package/src/llama.cpp/src/llama-grammar.cpp +15 -15
package/src/llama.cpp/src/llama-grammar.h +2 -5
package/src/llama.cpp/src/llama-sampling.cpp +35 -90
package/src/llama.cpp/src/llama-vocab.cpp +6 -1
package/src/llama.cpp/src/llama.cpp +1748 -640
package/src/llama.cpp/src/unicode.cpp +62 -51
package/src/llama.cpp/src/unicode.h +9 -10
package/src/llama.cpp/tests/CMakeLists.txt +48 -37
package/src/llama.cpp/tests/test-arg-parser.cpp +2 -2
package/src/llama.cpp/tests/test-backend-ops.cpp +140 -21
package/src/llama.cpp/tests/test-chat-template.cpp +50 -4
package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
package/src/llama.cpp/tests/test-quantize-fns.cpp +3 -3
package/src/llama.cpp/tests/test-rope.cpp +61 -20
package/src/llama.cpp/tests/test-sampling.cpp +2 -2
package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
package/src/llama.cpp/ggml/include/ggml-amx.h +0 -25
package/src/llama.cpp/ggml/src/ggml-aarch64.c +0 -129
package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -19
package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +0 -107
package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +0 -446

package/src/llama.cpp/ggml/src/ggml.c CHANGED Viewed

@@ -8,7 +8,10 @@
 // FIXME: required here for quantization functions
 #include "ggml-quants.h"
-#include "ggml-aarch64.h"
+#ifdef GGML_USE_CPU_HBM
+#include <hbwmalloc.h>
+#endif
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <malloc.h> // using malloc.h with MSC/MINGW
@@ -788,32 +791,23 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .to_float                 = (ggml_to_float_t) ggml_bf16_to_fp32_row,
         .from_float_ref           = (ggml_from_float_t) ggml_fp32_to_bf16_row_ref,
     },
-    [GGML_TYPE_Q4_0_4_4] = {
-        .type_name                = "q4_0_4x4",
-        .blck_size                = QK4_0,
-        .blck_size_interleave     = 4,
-        .type_size                = sizeof(block_q4_0),
-        .is_quantized             = true,
-        .to_float                 = NULL,
-        .from_float_ref           = NULL,
+    [31] = { // GGML_TYPE_Q4_0_4_4
+        .type_name                = "TYPE_Q4_0_4_4 REMOVED, use Q4_0 with runtime repacking",
+        .blck_size                = 0,
+        .type_size                = 0,
+        .is_quantized             = false,
     },
-    [GGML_TYPE_Q4_0_4_8] = {
-        .type_name                = "q4_0_4x8",
-        .blck_size                = QK4_0,
-        .blck_size_interleave     = 8,
-        .type_size                = sizeof(block_q4_0),
-        .is_quantized             = true,
-        .to_float                 = NULL,
-        .from_float_ref           = NULL,
+    [32] = { // GGML_TYPE_Q4_0_4_8
+        .type_name                = "TYPE_Q4_0_4_8 REMOVED, use Q4_0 with runtime repacking",
+        .blck_size                = 0,
+        .type_size                = 0,
+        .is_quantized             = false,
     },
-    [GGML_TYPE_Q4_0_8_8] = {
-        .type_name                = "q4_0_8x8",
-        .blck_size                = QK4_0,
-        .blck_size_interleave     = 8,
-        .type_size                = sizeof(block_q4_0),
-        .is_quantized             = true,
-        .to_float                 = NULL,
-        .from_float_ref           = NULL,
+    [33] = { // GGML_TYPE_Q4_0_8_8
+        .type_name                = "TYPE_Q4_0_8_8 REMOVED, use Q4_0 with runtime repacking",
+        .blck_size                = 0,
+        .type_size                = 0,
+        .is_quantized             = false,
     },
     [GGML_TYPE_TQ1_0] = {
         .type_name                = "tq1_0",
@@ -831,6 +825,24 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .to_float                 = (ggml_to_float_t) dequantize_row_tq2_0,
         .from_float_ref           = (ggml_from_float_t) quantize_row_tq2_0_ref,
     },
+    [36] = { // GGML_TYPE_IQ4_NL_4_4
+        .type_name                = "TYPE_IQ4_NL_4_4 REMOVED, use IQ4_NL with runtime repacking",
+        .blck_size                = 0,
+        .type_size                = 0,
+        .is_quantized             = false,
+    },
+    [37] = { // GGML_TYPE_IQ4_NL_4_8
+        .type_name                = "TYPE_IQ4_NL_4_8 REMOVED, use IQ4_NL with runtime repacking",
+        .blck_size                = 0,
+        .type_size                = 0,
+        .is_quantized             = false,
+    },
+    [38] = { // GGML_TYPE_IQ4_NL_8_8
+        .type_name                = "TYPE_IQ4_NL_8_8 REMOVED, use IQ4_NL with runtime repacking",
+        .blck_size                = 0,
+        .type_size                = 0,
+        .is_quantized             = false,
+    },
 };
 const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) {
@@ -941,6 +953,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "POOL_2D_BACK",
     "UPSCALE",
     "PAD",
+    "PAD_REFLECT_1D",
     "ARANGE",
     "TIMESTEP_EMBEDDING",
     "ARGSORT",
@@ -974,7 +987,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "OPT_STEP_ADAMW",
 };
-static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -1036,6 +1049,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "pool_2d_back(x)",
     "upscale(x)",
     "pad(x)",
+    "pad_reflect_1d(x)",
     "arange(start, stop, step)",
     "timestep_embedding(timesteps, dim, max_period)",
     "argsort(x)",
@@ -1069,7 +1083,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "adamw(x)",
 };
-static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@@ -1259,9 +1273,6 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
         case GGML_FTYPE_MOSTLY_IQ4_XS:        wtype = GGML_TYPE_IQ4_XS;   break;
         case GGML_FTYPE_MOSTLY_IQ3_S:         wtype = GGML_TYPE_IQ3_S;    break;
         case GGML_FTYPE_MOSTLY_IQ2_S:         wtype = GGML_TYPE_IQ2_S;    break;
-        case GGML_FTYPE_MOSTLY_Q4_0_4_4:      wtype = GGML_TYPE_Q4_0_4_4; break;
-        case GGML_FTYPE_MOSTLY_Q4_0_4_8:      wtype = GGML_TYPE_Q4_0_4_8; break;
-        case GGML_FTYPE_MOSTLY_Q4_0_8_8:      wtype = GGML_TYPE_Q4_0_8_8; break;
         case GGML_FTYPE_UNKNOWN:              wtype = GGML_TYPE_COUNT; break;
         case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
     }
@@ -2255,6 +2266,7 @@ struct ggml_tensor * ggml_argmax(
         struct ggml_context * ctx,
         struct ggml_tensor  * a) {
     GGML_ASSERT(ggml_is_matrix(a));
+    GGML_ASSERT(a->ne[0] <= INT32_MAX);
     struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, a->ne[1]);
@@ -3505,15 +3517,18 @@ static struct ggml_tensor * ggml_rope_impl(
         GGML_ASSERT(c->ne[0] >= n_dims / 2);
     }
+    int sections[4] = {0, 0, 0, 0};
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-    int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
+    int32_t params[15] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
     memcpy(params +  5, &freq_base,    sizeof(float));
     memcpy(params +  6, &freq_scale,   sizeof(float));
     memcpy(params +  7, &ext_factor,   sizeof(float));
     memcpy(params +  8, &attn_factor,  sizeof(float));
     memcpy(params +  9, &beta_fast,    sizeof(float));
     memcpy(params + 10, &beta_slow,    sizeof(float));
+    memcpy(params + 11, &sections,     sizeof(int)*4);
     ggml_set_op_params(result, params, sizeof(params));
     result->op     = GGML_OP_ROPE;
@@ -3535,6 +3550,53 @@ struct ggml_tensor * ggml_rope(
     );
 }
+struct ggml_tensor * ggml_rope_multi(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        struct ggml_tensor  * c,
+        int                   n_dims,
+        int                   sections[4],
+        int                   mode,
+        int                   n_ctx_orig,
+        float                 freq_base,
+        float                 freq_scale,
+        float                 ext_factor,
+        float                 attn_factor,
+        float                 beta_fast,
+        float                 beta_slow) {
+    // Multimodal Rotary Position Embedding
+    GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");
+    GGML_ASSERT(ggml_is_vector(b));
+    GGML_ASSERT(b->type == GGML_TYPE_I32);
+    GGML_ASSERT(a->ne[2] * 4 == b->ne[0]); // mrope expecting 4 position ids per token
+    if (c) {
+        GGML_ASSERT(c->type == GGML_TYPE_F32);
+        GGML_ASSERT(c->ne[0] >= n_dims / 2);
+    }
+    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
+    int32_t params[11 + 4] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
+    memcpy(params +  5, &freq_base,    sizeof(float));
+    memcpy(params +  6, &freq_scale,   sizeof(float));
+    memcpy(params +  7, &ext_factor,   sizeof(float));
+    memcpy(params +  8, &attn_factor,  sizeof(float));
+    memcpy(params +  9, &beta_fast,    sizeof(float));
+    memcpy(params + 10, &beta_slow,    sizeof(float));
+    memcpy(&params[11], sections,      sizeof(int)*4);
+    ggml_set_op_params(result, params, sizeof(params));
+    result->op   = GGML_OP_ROPE;
+    result->src[0] = a;
+    result->src[1] = b;
+    result->src[2] = c;
+    return result;
+}
 struct ggml_tensor * ggml_rope_inplace(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
@@ -3698,13 +3760,84 @@ struct ggml_tensor * ggml_clamp(
     return result;
 }
-// ggml_conv_1d
 static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
     return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
 }
-GGML_API struct ggml_tensor * ggml_conv_1d(
+// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
+// a: [OC，IC, KH, KW]
+// b: [N, IC, IH, IW]
+// result: [N, OH, OW, IC*KH*KW]
+struct ggml_tensor * ggml_im2col(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   s0,
+        int                   s1,
+        int                   p0,
+        int                   p1,
+        int                   d0,
+        int                   d1,
+        bool                  is_2D,
+        enum ggml_type        dst_type) {
+    if (is_2D) {
+        GGML_ASSERT(a->ne[2] == b->ne[2]);
+    } else {
+        //GGML_ASSERT(b->ne[1] % a->ne[1] == 0);
+        GGML_ASSERT(b->ne[1] == a->ne[1]);
+        GGML_ASSERT(b->ne[3] == 1);
+    }
+    const int64_t OH = is_2D ? ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0;
+    const int64_t OW =         ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
+    GGML_ASSERT((!is_2D || OH > 0) && "b too small compared to a");
+    GGML_ASSERT((OW > 0)           && "b too small compared to a");
+    const int64_t ne[4] = {
+        is_2D ? (a->ne[2] * a->ne[1] * a->ne[0]) : a->ne[1] * a->ne[0],
+        OW,
+        is_2D ? OH : b->ne[2],
+        is_2D ?      b->ne[3] : 1,
+    };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, dst_type, 4, ne);
+    int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
+    ggml_set_op_params(result, params, sizeof(params));
+    result->op     = GGML_OP_IM2COL;
+    result->src[0] = a;
+    result->src[1] = b;
+    return result;
+}
+struct ggml_tensor * ggml_im2col_back(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int64_t             * ne,
+        int                   s0,
+        int                   s1,
+        int                   p0,
+        int                   p1,
+        int                   d0,
+        int                   d1,
+        bool                  is_2D) {
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+    int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
+    ggml_set_op_params(result, params, sizeof(params));
+    result->op     = GGML_OP_IM2COL_BACK;
+    result->src[0] = a;
+    result->src[1] = b;
+    return result;
+}
+// ggml_conv_1d
+struct ggml_tensor * ggml_conv_1d(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         struct ggml_tensor  * b,
@@ -3734,137 +3867,75 @@ struct ggml_tensor* ggml_conv_1d_ph(
     return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
 }
-// ggml_conv_transpose_1d
-static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
-    return (ins - 1) * s - 2 * p + d * (ks - 1) + 1;
-}
+// ggml_conv_1d_dw
-GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
+struct ggml_tensor * ggml_conv_1d_dw(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         struct ggml_tensor  * b,
         int                   s0,
         int                   p0,
         int                   d0) {
-    GGML_ASSERT(ggml_is_matrix(b));
-    GGML_ASSERT(a->ne[2] == b->ne[1]);
-    GGML_ASSERT(a->ne[3] == 1);
+    struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], 1, a->ne[1], a->ne[2]);
+    struct ggml_tensor * new_b = ggml_reshape_4d(ctx, b, b->ne[0], 1, b->ne[1], b->ne[2]);
-    GGML_ASSERT(p0 == 0);
-    GGML_ASSERT(d0 == 1);
+    struct ggml_tensor * im2col = ggml_im2col(ctx, new_a, new_b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16);
-    const int64_t ne[4] = {
-        ggml_calc_conv_transpose_1d_output_size(b->ne[0], a->ne[0], s0, 0 /*p0*/, 1 /*d0*/),
-        a->ne[1], b->ne[2], 1,
-    };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+    struct ggml_tensor * result = ggml_mul_mat(ctx, im2col, a);
-    int32_t params[] = { s0, p0, d0 };
-    ggml_set_op_params(result, params, sizeof(params));
-    result->op     = GGML_OP_CONV_TRANSPOSE_1D;
-    result->src[0] = a;
-    result->src[1] = b;
+    result = ggml_reshape_3d(ctx, result, b->ne[0], b->ne[1], 1);
     return result;
 }
-// ggml_conv_depthwise
+// ggml_conv_1d_dw_ph
-struct ggml_tensor * ggml_conv_depthwise_2d(
+struct ggml_tensor * ggml_conv_1d_dw_ph(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         struct ggml_tensor  * b,
         int                   s0,
-        int                   s1,
-        int                   p0,
-        int                   p1,
-        int                   d0,
-        int                   d1) {
-    struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
-    struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
-                                        ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
-                                        s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW]
-    struct ggml_tensor * new_b = ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
+        int                   d0) {
+    return ggml_conv_1d_dw(ctx, a, b, s0, a->ne[0] / 2, d0);
+}
-    new_a = ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2],  new_a->ne[3], 1);                       // [OC，1, KH, KW] => [1, OC, 1, KH * KW]
-    struct ggml_tensor * result = ggml_mul_mat(ctx, new_a, new_b);
-    result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
+// ggml_conv_transpose_1d
-    return result;
+static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
+    return (ins - 1) * s - 2 * p + d * (ks - 1) + 1;
 }
-// ggml_conv_2d
-// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
-// a: [OC，IC, KH, KW]
-// b: [N, IC, IH, IW]
-// result: [N, OH, OW, IC*KH*KW]
-struct ggml_tensor * ggml_im2col(
+GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         struct ggml_tensor  * b,
         int                   s0,
-        int                   s1,
         int                   p0,
-        int                   p1,
-        int                   d0,
-        int                   d1,
-        bool                  is_2D,
-        enum ggml_type        dst_type) {
-    if(is_2D) {
-        GGML_ASSERT(a->ne[2] == b->ne[2]);
-    } else {
-        GGML_ASSERT(a->ne[1] == b->ne[1]);
-        GGML_ASSERT(b->ne[3] == 1);
-    }
-    const int64_t OH = is_2D ? ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0;
-    const int64_t OW =         ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
+        int                   d0) {
+    GGML_ASSERT(ggml_is_matrix(b));
+    GGML_ASSERT(a->ne[2] == b->ne[1]);
+    GGML_ASSERT(a->ne[3] == 1);
-    GGML_ASSERT((!is_2D || OH > 0) && "b too small compared to a");
-    GGML_ASSERT((OW > 0)           && "b too small compared to a");
+    GGML_ASSERT(p0 == 0);
+    GGML_ASSERT(d0 == 1);
     const int64_t ne[4] = {
-        is_2D ? (a->ne[2] * a->ne[1] * a->ne[0]) : a->ne[1] * a->ne[0],
-        OW,
-        is_2D ? OH : b->ne[2],
-        is_2D ?      b->ne[3] : 1,
+        ggml_calc_conv_transpose_1d_output_size(b->ne[0], a->ne[0], s0, 0 /*p0*/, 1 /*d0*/),
+        a->ne[1], b->ne[2], 1,
     };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
-    struct ggml_tensor * result = ggml_new_tensor(ctx, dst_type, 4, ne);
-    int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
+    int32_t params[] = { s0, p0, d0 };
     ggml_set_op_params(result, params, sizeof(params));
-    result->op     = GGML_OP_IM2COL;
+    result->op     = GGML_OP_CONV_TRANSPOSE_1D;
     result->src[0] = a;
     result->src[1] = b;
     return result;
 }
-struct ggml_tensor * ggml_im2col_back(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        int64_t             * ne,
-        int                   s0,
-        int                   s1,
-        int                   p0,
-        int                   p1,
-        int                   d0,
-        int                   d1,
-        bool                  is_2D) {
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
-    int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
-    ggml_set_op_params(result, params, sizeof(params));
-    result->op     = GGML_OP_IM2COL_BACK;
-    result->src[0] = a;
-    result->src[1] = b;
-    return result;
-}
+// ggml_conv_2d
 // a: [OC，IC, KH, KW]
 // b: [N, IC, IH, IW]
@@ -3911,6 +3982,31 @@ struct ggml_tensor * ggml_conv_2d_s1_ph(
     return ggml_conv_2d(ctx, a, b, 1, 1, a->ne[0] / 2, a->ne[1] / 2, 1, 1);
 }
+// ggml_conv_2d_dw
+struct ggml_tensor * ggml_conv_2d_dw(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   s0,
+        int                   s1,
+        int                   p0,
+        int                   p1,
+        int                   d0,
+        int                   d1) {
+    struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
+    struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
+                                        ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
+                                        s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW]
+    struct ggml_tensor * new_b = ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
+    new_a = ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2],  new_a->ne[3], 1);                       // [OC，1, KH, KW] => [1, OC, 1, KH * KW]
+    struct ggml_tensor * result = ggml_mul_mat(ctx, new_a, new_b);
+    result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
+    return result;
+}
 // ggml_conv_transpose_2d_p0
 static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {
@@ -4087,6 +4183,37 @@ struct ggml_tensor * ggml_pad(
     return result;
 }
+// ggml_pad_reflect_1d
+struct ggml_tensor * ggml_pad_reflect_1d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   p0,
+        int                   p1) {
+    GGML_ASSERT(p0 >= 0);
+    GGML_ASSERT(p1 >= 0);
+    GGML_ASSERT(p0 < a->ne[0]); // padding length on each size must be less than the
+    GGML_ASSERT(p1 < a->ne[0]); // existing length of the dimension being padded
+    GGML_ASSERT(ggml_is_contiguous(a));
+    GGML_ASSERT(a->type == GGML_TYPE_F32);
+    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
+            a->ne[0] + p0 + p1,
+            a->ne[1],
+            a->ne[2],
+            a->ne[3]);
+    int32_t params[] = { p0, p1 };
+    ggml_set_op_params(result, params, sizeof(params));
+    result->op     = GGML_OP_PAD_REFLECT_1D;
+    result->src[0] = a;
+    return result;
+}
 // ggml_arange
 struct ggml_tensor * ggml_arange(
@@ -4138,6 +4265,7 @@ struct ggml_tensor * ggml_argsort(
         struct ggml_context  * ctx,
         struct ggml_tensor   * a,
         enum ggml_sort_order   order) {
+    GGML_ASSERT(a->ne[0] <= INT32_MAX);
     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne);
     ggml_set_op_params_i32(result, 0, (int32_t) order);
@@ -5019,8 +5147,10 @@ static void ggml_hash_map_free(struct hash_map * map) {
 }
 // utility functions to change gradients
-// if a is in acc_table, modify gradients in-place and mark result as gradient accumulator
-// else if a is in zero_table, replace a
+// isrc is the index of tensor in cgraph->visited_has_set.keys
+// the corresponding gradient (accumulators) are also at position isrc
+// if tensor has a gradient accumulator, modify that accumulator in-place
+// else if there is no gradient for tensor, set the corresponding value
 // else, just add/subtract/etc. the gradients
 static void ggml_add_or_set(
@@ -5028,11 +5158,14 @@ static void ggml_add_or_set(
         struct ggml_cgraph  * cgraph,
         size_t                isrc,
         struct ggml_tensor  * tensor) {
+    struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc];
+    GGML_ASSERT(src);
     if (cgraph->grads[isrc]) {
-        cgraph->grads[isrc] = ggml_add_impl(ctx, cgraph->grads[isrc], tensor, cgraph->grad_accs[isrc]);
+        cgraph->grads[isrc] = ggml_add_impl(ctx, cgraph->grads[isrc], tensor, /*inplace =*/ cgraph->grad_accs[isrc]);
     } else {
         cgraph->grads[isrc] = tensor;
     }
+    ggml_format_name(cgraph->grads[isrc], "grad for %s", src->name);
     ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
 }
@@ -5040,18 +5173,20 @@ static void ggml_acc_or_set(
         struct ggml_context * ctx,
         struct ggml_cgraph  * cgraph,
         size_t                isrc,
-        struct ggml_tensor  * src,
         struct ggml_tensor  * tensor,
         const  size_t         nb1,
         const  size_t         nb2,
         const  size_t         nb3,
         const  size_t         offset) {
+    struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc];
+    GGML_ASSERT(src);
     if (cgraph->grads[isrc]) {
         cgraph->grads[isrc] = ggml_acc_impl(ctx, cgraph->grads[isrc], tensor, nb1, nb2, nb3, offset, cgraph->grad_accs[isrc]);
     } else {
         struct ggml_tensor * a_zero = ggml_scale(ctx, src, 0.0f); // FIXME this is going to produce NaN if a contains inf/NaN
         cgraph->grads[isrc] = ggml_acc_impl(ctx, a_zero, tensor, nb1, nb2, nb3, offset, false);
     }
+    ggml_format_name(cgraph->grads[isrc], "grad for %s", cgraph->visited_hash_set.keys[isrc]->name);
     ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
 }
@@ -5059,13 +5194,15 @@ static void ggml_add1_or_set(
         struct ggml_context * ctx,
         struct ggml_cgraph  * cgraph,
         size_t                isrc,
-        struct ggml_tensor  * src,
         struct ggml_tensor  * tensor) {
+    struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc];
+    GGML_ASSERT(src);
     if (cgraph->grads[isrc]) {
         cgraph->grads[isrc] = ggml_add1_impl(ctx, cgraph->grads[isrc], tensor, cgraph->grad_accs[isrc]);
     } else {
         cgraph->grads[isrc] = ggml_repeat(ctx, tensor, src);
     }
+    ggml_format_name(cgraph->grads[isrc], "grad for %s", src->name);
     ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
 }
@@ -5074,11 +5211,14 @@ static void ggml_sub_or_set(
         struct ggml_cgraph  * cgraph,
         size_t                isrc,
         struct ggml_tensor  * tensor) {
+    struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc];
+    GGML_ASSERT(src);
     if (cgraph->grads[isrc]) {
         cgraph->grads[isrc] = ggml_sub_impl(ctx, cgraph->grads[isrc], tensor, cgraph->grad_accs[isrc]);
     } else {
         cgraph->grads[isrc] = ggml_neg(ctx, tensor);
     }
+    ggml_format_name(cgraph->grads[isrc], "grad for %s", src->name);
     ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
 }
@@ -5095,12 +5235,12 @@ static void ggml_compute_backward(
     struct ggml_tensor * src1 = tensor->src[1];
     struct ggml_tensor * src2 = tensor->src[2];
     struct ggml_hash_set * hash_set = &cgraph->visited_hash_set;
-    const size_t isrc0 = ggml_hash_find(hash_set, src0);
-    const size_t isrc1 = ggml_hash_find(hash_set, src1);
-    const size_t isrc2 = ggml_hash_find(hash_set, src2);
-    const bool src0_needs_grads = isrc0 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc0) && grads_needed[isrc0];
-    const bool src1_needs_grads = isrc1 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc1) && grads_needed[isrc1];
-    const bool src2_needs_grads = isrc2 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc2) && grads_needed[isrc2];
+    const size_t isrc0 = src0 ? ggml_hash_find(hash_set, src0) : (size_t) -1;
+    const size_t isrc1 = src1 ? ggml_hash_find(hash_set, src1) : (size_t) -1;
+    const size_t isrc2 = src2 ? ggml_hash_find(hash_set, src2) : (size_t) -1;
+    const bool src0_needs_grads = src0 && isrc0 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc0) && grads_needed[isrc0];
+    const bool src1_needs_grads = src1 && isrc1 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc1) && grads_needed[isrc1];
+    const bool src2_needs_grads = src2 && isrc2 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc2) && grads_needed[isrc2];
     switch (tensor->op) {
         case GGML_OP_DUP: {
@@ -5200,7 +5340,7 @@ static void ggml_compute_backward(
         } break;
         case GGML_OP_SUM: {
             if (src0_needs_grads) {
-                ggml_add1_or_set(ctx, cgraph, isrc0, src0, grad);
+                ggml_add1_or_set(ctx, cgraph, isrc0, grad);
             }
         } break;
         case GGML_OP_SUM_ROWS: {
@@ -5210,7 +5350,7 @@ static void ggml_compute_backward(
         } break;
         case GGML_OP_MEAN: {
             if (src0_needs_grads) {
-                ggml_add1_or_set(ctx, cgraph, isrc0, src0, ggml_scale_impl(ctx, grad, 1.0f/src0->ne[0], false));
+                ggml_add1_or_set(ctx, cgraph, isrc0, ggml_scale_impl(ctx, grad, 1.0f/src0->ne[0], false));
             }
         } break;
         case GGML_OP_REPEAT: {
@@ -5363,7 +5503,7 @@ static void ggml_compute_backward(
                     nb3 = (nb3 / n0) * ng;
                 }
-                ggml_acc_or_set(ctx, cgraph, isrc0, src0, grad, nb1, nb2, nb3, offset);
+                ggml_acc_or_set(ctx, cgraph, isrc0, grad, nb1, nb2, nb3, offset);
             }
         } break;
         case GGML_OP_PERMUTE: {
@@ -5597,10 +5737,9 @@ void ggml_build_backward_expand(
     const int n_nodes_f = cgraph->n_nodes;
-    const size_t hash_size = ggml_hash_size(2*cgraph->size);
-    memset(cgraph->grads,     0, hash_size*sizeof(struct ggml_tensor *));
-    memset(cgraph->grad_accs, 0, hash_size*sizeof(struct ggml_tensor *));
-    bool * grads_needed = calloc(hash_size, sizeof(bool));
+    memset(cgraph->grads,     0, cgraph->visited_hash_set.size*sizeof(struct ggml_tensor *));
+    memset(cgraph->grad_accs, 0, cgraph->visited_hash_set.size*sizeof(struct ggml_tensor *));
+    bool * grads_needed = calloc(cgraph->visited_hash_set.size, sizeof(bool));
     {
         bool any_params = false;
@@ -5621,7 +5760,7 @@ void ggml_build_backward_expand(
             continue;
         }
-        bool node_needs_grad = node->flags & GGML_TENSOR_FLAG_PARAM;
+        bool node_needs_grad = (node->flags & GGML_TENSOR_FLAG_PARAM) || (node->flags & GGML_TENSOR_FLAG_LOSS);
         bool ignore_src[GGML_MAX_SRC] = {false};
         switch (node->op) {
             // gradients in node->src[0] for one reason or another have no effect on output gradients
@@ -5638,7 +5777,7 @@ void ggml_build_backward_expand(
             } break;
             // gradients in node->src[1] for one reason or another have no effect on output gradients
-            case GGML_OP_CPY:           // gradients in CPY target  are irrelevant
+            case GGML_OP_CPY:           // gradients in CPY target are irrelevant
             case GGML_OP_GET_ROWS:      // row indices not differentiable
             case GGML_OP_GET_ROWS_BACK: // same as for GET_ROWS
             case GGML_OP_ROPE:          // positions not differentiable
@@ -5665,9 +5804,12 @@ void ggml_build_backward_expand(
             node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE);
         const size_t igrad = ggml_hash_find(&cgraph->visited_hash_set, node);
+        GGML_ASSERT(igrad != GGML_HASHSET_FULL);
+        GGML_ASSERT(ggml_bitset_get(cgraph->visited_hash_set.used, igrad));
         if ((accumulate && (node->flags & GGML_TENSOR_FLAG_PARAM)) || (node->flags & GGML_TENSOR_FLAG_LOSS)) {
-            cgraph->grads[igrad]     = ggml_dup_tensor(ctx_static, node);
-            cgraph->grad_accs[igrad] = cgraph->grads[igrad];
+            cgraph->grad_accs[igrad] = ggml_dup_tensor(ctx_static, node);
+            cgraph->grads[igrad]     = cgraph->grad_accs[igrad];
+            ggml_format_name(cgraph->grad_accs[igrad], "grad acc for %s", node->name);
         }
         grads_needed[igrad] = true;
     }
@@ -5761,15 +5903,15 @@ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
 struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) {
     struct ggml_cgraph cgraph = {
-        /*.size         =*/ 0,
-        /*.n_nodes      =*/ i1 - i0,
-        /*.n_leafs      =*/ 0,
-        /*.nodes        =*/ cgraph0->nodes + i0,
-        /*.grads        =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL,
-        /*.grad_accs    =*/ cgraph0->grad_accs ? cgraph0->grad_accs + i0 : NULL,
-        /*.leafs        =*/ NULL,
-        /*.hash_table   =*/ { 0, NULL, NULL },
-        /*.order        =*/ cgraph0->order,
+        /*.size             =*/ 0,
+        /*.n_nodes          =*/ i1 - i0,
+        /*.n_leafs          =*/ 0,
+        /*.nodes            =*/ cgraph0->nodes + i0,
+        /*.grads            =*/ NULL, // gradients would need visited_hash_set
+        /*.grad_accs        =*/ NULL,
+        /*.leafs            =*/ NULL,
+        /*.visited_hash_set =*/ { 0, NULL, NULL },
+        /*.order            =*/ cgraph0->order,
     };
     return cgraph;
@@ -5799,12 +5941,22 @@ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
         }
     }
+    if (dst->grads) {
+        memset(dst->grads,     0, dst->visited_hash_set.size*sizeof(struct ggml_tensor *));
+        memset(dst->grad_accs, 0, dst->visited_hash_set.size*sizeof(struct ggml_tensor *));
+    }
     if (src->grads) {
         GGML_ASSERT(dst->grads     != NULL);
         GGML_ASSERT(dst->grad_accs != NULL);
         for (int i = 0; i < src->n_nodes; ++i) {
             const size_t igrad_src = ggml_hash_find(&src->visited_hash_set, src->nodes[i]);
             const size_t igrad_dst = ggml_hash_find(&dst->visited_hash_set, dst->nodes[i]);
+            GGML_ASSERT(igrad_src != GGML_HASHSET_FULL);
+            GGML_ASSERT(ggml_bitset_get(src->visited_hash_set.used, igrad_src));
+            GGML_ASSERT(igrad_dst != GGML_HASHSET_FULL);
+            GGML_ASSERT(ggml_bitset_get(dst->visited_hash_set.used, igrad_dst));
             dst->grads[igrad_dst]     = src->grads[igrad_src];
             dst->grad_accs[igrad_dst] = src->grad_accs[igrad_src];
         }
@@ -5839,12 +5991,8 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
         if (node->op == GGML_OP_OPT_STEP_ADAMW) {
             // clear momenta
-            if (node->src[2]->data) {
-                ggml_set_zero(node->src[2]);
-            }
-            if (node->src[3]->data) {
-                ggml_set_zero(node->src[3]);
-            }
+            ggml_set_zero(node->src[2]);
+            ggml_set_zero(node->src[3]);
         }
         // initial gradients of loss should be 1, 0 otherwise
@@ -5923,12 +6071,12 @@ struct ggml_tensor * ggml_graph_get_tensor(const struct ggml_cgraph * cgraph, co
 struct ggml_tensor * ggml_graph_get_grad(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
     const size_t igrad = ggml_hash_find(&cgraph->visited_hash_set, node);
-    return igrad != GGML_HASHSET_FULL && ggml_bitset_get(cgraph->visited_hash_set.used, igrad) ? cgraph->grads[igrad] : NULL;
+    return igrad != GGML_HASHSET_FULL && ggml_bitset_get(cgraph->visited_hash_set.used, igrad) && cgraph->grads ? cgraph->grads[igrad] : NULL;
 }
 struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
     const size_t igrad = ggml_hash_find(&cgraph->visited_hash_set, node);
-    return igrad != GGML_HASHSET_FULL && ggml_bitset_get(cgraph->visited_hash_set.used, igrad) ? cgraph->grad_accs[igrad] : NULL;
+    return igrad != GGML_HASHSET_FULL && ggml_bitset_get(cgraph->visited_hash_set.used, igrad) && cgraph->grad_accs ? cgraph->grad_accs[igrad] : NULL;
 }
 void ggml_graph_print(const struct ggml_cgraph * cgraph) {
@@ -6240,9 +6388,6 @@ size_t ggml_quantize_chunk(
         case GGML_TYPE_IQ1_M:   result = quantize_iq1_m  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_IQ4_NL:  result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_IQ4_XS:  result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q4_0_4_4: result = quantize_q4_0_4x4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q4_0_4_8: result = quantize_q4_0_4x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q4_0_8_8: result = quantize_q4_0_8x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_F16:
             {
                 size_t elemsize = sizeof(ggml_fp16_t);
@@ -6378,7 +6523,7 @@ struct gguf_context {
     void * data;
 };
-static size_t gguf_type_size(enum gguf_type type) {
+size_t gguf_type_size(enum gguf_type type) {
     GGML_ASSERT(0 <= type && type < GGUF_TYPE_COUNT);
     return GGUF_TYPE_SIZE[type];
 }
@@ -6506,13 +6651,7 @@ struct gguf_context * gguf_init_empty(void) {
     return ctx;
 }
-struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
-    FILE * file = ggml_fopen(fname, "rb");
-    if (!file) {
-        fprintf(stderr, "%s: failed to open '%s': '%s'\n", __func__, fname, strerror(errno));
-        return NULL;
-    }
+struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params) {
     // offset from start of file
     size_t offset = 0;
@@ -6525,7 +6664,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
         for (uint32_t i = 0; i < sizeof(magic); i++) {
             if (magic[i] != GGUF_MAGIC[i]) {
                 fprintf(stderr, "%s: invalid magic characters '%c%c%c%c'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
-                fclose(file);
                 return NULL;
             }
         }
@@ -6536,7 +6674,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
     struct gguf_context * ctx = calloc(1, sizeof(struct gguf_context));
     if (!ctx) {
         fprintf(stderr, "%s: failed to allocate memory for context\n", __func__);
-        fclose(file);
         return NULL;
     }
@@ -6554,7 +6691,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
         if (ctx->header.version == 1) {
             fprintf(stderr, "%s: GGUFv1 is no longer supported. please use a more up-to-date version\n", __func__);
-            fclose(file);
             gguf_free(ctx);
             return NULL;
         }
@@ -6567,7 +6703,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
         if (!ok) {
             fprintf(stderr, "%s: failed to read header\n", __func__);
-            fclose(file);
             gguf_free(ctx);
             return NULL;
         }
@@ -6577,12 +6712,13 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
     {
         const uint64_t n_kv = ctx->header.n_kv;
-        ctx->kv = calloc(n_kv, sizeof(struct gguf_kv));
-        if (!ctx->kv) {
-            fprintf(stderr, "%s: failed to allocate memory for kv pairs\n", __func__);
-            fclose(file);
-            gguf_free(ctx);
-            return NULL;
+        if (n_kv > 0) {
+            ctx->kv = calloc(n_kv, sizeof(struct gguf_kv));
+            if (!ctx->kv) {
+                fprintf(stderr, "%s: failed to allocate memory for kv pairs\n", __func__);
+                gguf_free(ctx);
+                return NULL;
+            }
         }
         for (uint64_t i = 0; i < n_kv; ++i) {
@@ -6629,7 +6765,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                                     // prevent from integer overflow in the malloc below
                                     if (kv->value.arr.n >= SIZE_MAX/gguf_type_size(kv->value.arr.type)) {
                                         fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
-                                        fclose(file);
                                         gguf_free(ctx);
                                         return NULL;
                                     }
@@ -6637,7 +6772,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                                     kv->value.arr.data = calloc(kv->value.arr.n, gguf_type_size(kv->value.arr.type));
                                     if (!kv->value.arr.data) {
                                         fprintf(stderr, "%s: failed to allocate memory for array\n", __func__);
-                                        fclose(file);
                                         gguf_free(ctx);
                                         return NULL;
                                     }
@@ -6649,7 +6783,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                                     // prevent from integer overflow in the malloc below
                                     if (kv->value.arr.n >= SIZE_MAX/sizeof(struct gguf_str)) {
                                         fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
-                                        fclose(file);
                                         gguf_free(ctx);
                                         return NULL;
                                     }
@@ -6657,7 +6790,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                                     kv->value.arr.data = calloc(kv->value.arr.n, sizeof(struct gguf_str));
                                     if (!kv->value.arr.data) {
                                         fprintf(stderr, "%s: failed to allocate memory for array\n", __func__);
-                                        fclose(file);
                                         gguf_free(ctx);
                                         return NULL;
                                     }
@@ -6688,7 +6820,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
         if (!ok) {
             fprintf(stderr, "%s: failed to read key-value pairs\n", __func__);
-            fclose(file);
             gguf_free(ctx);
             return NULL;
         }
@@ -6699,7 +6830,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
         ctx->infos = calloc(ctx->header.n_tensors, sizeof(struct gguf_tensor_info));
         if (!ctx->infos) {
             fprintf(stderr, "%s: failed to allocate memory for tensor infos\n", __func__);
-            fclose(file);
             gguf_free(ctx);
             return NULL;
         }
@@ -6735,7 +6865,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
             if (!ok) {
                 fprintf(stderr, "%s: failed to read tensor info\n", __func__);
-                fclose(file);
                 gguf_free(ctx);
                 return NULL;
             }
@@ -6774,10 +6903,17 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                 (int64_t) info->ne[2] *
                 (int64_t) info->ne[3];
-            if (ggml_blck_size(info->type) == 0 || ne % ggml_blck_size(info->type) != 0) {
+            if (ggml_blck_size(info->type) == 0 ) {
+                // this tensor type support have been removed:
+                fprintf(stderr, "%s: tensor '%s' of type %d: %s\n",
+                        __func__, info->name.data, (int) info->type, ggml_type_name(info->type));
+                gguf_free(ctx);
+                return NULL;
+            }
+            if (ne % ggml_blck_size(info->type) != 0) {
                 fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n",
                         __func__, info->name.data, (int) info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
-                fclose(file);
                 gguf_free(ctx);
                 return NULL;
             }
@@ -6809,7 +6945,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
         *params.ctx = ggml_init(pdata);
         if (*params.ctx == NULL) {
             fprintf(stderr, "%s: failed to initialize context\n", __func__);
-            fclose(file);
             gguf_free(ctx);
             return NULL;
         }
@@ -6828,7 +6963,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
             if (!ok) {
                 fprintf(stderr, "%s: failed to read tensor data\n", __func__);
-                fclose(file);
                 ggml_free(ctx_data);
                 gguf_free(ctx);
                 return NULL;
@@ -6867,7 +7001,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
         if (!ok) {
             fprintf(stderr, "%s: failed to read the tensor data\n", __func__);
-            fclose(file);
             ggml_free(ctx_data);
             gguf_free(ctx);
             return NULL;
@@ -6876,11 +7009,21 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
         ggml_set_no_alloc(ctx_data, params.no_alloc);
     }
-    fclose(file);
     return ctx;
 }
+struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
+    FILE * file = ggml_fopen(fname, "rb");
+    if (!file) {
+        fprintf(stderr, "%s: failed to open '%s': '%s'\n", __func__, fname, strerror(errno));
+        return NULL;
+    }
+    struct gguf_context * result = gguf_init_from_file_impl(file, params);
+    fclose(file);
+    return result;
+}
 void gguf_free(struct gguf_context * ctx) {
     if (ctx == NULL) {
         return;
@@ -7340,13 +7483,7 @@ void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const vo
 //    fwrite(val, sizeof(char), size, file);
 //}
-struct gguf_buf {
-    void * data;
-    size_t size;
-    size_t offset;
-};
-static struct gguf_buf gguf_buf_init(size_t size) {
+struct gguf_buf gguf_buf_init(size_t size) {
     struct gguf_buf buf = {
         /*buf.data   =*/ size == 0 ? NULL : GGML_CALLOC(1, size),
         /*buf.size   =*/ size,
@@ -7356,7 +7493,7 @@ static struct gguf_buf gguf_buf_init(size_t size) {
     return buf;
 }
-static void gguf_buf_free(struct gguf_buf buf) {
+void gguf_buf_free(struct gguf_buf buf) {
     if (buf.data) {
         GGML_FREE(buf.data);
     }
@@ -7394,7 +7531,7 @@ static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_si
     buf->offset += el_size;
 }
-static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
+void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
     // write header
     gguf_bwrite_el(buf, &ctx->header.magic,     sizeof(ctx->header.magic));
     gguf_bwrite_el(buf, &ctx->header.version,   sizeof(ctx->header.version));
@@ -7549,3 +7686,26 @@ void ggml_log_set(ggml_log_callback log_callback, void * user_data) {
     g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default;
     g_logger_state.log_callback_user_data = user_data;
 }
+void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) {
+    p->n_threads  = n_threads;
+    p->prio       = 0;     // default priority (usually means normal or inherited)
+    p->poll       = 50;    // hybrid-polling enabled
+    p->strict_cpu = false; // no strict placement (all threads share same cpumask)
+    p->paused     = false; // threads are ready to go
+    memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited)
+}
+struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) {
+    struct ggml_threadpool_params p;
+    ggml_threadpool_params_init(&p, n_threads);
+    return p;
+}
+bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) {
+    if (p0->n_threads      != p1->n_threads  )    return false;
+    if (p0->prio           != p1->prio       )    return false;
+    if (p0->poll           != p1->poll       )    return false;
+    if (p0->strict_cpu     != p1->strict_cpu )    return false;
+    return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
+}