RubyGems - llama_cpp - Versions diffs - 0.15.0 → 0.15.1 - Mend

llama_cpp 0.15.0 → 0.15.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +6 -0
data/ext/llama_cpp/llama_cpp.cpp +6 -0
data/lib/llama_cpp/version.rb +2 -2
data/sig/llama_cpp.rbs +6 -0
data/vendor/tmp/llama.cpp/Makefile +3 -4
data/vendor/tmp/llama.cpp/ggml-cuda.cu +289 -17
data/vendor/tmp/llama.cpp/ggml-impl.h +77 -0
data/vendor/tmp/llama.cpp/ggml-metal.m +23 -8
data/vendor/tmp/llama.cpp/ggml-metal.metal +1 -1
data/vendor/tmp/llama.cpp/ggml-opencl.cpp +1 -0
data/vendor/tmp/llama.cpp/ggml-quants.c +18 -0
data/vendor/tmp/llama.cpp/ggml-sycl.cpp +11 -9
data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +46843 -39205
data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +950 -267
data/vendor/tmp/llama.cpp/ggml.c +1090 -89
data/vendor/tmp/llama.cpp/ggml.h +15 -7
data/vendor/tmp/llama.cpp/llama.cpp +57 -17
data/vendor/tmp/llama.cpp/llama.h +7 -1
data/vendor/tmp/llama.cpp/sgemm.cpp +56 -21
data/vendor/tmp/llama.cpp/unicode-data.cpp +1187 -655
data/vendor/tmp/llama.cpp/unicode-data.h +2 -1
data/vendor/tmp/llama.cpp/unicode.cpp +254 -122
data/vendor/tmp/llama.cpp/unicode.h +4 -2
metadata +2 -2

data/vendor/tmp/llama.cpp/ggml-sycl.cpp CHANGED Viewed

@@ -8330,24 +8330,26 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_
     const int blocks_per_row = ncols / qk;
     const int blocks_per_warp = vdr * WARP_SIZE / qi;
-// partial sum for each thread
+    const int qi_vdr = (qi / vdr); // N_threads processing 1 qk block
+    // partial sum for each thread
     float tmp = 0.0f;
     const block_q_t  * x = (const block_q_t  *) vx;
     const block_q8_1 * y = (const block_q8_1 *) vy;
-    for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
+    for (int i = item_ct1.get_local_id(2) / qi_vdr; i < blocks_per_row;
          i += blocks_per_warp) {
-        const int ibx = row*blocks_per_row + i; // x block index
+      const int ibx = row * blocks_per_row + i; // x block index
-        const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
+      const int iby = i * (qk / QK8_1); // y block index that aligns with ibx
-        const int iqs =
-            vdr *
-            (item_ct1.get_local_id(2) %
-             (qi / vdr)); // x block quant index when casting the quants to int
+      const int iqs =
+          vdr *
+          (item_ct1.get_local_id(2) -
+           i * qi_vdr); // x block quant index when casting the quants to int
-        tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
+      tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
     }
     // sum up partial sums and write back result