llama_cpp 0.15.0 → 0.15.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/llama_cpp.cpp +6 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -0
- data/vendor/tmp/llama.cpp/Makefile +3 -4
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +289 -17
- data/vendor/tmp/llama.cpp/ggml-impl.h +77 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +23 -8
- data/vendor/tmp/llama.cpp/ggml-metal.metal +1 -1
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +1 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +18 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +11 -9
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +46843 -39205
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +950 -267
- data/vendor/tmp/llama.cpp/ggml.c +1090 -89
- data/vendor/tmp/llama.cpp/ggml.h +15 -7
- data/vendor/tmp/llama.cpp/llama.cpp +57 -17
- data/vendor/tmp/llama.cpp/llama.h +7 -1
- data/vendor/tmp/llama.cpp/sgemm.cpp +56 -21
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1187 -655
- data/vendor/tmp/llama.cpp/unicode-data.h +2 -1
- data/vendor/tmp/llama.cpp/unicode.cpp +254 -122
- data/vendor/tmp/llama.cpp/unicode.h +4 -2
- metadata +2 -2
@@ -8330,24 +8330,26 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_
|
|
8330
8330
|
const int blocks_per_row = ncols / qk;
|
8331
8331
|
const int blocks_per_warp = vdr * WARP_SIZE / qi;
|
8332
8332
|
|
8333
|
-
//
|
8333
|
+
const int qi_vdr = (qi / vdr); // N_threads processing 1 qk block
|
8334
|
+
|
8335
|
+
// partial sum for each thread
|
8334
8336
|
float tmp = 0.0f;
|
8335
8337
|
|
8336
8338
|
const block_q_t * x = (const block_q_t *) vx;
|
8337
8339
|
const block_q8_1 * y = (const block_q8_1 *) vy;
|
8338
8340
|
|
8339
|
-
for (int i = item_ct1.get_local_id(2) /
|
8341
|
+
for (int i = item_ct1.get_local_id(2) / qi_vdr; i < blocks_per_row;
|
8340
8342
|
i += blocks_per_warp) {
|
8341
|
-
|
8343
|
+
const int ibx = row * blocks_per_row + i; // x block index
|
8342
8344
|
|
8343
|
-
|
8345
|
+
const int iby = i * (qk / QK8_1); // y block index that aligns with ibx
|
8344
8346
|
|
8345
|
-
|
8346
|
-
|
8347
|
-
|
8348
|
-
|
8347
|
+
const int iqs =
|
8348
|
+
vdr *
|
8349
|
+
(item_ct1.get_local_id(2) -
|
8350
|
+
i * qi_vdr); // x block quant index when casting the quants to int
|
8349
8351
|
|
8350
|
-
|
8352
|
+
tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
|
8351
8353
|
}
|
8352
8354
|
|
8353
8355
|
// sum up partial sums and write back result
|