llama_cpp 0.15.0 → 0.15.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/llama_cpp.cpp +6 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -0
- data/vendor/tmp/llama.cpp/Makefile +3 -4
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +289 -17
- data/vendor/tmp/llama.cpp/ggml-impl.h +77 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +23 -8
- data/vendor/tmp/llama.cpp/ggml-metal.metal +1 -1
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +1 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +18 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +11 -9
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +46843 -39205
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +950 -267
- data/vendor/tmp/llama.cpp/ggml.c +1090 -89
- data/vendor/tmp/llama.cpp/ggml.h +15 -7
- data/vendor/tmp/llama.cpp/llama.cpp +57 -17
- data/vendor/tmp/llama.cpp/llama.h +7 -1
- data/vendor/tmp/llama.cpp/sgemm.cpp +56 -21
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1187 -655
- data/vendor/tmp/llama.cpp/unicode-data.h +2 -1
- data/vendor/tmp/llama.cpp/unicode.cpp +254 -122
- data/vendor/tmp/llama.cpp/unicode.h +4 -2
- metadata +2 -2
@@ -8330,24 +8330,26 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_
|
|
8330
8330
|
const int blocks_per_row = ncols / qk;
|
8331
8331
|
const int blocks_per_warp = vdr * WARP_SIZE / qi;
|
8332
8332
|
|
8333
|
-
//
|
8333
|
+
const int qi_vdr = (qi / vdr); // N_threads processing 1 qk block
|
8334
|
+
|
8335
|
+
// partial sum for each thread
|
8334
8336
|
float tmp = 0.0f;
|
8335
8337
|
|
8336
8338
|
const block_q_t * x = (const block_q_t *) vx;
|
8337
8339
|
const block_q8_1 * y = (const block_q8_1 *) vy;
|
8338
8340
|
|
8339
|
-
for (int i = item_ct1.get_local_id(2) /
|
8341
|
+
for (int i = item_ct1.get_local_id(2) / qi_vdr; i < blocks_per_row;
|
8340
8342
|
i += blocks_per_warp) {
|
8341
|
-
|
8343
|
+
const int ibx = row * blocks_per_row + i; // x block index
|
8342
8344
|
|
8343
|
-
|
8345
|
+
const int iby = i * (qk / QK8_1); // y block index that aligns with ibx
|
8344
8346
|
|
8345
|
-
|
8346
|
-
|
8347
|
-
|
8348
|
-
|
8347
|
+
const int iqs =
|
8348
|
+
vdr *
|
8349
|
+
(item_ct1.get_local_id(2) -
|
8350
|
+
i * qi_vdr); // x block quant index when casting the quants to int
|
8349
8351
|
|
8350
|
-
|
8352
|
+
tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
|
8351
8353
|
}
|
8352
8354
|
|
8353
8355
|
// sum up partial sums and write back result
|