llama_cpp 0.15.4 → 0.16.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/ext/llama_cpp/extconf.rb +3 -2
- data/ext/llama_cpp/llama_cpp.cpp +17 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -1
- data/vendor/tmp/llama.cpp/Makefile +166 -82
- data/vendor/tmp/llama.cpp/ggml-alloc.c +82 -26
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +183 -69
- data/vendor/tmp/llama.cpp/ggml-backend.h +4 -4
- data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
- data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
- data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +104 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +674 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +88 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +419 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +112 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +206 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +286 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +103 -135
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +29 -13
- data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +45 -33
- data/vendor/tmp/llama.cpp/ggml-metal.metal +83 -59
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +15 -14
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +26 -90
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +74522 -14913
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +631 -471
- data/vendor/tmp/llama.cpp/ggml.c +278 -603
- data/vendor/tmp/llama.cpp/ggml.h +9 -28
- data/vendor/tmp/llama.cpp/llama.cpp +345 -473
- data/vendor/tmp/llama.cpp/llama.h +21 -43
- metadata +134 -7
- data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
- data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
- data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
|
@@ -8928,49 +8928,6 @@ static void rope_neox(
|
|
|
8928
8928
|
dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
|
|
8929
8929
|
}
|
|
8930
8930
|
|
|
8931
|
-
static void rope_glm_f32(
|
|
8932
|
-
const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
|
|
8933
|
-
int n_ctx
|
|
8934
|
-
, const sycl::nd_item<3> &item_ct1) {
|
|
8935
|
-
const int col = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
8936
|
-
item_ct1.get_local_id(2);
|
|
8937
|
-
const int half_n_dims = ncols/4;
|
|
8938
|
-
|
|
8939
|
-
if (col >= half_n_dims) {
|
|
8940
|
-
return;
|
|
8941
|
-
}
|
|
8942
|
-
|
|
8943
|
-
const int row = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
|
8944
|
-
item_ct1.get_local_id(1);
|
|
8945
|
-
const int i = row*ncols + col;
|
|
8946
|
-
const int i2 = row/p_delta_rows;
|
|
8947
|
-
|
|
8948
|
-
const float col_theta_scale = dpct::pow(freq_base, -2.0f * col / ncols);
|
|
8949
|
-
// FIXME: this is likely wrong
|
|
8950
|
-
const int p = pos != nullptr ? pos[i2] : 0;
|
|
8951
|
-
|
|
8952
|
-
const float theta = sycl::min(p, n_ctx - 2) * freq_scale * col_theta_scale;
|
|
8953
|
-
const float sin_theta = sycl::sin((float)theta);
|
|
8954
|
-
const float cos_theta = sycl::cos((float)theta);
|
|
8955
|
-
|
|
8956
|
-
const float x0 = x[i + 0];
|
|
8957
|
-
const float x1 = x[i + half_n_dims];
|
|
8958
|
-
|
|
8959
|
-
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
|
8960
|
-
dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
|
|
8961
|
-
|
|
8962
|
-
const float block_theta =
|
|
8963
|
-
((float)sycl::max(p - n_ctx - 2, 0)) * col_theta_scale;
|
|
8964
|
-
const float sin_block_theta = sycl::sin((float)block_theta);
|
|
8965
|
-
const float cos_block_theta = sycl::cos((float)block_theta);
|
|
8966
|
-
|
|
8967
|
-
const float x2 = x[i + half_n_dims * 2];
|
|
8968
|
-
const float x3 = x[i + half_n_dims * 3];
|
|
8969
|
-
|
|
8970
|
-
dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta;
|
|
8971
|
-
dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
|
|
8972
|
-
}
|
|
8973
|
-
|
|
8974
8931
|
static void k_sum_rows_f32(const float * x, float * dst, const int ncols,
|
|
8975
8932
|
const sycl::nd_item<3> &item_ct1) {
|
|
8976
8933
|
const int row = item_ct1.get_group(1);
|
|
@@ -9151,6 +9108,7 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const
|
|
|
9151
9108
|
// find the sum of exps in the block
|
|
9152
9109
|
tmp = warp_reduce_sum(tmp, item_ct1);
|
|
9153
9110
|
if (block_size > WARP_SIZE) {
|
|
9111
|
+
item_ct1.barrier(sycl::access::fence_space::local_space);
|
|
9154
9112
|
if (warp_id == 0) {
|
|
9155
9113
|
buf[lane_id] = 0.f;
|
|
9156
9114
|
}
|
|
@@ -12520,22 +12478,6 @@ static void rope_neox_sycl(const T *x, T *dst, int ncols, int n_dims, int nrows,
|
|
|
12520
12478
|
}
|
|
12521
12479
|
}
|
|
12522
12480
|
|
|
12523
|
-
static void rope_glm_f32_sycl(const float *x, float *dst, int ncols, int nrows,
|
|
12524
|
-
const int32_t *pos, float freq_scale,
|
|
12525
|
-
int p_delta_rows, float freq_base, int n_ctx,
|
|
12526
|
-
dpct::queue_ptr stream) {
|
|
12527
|
-
GGML_ASSERT(ncols % 4 == 0);
|
|
12528
|
-
const sycl::range<3> block_dims(1, 1, SYCL_ROPE_BLOCK_SIZE / 4);
|
|
12529
|
-
const int num_blocks_x = (ncols + SYCL_ROPE_BLOCK_SIZE - 1) / SYCL_ROPE_BLOCK_SIZE;
|
|
12530
|
-
const sycl::range<3> block_nums(1, nrows, num_blocks_x);
|
|
12531
|
-
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
12532
|
-
[=](sycl::nd_item<3> item_ct1) {
|
|
12533
|
-
rope_glm_f32(x, dst, ncols, pos, freq_scale,
|
|
12534
|
-
p_delta_rows, freq_base, n_ctx,
|
|
12535
|
-
item_ct1);
|
|
12536
|
-
});
|
|
12537
|
-
}
|
|
12538
|
-
|
|
12539
12481
|
static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols,
|
|
12540
12482
|
const int nrows, dpct::queue_ptr stream) {
|
|
12541
12483
|
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
|
@@ -13147,10 +13089,12 @@ void *ggml_sycl_host_malloc(size_t size) try {
|
|
|
13147
13089
|
return nullptr;
|
|
13148
13090
|
}
|
|
13149
13091
|
|
|
13092
|
+
ggml_sycl_set_device(g_main_device);
|
|
13093
|
+
dpct::queue_ptr main_stream = g_syclStreams[g_main_device][0];
|
|
13094
|
+
|
|
13150
13095
|
void * ptr = nullptr;
|
|
13151
|
-
//allow to use dpct::get_in_order_queue() for host malloc
|
|
13152
13096
|
dpct::err0 err = CHECK_TRY_ERROR(
|
|
13153
|
-
ptr = (void *)sycl::malloc_host(size,
|
|
13097
|
+
ptr = (void *)sycl::malloc_host(size, *main_stream));
|
|
13154
13098
|
|
|
13155
13099
|
if (err != 0) {
|
|
13156
13100
|
// clear the error
|
|
@@ -13171,8 +13115,9 @@ catch (sycl::exception const &exc) {
|
|
|
13171
13115
|
}
|
|
13172
13116
|
|
|
13173
13117
|
void ggml_sycl_host_free(void *ptr) try {
|
|
13174
|
-
|
|
13175
|
-
|
|
13118
|
+
ggml_sycl_set_device(g_main_device);
|
|
13119
|
+
dpct::queue_ptr main_stream = g_syclStreams[g_main_device][0];
|
|
13120
|
+
SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *main_stream)));
|
|
13176
13121
|
}
|
|
13177
13122
|
catch (sycl::exception const &exc) {
|
|
13178
13123
|
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
|
@@ -14066,8 +14011,8 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
|
14066
14011
|
//const int n_past = ((int32_t *) dst->op_params)[0];
|
|
14067
14012
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
|
14068
14013
|
const int mode = ((int32_t *) dst->op_params)[2];
|
|
14069
|
-
const int n_ctx = ((int32_t *) dst->op_params)[3];
|
|
14070
|
-
const int
|
|
14014
|
+
//const int n_ctx = ((int32_t *) dst->op_params)[3];
|
|
14015
|
+
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
|
|
14071
14016
|
|
|
14072
14017
|
// RoPE alteration for extended context
|
|
14073
14018
|
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
|
@@ -14087,7 +14032,9 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
|
14087
14032
|
}
|
|
14088
14033
|
|
|
14089
14034
|
const bool is_neox = mode & 2;
|
|
14090
|
-
|
|
14035
|
+
|
|
14036
|
+
#pragma message("TODO: update rope NORM mode to match NEOX mode")
|
|
14037
|
+
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7634")
|
|
14091
14038
|
|
|
14092
14039
|
if (is_neox) {
|
|
14093
14040
|
pos = (const int32_t *) src1_dd;
|
|
@@ -14100,13 +14047,10 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
|
14100
14047
|
}
|
|
14101
14048
|
|
|
14102
14049
|
rope_corr_dims corr_dims;
|
|
14103
|
-
ggml_rope_yarn_corr_dims(n_dims,
|
|
14050
|
+
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims.v);
|
|
14104
14051
|
|
|
14105
14052
|
// compute
|
|
14106
|
-
if (
|
|
14107
|
-
GGML_ASSERT(false);
|
|
14108
|
-
rope_glm_f32_sycl(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
|
|
14109
|
-
} else if (is_neox) {
|
|
14053
|
+
if (is_neox) {
|
|
14110
14054
|
if (src0->type == GGML_TYPE_F32) {
|
|
14111
14055
|
rope_neox_sycl(
|
|
14112
14056
|
(const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
|
@@ -16631,22 +16575,12 @@ GGML_CALL static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backen
|
|
|
16631
16575
|
UNUSED(buft);
|
|
16632
16576
|
}
|
|
16633
16577
|
|
|
16634
|
-
GGML_CALL static bool ggml_backend_sycl_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
|
16635
|
-
if (!ggml_backend_is_sycl(backend)) {
|
|
16636
|
-
return false;
|
|
16637
|
-
}
|
|
16638
|
-
ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
|
|
16639
|
-
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
|
16640
|
-
return buft_ctx->device == sycl_ctx->device;
|
|
16641
|
-
}
|
|
16642
|
-
|
|
16643
16578
|
static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
|
|
16644
16579
|
/* .get_name = */ ggml_backend_sycl_buffer_type_name,
|
|
16645
16580
|
/* .alloc_buffer = */ ggml_backend_sycl_buffer_type_alloc_buffer,
|
|
16646
16581
|
/* .get_alignment = */ ggml_backend_sycl_buffer_type_get_alignment,
|
|
16647
16582
|
/* .get_max_size = */ ggml_backend_sycl_buffer_type_get_max_size,
|
|
16648
16583
|
/* .get_alloc_size = */ ggml_backend_sycl_buffer_type_get_alloc_size,
|
|
16649
|
-
/* .supports_backend = */ ggml_backend_sycl_buffer_type_supports_backend,
|
|
16650
16584
|
/* .is_host = */ nullptr,
|
|
16651
16585
|
};
|
|
16652
16586
|
|
|
@@ -16998,12 +16932,6 @@ GGML_CALL static size_t ggml_backend_sycl_split_buffer_type_get_alloc_size(ggml_
|
|
|
16998
16932
|
return total_size;
|
|
16999
16933
|
}
|
|
17000
16934
|
|
|
17001
|
-
GGML_CALL static bool ggml_backend_sycl_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
|
17002
|
-
return ggml_backend_is_sycl(backend);
|
|
17003
|
-
|
|
17004
|
-
UNUSED(buft);
|
|
17005
|
-
}
|
|
17006
|
-
|
|
17007
16935
|
GGML_CALL static bool ggml_backend_sycl_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
|
17008
16936
|
return false;
|
|
17009
16937
|
|
|
@@ -17016,7 +16944,6 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface
|
|
|
17016
16944
|
/* .get_alignment = */ ggml_backend_sycl_split_buffer_type_get_alignment,
|
|
17017
16945
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
|
17018
16946
|
/* .get_alloc_size = */ ggml_backend_sycl_split_buffer_type_get_alloc_size,
|
|
17019
|
-
/* .supports_backend = */ ggml_backend_sycl_split_buffer_type_supports_backend,
|
|
17020
16947
|
/* .is_host = */ ggml_backend_sycl_split_buffer_type_is_host,
|
|
17021
16948
|
};
|
|
17022
16949
|
|
|
@@ -17102,7 +17029,6 @@ ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
|
|
|
17102
17029
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
|
17103
17030
|
/* .get_max_size = */ NULL, // TODO: return device.maxBufferLength
|
|
17104
17031
|
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
|
17105
|
-
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
|
17106
17032
|
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
|
17107
17033
|
},
|
|
17108
17034
|
/* .context = */ nullptr,
|
|
@@ -17246,7 +17172,7 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
|
|
|
17246
17172
|
case GGML_UNARY_OP_HARDSWISH:
|
|
17247
17173
|
case GGML_UNARY_OP_GELU_QUICK:
|
|
17248
17174
|
case GGML_UNARY_OP_TANH:
|
|
17249
|
-
return
|
|
17175
|
+
return ggml_is_contiguous(op->src[0]);
|
|
17250
17176
|
default:
|
|
17251
17177
|
return false;
|
|
17252
17178
|
}
|
|
@@ -17367,6 +17293,14 @@ GGML_CALL static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const
|
|
|
17367
17293
|
GGML_UNUSED(backend);
|
|
17368
17294
|
}
|
|
17369
17295
|
|
|
17296
|
+
GGML_CALL static bool ggml_backend_sycl_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
|
17297
|
+
if (buft->iface.get_name != ggml_backend_sycl_buffer_type_name) {
|
|
17298
|
+
return false;
|
|
17299
|
+
}
|
|
17300
|
+
ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
|
|
17301
|
+
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
|
17302
|
+
return buft_ctx->device == sycl_ctx->device;
|
|
17303
|
+
}
|
|
17370
17304
|
|
|
17371
17305
|
static ggml_backend_i ggml_backend_sycl_interface = {
|
|
17372
17306
|
/* .get_name = */ ggml_backend_sycl_name,
|
|
@@ -17378,9 +17312,11 @@ static ggml_backend_i ggml_backend_sycl_interface = {
|
|
|
17378
17312
|
/* .synchronize = */ ggml_backend_sycl_synchronize,
|
|
17379
17313
|
/* .graph_plan_create = */ NULL,
|
|
17380
17314
|
/* .graph_plan_free = */ NULL,
|
|
17315
|
+
/* .graph_plan_update = */ NULL,
|
|
17381
17316
|
/* .graph_plan_compute = */ NULL,
|
|
17382
17317
|
/* .graph_compute = */ ggml_backend_sycl_graph_compute,
|
|
17383
17318
|
/* .supports_op = */ ggml_backend_sycl_supports_op,
|
|
17319
|
+
/* .supports_buft = */ ggml_backend_sycl_supports_buft,
|
|
17384
17320
|
/* .offload_op = */ ggml_backend_sycl_offload_op,
|
|
17385
17321
|
/* .event_new = */ NULL,
|
|
17386
17322
|
/* .event_free = */ NULL,
|