llama_cpp 0.15.4 → 0.16.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/ext/llama_cpp/extconf.rb +3 -2
- data/ext/llama_cpp/llama_cpp.cpp +17 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -1
- data/vendor/tmp/llama.cpp/Makefile +166 -82
- data/vendor/tmp/llama.cpp/ggml-alloc.c +82 -26
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +183 -69
- data/vendor/tmp/llama.cpp/ggml-backend.h +4 -4
- data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
- data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
- data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +104 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +674 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +88 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +419 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +112 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +206 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +286 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +103 -135
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +29 -13
- data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +45 -33
- data/vendor/tmp/llama.cpp/ggml-metal.metal +83 -59
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +15 -14
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +26 -90
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +74522 -14913
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +631 -471
- data/vendor/tmp/llama.cpp/ggml.c +278 -603
- data/vendor/tmp/llama.cpp/ggml.h +9 -28
- data/vendor/tmp/llama.cpp/llama.cpp +345 -473
- data/vendor/tmp/llama.cpp/llama.h +21 -43
- metadata +134 -7
- data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
- data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
- data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
@@ -8928,49 +8928,6 @@ static void rope_neox(
|
|
8928
8928
|
dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
|
8929
8929
|
}
|
8930
8930
|
|
8931
|
-
static void rope_glm_f32(
|
8932
|
-
const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
|
8933
|
-
int n_ctx
|
8934
|
-
, const sycl::nd_item<3> &item_ct1) {
|
8935
|
-
const int col = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
8936
|
-
item_ct1.get_local_id(2);
|
8937
|
-
const int half_n_dims = ncols/4;
|
8938
|
-
|
8939
|
-
if (col >= half_n_dims) {
|
8940
|
-
return;
|
8941
|
-
}
|
8942
|
-
|
8943
|
-
const int row = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
8944
|
-
item_ct1.get_local_id(1);
|
8945
|
-
const int i = row*ncols + col;
|
8946
|
-
const int i2 = row/p_delta_rows;
|
8947
|
-
|
8948
|
-
const float col_theta_scale = dpct::pow(freq_base, -2.0f * col / ncols);
|
8949
|
-
// FIXME: this is likely wrong
|
8950
|
-
const int p = pos != nullptr ? pos[i2] : 0;
|
8951
|
-
|
8952
|
-
const float theta = sycl::min(p, n_ctx - 2) * freq_scale * col_theta_scale;
|
8953
|
-
const float sin_theta = sycl::sin((float)theta);
|
8954
|
-
const float cos_theta = sycl::cos((float)theta);
|
8955
|
-
|
8956
|
-
const float x0 = x[i + 0];
|
8957
|
-
const float x1 = x[i + half_n_dims];
|
8958
|
-
|
8959
|
-
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
8960
|
-
dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
|
8961
|
-
|
8962
|
-
const float block_theta =
|
8963
|
-
((float)sycl::max(p - n_ctx - 2, 0)) * col_theta_scale;
|
8964
|
-
const float sin_block_theta = sycl::sin((float)block_theta);
|
8965
|
-
const float cos_block_theta = sycl::cos((float)block_theta);
|
8966
|
-
|
8967
|
-
const float x2 = x[i + half_n_dims * 2];
|
8968
|
-
const float x3 = x[i + half_n_dims * 3];
|
8969
|
-
|
8970
|
-
dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta;
|
8971
|
-
dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
|
8972
|
-
}
|
8973
|
-
|
8974
8931
|
static void k_sum_rows_f32(const float * x, float * dst, const int ncols,
|
8975
8932
|
const sycl::nd_item<3> &item_ct1) {
|
8976
8933
|
const int row = item_ct1.get_group(1);
|
@@ -9151,6 +9108,7 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const
|
|
9151
9108
|
// find the sum of exps in the block
|
9152
9109
|
tmp = warp_reduce_sum(tmp, item_ct1);
|
9153
9110
|
if (block_size > WARP_SIZE) {
|
9111
|
+
item_ct1.barrier(sycl::access::fence_space::local_space);
|
9154
9112
|
if (warp_id == 0) {
|
9155
9113
|
buf[lane_id] = 0.f;
|
9156
9114
|
}
|
@@ -12520,22 +12478,6 @@ static void rope_neox_sycl(const T *x, T *dst, int ncols, int n_dims, int nrows,
|
|
12520
12478
|
}
|
12521
12479
|
}
|
12522
12480
|
|
12523
|
-
static void rope_glm_f32_sycl(const float *x, float *dst, int ncols, int nrows,
|
12524
|
-
const int32_t *pos, float freq_scale,
|
12525
|
-
int p_delta_rows, float freq_base, int n_ctx,
|
12526
|
-
dpct::queue_ptr stream) {
|
12527
|
-
GGML_ASSERT(ncols % 4 == 0);
|
12528
|
-
const sycl::range<3> block_dims(1, 1, SYCL_ROPE_BLOCK_SIZE / 4);
|
12529
|
-
const int num_blocks_x = (ncols + SYCL_ROPE_BLOCK_SIZE - 1) / SYCL_ROPE_BLOCK_SIZE;
|
12530
|
-
const sycl::range<3> block_nums(1, nrows, num_blocks_x);
|
12531
|
-
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
12532
|
-
[=](sycl::nd_item<3> item_ct1) {
|
12533
|
-
rope_glm_f32(x, dst, ncols, pos, freq_scale,
|
12534
|
-
p_delta_rows, freq_base, n_ctx,
|
12535
|
-
item_ct1);
|
12536
|
-
});
|
12537
|
-
}
|
12538
|
-
|
12539
12481
|
static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols,
|
12540
12482
|
const int nrows, dpct::queue_ptr stream) {
|
12541
12483
|
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
@@ -13147,10 +13089,12 @@ void *ggml_sycl_host_malloc(size_t size) try {
|
|
13147
13089
|
return nullptr;
|
13148
13090
|
}
|
13149
13091
|
|
13092
|
+
ggml_sycl_set_device(g_main_device);
|
13093
|
+
dpct::queue_ptr main_stream = g_syclStreams[g_main_device][0];
|
13094
|
+
|
13150
13095
|
void * ptr = nullptr;
|
13151
|
-
//allow to use dpct::get_in_order_queue() for host malloc
|
13152
13096
|
dpct::err0 err = CHECK_TRY_ERROR(
|
13153
|
-
ptr = (void *)sycl::malloc_host(size,
|
13097
|
+
ptr = (void *)sycl::malloc_host(size, *main_stream));
|
13154
13098
|
|
13155
13099
|
if (err != 0) {
|
13156
13100
|
// clear the error
|
@@ -13171,8 +13115,9 @@ catch (sycl::exception const &exc) {
|
|
13171
13115
|
}
|
13172
13116
|
|
13173
13117
|
void ggml_sycl_host_free(void *ptr) try {
|
13174
|
-
|
13175
|
-
|
13118
|
+
ggml_sycl_set_device(g_main_device);
|
13119
|
+
dpct::queue_ptr main_stream = g_syclStreams[g_main_device][0];
|
13120
|
+
SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *main_stream)));
|
13176
13121
|
}
|
13177
13122
|
catch (sycl::exception const &exc) {
|
13178
13123
|
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
@@ -14066,8 +14011,8 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
14066
14011
|
//const int n_past = ((int32_t *) dst->op_params)[0];
|
14067
14012
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
14068
14013
|
const int mode = ((int32_t *) dst->op_params)[2];
|
14069
|
-
const int n_ctx = ((int32_t *) dst->op_params)[3];
|
14070
|
-
const int
|
14014
|
+
//const int n_ctx = ((int32_t *) dst->op_params)[3];
|
14015
|
+
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
|
14071
14016
|
|
14072
14017
|
// RoPE alteration for extended context
|
14073
14018
|
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
@@ -14087,7 +14032,9 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
14087
14032
|
}
|
14088
14033
|
|
14089
14034
|
const bool is_neox = mode & 2;
|
14090
|
-
|
14035
|
+
|
14036
|
+
#pragma message("TODO: update rope NORM mode to match NEOX mode")
|
14037
|
+
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7634")
|
14091
14038
|
|
14092
14039
|
if (is_neox) {
|
14093
14040
|
pos = (const int32_t *) src1_dd;
|
@@ -14100,13 +14047,10 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
14100
14047
|
}
|
14101
14048
|
|
14102
14049
|
rope_corr_dims corr_dims;
|
14103
|
-
ggml_rope_yarn_corr_dims(n_dims,
|
14050
|
+
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims.v);
|
14104
14051
|
|
14105
14052
|
// compute
|
14106
|
-
if (
|
14107
|
-
GGML_ASSERT(false);
|
14108
|
-
rope_glm_f32_sycl(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
|
14109
|
-
} else if (is_neox) {
|
14053
|
+
if (is_neox) {
|
14110
14054
|
if (src0->type == GGML_TYPE_F32) {
|
14111
14055
|
rope_neox_sycl(
|
14112
14056
|
(const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
@@ -16631,22 +16575,12 @@ GGML_CALL static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backen
|
|
16631
16575
|
UNUSED(buft);
|
16632
16576
|
}
|
16633
16577
|
|
16634
|
-
GGML_CALL static bool ggml_backend_sycl_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
16635
|
-
if (!ggml_backend_is_sycl(backend)) {
|
16636
|
-
return false;
|
16637
|
-
}
|
16638
|
-
ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
|
16639
|
-
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
16640
|
-
return buft_ctx->device == sycl_ctx->device;
|
16641
|
-
}
|
16642
|
-
|
16643
16578
|
static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
|
16644
16579
|
/* .get_name = */ ggml_backend_sycl_buffer_type_name,
|
16645
16580
|
/* .alloc_buffer = */ ggml_backend_sycl_buffer_type_alloc_buffer,
|
16646
16581
|
/* .get_alignment = */ ggml_backend_sycl_buffer_type_get_alignment,
|
16647
16582
|
/* .get_max_size = */ ggml_backend_sycl_buffer_type_get_max_size,
|
16648
16583
|
/* .get_alloc_size = */ ggml_backend_sycl_buffer_type_get_alloc_size,
|
16649
|
-
/* .supports_backend = */ ggml_backend_sycl_buffer_type_supports_backend,
|
16650
16584
|
/* .is_host = */ nullptr,
|
16651
16585
|
};
|
16652
16586
|
|
@@ -16998,12 +16932,6 @@ GGML_CALL static size_t ggml_backend_sycl_split_buffer_type_get_alloc_size(ggml_
|
|
16998
16932
|
return total_size;
|
16999
16933
|
}
|
17000
16934
|
|
17001
|
-
GGML_CALL static bool ggml_backend_sycl_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
17002
|
-
return ggml_backend_is_sycl(backend);
|
17003
|
-
|
17004
|
-
UNUSED(buft);
|
17005
|
-
}
|
17006
|
-
|
17007
16935
|
GGML_CALL static bool ggml_backend_sycl_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
17008
16936
|
return false;
|
17009
16937
|
|
@@ -17016,7 +16944,6 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface
|
|
17016
16944
|
/* .get_alignment = */ ggml_backend_sycl_split_buffer_type_get_alignment,
|
17017
16945
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
17018
16946
|
/* .get_alloc_size = */ ggml_backend_sycl_split_buffer_type_get_alloc_size,
|
17019
|
-
/* .supports_backend = */ ggml_backend_sycl_split_buffer_type_supports_backend,
|
17020
16947
|
/* .is_host = */ ggml_backend_sycl_split_buffer_type_is_host,
|
17021
16948
|
};
|
17022
16949
|
|
@@ -17102,7 +17029,6 @@ ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
|
|
17102
17029
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
17103
17030
|
/* .get_max_size = */ NULL, // TODO: return device.maxBufferLength
|
17104
17031
|
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
17105
|
-
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
17106
17032
|
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
17107
17033
|
},
|
17108
17034
|
/* .context = */ nullptr,
|
@@ -17246,7 +17172,7 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
|
|
17246
17172
|
case GGML_UNARY_OP_HARDSWISH:
|
17247
17173
|
case GGML_UNARY_OP_GELU_QUICK:
|
17248
17174
|
case GGML_UNARY_OP_TANH:
|
17249
|
-
return
|
17175
|
+
return ggml_is_contiguous(op->src[0]);
|
17250
17176
|
default:
|
17251
17177
|
return false;
|
17252
17178
|
}
|
@@ -17367,6 +17293,14 @@ GGML_CALL static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const
|
|
17367
17293
|
GGML_UNUSED(backend);
|
17368
17294
|
}
|
17369
17295
|
|
17296
|
+
GGML_CALL static bool ggml_backend_sycl_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
17297
|
+
if (buft->iface.get_name != ggml_backend_sycl_buffer_type_name) {
|
17298
|
+
return false;
|
17299
|
+
}
|
17300
|
+
ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
|
17301
|
+
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
17302
|
+
return buft_ctx->device == sycl_ctx->device;
|
17303
|
+
}
|
17370
17304
|
|
17371
17305
|
static ggml_backend_i ggml_backend_sycl_interface = {
|
17372
17306
|
/* .get_name = */ ggml_backend_sycl_name,
|
@@ -17378,9 +17312,11 @@ static ggml_backend_i ggml_backend_sycl_interface = {
|
|
17378
17312
|
/* .synchronize = */ ggml_backend_sycl_synchronize,
|
17379
17313
|
/* .graph_plan_create = */ NULL,
|
17380
17314
|
/* .graph_plan_free = */ NULL,
|
17315
|
+
/* .graph_plan_update = */ NULL,
|
17381
17316
|
/* .graph_plan_compute = */ NULL,
|
17382
17317
|
/* .graph_compute = */ ggml_backend_sycl_graph_compute,
|
17383
17318
|
/* .supports_op = */ ggml_backend_sycl_supports_op,
|
17319
|
+
/* .supports_buft = */ ggml_backend_sycl_supports_buft,
|
17384
17320
|
/* .offload_op = */ ggml_backend_sycl_offload_op,
|
17385
17321
|
/* .event_new = */ NULL,
|
17386
17322
|
/* .event_free = */ NULL,
|