llama_cpp 0.16.0 → 0.16.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/ext/llama_cpp/extconf.rb +3 -0
- data/ext/llama_cpp/llama_cpp.cpp +14 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +119 -54
- data/vendor/tmp/llama.cpp/ggml-alloc.c +78 -22
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +190 -65
- data/vendor/tmp/llama.cpp/ggml-backend.h +6 -3
- data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
- data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
- data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +1 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +21 -9
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +15 -1491
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +77 -62
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +77 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +1 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +48 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +95 -129
- data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +8 -7
- data/vendor/tmp/llama.cpp/ggml-metal.m +17 -9
- data/vendor/tmp/llama.cpp/ggml-quants.c +982 -368
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +21 -15
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +2133 -13215
- data/vendor/tmp/llama.cpp/ggml-sycl.h +1 -10
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +28826 -25037
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +438 -493
- data/vendor/tmp/llama.cpp/ggml.c +158 -414
- data/vendor/tmp/llama.cpp/ggml.h +6 -0
- data/vendor/tmp/llama.cpp/llama.cpp +628 -279
- data/vendor/tmp/llama.cpp/llama.h +9 -1
- data/vendor/tmp/llama.cpp/sgemm.cpp +2 -0
- data/vendor/tmp/llama.cpp/unicode-data.cpp +851 -801
- data/vendor/tmp/llama.cpp/unicode.cpp +33 -19
- data/vendor/tmp/llama.cpp/unicode.h +1 -1
- metadata +15 -3
@@ -92,6 +92,15 @@ static __global__ void sqr_f32(const float * x, float * dst, const int k) {
|
|
92
92
|
dst[i] = x[i] * x[i];
|
93
93
|
}
|
94
94
|
|
95
|
+
static __global__ void sqrt_f32(const float * x, float * dst, const int k) {
|
96
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
97
|
+
|
98
|
+
if (i >= k) {
|
99
|
+
return;
|
100
|
+
}
|
101
|
+
dst[i] = sqrtf(x[i]);
|
102
|
+
}
|
103
|
+
|
95
104
|
static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
96
105
|
const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
|
97
106
|
gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
@@ -142,12 +151,19 @@ static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t
|
|
142
151
|
sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
143
152
|
}
|
144
153
|
|
154
|
+
static void sqrt_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
155
|
+
const int num_blocks = (k + CUDA_SQRT_BLOCK_SIZE - 1) / CUDA_SQRT_BLOCK_SIZE;
|
156
|
+
sqrt_f32<<<num_blocks, CUDA_SQRT_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
157
|
+
}
|
158
|
+
|
145
159
|
void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
146
160
|
const ggml_tensor * src0 = dst->src[0];
|
147
161
|
const float * src0_d = (const float *)src0->data;
|
148
162
|
float * dst_d = (float *)dst->data;
|
149
163
|
cudaStream_t stream = ctx.stream();
|
150
164
|
|
165
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
166
|
+
|
151
167
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
152
168
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
153
169
|
|
@@ -160,6 +176,8 @@ void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
160
176
|
float * dst_d = (float *)dst->data;
|
161
177
|
cudaStream_t stream = ctx.stream();
|
162
178
|
|
179
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
180
|
+
|
163
181
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
164
182
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
165
183
|
|
@@ -172,6 +190,8 @@ void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
|
|
172
190
|
float * dst_d = (float *)dst->data;
|
173
191
|
cudaStream_t stream = ctx.stream();
|
174
192
|
|
193
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
194
|
+
|
175
195
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
176
196
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
177
197
|
|
@@ -184,6 +204,8 @@ void ggml_cuda_op_tanh(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
184
204
|
float * dst_d = (float *)dst->data;
|
185
205
|
cudaStream_t stream = ctx.stream();
|
186
206
|
|
207
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
208
|
+
|
187
209
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
188
210
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
189
211
|
|
@@ -196,6 +218,8 @@ void ggml_cuda_op_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
196
218
|
float * dst_d = (float *)dst->data;
|
197
219
|
cudaStream_t stream = ctx.stream();
|
198
220
|
|
221
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
222
|
+
|
199
223
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
200
224
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
201
225
|
|
@@ -208,6 +232,8 @@ void ggml_cuda_op_sigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
208
232
|
float * dst_d = (float *)dst->data;
|
209
233
|
cudaStream_t stream = ctx.stream();
|
210
234
|
|
235
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
236
|
+
|
211
237
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
212
238
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
213
239
|
|
@@ -220,6 +246,8 @@ void ggml_cuda_op_hardsigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
|
220
246
|
float * dst_d = (float *)dst->data;
|
221
247
|
cudaStream_t stream = ctx.stream();
|
222
248
|
|
249
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
250
|
+
|
223
251
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
224
252
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
225
253
|
|
@@ -232,6 +260,8 @@ void ggml_cuda_op_hardswish(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
|
|
232
260
|
float * dst_d = (float *)dst->data;
|
233
261
|
cudaStream_t stream = ctx.stream();
|
234
262
|
|
263
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
264
|
+
|
235
265
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
236
266
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
237
267
|
|
@@ -244,6 +274,8 @@ void ggml_cuda_op_leaky_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
|
|
244
274
|
float * dst_d = (float *)dst->data;
|
245
275
|
cudaStream_t stream = ctx.stream();
|
246
276
|
|
277
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
278
|
+
|
247
279
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
248
280
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
249
281
|
|
@@ -259,8 +291,24 @@ void ggml_cuda_op_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
259
291
|
float * dst_d = (float *)dst->data;
|
260
292
|
cudaStream_t stream = ctx.stream();
|
261
293
|
|
294
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
295
|
+
|
262
296
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
263
297
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
264
298
|
|
265
299
|
sqr_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
|
266
300
|
}
|
301
|
+
|
302
|
+
void ggml_cuda_op_sqrt(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
303
|
+
const ggml_tensor * src0 = dst->src[0];
|
304
|
+
const float * src0_d = (const float *)src0->data;
|
305
|
+
float * dst_d = (float *)dst->data;
|
306
|
+
cudaStream_t stream = ctx.stream();
|
307
|
+
|
308
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
309
|
+
|
310
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
311
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
312
|
+
|
313
|
+
sqrt_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
|
314
|
+
}
|
@@ -188,13 +188,15 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
188
188
|
info.default_tensor_split[id] = total_vram;
|
189
189
|
total_vram += prop.totalGlobalMem;
|
190
190
|
|
191
|
+
info.devices[id].nsm = prop.multiProcessorCount;
|
192
|
+
info.devices[id].smpb = prop.sharedMemPerBlock;
|
191
193
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
194
|
+
info.devices[id].smpbo = prop.sharedMemPerBlock;
|
192
195
|
info.devices[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
|
193
196
|
#else
|
197
|
+
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
|
194
198
|
info.devices[id].cc = 100*prop.major + 10*prop.minor;
|
195
199
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
196
|
-
info.devices[id].smpb = prop.sharedMemPerBlock;
|
197
|
-
info.devices[id].nsm = prop.multiProcessorCount;
|
198
200
|
}
|
199
201
|
|
200
202
|
for (int id = 0; id < info.device_count; ++id) {
|
@@ -543,6 +545,10 @@ GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_bu
|
|
543
545
|
return ctx->name.c_str();
|
544
546
|
}
|
545
547
|
|
548
|
+
static bool ggml_backend_buft_is_cuda(ggml_backend_buffer_type_t buft) {
|
549
|
+
return buft->iface.get_name == ggml_backend_cuda_buffer_type_name;
|
550
|
+
}
|
551
|
+
|
546
552
|
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
547
553
|
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
548
554
|
|
@@ -585,24 +591,12 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backen
|
|
585
591
|
GGML_UNUSED(buft);
|
586
592
|
}
|
587
593
|
|
588
|
-
GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
589
|
-
if (!ggml_backend_is_cuda(backend)) {
|
590
|
-
return false;
|
591
|
-
}
|
592
|
-
|
593
|
-
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
594
|
-
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
595
|
-
|
596
|
-
return buft_ctx->device == cuda_ctx->device;
|
597
|
-
}
|
598
|
-
|
599
594
|
static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
|
600
595
|
/* .get_name = */ ggml_backend_cuda_buffer_type_name,
|
601
596
|
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
|
602
597
|
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
603
598
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
604
599
|
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
605
|
-
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
|
606
600
|
/* .is_host = */ NULL,
|
607
601
|
};
|
608
602
|
|
@@ -633,88 +627,22 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
|
633
627
|
|
634
628
|
// cuda split buffer
|
635
629
|
|
636
|
-
static int64_t get_row_rounding(
|
637
|
-
int64_t
|
638
|
-
int64_t max_compute_capability = INT_MIN;
|
630
|
+
static int64_t get_row_rounding(const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
|
631
|
+
int64_t row_rounding = 0;
|
639
632
|
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
640
|
-
if (tensor_split[id]
|
641
|
-
|
642
|
-
min_compute_capability = ggml_cuda_info().devices[id].cc;
|
643
|
-
}
|
644
|
-
if (max_compute_capability < ggml_cuda_info().devices[id].cc) {
|
645
|
-
max_compute_capability = ggml_cuda_info().devices[id].cc;
|
646
|
-
}
|
633
|
+
if (tensor_split[id] >= (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
|
634
|
+
continue;
|
647
635
|
}
|
648
|
-
}
|
649
636
|
|
650
|
-
|
651
|
-
|
652
|
-
case GGML_TYPE_Q4_0:
|
653
|
-
case GGML_TYPE_Q4_1:
|
654
|
-
case GGML_TYPE_Q5_0:
|
655
|
-
case GGML_TYPE_Q5_1:
|
656
|
-
case GGML_TYPE_Q8_0:
|
657
|
-
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
658
|
-
case GGML_TYPE_F16:
|
659
|
-
case GGML_TYPE_F32:
|
660
|
-
return 1;
|
661
|
-
case GGML_TYPE_Q2_K:
|
662
|
-
return max_compute_capability >= CC_RDNA2 ? 128 : 32;
|
663
|
-
case GGML_TYPE_Q3_K:
|
664
|
-
return min_compute_capability < CC_RDNA2 ? 128 : 64;
|
665
|
-
case GGML_TYPE_Q4_K:
|
666
|
-
case GGML_TYPE_Q5_K:
|
667
|
-
case GGML_TYPE_Q6_K:
|
668
|
-
case GGML_TYPE_IQ2_XXS:
|
669
|
-
case GGML_TYPE_IQ2_XS:
|
670
|
-
case GGML_TYPE_IQ2_S:
|
671
|
-
case GGML_TYPE_IQ3_XXS:
|
672
|
-
case GGML_TYPE_IQ1_S:
|
673
|
-
case GGML_TYPE_IQ1_M:
|
674
|
-
case GGML_TYPE_IQ4_NL:
|
675
|
-
case GGML_TYPE_IQ4_XS:
|
676
|
-
case GGML_TYPE_IQ3_S:
|
677
|
-
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
678
|
-
default:
|
679
|
-
GGML_ASSERT(false);
|
637
|
+
const int cc = ggml_cuda_info().devices[id].cc;
|
638
|
+
row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc, get_mmq_x_max_host(cc)));
|
680
639
|
}
|
681
|
-
|
682
|
-
switch(type) {
|
683
|
-
case GGML_TYPE_Q4_0:
|
684
|
-
case GGML_TYPE_Q4_1:
|
685
|
-
return max_compute_capability >= CC_VOLTA ? 128 : 64;
|
686
|
-
case GGML_TYPE_Q5_0:
|
687
|
-
case GGML_TYPE_Q5_1:
|
688
|
-
case GGML_TYPE_Q8_0:
|
689
|
-
return 64;
|
690
|
-
case GGML_TYPE_F16:
|
691
|
-
case GGML_TYPE_F32:
|
692
|
-
return 1;
|
693
|
-
case GGML_TYPE_Q2_K:
|
694
|
-
case GGML_TYPE_Q3_K:
|
695
|
-
case GGML_TYPE_Q4_K:
|
696
|
-
case GGML_TYPE_Q5_K:
|
697
|
-
case GGML_TYPE_IQ2_XXS:
|
698
|
-
case GGML_TYPE_IQ2_XS:
|
699
|
-
case GGML_TYPE_IQ2_S:
|
700
|
-
case GGML_TYPE_IQ3_XXS:
|
701
|
-
case GGML_TYPE_IQ1_S:
|
702
|
-
case GGML_TYPE_IQ1_M:
|
703
|
-
case GGML_TYPE_IQ4_NL:
|
704
|
-
case GGML_TYPE_IQ4_XS:
|
705
|
-
case GGML_TYPE_IQ3_S:
|
706
|
-
return max_compute_capability >= CC_VOLTA ? 128 : 64;
|
707
|
-
case GGML_TYPE_Q6_K:
|
708
|
-
return 64;
|
709
|
-
default:
|
710
|
-
GGML_ASSERT(false);
|
711
|
-
}
|
712
|
-
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
640
|
+
return row_rounding;
|
713
641
|
}
|
714
642
|
|
715
643
|
static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split, int id) {
|
716
644
|
const int64_t nrows = ggml_nrows(tensor);
|
717
|
-
const int64_t rounding = get_row_rounding(
|
645
|
+
const int64_t rounding = get_row_rounding(tensor_split);
|
718
646
|
|
719
647
|
*row_low = id == 0 ? 0 : nrows*tensor_split[id];
|
720
648
|
*row_low -= *row_low % rounding;
|
@@ -929,6 +857,10 @@ GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_back
|
|
929
857
|
GGML_UNUSED(buft);
|
930
858
|
}
|
931
859
|
|
860
|
+
static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) {
|
861
|
+
return buft->iface.get_name == ggml_backend_cuda_split_buffer_type_name;
|
862
|
+
}
|
863
|
+
|
932
864
|
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
933
865
|
// since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
|
934
866
|
// instead, we allocate them for each tensor separately in init_tensor
|
@@ -972,12 +904,6 @@ GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_
|
|
972
904
|
return total_size;
|
973
905
|
}
|
974
906
|
|
975
|
-
GGML_CALL static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
976
|
-
return ggml_backend_is_cuda(backend);
|
977
|
-
|
978
|
-
GGML_UNUSED(buft);
|
979
|
-
}
|
980
|
-
|
981
907
|
GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
982
908
|
return false;
|
983
909
|
|
@@ -990,7 +916,6 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface
|
|
990
916
|
/* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
|
991
917
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
992
918
|
/* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
|
993
|
-
/* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
|
994
919
|
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
|
995
920
|
};
|
996
921
|
|
@@ -1090,7 +1015,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
|
1090
1015
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
1091
1016
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
1092
1017
|
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
1093
|
-
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
1094
1018
|
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
1095
1019
|
},
|
1096
1020
|
/* .context = */ nullptr,
|
@@ -1413,10 +1337,30 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
|
|
1413
1337
|
GGML_UNUSED(main_device);
|
1414
1338
|
}
|
1415
1339
|
|
1340
|
+
static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
|
1341
|
+
void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) {
|
1342
|
+
|
1343
|
+
#if !defined(GGML_USE_HIPBLAS)
|
1344
|
+
// cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
|
1345
|
+
cudaMemcpy3DPeerParms p = {};
|
1346
|
+
p.dstDevice = dstDevice;
|
1347
|
+
p.dstPtr = make_cudaPitchedPtr(dst, dpitch, dpitch, height);
|
1348
|
+
p.srcDevice = srcDevice;
|
1349
|
+
p.srcPtr = make_cudaPitchedPtr(src, spitch, spitch, height);
|
1350
|
+
p.extent = make_cudaExtent(width, height, 1);
|
1351
|
+
return cudaMemcpy3DPeerAsync(&p, stream);
|
1352
|
+
#else
|
1353
|
+
// HIP does not support cudaMemcpy3DPeerAsync or vmm pools
|
1354
|
+
GGML_UNUSED(dstDevice);
|
1355
|
+
GGML_UNUSED(srcDevice);
|
1356
|
+
return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream);
|
1357
|
+
#endif // !defined(GGML_USE_HIPBLAS)
|
1358
|
+
}
|
1359
|
+
|
1416
1360
|
static void ggml_cuda_op_mul_mat(
|
1417
1361
|
ggml_backend_cuda_context & ctx,
|
1418
1362
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
|
1419
|
-
|
1363
|
+
quantize_cuda_t quantize_src1) {
|
1420
1364
|
|
1421
1365
|
const int64_t ne00 = src0->ne[0];
|
1422
1366
|
const int64_t ne01 = src0->ne[1];
|
@@ -1473,7 +1417,9 @@ static void ggml_cuda_op_mul_mat(
|
|
1473
1417
|
}
|
1474
1418
|
|
1475
1419
|
struct dev_data {
|
1476
|
-
|
1420
|
+
int cc;
|
1421
|
+
|
1422
|
+
ggml_cuda_pool_alloc<char> src0_dd_alloc;
|
1477
1423
|
ggml_cuda_pool_alloc<float> src1_ddf_alloc;
|
1478
1424
|
ggml_cuda_pool_alloc<char> src1_ddq_alloc;
|
1479
1425
|
ggml_cuda_pool_alloc<float> dst_dd_alloc;
|
@@ -1492,6 +1438,8 @@ static void ggml_cuda_op_mul_mat(
|
|
1492
1438
|
int used_devices = 0;
|
1493
1439
|
|
1494
1440
|
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
1441
|
+
dev[id].cc = ggml_cuda_info().devices[id].cc;
|
1442
|
+
|
1495
1443
|
// by default, use all rows
|
1496
1444
|
dev[id].row_low = 0;
|
1497
1445
|
dev[id].row_high = ne01;
|
@@ -1499,7 +1447,7 @@ static void ggml_cuda_op_mul_mat(
|
|
1499
1447
|
// for multi GPU, get the row boundaries from tensor split
|
1500
1448
|
// and round to mul_mat_q tile sizes
|
1501
1449
|
if (split) {
|
1502
|
-
const int64_t rounding = get_row_rounding(
|
1450
|
+
const int64_t rounding = get_row_rounding(tensor_split);
|
1503
1451
|
|
1504
1452
|
if (id != 0) {
|
1505
1453
|
dev[id].row_low = ne01*tensor_split[id];
|
@@ -1542,11 +1490,15 @@ static void ggml_cuda_op_mul_mat(
|
|
1542
1490
|
dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ctx.pool(id), ggml_nelements(src1));
|
1543
1491
|
}
|
1544
1492
|
|
1545
|
-
if (
|
1546
|
-
|
1493
|
+
if (quantize_src1) {
|
1494
|
+
size_t src_1_ddq_size = nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs;
|
1495
|
+
if (quantize_src1 == quantize_mmq_q8_1_cuda) {
|
1496
|
+
src_1_ddq_size += get_mmq_x_max_host(dev[id].cc)*sizeof(block_q8_1_mmq);
|
1497
|
+
}
|
1498
|
+
dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), src_1_ddq_size);
|
1547
1499
|
|
1548
1500
|
if (src1_on_device && src1_is_contiguous) {
|
1549
|
-
|
1501
|
+
quantize_src1(dev[id].src1_ddf, dev[id].src1_ddq, ne10, ne11, ne12*ne13, src1_padded_col_size, src0->type, stream);
|
1550
1502
|
CUDA_CHECK(cudaGetLastError());
|
1551
1503
|
}
|
1552
1504
|
}
|
@@ -1592,7 +1544,12 @@ static void ggml_cuda_op_mul_mat(
|
|
1592
1544
|
const int64_t i03 = i0 / ne12;
|
1593
1545
|
const int64_t i02 = i0 % ne12;
|
1594
1546
|
|
1595
|
-
|
1547
|
+
size_t src1_ddq_i_offset = i0*ne11 * src1_padded_col_size*q8_1_ts/q8_1_bs;
|
1548
|
+
if (quantize_src1 == quantize_mmq_q8_1_cuda) {
|
1549
|
+
src1_ddq_i_offset += src1_col_0 * sizeof(block_q8_1_mmq);
|
1550
|
+
} else {
|
1551
|
+
src1_ddq_i_offset += src1_col_0 * src1_padded_col_size*q8_1_ts/q8_1_bs;
|
1552
|
+
}
|
1596
1553
|
|
1597
1554
|
// for split tensors the data begins at i0 == i0_offset_low
|
1598
1555
|
char * src0_dd_i = dev[id].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
|
@@ -1609,10 +1566,17 @@ static void ggml_cuda_op_mul_mat(
|
|
1609
1566
|
// copy src0, src1 to device if necessary
|
1610
1567
|
if (src1_is_contiguous) {
|
1611
1568
|
if (id != ctx.device) {
|
1612
|
-
if (
|
1569
|
+
if (quantize_src1) {
|
1613
1570
|
char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset;
|
1614
|
-
|
1615
|
-
|
1571
|
+
if (quantize_src1 == quantize_mmq_q8_1_cuda) {
|
1572
|
+
const size_t pitch = ne11*sizeof(block_q8_1_mmq);
|
1573
|
+
const size_t width = src1_ncols*sizeof(block_q8_1_mmq);
|
1574
|
+
const size_t height = src1_padded_col_size/(4*QK8_1);
|
1575
|
+
CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync(src1_ddq_i, id, pitch, src1_ddq_i_source, ctx.device, pitch, width, height, stream));
|
1576
|
+
} else {
|
1577
|
+
CUDA_CHECK(cudaMemcpyPeerAsync(
|
1578
|
+
src1_ddq_i, id, src1_ddq_i_source, ctx.device, src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
|
1579
|
+
}
|
1616
1580
|
} else {
|
1617
1581
|
float * src1_ddf_i_source = (float *) src1->data;
|
1618
1582
|
src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
|
@@ -1627,8 +1591,8 @@ static void ggml_cuda_op_mul_mat(
|
|
1627
1591
|
GGML_ASSERT(false);
|
1628
1592
|
}
|
1629
1593
|
|
1630
|
-
if (
|
1631
|
-
|
1594
|
+
if (quantize_src1 && !src1_is_contiguous) {
|
1595
|
+
quantize_src1(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, 1, src1_padded_col_size, src0->type, stream);
|
1632
1596
|
CUDA_CHECK(cudaGetLastError());
|
1633
1597
|
}
|
1634
1598
|
|
@@ -1653,22 +1617,8 @@ static void ggml_cuda_op_mul_mat(
|
|
1653
1617
|
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
1654
1618
|
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
|
1655
1619
|
dhf_dst_i += src1_col_0*ne0 + dev[id].row_low;
|
1656
|
-
|
1657
|
-
|
1658
|
-
cudaMemcpy3DPeerParms p = {};
|
1659
|
-
p.dstDevice = ctx.device;
|
1660
|
-
p.dstPtr = make_cudaPitchedPtr(dhf_dst_i, ne0*sizeof(float), row_diff, src1_ncols);
|
1661
|
-
p.srcDevice = id;
|
1662
|
-
p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols);
|
1663
|
-
p.extent = make_cudaExtent(row_diff*sizeof(float), src1_ncols, 1);
|
1664
|
-
CUDA_CHECK(cudaMemcpy3DPeerAsync(&p, stream));
|
1665
|
-
#else
|
1666
|
-
// HIP does not support cudaMemcpy3DPeerAsync or vmm pools
|
1667
|
-
CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float),
|
1668
|
-
dst_dd_i, row_diff*sizeof(float),
|
1669
|
-
row_diff*sizeof(float), src1_ncols,
|
1670
|
-
cudaMemcpyDeviceToDevice, stream));
|
1671
|
-
#endif
|
1620
|
+
CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync(
|
1621
|
+
dhf_dst_i, ctx.device, ne0*sizeof(float), dst_dd_i, id, row_diff*sizeof(float), row_diff*sizeof(float), src1_ncols, stream));
|
1672
1622
|
} else {
|
1673
1623
|
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
1674
1624
|
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
|
@@ -2007,13 +1957,13 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
|
2007
1957
|
// KQ + KQV multi-batch
|
2008
1958
|
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
|
2009
1959
|
} else if (use_dequantize_mul_mat_vec) {
|
2010
|
-
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec,
|
1960
|
+
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, nullptr);
|
2011
1961
|
} else if (use_mul_mat_vec_q) {
|
2012
|
-
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q,
|
1962
|
+
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
|
2013
1963
|
} else if (use_mul_mat_q) {
|
2014
|
-
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q,
|
1964
|
+
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, quantize_mmq_q8_1_cuda);
|
2015
1965
|
} else {
|
2016
|
-
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas,
|
1966
|
+
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, nullptr);
|
2017
1967
|
}
|
2018
1968
|
}
|
2019
1969
|
|
@@ -2780,7 +2730,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
2780
2730
|
case GGML_UNARY_OP_HARDSWISH:
|
2781
2731
|
case GGML_UNARY_OP_GELU_QUICK:
|
2782
2732
|
case GGML_UNARY_OP_TANH:
|
2783
|
-
return
|
2733
|
+
return ggml_is_contiguous(op->src[0]);
|
2784
2734
|
default:
|
2785
2735
|
return false;
|
2786
2736
|
}
|
@@ -2919,6 +2869,20 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
2919
2869
|
GGML_UNUSED(backend);
|
2920
2870
|
}
|
2921
2871
|
|
2872
|
+
GGML_CALL static bool ggml_backend_cuda_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
2873
|
+
if (ggml_backend_buft_is_cuda_split(buft)) {
|
2874
|
+
return true;
|
2875
|
+
}
|
2876
|
+
|
2877
|
+
if (ggml_backend_buft_is_cuda(buft)) {
|
2878
|
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
2879
|
+
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
2880
|
+
return buft_ctx->device == cuda_ctx->device;
|
2881
|
+
}
|
2882
|
+
|
2883
|
+
return false;
|
2884
|
+
}
|
2885
|
+
|
2922
2886
|
GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
|
2923
2887
|
const int min_batch_size = 32;
|
2924
2888
|
|
@@ -2991,9 +2955,11 @@ static ggml_backend_i ggml_backend_cuda_interface = {
|
|
2991
2955
|
/* .synchronize = */ ggml_backend_cuda_synchronize,
|
2992
2956
|
/* .graph_plan_create = */ NULL,
|
2993
2957
|
/* .graph_plan_free = */ NULL,
|
2958
|
+
/* .graph_plan_update = */ NULL,
|
2994
2959
|
/* .graph_plan_compute = */ NULL,
|
2995
2960
|
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
2996
2961
|
/* .supports_op = */ ggml_backend_cuda_supports_op,
|
2962
|
+
/* .supports_buft = */ ggml_backend_cuda_supports_buft,
|
2997
2963
|
/* .offload_op = */ ggml_backend_cuda_offload_op,
|
2998
2964
|
/* .event_new = */ ggml_backend_cuda_event_new,
|
2999
2965
|
/* .event_free = */ ggml_backend_cuda_event_free,
|
@@ -1340,7 +1340,7 @@ static bool ggml_vk_supports_op(const struct ggml_tensor * op) {
|
|
1340
1340
|
case GGML_UNARY_OP_RELU:
|
1341
1341
|
case GGML_UNARY_OP_GELU:
|
1342
1342
|
case GGML_UNARY_OP_SILU:
|
1343
|
-
return
|
1343
|
+
return ggml_is_contiguous(op->src[0]);
|
1344
1344
|
default:
|
1345
1345
|
;
|
1346
1346
|
}
|
@@ -1902,18 +1902,12 @@ static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_
|
|
1902
1902
|
return ctx->max_alloc;
|
1903
1903
|
}
|
1904
1904
|
|
1905
|
-
static bool ggml_backend_kompute_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
1906
|
-
GGML_UNUSED(buft);
|
1907
|
-
return ggml_backend_is_kompute(backend);
|
1908
|
-
}
|
1909
|
-
|
1910
1905
|
static ggml_backend_buffer_type_i ggml_backend_kompute_buffer_type_interface = {
|
1911
1906
|
/* .get_name = */ ggml_backend_kompute_buffer_type_get_name,
|
1912
1907
|
/* .alloc_buffer = */ ggml_backend_kompute_buffer_type_alloc_buffer,
|
1913
1908
|
/* .get_alignment = */ ggml_backend_kompute_buffer_type_get_alignment,
|
1914
1909
|
/* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size,
|
1915
1910
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
1916
|
-
/* .supports_backend = */ ggml_backend_kompute_buffer_type_supports_backend,
|
1917
1911
|
/* .is_host = */ NULL,
|
1918
1912
|
};
|
1919
1913
|
|
@@ -1973,6 +1967,11 @@ static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struc
|
|
1973
1967
|
return ggml_vk_supports_op(op);
|
1974
1968
|
}
|
1975
1969
|
|
1970
|
+
static bool ggml_backend_kompute_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
1971
|
+
GGML_UNUSED(backend);
|
1972
|
+
return buft->iface.get_name == ggml_backend_kompute_buffer_type_get_name;
|
1973
|
+
}
|
1974
|
+
|
1976
1975
|
static struct ggml_backend_i kompute_backend_i = {
|
1977
1976
|
/* .get_name = */ ggml_backend_kompute_name,
|
1978
1977
|
/* .free = */ ggml_backend_kompute_free,
|
@@ -1983,9 +1982,11 @@ static struct ggml_backend_i kompute_backend_i = {
|
|
1983
1982
|
/* .synchronize = */ NULL,
|
1984
1983
|
/* .graph_plan_create = */ NULL,
|
1985
1984
|
/* .graph_plan_free = */ NULL,
|
1985
|
+
/* .graph_plan_update = */ NULL,
|
1986
1986
|
/* .graph_plan_compute = */ NULL,
|
1987
1987
|
/* .graph_compute = */ ggml_backend_kompute_graph_compute,
|
1988
1988
|
/* .supports_op = */ ggml_backend_kompute_supports_op,
|
1989
|
+
/* .supports_buft = */ ggml_backend_kompute_supports_buft,
|
1989
1990
|
/* .offload_op = */ NULL,
|
1990
1991
|
/* .event_new = */ NULL,
|
1991
1992
|
/* .event_free = */ NULL,
|
@@ -735,6 +735,12 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs
|
|
735
735
|
}
|
736
736
|
|
737
737
|
static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const struct ggml_tensor * op) {
|
738
|
+
for (size_t i = 0, n = 3; i < n; ++i) {
|
739
|
+
if (op->src[i] != NULL && op->src[i]->type == GGML_TYPE_BF16) {
|
740
|
+
return false;
|
741
|
+
}
|
742
|
+
}
|
743
|
+
|
738
744
|
switch (op->op) {
|
739
745
|
case GGML_OP_UNARY:
|
740
746
|
switch (ggml_get_unary_op(op)) {
|
@@ -744,7 +750,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
|
|
744
750
|
case GGML_UNARY_OP_GELU:
|
745
751
|
case GGML_UNARY_OP_GELU_QUICK:
|
746
752
|
case GGML_UNARY_OP_SILU:
|
747
|
-
return
|
753
|
+
return ggml_is_contiguous(op->src[0]);
|
748
754
|
default:
|
749
755
|
return false;
|
750
756
|
}
|
@@ -1862,9 +1868,10 @@ static enum ggml_status ggml_metal_graph_compute(
|
|
1862
1868
|
// ne21 = n_rows
|
1863
1869
|
const int dst_rows = ne20*ne21;
|
1864
1870
|
const int dst_rows_min = n_as;
|
1871
|
+
const int dst_rows_max = (ctx->device.maxThreadgroupMemoryLength - 32 - 8192)/4;
|
1865
1872
|
|
1866
1873
|
// max size of the rowids array in the kernel shared buffer
|
1867
|
-
GGML_ASSERT(dst_rows <=
|
1874
|
+
GGML_ASSERT(dst_rows <= dst_rows_max);
|
1868
1875
|
|
1869
1876
|
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
|
1870
1877
|
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
|
@@ -3044,12 +3051,6 @@ GGML_CALL static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend
|
|
3044
3051
|
UNUSED(buft);
|
3045
3052
|
}
|
3046
3053
|
|
3047
|
-
GGML_CALL static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
3048
|
-
return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
|
3049
|
-
|
3050
|
-
UNUSED(buft);
|
3051
|
-
}
|
3052
|
-
|
3053
3054
|
GGML_CALL static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
3054
3055
|
return true;
|
3055
3056
|
|
@@ -3064,7 +3065,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
|
3064
3065
|
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
|
3065
3066
|
/* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size,
|
3066
3067
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
3067
|
-
/* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
|
3068
3068
|
/* .is_host = */ ggml_backend_metal_buffer_type_is_host,
|
3069
3069
|
},
|
3070
3070
|
/* .context = */ NULL,
|
@@ -3179,6 +3179,12 @@ GGML_CALL static bool ggml_backend_metal_supports_op(ggml_backend_t backend, con
|
|
3179
3179
|
return ggml_metal_supports_op(metal_ctx, op);
|
3180
3180
|
}
|
3181
3181
|
|
3182
|
+
GGML_CALL static bool ggml_backend_metal_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
3183
|
+
return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name;
|
3184
|
+
|
3185
|
+
UNUSED(backend);
|
3186
|
+
}
|
3187
|
+
|
3182
3188
|
static struct ggml_backend_i ggml_backend_metal_i = {
|
3183
3189
|
/* .get_name = */ ggml_backend_metal_name,
|
3184
3190
|
/* .free = */ ggml_backend_metal_free,
|
@@ -3189,9 +3195,11 @@ static struct ggml_backend_i ggml_backend_metal_i = {
|
|
3189
3195
|
/* .synchronize = */ NULL,
|
3190
3196
|
/* .graph_plan_create = */ NULL,
|
3191
3197
|
/* .graph_plan_free = */ NULL,
|
3198
|
+
/* .graph_plan_update = */ NULL,
|
3192
3199
|
/* .graph_plan_compute = */ NULL,
|
3193
3200
|
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
3194
3201
|
/* .supports_op = */ ggml_backend_metal_supports_op,
|
3202
|
+
/* .supports_buft = */ ggml_backend_metal_supports_buft,
|
3195
3203
|
/* .offload_op = */ NULL,
|
3196
3204
|
/* .event_new = */ NULL,
|
3197
3205
|
/* .event_free = */ NULL,
|