llama_cpp 0.16.0 → 0.16.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/ext/llama_cpp/extconf.rb +3 -0
- data/ext/llama_cpp/llama_cpp.cpp +14 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +119 -54
- data/vendor/tmp/llama.cpp/ggml-alloc.c +78 -22
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +190 -65
- data/vendor/tmp/llama.cpp/ggml-backend.h +6 -3
- data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
- data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
- data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +1 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +21 -9
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +15 -1491
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +77 -62
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +77 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +1 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +48 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +95 -129
- data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +8 -7
- data/vendor/tmp/llama.cpp/ggml-metal.m +17 -9
- data/vendor/tmp/llama.cpp/ggml-quants.c +982 -368
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +21 -15
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +2133 -13215
- data/vendor/tmp/llama.cpp/ggml-sycl.h +1 -10
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +28826 -25037
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +438 -493
- data/vendor/tmp/llama.cpp/ggml.c +158 -414
- data/vendor/tmp/llama.cpp/ggml.h +6 -0
- data/vendor/tmp/llama.cpp/llama.cpp +628 -279
- data/vendor/tmp/llama.cpp/llama.h +9 -1
- data/vendor/tmp/llama.cpp/sgemm.cpp +2 -0
- data/vendor/tmp/llama.cpp/unicode-data.cpp +851 -801
- data/vendor/tmp/llama.cpp/unicode.cpp +33 -19
- data/vendor/tmp/llama.cpp/unicode.h +1 -1
- metadata +15 -3
|
@@ -92,6 +92,15 @@ static __global__ void sqr_f32(const float * x, float * dst, const int k) {
|
|
|
92
92
|
dst[i] = x[i] * x[i];
|
|
93
93
|
}
|
|
94
94
|
|
|
95
|
+
static __global__ void sqrt_f32(const float * x, float * dst, const int k) {
|
|
96
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
|
97
|
+
|
|
98
|
+
if (i >= k) {
|
|
99
|
+
return;
|
|
100
|
+
}
|
|
101
|
+
dst[i] = sqrtf(x[i]);
|
|
102
|
+
}
|
|
103
|
+
|
|
95
104
|
static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
|
96
105
|
const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
|
|
97
106
|
gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
|
@@ -142,12 +151,19 @@ static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t
|
|
|
142
151
|
sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
|
143
152
|
}
|
|
144
153
|
|
|
154
|
+
static void sqrt_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
|
155
|
+
const int num_blocks = (k + CUDA_SQRT_BLOCK_SIZE - 1) / CUDA_SQRT_BLOCK_SIZE;
|
|
156
|
+
sqrt_f32<<<num_blocks, CUDA_SQRT_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
|
157
|
+
}
|
|
158
|
+
|
|
145
159
|
void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
146
160
|
const ggml_tensor * src0 = dst->src[0];
|
|
147
161
|
const float * src0_d = (const float *)src0->data;
|
|
148
162
|
float * dst_d = (float *)dst->data;
|
|
149
163
|
cudaStream_t stream = ctx.stream();
|
|
150
164
|
|
|
165
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
|
166
|
+
|
|
151
167
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
152
168
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
153
169
|
|
|
@@ -160,6 +176,8 @@ void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
|
160
176
|
float * dst_d = (float *)dst->data;
|
|
161
177
|
cudaStream_t stream = ctx.stream();
|
|
162
178
|
|
|
179
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
|
180
|
+
|
|
163
181
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
164
182
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
165
183
|
|
|
@@ -172,6 +190,8 @@ void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
|
|
|
172
190
|
float * dst_d = (float *)dst->data;
|
|
173
191
|
cudaStream_t stream = ctx.stream();
|
|
174
192
|
|
|
193
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
|
194
|
+
|
|
175
195
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
176
196
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
177
197
|
|
|
@@ -184,6 +204,8 @@ void ggml_cuda_op_tanh(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
|
184
204
|
float * dst_d = (float *)dst->data;
|
|
185
205
|
cudaStream_t stream = ctx.stream();
|
|
186
206
|
|
|
207
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
|
208
|
+
|
|
187
209
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
188
210
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
189
211
|
|
|
@@ -196,6 +218,8 @@ void ggml_cuda_op_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
|
196
218
|
float * dst_d = (float *)dst->data;
|
|
197
219
|
cudaStream_t stream = ctx.stream();
|
|
198
220
|
|
|
221
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
|
222
|
+
|
|
199
223
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
200
224
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
201
225
|
|
|
@@ -208,6 +232,8 @@ void ggml_cuda_op_sigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
|
208
232
|
float * dst_d = (float *)dst->data;
|
|
209
233
|
cudaStream_t stream = ctx.stream();
|
|
210
234
|
|
|
235
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
|
236
|
+
|
|
211
237
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
212
238
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
213
239
|
|
|
@@ -220,6 +246,8 @@ void ggml_cuda_op_hardsigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
|
|
220
246
|
float * dst_d = (float *)dst->data;
|
|
221
247
|
cudaStream_t stream = ctx.stream();
|
|
222
248
|
|
|
249
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
|
250
|
+
|
|
223
251
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
224
252
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
225
253
|
|
|
@@ -232,6 +260,8 @@ void ggml_cuda_op_hardswish(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
|
|
|
232
260
|
float * dst_d = (float *)dst->data;
|
|
233
261
|
cudaStream_t stream = ctx.stream();
|
|
234
262
|
|
|
263
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
|
264
|
+
|
|
235
265
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
236
266
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
237
267
|
|
|
@@ -244,6 +274,8 @@ void ggml_cuda_op_leaky_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
|
|
|
244
274
|
float * dst_d = (float *)dst->data;
|
|
245
275
|
cudaStream_t stream = ctx.stream();
|
|
246
276
|
|
|
277
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
|
278
|
+
|
|
247
279
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
248
280
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
249
281
|
|
|
@@ -259,8 +291,24 @@ void ggml_cuda_op_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
|
259
291
|
float * dst_d = (float *)dst->data;
|
|
260
292
|
cudaStream_t stream = ctx.stream();
|
|
261
293
|
|
|
294
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
|
295
|
+
|
|
262
296
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
263
297
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
264
298
|
|
|
265
299
|
sqr_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
|
|
266
300
|
}
|
|
301
|
+
|
|
302
|
+
void ggml_cuda_op_sqrt(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
303
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
304
|
+
const float * src0_d = (const float *)src0->data;
|
|
305
|
+
float * dst_d = (float *)dst->data;
|
|
306
|
+
cudaStream_t stream = ctx.stream();
|
|
307
|
+
|
|
308
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
|
309
|
+
|
|
310
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
311
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
312
|
+
|
|
313
|
+
sqrt_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
|
|
314
|
+
}
|
|
@@ -188,13 +188,15 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
|
188
188
|
info.default_tensor_split[id] = total_vram;
|
|
189
189
|
total_vram += prop.totalGlobalMem;
|
|
190
190
|
|
|
191
|
+
info.devices[id].nsm = prop.multiProcessorCount;
|
|
192
|
+
info.devices[id].smpb = prop.sharedMemPerBlock;
|
|
191
193
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
194
|
+
info.devices[id].smpbo = prop.sharedMemPerBlock;
|
|
192
195
|
info.devices[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
|
|
193
196
|
#else
|
|
197
|
+
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
|
|
194
198
|
info.devices[id].cc = 100*prop.major + 10*prop.minor;
|
|
195
199
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
196
|
-
info.devices[id].smpb = prop.sharedMemPerBlock;
|
|
197
|
-
info.devices[id].nsm = prop.multiProcessorCount;
|
|
198
200
|
}
|
|
199
201
|
|
|
200
202
|
for (int id = 0; id < info.device_count; ++id) {
|
|
@@ -543,6 +545,10 @@ GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_bu
|
|
|
543
545
|
return ctx->name.c_str();
|
|
544
546
|
}
|
|
545
547
|
|
|
548
|
+
static bool ggml_backend_buft_is_cuda(ggml_backend_buffer_type_t buft) {
|
|
549
|
+
return buft->iface.get_name == ggml_backend_cuda_buffer_type_name;
|
|
550
|
+
}
|
|
551
|
+
|
|
546
552
|
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
547
553
|
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
|
548
554
|
|
|
@@ -585,24 +591,12 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backen
|
|
|
585
591
|
GGML_UNUSED(buft);
|
|
586
592
|
}
|
|
587
593
|
|
|
588
|
-
GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
|
589
|
-
if (!ggml_backend_is_cuda(backend)) {
|
|
590
|
-
return false;
|
|
591
|
-
}
|
|
592
|
-
|
|
593
|
-
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
|
594
|
-
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
|
595
|
-
|
|
596
|
-
return buft_ctx->device == cuda_ctx->device;
|
|
597
|
-
}
|
|
598
|
-
|
|
599
594
|
static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
|
|
600
595
|
/* .get_name = */ ggml_backend_cuda_buffer_type_name,
|
|
601
596
|
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
|
|
602
597
|
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
|
603
598
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
|
604
599
|
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
|
605
|
-
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
|
|
606
600
|
/* .is_host = */ NULL,
|
|
607
601
|
};
|
|
608
602
|
|
|
@@ -633,88 +627,22 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
|
|
633
627
|
|
|
634
628
|
// cuda split buffer
|
|
635
629
|
|
|
636
|
-
static int64_t get_row_rounding(
|
|
637
|
-
int64_t
|
|
638
|
-
int64_t max_compute_capability = INT_MIN;
|
|
630
|
+
static int64_t get_row_rounding(const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
|
|
631
|
+
int64_t row_rounding = 0;
|
|
639
632
|
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
|
640
|
-
if (tensor_split[id]
|
|
641
|
-
|
|
642
|
-
min_compute_capability = ggml_cuda_info().devices[id].cc;
|
|
643
|
-
}
|
|
644
|
-
if (max_compute_capability < ggml_cuda_info().devices[id].cc) {
|
|
645
|
-
max_compute_capability = ggml_cuda_info().devices[id].cc;
|
|
646
|
-
}
|
|
633
|
+
if (tensor_split[id] >= (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
|
|
634
|
+
continue;
|
|
647
635
|
}
|
|
648
|
-
}
|
|
649
636
|
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
case GGML_TYPE_Q4_0:
|
|
653
|
-
case GGML_TYPE_Q4_1:
|
|
654
|
-
case GGML_TYPE_Q5_0:
|
|
655
|
-
case GGML_TYPE_Q5_1:
|
|
656
|
-
case GGML_TYPE_Q8_0:
|
|
657
|
-
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
|
658
|
-
case GGML_TYPE_F16:
|
|
659
|
-
case GGML_TYPE_F32:
|
|
660
|
-
return 1;
|
|
661
|
-
case GGML_TYPE_Q2_K:
|
|
662
|
-
return max_compute_capability >= CC_RDNA2 ? 128 : 32;
|
|
663
|
-
case GGML_TYPE_Q3_K:
|
|
664
|
-
return min_compute_capability < CC_RDNA2 ? 128 : 64;
|
|
665
|
-
case GGML_TYPE_Q4_K:
|
|
666
|
-
case GGML_TYPE_Q5_K:
|
|
667
|
-
case GGML_TYPE_Q6_K:
|
|
668
|
-
case GGML_TYPE_IQ2_XXS:
|
|
669
|
-
case GGML_TYPE_IQ2_XS:
|
|
670
|
-
case GGML_TYPE_IQ2_S:
|
|
671
|
-
case GGML_TYPE_IQ3_XXS:
|
|
672
|
-
case GGML_TYPE_IQ1_S:
|
|
673
|
-
case GGML_TYPE_IQ1_M:
|
|
674
|
-
case GGML_TYPE_IQ4_NL:
|
|
675
|
-
case GGML_TYPE_IQ4_XS:
|
|
676
|
-
case GGML_TYPE_IQ3_S:
|
|
677
|
-
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
|
678
|
-
default:
|
|
679
|
-
GGML_ASSERT(false);
|
|
637
|
+
const int cc = ggml_cuda_info().devices[id].cc;
|
|
638
|
+
row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc, get_mmq_x_max_host(cc)));
|
|
680
639
|
}
|
|
681
|
-
|
|
682
|
-
switch(type) {
|
|
683
|
-
case GGML_TYPE_Q4_0:
|
|
684
|
-
case GGML_TYPE_Q4_1:
|
|
685
|
-
return max_compute_capability >= CC_VOLTA ? 128 : 64;
|
|
686
|
-
case GGML_TYPE_Q5_0:
|
|
687
|
-
case GGML_TYPE_Q5_1:
|
|
688
|
-
case GGML_TYPE_Q8_0:
|
|
689
|
-
return 64;
|
|
690
|
-
case GGML_TYPE_F16:
|
|
691
|
-
case GGML_TYPE_F32:
|
|
692
|
-
return 1;
|
|
693
|
-
case GGML_TYPE_Q2_K:
|
|
694
|
-
case GGML_TYPE_Q3_K:
|
|
695
|
-
case GGML_TYPE_Q4_K:
|
|
696
|
-
case GGML_TYPE_Q5_K:
|
|
697
|
-
case GGML_TYPE_IQ2_XXS:
|
|
698
|
-
case GGML_TYPE_IQ2_XS:
|
|
699
|
-
case GGML_TYPE_IQ2_S:
|
|
700
|
-
case GGML_TYPE_IQ3_XXS:
|
|
701
|
-
case GGML_TYPE_IQ1_S:
|
|
702
|
-
case GGML_TYPE_IQ1_M:
|
|
703
|
-
case GGML_TYPE_IQ4_NL:
|
|
704
|
-
case GGML_TYPE_IQ4_XS:
|
|
705
|
-
case GGML_TYPE_IQ3_S:
|
|
706
|
-
return max_compute_capability >= CC_VOLTA ? 128 : 64;
|
|
707
|
-
case GGML_TYPE_Q6_K:
|
|
708
|
-
return 64;
|
|
709
|
-
default:
|
|
710
|
-
GGML_ASSERT(false);
|
|
711
|
-
}
|
|
712
|
-
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
640
|
+
return row_rounding;
|
|
713
641
|
}
|
|
714
642
|
|
|
715
643
|
static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split, int id) {
|
|
716
644
|
const int64_t nrows = ggml_nrows(tensor);
|
|
717
|
-
const int64_t rounding = get_row_rounding(
|
|
645
|
+
const int64_t rounding = get_row_rounding(tensor_split);
|
|
718
646
|
|
|
719
647
|
*row_low = id == 0 ? 0 : nrows*tensor_split[id];
|
|
720
648
|
*row_low -= *row_low % rounding;
|
|
@@ -929,6 +857,10 @@ GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_back
|
|
|
929
857
|
GGML_UNUSED(buft);
|
|
930
858
|
}
|
|
931
859
|
|
|
860
|
+
static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) {
|
|
861
|
+
return buft->iface.get_name == ggml_backend_cuda_split_buffer_type_name;
|
|
862
|
+
}
|
|
863
|
+
|
|
932
864
|
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
933
865
|
// since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
|
|
934
866
|
// instead, we allocate them for each tensor separately in init_tensor
|
|
@@ -972,12 +904,6 @@ GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_
|
|
|
972
904
|
return total_size;
|
|
973
905
|
}
|
|
974
906
|
|
|
975
|
-
GGML_CALL static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
|
976
|
-
return ggml_backend_is_cuda(backend);
|
|
977
|
-
|
|
978
|
-
GGML_UNUSED(buft);
|
|
979
|
-
}
|
|
980
|
-
|
|
981
907
|
GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
|
982
908
|
return false;
|
|
983
909
|
|
|
@@ -990,7 +916,6 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface
|
|
|
990
916
|
/* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
|
|
991
917
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
|
992
918
|
/* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
|
|
993
|
-
/* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
|
|
994
919
|
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
|
|
995
920
|
};
|
|
996
921
|
|
|
@@ -1090,7 +1015,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
|
|
1090
1015
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
|
1091
1016
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
|
1092
1017
|
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
|
1093
|
-
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
|
1094
1018
|
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
|
1095
1019
|
},
|
|
1096
1020
|
/* .context = */ nullptr,
|
|
@@ -1413,10 +1337,30 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
|
|
|
1413
1337
|
GGML_UNUSED(main_device);
|
|
1414
1338
|
}
|
|
1415
1339
|
|
|
1340
|
+
static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
|
|
1341
|
+
void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) {
|
|
1342
|
+
|
|
1343
|
+
#if !defined(GGML_USE_HIPBLAS)
|
|
1344
|
+
// cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
|
|
1345
|
+
cudaMemcpy3DPeerParms p = {};
|
|
1346
|
+
p.dstDevice = dstDevice;
|
|
1347
|
+
p.dstPtr = make_cudaPitchedPtr(dst, dpitch, dpitch, height);
|
|
1348
|
+
p.srcDevice = srcDevice;
|
|
1349
|
+
p.srcPtr = make_cudaPitchedPtr(src, spitch, spitch, height);
|
|
1350
|
+
p.extent = make_cudaExtent(width, height, 1);
|
|
1351
|
+
return cudaMemcpy3DPeerAsync(&p, stream);
|
|
1352
|
+
#else
|
|
1353
|
+
// HIP does not support cudaMemcpy3DPeerAsync or vmm pools
|
|
1354
|
+
GGML_UNUSED(dstDevice);
|
|
1355
|
+
GGML_UNUSED(srcDevice);
|
|
1356
|
+
return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream);
|
|
1357
|
+
#endif // !defined(GGML_USE_HIPBLAS)
|
|
1358
|
+
}
|
|
1359
|
+
|
|
1416
1360
|
static void ggml_cuda_op_mul_mat(
|
|
1417
1361
|
ggml_backend_cuda_context & ctx,
|
|
1418
1362
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
|
|
1419
|
-
|
|
1363
|
+
quantize_cuda_t quantize_src1) {
|
|
1420
1364
|
|
|
1421
1365
|
const int64_t ne00 = src0->ne[0];
|
|
1422
1366
|
const int64_t ne01 = src0->ne[1];
|
|
@@ -1473,7 +1417,9 @@ static void ggml_cuda_op_mul_mat(
|
|
|
1473
1417
|
}
|
|
1474
1418
|
|
|
1475
1419
|
struct dev_data {
|
|
1476
|
-
|
|
1420
|
+
int cc;
|
|
1421
|
+
|
|
1422
|
+
ggml_cuda_pool_alloc<char> src0_dd_alloc;
|
|
1477
1423
|
ggml_cuda_pool_alloc<float> src1_ddf_alloc;
|
|
1478
1424
|
ggml_cuda_pool_alloc<char> src1_ddq_alloc;
|
|
1479
1425
|
ggml_cuda_pool_alloc<float> dst_dd_alloc;
|
|
@@ -1492,6 +1438,8 @@ static void ggml_cuda_op_mul_mat(
|
|
|
1492
1438
|
int used_devices = 0;
|
|
1493
1439
|
|
|
1494
1440
|
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
|
1441
|
+
dev[id].cc = ggml_cuda_info().devices[id].cc;
|
|
1442
|
+
|
|
1495
1443
|
// by default, use all rows
|
|
1496
1444
|
dev[id].row_low = 0;
|
|
1497
1445
|
dev[id].row_high = ne01;
|
|
@@ -1499,7 +1447,7 @@ static void ggml_cuda_op_mul_mat(
|
|
|
1499
1447
|
// for multi GPU, get the row boundaries from tensor split
|
|
1500
1448
|
// and round to mul_mat_q tile sizes
|
|
1501
1449
|
if (split) {
|
|
1502
|
-
const int64_t rounding = get_row_rounding(
|
|
1450
|
+
const int64_t rounding = get_row_rounding(tensor_split);
|
|
1503
1451
|
|
|
1504
1452
|
if (id != 0) {
|
|
1505
1453
|
dev[id].row_low = ne01*tensor_split[id];
|
|
@@ -1542,11 +1490,15 @@ static void ggml_cuda_op_mul_mat(
|
|
|
1542
1490
|
dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ctx.pool(id), ggml_nelements(src1));
|
|
1543
1491
|
}
|
|
1544
1492
|
|
|
1545
|
-
if (
|
|
1546
|
-
|
|
1493
|
+
if (quantize_src1) {
|
|
1494
|
+
size_t src_1_ddq_size = nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs;
|
|
1495
|
+
if (quantize_src1 == quantize_mmq_q8_1_cuda) {
|
|
1496
|
+
src_1_ddq_size += get_mmq_x_max_host(dev[id].cc)*sizeof(block_q8_1_mmq);
|
|
1497
|
+
}
|
|
1498
|
+
dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), src_1_ddq_size);
|
|
1547
1499
|
|
|
1548
1500
|
if (src1_on_device && src1_is_contiguous) {
|
|
1549
|
-
|
|
1501
|
+
quantize_src1(dev[id].src1_ddf, dev[id].src1_ddq, ne10, ne11, ne12*ne13, src1_padded_col_size, src0->type, stream);
|
|
1550
1502
|
CUDA_CHECK(cudaGetLastError());
|
|
1551
1503
|
}
|
|
1552
1504
|
}
|
|
@@ -1592,7 +1544,12 @@ static void ggml_cuda_op_mul_mat(
|
|
|
1592
1544
|
const int64_t i03 = i0 / ne12;
|
|
1593
1545
|
const int64_t i02 = i0 % ne12;
|
|
1594
1546
|
|
|
1595
|
-
|
|
1547
|
+
size_t src1_ddq_i_offset = i0*ne11 * src1_padded_col_size*q8_1_ts/q8_1_bs;
|
|
1548
|
+
if (quantize_src1 == quantize_mmq_q8_1_cuda) {
|
|
1549
|
+
src1_ddq_i_offset += src1_col_0 * sizeof(block_q8_1_mmq);
|
|
1550
|
+
} else {
|
|
1551
|
+
src1_ddq_i_offset += src1_col_0 * src1_padded_col_size*q8_1_ts/q8_1_bs;
|
|
1552
|
+
}
|
|
1596
1553
|
|
|
1597
1554
|
// for split tensors the data begins at i0 == i0_offset_low
|
|
1598
1555
|
char * src0_dd_i = dev[id].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
|
|
@@ -1609,10 +1566,17 @@ static void ggml_cuda_op_mul_mat(
|
|
|
1609
1566
|
// copy src0, src1 to device if necessary
|
|
1610
1567
|
if (src1_is_contiguous) {
|
|
1611
1568
|
if (id != ctx.device) {
|
|
1612
|
-
if (
|
|
1569
|
+
if (quantize_src1) {
|
|
1613
1570
|
char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset;
|
|
1614
|
-
|
|
1615
|
-
|
|
1571
|
+
if (quantize_src1 == quantize_mmq_q8_1_cuda) {
|
|
1572
|
+
const size_t pitch = ne11*sizeof(block_q8_1_mmq);
|
|
1573
|
+
const size_t width = src1_ncols*sizeof(block_q8_1_mmq);
|
|
1574
|
+
const size_t height = src1_padded_col_size/(4*QK8_1);
|
|
1575
|
+
CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync(src1_ddq_i, id, pitch, src1_ddq_i_source, ctx.device, pitch, width, height, stream));
|
|
1576
|
+
} else {
|
|
1577
|
+
CUDA_CHECK(cudaMemcpyPeerAsync(
|
|
1578
|
+
src1_ddq_i, id, src1_ddq_i_source, ctx.device, src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
|
|
1579
|
+
}
|
|
1616
1580
|
} else {
|
|
1617
1581
|
float * src1_ddf_i_source = (float *) src1->data;
|
|
1618
1582
|
src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
|
|
@@ -1627,8 +1591,8 @@ static void ggml_cuda_op_mul_mat(
|
|
|
1627
1591
|
GGML_ASSERT(false);
|
|
1628
1592
|
}
|
|
1629
1593
|
|
|
1630
|
-
if (
|
|
1631
|
-
|
|
1594
|
+
if (quantize_src1 && !src1_is_contiguous) {
|
|
1595
|
+
quantize_src1(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, 1, src1_padded_col_size, src0->type, stream);
|
|
1632
1596
|
CUDA_CHECK(cudaGetLastError());
|
|
1633
1597
|
}
|
|
1634
1598
|
|
|
@@ -1653,22 +1617,8 @@ static void ggml_cuda_op_mul_mat(
|
|
|
1653
1617
|
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
|
1654
1618
|
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
|
|
1655
1619
|
dhf_dst_i += src1_col_0*ne0 + dev[id].row_low;
|
|
1656
|
-
|
|
1657
|
-
|
|
1658
|
-
cudaMemcpy3DPeerParms p = {};
|
|
1659
|
-
p.dstDevice = ctx.device;
|
|
1660
|
-
p.dstPtr = make_cudaPitchedPtr(dhf_dst_i, ne0*sizeof(float), row_diff, src1_ncols);
|
|
1661
|
-
p.srcDevice = id;
|
|
1662
|
-
p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols);
|
|
1663
|
-
p.extent = make_cudaExtent(row_diff*sizeof(float), src1_ncols, 1);
|
|
1664
|
-
CUDA_CHECK(cudaMemcpy3DPeerAsync(&p, stream));
|
|
1665
|
-
#else
|
|
1666
|
-
// HIP does not support cudaMemcpy3DPeerAsync or vmm pools
|
|
1667
|
-
CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float),
|
|
1668
|
-
dst_dd_i, row_diff*sizeof(float),
|
|
1669
|
-
row_diff*sizeof(float), src1_ncols,
|
|
1670
|
-
cudaMemcpyDeviceToDevice, stream));
|
|
1671
|
-
#endif
|
|
1620
|
+
CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync(
|
|
1621
|
+
dhf_dst_i, ctx.device, ne0*sizeof(float), dst_dd_i, id, row_diff*sizeof(float), row_diff*sizeof(float), src1_ncols, stream));
|
|
1672
1622
|
} else {
|
|
1673
1623
|
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
|
1674
1624
|
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
|
|
@@ -2007,13 +1957,13 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
|
|
2007
1957
|
// KQ + KQV multi-batch
|
|
2008
1958
|
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
|
|
2009
1959
|
} else if (use_dequantize_mul_mat_vec) {
|
|
2010
|
-
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec,
|
|
1960
|
+
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, nullptr);
|
|
2011
1961
|
} else if (use_mul_mat_vec_q) {
|
|
2012
|
-
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q,
|
|
1962
|
+
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
|
|
2013
1963
|
} else if (use_mul_mat_q) {
|
|
2014
|
-
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q,
|
|
1964
|
+
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, quantize_mmq_q8_1_cuda);
|
|
2015
1965
|
} else {
|
|
2016
|
-
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas,
|
|
1966
|
+
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, nullptr);
|
|
2017
1967
|
}
|
|
2018
1968
|
}
|
|
2019
1969
|
|
|
@@ -2780,7 +2730,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
|
2780
2730
|
case GGML_UNARY_OP_HARDSWISH:
|
|
2781
2731
|
case GGML_UNARY_OP_GELU_QUICK:
|
|
2782
2732
|
case GGML_UNARY_OP_TANH:
|
|
2783
|
-
return
|
|
2733
|
+
return ggml_is_contiguous(op->src[0]);
|
|
2784
2734
|
default:
|
|
2785
2735
|
return false;
|
|
2786
2736
|
}
|
|
@@ -2919,6 +2869,20 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
|
2919
2869
|
GGML_UNUSED(backend);
|
|
2920
2870
|
}
|
|
2921
2871
|
|
|
2872
|
+
GGML_CALL static bool ggml_backend_cuda_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
|
2873
|
+
if (ggml_backend_buft_is_cuda_split(buft)) {
|
|
2874
|
+
return true;
|
|
2875
|
+
}
|
|
2876
|
+
|
|
2877
|
+
if (ggml_backend_buft_is_cuda(buft)) {
|
|
2878
|
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
|
2879
|
+
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
|
2880
|
+
return buft_ctx->device == cuda_ctx->device;
|
|
2881
|
+
}
|
|
2882
|
+
|
|
2883
|
+
return false;
|
|
2884
|
+
}
|
|
2885
|
+
|
|
2922
2886
|
GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
|
|
2923
2887
|
const int min_batch_size = 32;
|
|
2924
2888
|
|
|
@@ -2991,9 +2955,11 @@ static ggml_backend_i ggml_backend_cuda_interface = {
|
|
|
2991
2955
|
/* .synchronize = */ ggml_backend_cuda_synchronize,
|
|
2992
2956
|
/* .graph_plan_create = */ NULL,
|
|
2993
2957
|
/* .graph_plan_free = */ NULL,
|
|
2958
|
+
/* .graph_plan_update = */ NULL,
|
|
2994
2959
|
/* .graph_plan_compute = */ NULL,
|
|
2995
2960
|
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
|
2996
2961
|
/* .supports_op = */ ggml_backend_cuda_supports_op,
|
|
2962
|
+
/* .supports_buft = */ ggml_backend_cuda_supports_buft,
|
|
2997
2963
|
/* .offload_op = */ ggml_backend_cuda_offload_op,
|
|
2998
2964
|
/* .event_new = */ ggml_backend_cuda_event_new,
|
|
2999
2965
|
/* .event_free = */ ggml_backend_cuda_event_free,
|
|
@@ -1340,7 +1340,7 @@ static bool ggml_vk_supports_op(const struct ggml_tensor * op) {
|
|
|
1340
1340
|
case GGML_UNARY_OP_RELU:
|
|
1341
1341
|
case GGML_UNARY_OP_GELU:
|
|
1342
1342
|
case GGML_UNARY_OP_SILU:
|
|
1343
|
-
return
|
|
1343
|
+
return ggml_is_contiguous(op->src[0]);
|
|
1344
1344
|
default:
|
|
1345
1345
|
;
|
|
1346
1346
|
}
|
|
@@ -1902,18 +1902,12 @@ static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_
|
|
|
1902
1902
|
return ctx->max_alloc;
|
|
1903
1903
|
}
|
|
1904
1904
|
|
|
1905
|
-
static bool ggml_backend_kompute_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
|
1906
|
-
GGML_UNUSED(buft);
|
|
1907
|
-
return ggml_backend_is_kompute(backend);
|
|
1908
|
-
}
|
|
1909
|
-
|
|
1910
1905
|
static ggml_backend_buffer_type_i ggml_backend_kompute_buffer_type_interface = {
|
|
1911
1906
|
/* .get_name = */ ggml_backend_kompute_buffer_type_get_name,
|
|
1912
1907
|
/* .alloc_buffer = */ ggml_backend_kompute_buffer_type_alloc_buffer,
|
|
1913
1908
|
/* .get_alignment = */ ggml_backend_kompute_buffer_type_get_alignment,
|
|
1914
1909
|
/* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size,
|
|
1915
1910
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
|
1916
|
-
/* .supports_backend = */ ggml_backend_kompute_buffer_type_supports_backend,
|
|
1917
1911
|
/* .is_host = */ NULL,
|
|
1918
1912
|
};
|
|
1919
1913
|
|
|
@@ -1973,6 +1967,11 @@ static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struc
|
|
|
1973
1967
|
return ggml_vk_supports_op(op);
|
|
1974
1968
|
}
|
|
1975
1969
|
|
|
1970
|
+
static bool ggml_backend_kompute_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
|
1971
|
+
GGML_UNUSED(backend);
|
|
1972
|
+
return buft->iface.get_name == ggml_backend_kompute_buffer_type_get_name;
|
|
1973
|
+
}
|
|
1974
|
+
|
|
1976
1975
|
static struct ggml_backend_i kompute_backend_i = {
|
|
1977
1976
|
/* .get_name = */ ggml_backend_kompute_name,
|
|
1978
1977
|
/* .free = */ ggml_backend_kompute_free,
|
|
@@ -1983,9 +1982,11 @@ static struct ggml_backend_i kompute_backend_i = {
|
|
|
1983
1982
|
/* .synchronize = */ NULL,
|
|
1984
1983
|
/* .graph_plan_create = */ NULL,
|
|
1985
1984
|
/* .graph_plan_free = */ NULL,
|
|
1985
|
+
/* .graph_plan_update = */ NULL,
|
|
1986
1986
|
/* .graph_plan_compute = */ NULL,
|
|
1987
1987
|
/* .graph_compute = */ ggml_backend_kompute_graph_compute,
|
|
1988
1988
|
/* .supports_op = */ ggml_backend_kompute_supports_op,
|
|
1989
|
+
/* .supports_buft = */ ggml_backend_kompute_supports_buft,
|
|
1989
1990
|
/* .offload_op = */ NULL,
|
|
1990
1991
|
/* .event_new = */ NULL,
|
|
1991
1992
|
/* .event_free = */ NULL,
|
|
@@ -735,6 +735,12 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs
|
|
|
735
735
|
}
|
|
736
736
|
|
|
737
737
|
static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const struct ggml_tensor * op) {
|
|
738
|
+
for (size_t i = 0, n = 3; i < n; ++i) {
|
|
739
|
+
if (op->src[i] != NULL && op->src[i]->type == GGML_TYPE_BF16) {
|
|
740
|
+
return false;
|
|
741
|
+
}
|
|
742
|
+
}
|
|
743
|
+
|
|
738
744
|
switch (op->op) {
|
|
739
745
|
case GGML_OP_UNARY:
|
|
740
746
|
switch (ggml_get_unary_op(op)) {
|
|
@@ -744,7 +750,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
|
|
|
744
750
|
case GGML_UNARY_OP_GELU:
|
|
745
751
|
case GGML_UNARY_OP_GELU_QUICK:
|
|
746
752
|
case GGML_UNARY_OP_SILU:
|
|
747
|
-
return
|
|
753
|
+
return ggml_is_contiguous(op->src[0]);
|
|
748
754
|
default:
|
|
749
755
|
return false;
|
|
750
756
|
}
|
|
@@ -1862,9 +1868,10 @@ static enum ggml_status ggml_metal_graph_compute(
|
|
|
1862
1868
|
// ne21 = n_rows
|
|
1863
1869
|
const int dst_rows = ne20*ne21;
|
|
1864
1870
|
const int dst_rows_min = n_as;
|
|
1871
|
+
const int dst_rows_max = (ctx->device.maxThreadgroupMemoryLength - 32 - 8192)/4;
|
|
1865
1872
|
|
|
1866
1873
|
// max size of the rowids array in the kernel shared buffer
|
|
1867
|
-
GGML_ASSERT(dst_rows <=
|
|
1874
|
+
GGML_ASSERT(dst_rows <= dst_rows_max);
|
|
1868
1875
|
|
|
1869
1876
|
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
|
|
1870
1877
|
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
|
|
@@ -3044,12 +3051,6 @@ GGML_CALL static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend
|
|
|
3044
3051
|
UNUSED(buft);
|
|
3045
3052
|
}
|
|
3046
3053
|
|
|
3047
|
-
GGML_CALL static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
|
3048
|
-
return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
|
|
3049
|
-
|
|
3050
|
-
UNUSED(buft);
|
|
3051
|
-
}
|
|
3052
|
-
|
|
3053
3054
|
GGML_CALL static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
|
3054
3055
|
return true;
|
|
3055
3056
|
|
|
@@ -3064,7 +3065,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
|
|
3064
3065
|
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
|
|
3065
3066
|
/* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size,
|
|
3066
3067
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
|
3067
|
-
/* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
|
|
3068
3068
|
/* .is_host = */ ggml_backend_metal_buffer_type_is_host,
|
|
3069
3069
|
},
|
|
3070
3070
|
/* .context = */ NULL,
|
|
@@ -3179,6 +3179,12 @@ GGML_CALL static bool ggml_backend_metal_supports_op(ggml_backend_t backend, con
|
|
|
3179
3179
|
return ggml_metal_supports_op(metal_ctx, op);
|
|
3180
3180
|
}
|
|
3181
3181
|
|
|
3182
|
+
GGML_CALL static bool ggml_backend_metal_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
|
3183
|
+
return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name;
|
|
3184
|
+
|
|
3185
|
+
UNUSED(backend);
|
|
3186
|
+
}
|
|
3187
|
+
|
|
3182
3188
|
static struct ggml_backend_i ggml_backend_metal_i = {
|
|
3183
3189
|
/* .get_name = */ ggml_backend_metal_name,
|
|
3184
3190
|
/* .free = */ ggml_backend_metal_free,
|
|
@@ -3189,9 +3195,11 @@ static struct ggml_backend_i ggml_backend_metal_i = {
|
|
|
3189
3195
|
/* .synchronize = */ NULL,
|
|
3190
3196
|
/* .graph_plan_create = */ NULL,
|
|
3191
3197
|
/* .graph_plan_free = */ NULL,
|
|
3198
|
+
/* .graph_plan_update = */ NULL,
|
|
3192
3199
|
/* .graph_plan_compute = */ NULL,
|
|
3193
3200
|
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
|
3194
3201
|
/* .supports_op = */ ggml_backend_metal_supports_op,
|
|
3202
|
+
/* .supports_buft = */ ggml_backend_metal_supports_buft,
|
|
3195
3203
|
/* .offload_op = */ NULL,
|
|
3196
3204
|
/* .event_new = */ NULL,
|
|
3197
3205
|
/* .event_free = */ NULL,
|