llama_cpp 0.15.4 → 0.16.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/ext/llama_cpp/extconf.rb +3 -2
- data/ext/llama_cpp/llama_cpp.cpp +17 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -1
- data/vendor/tmp/llama.cpp/Makefile +166 -82
- data/vendor/tmp/llama.cpp/ggml-alloc.c +82 -26
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +183 -69
- data/vendor/tmp/llama.cpp/ggml-backend.h +4 -4
- data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
- data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
- data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +104 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +674 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +88 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +419 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +112 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +206 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +286 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +103 -135
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +29 -13
- data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +45 -33
- data/vendor/tmp/llama.cpp/ggml-metal.metal +83 -59
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +15 -14
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +26 -90
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +74522 -14913
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +631 -471
- data/vendor/tmp/llama.cpp/ggml.c +278 -603
- data/vendor/tmp/llama.cpp/ggml.h +9 -28
- data/vendor/tmp/llama.cpp/llama.cpp +345 -473
- data/vendor/tmp/llama.cpp/llama.h +21 -43
- metadata +134 -7
- data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
- data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
- data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
@@ -188,13 +188,15 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
188
188
|
info.default_tensor_split[id] = total_vram;
|
189
189
|
total_vram += prop.totalGlobalMem;
|
190
190
|
|
191
|
+
info.devices[id].nsm = prop.multiProcessorCount;
|
192
|
+
info.devices[id].smpb = prop.sharedMemPerBlock;
|
191
193
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
194
|
+
info.devices[id].smpbo = prop.sharedMemPerBlock;
|
192
195
|
info.devices[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
|
193
196
|
#else
|
197
|
+
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
|
194
198
|
info.devices[id].cc = 100*prop.major + 10*prop.minor;
|
195
199
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
196
|
-
info.devices[id].smpb = prop.sharedMemPerBlock;
|
197
|
-
info.devices[id].nsm = prop.multiProcessorCount;
|
198
200
|
}
|
199
201
|
|
200
202
|
for (int id = 0; id < info.device_count; ++id) {
|
@@ -543,6 +545,10 @@ GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_bu
|
|
543
545
|
return ctx->name.c_str();
|
544
546
|
}
|
545
547
|
|
548
|
+
static bool ggml_backend_buft_is_cuda(ggml_backend_buffer_type_t buft) {
|
549
|
+
return buft->iface.get_name == ggml_backend_cuda_buffer_type_name;
|
550
|
+
}
|
551
|
+
|
546
552
|
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
547
553
|
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
548
554
|
|
@@ -585,24 +591,12 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backen
|
|
585
591
|
GGML_UNUSED(buft);
|
586
592
|
}
|
587
593
|
|
588
|
-
GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
589
|
-
if (!ggml_backend_is_cuda(backend)) {
|
590
|
-
return false;
|
591
|
-
}
|
592
|
-
|
593
|
-
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
594
|
-
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
595
|
-
|
596
|
-
return buft_ctx->device == cuda_ctx->device;
|
597
|
-
}
|
598
|
-
|
599
594
|
static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
|
600
595
|
/* .get_name = */ ggml_backend_cuda_buffer_type_name,
|
601
596
|
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
|
602
597
|
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
603
598
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
604
599
|
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
605
|
-
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
|
606
600
|
/* .is_host = */ NULL,
|
607
601
|
};
|
608
602
|
|
@@ -633,88 +627,22 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
|
633
627
|
|
634
628
|
// cuda split buffer
|
635
629
|
|
636
|
-
static int64_t get_row_rounding(
|
637
|
-
int64_t
|
638
|
-
int64_t max_compute_capability = INT_MIN;
|
630
|
+
static int64_t get_row_rounding(const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
|
631
|
+
int64_t row_rounding = 0;
|
639
632
|
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
640
|
-
if (tensor_split[id]
|
641
|
-
|
642
|
-
min_compute_capability = ggml_cuda_info().devices[id].cc;
|
643
|
-
}
|
644
|
-
if (max_compute_capability < ggml_cuda_info().devices[id].cc) {
|
645
|
-
max_compute_capability = ggml_cuda_info().devices[id].cc;
|
646
|
-
}
|
633
|
+
if (tensor_split[id] >= (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
|
634
|
+
continue;
|
647
635
|
}
|
648
|
-
}
|
649
636
|
|
650
|
-
|
651
|
-
|
652
|
-
case GGML_TYPE_Q4_0:
|
653
|
-
case GGML_TYPE_Q4_1:
|
654
|
-
case GGML_TYPE_Q5_0:
|
655
|
-
case GGML_TYPE_Q5_1:
|
656
|
-
case GGML_TYPE_Q8_0:
|
657
|
-
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
658
|
-
case GGML_TYPE_F16:
|
659
|
-
case GGML_TYPE_F32:
|
660
|
-
return 1;
|
661
|
-
case GGML_TYPE_Q2_K:
|
662
|
-
return max_compute_capability >= CC_RDNA2 ? 128 : 32;
|
663
|
-
case GGML_TYPE_Q3_K:
|
664
|
-
return min_compute_capability < CC_RDNA2 ? 128 : 64;
|
665
|
-
case GGML_TYPE_Q4_K:
|
666
|
-
case GGML_TYPE_Q5_K:
|
667
|
-
case GGML_TYPE_Q6_K:
|
668
|
-
case GGML_TYPE_IQ2_XXS:
|
669
|
-
case GGML_TYPE_IQ2_XS:
|
670
|
-
case GGML_TYPE_IQ2_S:
|
671
|
-
case GGML_TYPE_IQ3_XXS:
|
672
|
-
case GGML_TYPE_IQ1_S:
|
673
|
-
case GGML_TYPE_IQ1_M:
|
674
|
-
case GGML_TYPE_IQ4_NL:
|
675
|
-
case GGML_TYPE_IQ4_XS:
|
676
|
-
case GGML_TYPE_IQ3_S:
|
677
|
-
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
678
|
-
default:
|
679
|
-
GGML_ASSERT(false);
|
680
|
-
}
|
681
|
-
#else
|
682
|
-
switch(type) {
|
683
|
-
case GGML_TYPE_Q4_0:
|
684
|
-
case GGML_TYPE_Q4_1:
|
685
|
-
return max_compute_capability >= CC_VOLTA ? 128 : 64;
|
686
|
-
case GGML_TYPE_Q5_0:
|
687
|
-
case GGML_TYPE_Q5_1:
|
688
|
-
case GGML_TYPE_Q8_0:
|
689
|
-
return 64;
|
690
|
-
case GGML_TYPE_F16:
|
691
|
-
case GGML_TYPE_F32:
|
692
|
-
return 1;
|
693
|
-
case GGML_TYPE_Q2_K:
|
694
|
-
case GGML_TYPE_Q3_K:
|
695
|
-
case GGML_TYPE_Q4_K:
|
696
|
-
case GGML_TYPE_Q5_K:
|
697
|
-
case GGML_TYPE_IQ2_XXS:
|
698
|
-
case GGML_TYPE_IQ2_XS:
|
699
|
-
case GGML_TYPE_IQ2_S:
|
700
|
-
case GGML_TYPE_IQ3_XXS:
|
701
|
-
case GGML_TYPE_IQ1_S:
|
702
|
-
case GGML_TYPE_IQ1_M:
|
703
|
-
case GGML_TYPE_IQ4_NL:
|
704
|
-
case GGML_TYPE_IQ4_XS:
|
705
|
-
case GGML_TYPE_IQ3_S:
|
706
|
-
return max_compute_capability >= CC_VOLTA ? 128 : 64;
|
707
|
-
case GGML_TYPE_Q6_K:
|
708
|
-
return 64;
|
709
|
-
default:
|
710
|
-
GGML_ASSERT(false);
|
637
|
+
const int cc = ggml_cuda_info().devices[id].cc;
|
638
|
+
row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc, get_mmq_x_max_host(cc)));
|
711
639
|
}
|
712
|
-
|
640
|
+
return row_rounding;
|
713
641
|
}
|
714
642
|
|
715
643
|
static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split, int id) {
|
716
644
|
const int64_t nrows = ggml_nrows(tensor);
|
717
|
-
const int64_t rounding = get_row_rounding(
|
645
|
+
const int64_t rounding = get_row_rounding(tensor_split);
|
718
646
|
|
719
647
|
*row_low = id == 0 ? 0 : nrows*tensor_split[id];
|
720
648
|
*row_low -= *row_low % rounding;
|
@@ -929,6 +857,10 @@ GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_back
|
|
929
857
|
GGML_UNUSED(buft);
|
930
858
|
}
|
931
859
|
|
860
|
+
static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) {
|
861
|
+
return buft->iface.get_name == ggml_backend_cuda_split_buffer_type_name;
|
862
|
+
}
|
863
|
+
|
932
864
|
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
933
865
|
// since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
|
934
866
|
// instead, we allocate them for each tensor separately in init_tensor
|
@@ -972,12 +904,6 @@ GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_
|
|
972
904
|
return total_size;
|
973
905
|
}
|
974
906
|
|
975
|
-
GGML_CALL static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
976
|
-
return ggml_backend_is_cuda(backend);
|
977
|
-
|
978
|
-
GGML_UNUSED(buft);
|
979
|
-
}
|
980
|
-
|
981
907
|
GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
982
908
|
return false;
|
983
909
|
|
@@ -990,7 +916,6 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface
|
|
990
916
|
/* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
|
991
917
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
992
918
|
/* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
|
993
|
-
/* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
|
994
919
|
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
|
995
920
|
};
|
996
921
|
|
@@ -1090,7 +1015,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
|
1090
1015
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
1091
1016
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
1092
1017
|
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
1093
|
-
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
1094
1018
|
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
1095
1019
|
},
|
1096
1020
|
/* .context = */ nullptr,
|
@@ -1413,10 +1337,30 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
|
|
1413
1337
|
GGML_UNUSED(main_device);
|
1414
1338
|
}
|
1415
1339
|
|
1340
|
+
static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
|
1341
|
+
void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) {
|
1342
|
+
|
1343
|
+
#if !defined(GGML_USE_HIPBLAS)
|
1344
|
+
// cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
|
1345
|
+
cudaMemcpy3DPeerParms p = {};
|
1346
|
+
p.dstDevice = dstDevice;
|
1347
|
+
p.dstPtr = make_cudaPitchedPtr(dst, dpitch, dpitch, height);
|
1348
|
+
p.srcDevice = srcDevice;
|
1349
|
+
p.srcPtr = make_cudaPitchedPtr(src, spitch, spitch, height);
|
1350
|
+
p.extent = make_cudaExtent(width, height, 1);
|
1351
|
+
return cudaMemcpy3DPeerAsync(&p, stream);
|
1352
|
+
#else
|
1353
|
+
// HIP does not support cudaMemcpy3DPeerAsync or vmm pools
|
1354
|
+
GGML_UNUSED(dstDevice);
|
1355
|
+
GGML_UNUSED(srcDevice);
|
1356
|
+
return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream);
|
1357
|
+
#endif // !defined(GGML_USE_HIPBLAS)
|
1358
|
+
}
|
1359
|
+
|
1416
1360
|
static void ggml_cuda_op_mul_mat(
|
1417
1361
|
ggml_backend_cuda_context & ctx,
|
1418
1362
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
|
1419
|
-
|
1363
|
+
quantize_cuda_t quantize_src1) {
|
1420
1364
|
|
1421
1365
|
const int64_t ne00 = src0->ne[0];
|
1422
1366
|
const int64_t ne01 = src0->ne[1];
|
@@ -1473,7 +1417,9 @@ static void ggml_cuda_op_mul_mat(
|
|
1473
1417
|
}
|
1474
1418
|
|
1475
1419
|
struct dev_data {
|
1476
|
-
|
1420
|
+
int cc;
|
1421
|
+
|
1422
|
+
ggml_cuda_pool_alloc<char> src0_dd_alloc;
|
1477
1423
|
ggml_cuda_pool_alloc<float> src1_ddf_alloc;
|
1478
1424
|
ggml_cuda_pool_alloc<char> src1_ddq_alloc;
|
1479
1425
|
ggml_cuda_pool_alloc<float> dst_dd_alloc;
|
@@ -1492,6 +1438,8 @@ static void ggml_cuda_op_mul_mat(
|
|
1492
1438
|
int used_devices = 0;
|
1493
1439
|
|
1494
1440
|
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
1441
|
+
dev[id].cc = ggml_cuda_info().devices[id].cc;
|
1442
|
+
|
1495
1443
|
// by default, use all rows
|
1496
1444
|
dev[id].row_low = 0;
|
1497
1445
|
dev[id].row_high = ne01;
|
@@ -1499,7 +1447,7 @@ static void ggml_cuda_op_mul_mat(
|
|
1499
1447
|
// for multi GPU, get the row boundaries from tensor split
|
1500
1448
|
// and round to mul_mat_q tile sizes
|
1501
1449
|
if (split) {
|
1502
|
-
const int64_t rounding = get_row_rounding(
|
1450
|
+
const int64_t rounding = get_row_rounding(tensor_split);
|
1503
1451
|
|
1504
1452
|
if (id != 0) {
|
1505
1453
|
dev[id].row_low = ne01*tensor_split[id];
|
@@ -1542,11 +1490,15 @@ static void ggml_cuda_op_mul_mat(
|
|
1542
1490
|
dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ctx.pool(id), ggml_nelements(src1));
|
1543
1491
|
}
|
1544
1492
|
|
1545
|
-
if (
|
1546
|
-
|
1493
|
+
if (quantize_src1) {
|
1494
|
+
size_t src_1_ddq_size = nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs;
|
1495
|
+
if (quantize_src1 == quantize_mmq_q8_1_cuda) {
|
1496
|
+
src_1_ddq_size += get_mmq_x_max_host(dev[id].cc)*sizeof(block_q8_1_mmq);
|
1497
|
+
}
|
1498
|
+
dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), src_1_ddq_size);
|
1547
1499
|
|
1548
1500
|
if (src1_on_device && src1_is_contiguous) {
|
1549
|
-
|
1501
|
+
quantize_src1(dev[id].src1_ddf, dev[id].src1_ddq, ne10, ne11, ne12*ne13, src1_padded_col_size, src0->type, stream);
|
1550
1502
|
CUDA_CHECK(cudaGetLastError());
|
1551
1503
|
}
|
1552
1504
|
}
|
@@ -1592,7 +1544,12 @@ static void ggml_cuda_op_mul_mat(
|
|
1592
1544
|
const int64_t i03 = i0 / ne12;
|
1593
1545
|
const int64_t i02 = i0 % ne12;
|
1594
1546
|
|
1595
|
-
|
1547
|
+
size_t src1_ddq_i_offset = i0*ne11 * src1_padded_col_size*q8_1_ts/q8_1_bs;
|
1548
|
+
if (quantize_src1 == quantize_mmq_q8_1_cuda) {
|
1549
|
+
src1_ddq_i_offset += src1_col_0 * sizeof(block_q8_1_mmq);
|
1550
|
+
} else {
|
1551
|
+
src1_ddq_i_offset += src1_col_0 * src1_padded_col_size*q8_1_ts/q8_1_bs;
|
1552
|
+
}
|
1596
1553
|
|
1597
1554
|
// for split tensors the data begins at i0 == i0_offset_low
|
1598
1555
|
char * src0_dd_i = dev[id].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
|
@@ -1609,10 +1566,17 @@ static void ggml_cuda_op_mul_mat(
|
|
1609
1566
|
// copy src0, src1 to device if necessary
|
1610
1567
|
if (src1_is_contiguous) {
|
1611
1568
|
if (id != ctx.device) {
|
1612
|
-
if (
|
1569
|
+
if (quantize_src1) {
|
1613
1570
|
char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset;
|
1614
|
-
|
1615
|
-
|
1571
|
+
if (quantize_src1 == quantize_mmq_q8_1_cuda) {
|
1572
|
+
const size_t pitch = ne11*sizeof(block_q8_1_mmq);
|
1573
|
+
const size_t width = src1_ncols*sizeof(block_q8_1_mmq);
|
1574
|
+
const size_t height = src1_padded_col_size/(4*QK8_1);
|
1575
|
+
CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync(src1_ddq_i, id, pitch, src1_ddq_i_source, ctx.device, pitch, width, height, stream));
|
1576
|
+
} else {
|
1577
|
+
CUDA_CHECK(cudaMemcpyPeerAsync(
|
1578
|
+
src1_ddq_i, id, src1_ddq_i_source, ctx.device, src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
|
1579
|
+
}
|
1616
1580
|
} else {
|
1617
1581
|
float * src1_ddf_i_source = (float *) src1->data;
|
1618
1582
|
src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
|
@@ -1627,8 +1591,8 @@ static void ggml_cuda_op_mul_mat(
|
|
1627
1591
|
GGML_ASSERT(false);
|
1628
1592
|
}
|
1629
1593
|
|
1630
|
-
if (
|
1631
|
-
|
1594
|
+
if (quantize_src1 && !src1_is_contiguous) {
|
1595
|
+
quantize_src1(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, 1, src1_padded_col_size, src0->type, stream);
|
1632
1596
|
CUDA_CHECK(cudaGetLastError());
|
1633
1597
|
}
|
1634
1598
|
|
@@ -1653,22 +1617,8 @@ static void ggml_cuda_op_mul_mat(
|
|
1653
1617
|
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
1654
1618
|
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
|
1655
1619
|
dhf_dst_i += src1_col_0*ne0 + dev[id].row_low;
|
1656
|
-
|
1657
|
-
|
1658
|
-
cudaMemcpy3DPeerParms p = {};
|
1659
|
-
p.dstDevice = ctx.device;
|
1660
|
-
p.dstPtr = make_cudaPitchedPtr(dhf_dst_i, ne0*sizeof(float), row_diff, src1_ncols);
|
1661
|
-
p.srcDevice = id;
|
1662
|
-
p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols);
|
1663
|
-
p.extent = make_cudaExtent(row_diff*sizeof(float), src1_ncols, 1);
|
1664
|
-
CUDA_CHECK(cudaMemcpy3DPeerAsync(&p, stream));
|
1665
|
-
#else
|
1666
|
-
// HIP does not support cudaMemcpy3DPeerAsync or vmm pools
|
1667
|
-
CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float),
|
1668
|
-
dst_dd_i, row_diff*sizeof(float),
|
1669
|
-
row_diff*sizeof(float), src1_ncols,
|
1670
|
-
cudaMemcpyDeviceToDevice, stream));
|
1671
|
-
#endif
|
1620
|
+
CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync(
|
1621
|
+
dhf_dst_i, ctx.device, ne0*sizeof(float), dst_dd_i, id, row_diff*sizeof(float), row_diff*sizeof(float), src1_ncols, stream));
|
1672
1622
|
} else {
|
1673
1623
|
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
1674
1624
|
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
|
@@ -2007,13 +1957,13 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
|
2007
1957
|
// KQ + KQV multi-batch
|
2008
1958
|
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
|
2009
1959
|
} else if (use_dequantize_mul_mat_vec) {
|
2010
|
-
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec,
|
1960
|
+
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, nullptr);
|
2011
1961
|
} else if (use_mul_mat_vec_q) {
|
2012
|
-
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q,
|
1962
|
+
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
|
2013
1963
|
} else if (use_mul_mat_q) {
|
2014
|
-
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q,
|
1964
|
+
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, quantize_mmq_q8_1_cuda);
|
2015
1965
|
} else {
|
2016
|
-
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas,
|
1966
|
+
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, nullptr);
|
2017
1967
|
}
|
2018
1968
|
}
|
2019
1969
|
|
@@ -2702,10 +2652,8 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
2702
2652
|
|
2703
2653
|
if (cuda_graph_update_required) {
|
2704
2654
|
// Extract nodes from graph
|
2705
|
-
|
2706
|
-
|
2707
|
-
CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
|
2708
|
-
}
|
2655
|
+
// First call with null argument gets number of nodes in graph
|
2656
|
+
CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
|
2709
2657
|
// Subsequent call with non-null argument gets nodes
|
2710
2658
|
cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes);
|
2711
2659
|
cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes);
|
@@ -2782,7 +2730,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
2782
2730
|
case GGML_UNARY_OP_HARDSWISH:
|
2783
2731
|
case GGML_UNARY_OP_GELU_QUICK:
|
2784
2732
|
case GGML_UNARY_OP_TANH:
|
2785
|
-
return
|
2733
|
+
return ggml_is_contiguous(op->src[0]);
|
2786
2734
|
default:
|
2787
2735
|
return false;
|
2788
2736
|
}
|
@@ -2905,10 +2853,14 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
2905
2853
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
2906
2854
|
return op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128;
|
2907
2855
|
#else
|
2908
|
-
if (op->src[0]->ne[0] ==
|
2856
|
+
if (op->src[0]->ne[0] == 128) {
|
2909
2857
|
return true;
|
2910
2858
|
}
|
2911
|
-
|
2859
|
+
if (op->src[0]->ne[0] == 64 && op->src[1]->type == GGML_TYPE_F16) {
|
2860
|
+
return true;
|
2861
|
+
}
|
2862
|
+
return ggml_cuda_info().devices[cuda_ctx->device].cc >= CC_VOLTA &&
|
2863
|
+
op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
|
2912
2864
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
2913
2865
|
default:
|
2914
2866
|
return false;
|
@@ -2917,6 +2869,20 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
2917
2869
|
GGML_UNUSED(backend);
|
2918
2870
|
}
|
2919
2871
|
|
2872
|
+
GGML_CALL static bool ggml_backend_cuda_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
2873
|
+
if (ggml_backend_buft_is_cuda_split(buft)) {
|
2874
|
+
return true;
|
2875
|
+
}
|
2876
|
+
|
2877
|
+
if (ggml_backend_buft_is_cuda(buft)) {
|
2878
|
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
2879
|
+
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
2880
|
+
return buft_ctx->device == cuda_ctx->device;
|
2881
|
+
}
|
2882
|
+
|
2883
|
+
return false;
|
2884
|
+
}
|
2885
|
+
|
2920
2886
|
GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
|
2921
2887
|
const int min_batch_size = 32;
|
2922
2888
|
|
@@ -2989,9 +2955,11 @@ static ggml_backend_i ggml_backend_cuda_interface = {
|
|
2989
2955
|
/* .synchronize = */ ggml_backend_cuda_synchronize,
|
2990
2956
|
/* .graph_plan_create = */ NULL,
|
2991
2957
|
/* .graph_plan_free = */ NULL,
|
2958
|
+
/* .graph_plan_update = */ NULL,
|
2992
2959
|
/* .graph_plan_compute = */ NULL,
|
2993
2960
|
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
2994
2961
|
/* .supports_op = */ ggml_backend_cuda_supports_op,
|
2962
|
+
/* .supports_buft = */ ggml_backend_cuda_supports_buft,
|
2995
2963
|
/* .offload_op = */ ggml_backend_cuda_offload_op,
|
2996
2964
|
/* .event_new = */ ggml_backend_cuda_event_new,
|
2997
2965
|
/* .event_free = */ ggml_backend_cuda_event_free,
|
@@ -22,6 +22,7 @@
|
|
22
22
|
#include "shaderop_mul_mat_q4_1.h"
|
23
23
|
#include "shaderop_mul_mat_q6_k.h"
|
24
24
|
#include "shaderop_mul_mat_mat_f32.h"
|
25
|
+
#include "shaderop_getrows_f32.h"
|
25
26
|
#include "shaderop_getrows_f16.h"
|
26
27
|
#include "shaderop_getrows_q4_0.h"
|
27
28
|
#include "shaderop_getrows_q4_1.h"
|
@@ -1146,6 +1147,14 @@ static void ggml_vk_get_rows(
|
|
1146
1147
|
seq.record<kp::OpAlgoDispatch>(s_algo);
|
1147
1148
|
}
|
1148
1149
|
|
1150
|
+
template <typename... Args>
|
1151
|
+
static void ggml_vk_get_rows_f32(Args&&... args) {
|
1152
|
+
const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_f32_comp_spv,
|
1153
|
+
kp::shader_data::op_getrows_f32_comp_spv_len);
|
1154
|
+
|
1155
|
+
ggml_vk_get_rows(spirv, "f32", sizeof(float), 0, std::forward<Args>(args)...);
|
1156
|
+
}
|
1157
|
+
|
1149
1158
|
template <typename... Args>
|
1150
1159
|
static void ggml_vk_get_rows_f16(Args&&... args) {
|
1151
1160
|
const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_f16_comp_spv,
|
@@ -1183,7 +1192,7 @@ static void ggml_vk_rope(
|
|
1183
1192
|
const std::shared_ptr<kp::Tensor>& inB,
|
1184
1193
|
const std::shared_ptr<kp::Tensor>& out,
|
1185
1194
|
uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
|
1186
|
-
ggml_type src0t, int32_t n_dims, int32_t mode, int32_t
|
1195
|
+
ggml_type src0t, int32_t n_dims, int32_t mode, int32_t n_ctx_orig,
|
1187
1196
|
float freq_base, float freq_scale, float ext_factor, float attn_factor, float beta_fast, float beta_slow,
|
1188
1197
|
int32_t ne01, int32_t ne02, int32_t ne03,
|
1189
1198
|
uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
|
@@ -1212,14 +1221,14 @@ static void ggml_vk_rope(
|
|
1212
1221
|
|
1213
1222
|
struct PushConstants {
|
1214
1223
|
uint32_t inAOff, inBOff, outOff;
|
1215
|
-
int32_t n_dims, mode,
|
1224
|
+
int32_t n_dims, mode, n_ctx_orig;
|
1216
1225
|
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
1217
1226
|
uint32_t nb00, nb01, nb02, nb03;
|
1218
1227
|
int32_t ne0;
|
1219
1228
|
uint32_t nb0, nb1, nb2, nb3;
|
1220
1229
|
} pushConsts {
|
1221
1230
|
safe_divide(inAOff, type_size), safe_divide(inBOff, 4), safe_divide(outOff, type_size),
|
1222
|
-
n_dims, mode,
|
1231
|
+
n_dims, mode, n_ctx_orig,
|
1223
1232
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow,
|
1224
1233
|
nb00, nb01, nb02, nb03,
|
1225
1234
|
ne0,
|
@@ -1331,7 +1340,7 @@ static bool ggml_vk_supports_op(const struct ggml_tensor * op) {
|
|
1331
1340
|
case GGML_UNARY_OP_RELU:
|
1332
1341
|
case GGML_UNARY_OP_GELU:
|
1333
1342
|
case GGML_UNARY_OP_SILU:
|
1334
|
-
return
|
1343
|
+
return ggml_is_contiguous(op->src[0]);
|
1335
1344
|
default:
|
1336
1345
|
;
|
1337
1346
|
}
|
@@ -1371,6 +1380,7 @@ static bool ggml_vk_supports_op(const struct ggml_tensor * op) {
|
|
1371
1380
|
return op->ne[3] == 1;
|
1372
1381
|
case GGML_OP_GET_ROWS:
|
1373
1382
|
switch (op->src[0]->type) {
|
1383
|
+
case GGML_TYPE_F32:
|
1374
1384
|
case GGML_TYPE_F16:
|
1375
1385
|
case GGML_TYPE_Q4_0:
|
1376
1386
|
case GGML_TYPE_Q4_1:
|
@@ -1661,7 +1671,9 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
|
1661
1671
|
} break;
|
1662
1672
|
case GGML_OP_GET_ROWS:
|
1663
1673
|
{
|
1664
|
-
if (src0t ==
|
1674
|
+
if (src0t == GGML_TYPE_F32) {
|
1675
|
+
ggml_vk_get_rows_f32(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
|
1676
|
+
} else if (src0t == GGML_TYPE_F16) {
|
1665
1677
|
ggml_vk_get_rows_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
|
1666
1678
|
} else if (src0t == GGML_TYPE_Q4_0) {
|
1667
1679
|
ggml_vk_get_rows_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
|
@@ -1680,13 +1692,16 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
|
1680
1692
|
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225")
|
1681
1693
|
GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
|
1682
1694
|
|
1695
|
+
#pragma message("TODO: update rope NORM mode to match NEOX mode")
|
1696
|
+
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7634")
|
1697
|
+
|
1683
1698
|
GGML_ASSERT(ne10 == ne02);
|
1684
1699
|
GGML_ASSERT(src0t == dstt);
|
1685
1700
|
// const int n_past = ((int32_t *) dst->op_params)[0];
|
1686
1701
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
1687
1702
|
const int mode = ((int32_t *) dst->op_params)[2];
|
1688
1703
|
// skip 3, n_ctx used in GLM RoPE, unimplemented in Vulkan
|
1689
|
-
const int
|
1704
|
+
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
|
1690
1705
|
|
1691
1706
|
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
1692
1707
|
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
@@ -1696,7 +1711,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
|
1696
1711
|
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
1697
1712
|
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
1698
1713
|
ggml_vk_rope(
|
1699
|
-
seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, src0t, n_dims, mode,
|
1714
|
+
seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, src0t, n_dims, mode, n_ctx_orig,
|
1700
1715
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow,
|
1701
1716
|
ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, nb0, nb1, nb2, nb3
|
1702
1717
|
);
|
@@ -1887,18 +1902,12 @@ static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_
|
|
1887
1902
|
return ctx->max_alloc;
|
1888
1903
|
}
|
1889
1904
|
|
1890
|
-
static bool ggml_backend_kompute_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
1891
|
-
GGML_UNUSED(buft);
|
1892
|
-
return ggml_backend_is_kompute(backend);
|
1893
|
-
}
|
1894
|
-
|
1895
1905
|
static ggml_backend_buffer_type_i ggml_backend_kompute_buffer_type_interface = {
|
1896
1906
|
/* .get_name = */ ggml_backend_kompute_buffer_type_get_name,
|
1897
1907
|
/* .alloc_buffer = */ ggml_backend_kompute_buffer_type_alloc_buffer,
|
1898
1908
|
/* .get_alignment = */ ggml_backend_kompute_buffer_type_get_alignment,
|
1899
1909
|
/* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size,
|
1900
1910
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
1901
|
-
/* .supports_backend = */ ggml_backend_kompute_buffer_type_supports_backend,
|
1902
1911
|
/* .is_host = */ NULL,
|
1903
1912
|
};
|
1904
1913
|
|
@@ -1958,6 +1967,11 @@ static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struc
|
|
1958
1967
|
return ggml_vk_supports_op(op);
|
1959
1968
|
}
|
1960
1969
|
|
1970
|
+
static bool ggml_backend_kompute_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
1971
|
+
GGML_UNUSED(backend);
|
1972
|
+
return buft->iface.get_name == ggml_backend_kompute_buffer_type_get_name;
|
1973
|
+
}
|
1974
|
+
|
1961
1975
|
static struct ggml_backend_i kompute_backend_i = {
|
1962
1976
|
/* .get_name = */ ggml_backend_kompute_name,
|
1963
1977
|
/* .free = */ ggml_backend_kompute_free,
|
@@ -1968,9 +1982,11 @@ static struct ggml_backend_i kompute_backend_i = {
|
|
1968
1982
|
/* .synchronize = */ NULL,
|
1969
1983
|
/* .graph_plan_create = */ NULL,
|
1970
1984
|
/* .graph_plan_free = */ NULL,
|
1985
|
+
/* .graph_plan_update = */ NULL,
|
1971
1986
|
/* .graph_plan_compute = */ NULL,
|
1972
1987
|
/* .graph_compute = */ ggml_backend_kompute_graph_compute,
|
1973
1988
|
/* .supports_op = */ ggml_backend_kompute_supports_op,
|
1989
|
+
/* .supports_buft = */ ggml_backend_kompute_supports_buft,
|
1974
1990
|
/* .offload_op = */ NULL,
|
1975
1991
|
/* .event_new = */ NULL,
|
1976
1992
|
/* .event_free = */ NULL,
|
@@ -1,7 +1,7 @@
|
|
1
1
|
// An interface allowing to compute ggml_cgraph with Metal
|
2
2
|
//
|
3
3
|
// This is a fully functional interface that extends ggml with GPU support for Apple devices.
|
4
|
-
// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA,
|
4
|
+
// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, etc.)
|
5
5
|
//
|
6
6
|
// How it works?
|
7
7
|
//
|