llama_cpp 0.16.1 → 0.16.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +12 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- data/vendor/tmp/llama.cpp/Makefile +10 -2
- data/vendor/tmp/llama.cpp/ggml-backend.c +14 -3
- data/vendor/tmp/llama.cpp/ggml-backend.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +10 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +28 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +6 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +982 -368
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +8 -3
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +2124 -13202
- data/vendor/tmp/llama.cpp/ggml-sycl.h +1 -10
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +27564 -23876
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +278 -366
- data/vendor/tmp/llama.cpp/ggml.c +67 -150
- data/vendor/tmp/llama.cpp/ggml.h +6 -0
- data/vendor/tmp/llama.cpp/llama.cpp +530 -237
- data/vendor/tmp/llama.cpp/llama.h +5 -1
- data/vendor/tmp/llama.cpp/sgemm.cpp +2 -0
- data/vendor/tmp/llama.cpp/unicode-data.cpp +851 -801
- data/vendor/tmp/llama.cpp/unicode.cpp +33 -19
- data/vendor/tmp/llama.cpp/unicode.h +1 -1
- metadata +2 -2
@@ -8,6 +8,7 @@
|
|
8
8
|
|
9
9
|
#include <algorithm>
|
10
10
|
#include <cmath>
|
11
|
+
#include <iomanip>
|
11
12
|
#include <iostream>
|
12
13
|
#include <tuple>
|
13
14
|
#include <vector>
|
@@ -57,6 +58,12 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA
|
|
57
58
|
} \
|
58
59
|
} while (0)
|
59
60
|
|
61
|
+
#ifdef GGML_VULKAN_DEBUG
|
62
|
+
#define VK_LOG_DEBUG(msg) std::cerr << msg << std::endl
|
63
|
+
#else
|
64
|
+
#define VK_LOG_DEBUG(msg) ((void) 0)
|
65
|
+
#endif // GGML_VULKAN_DEBUG
|
66
|
+
|
60
67
|
struct ggml_backend_vk_context;
|
61
68
|
|
62
69
|
struct vk_queue {
|
@@ -159,9 +166,7 @@ struct vk_device {
|
|
159
166
|
std::vector<vk_pipeline_ref> pipelines;
|
160
167
|
|
161
168
|
~vk_device() {
|
162
|
-
|
163
|
-
std::cerr << "destroy device " << name << std::endl;
|
164
|
-
#endif
|
169
|
+
VK_LOG_DEBUG("destroy device " << name);
|
165
170
|
device.destroyCommandPool(compute_queue.pool);
|
166
171
|
if (!single_queue) {
|
167
172
|
device.destroyCommandPool(transfer_queue.pool);
|
@@ -196,9 +201,7 @@ struct vk_buffer_struct {
|
|
196
201
|
if (size == 0) {
|
197
202
|
return;
|
198
203
|
}
|
199
|
-
|
200
|
-
std::cerr << "~vk_buffer_struct(" << buffer << ", " << size << ")" << std::endl;
|
201
|
-
#endif
|
204
|
+
VK_LOG_DEBUG("~vk_buffer_struct(" << buffer << ", " << size << ")");
|
202
205
|
|
203
206
|
device->device.freeMemory(device_memory);
|
204
207
|
device->device.destroyBuffer(buffer);
|
@@ -355,6 +358,49 @@ struct ggml_vk_garbage_collector {
|
|
355
358
|
std::vector<vk_context> contexts;
|
356
359
|
};
|
357
360
|
|
361
|
+
#if defined(GGML_VULKAN_MEMORY_DEBUG) || defined(GGML_VULKAN_DEBUG)
|
362
|
+
#include <mutex>
|
363
|
+
|
364
|
+
#define VK_LOG_MEMORY(msg) std::cerr << "ggml_vulkan memory: " << msg << std::endl
|
365
|
+
|
366
|
+
static std::string format_size(size_t size) {
|
367
|
+
const size_t kib = 1024;
|
368
|
+
const size_t mib = kib * 1024;
|
369
|
+
const size_t gib = mib * 1024;
|
370
|
+
|
371
|
+
std::ostringstream oss;
|
372
|
+
oss << std::fixed << std::setprecision(2);
|
373
|
+
|
374
|
+
if (size >= gib) {
|
375
|
+
oss << static_cast<double>(size) / gib << " GiB";
|
376
|
+
} else if (size >= mib) {
|
377
|
+
oss << static_cast<double>(size) / mib << " MiB";
|
378
|
+
} else if (size >= kib) {
|
379
|
+
oss << static_cast<double>(size) / kib << " KiB";
|
380
|
+
} else {
|
381
|
+
oss << size << " B";
|
382
|
+
}
|
383
|
+
|
384
|
+
return oss.str();
|
385
|
+
}
|
386
|
+
|
387
|
+
static std::mutex log_mutex;
|
388
|
+
|
389
|
+
class vk_memory_logger {
|
390
|
+
public:
|
391
|
+
vk_memory_logger(): total_device(0), total_host(0) {}
|
392
|
+
void log_allocation(vk_buffer_ref buf_ref, size_t size);
|
393
|
+
void log_deallocation(vk_buffer_ref buf_ref);
|
394
|
+
|
395
|
+
private:
|
396
|
+
std::map<vk::Buffer, size_t> allocations; // Track allocations
|
397
|
+
size_t total_device;
|
398
|
+
size_t total_host;
|
399
|
+
};
|
400
|
+
#else
|
401
|
+
#define VK_LOG_MEMORY(msg) ((void) 0)
|
402
|
+
#endif // GGML_VULKAN_MEMORY_DEBUG
|
403
|
+
|
358
404
|
struct ggml_backend_vk_context {
|
359
405
|
std::string name;
|
360
406
|
|
@@ -379,8 +425,45 @@ struct ggml_backend_vk_context {
|
|
379
425
|
bool initialized;
|
380
426
|
|
381
427
|
size_t idx;
|
428
|
+
|
429
|
+
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
430
|
+
vk_memory_logger memory_logger;
|
431
|
+
#endif
|
382
432
|
};
|
383
433
|
|
434
|
+
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
435
|
+
void vk_memory_logger::log_allocation(vk_buffer_ref buf_ref, size_t size) {
|
436
|
+
std::lock_guard<std::mutex> guard(log_mutex);
|
437
|
+
vk_buffer buf = buf_ref.lock();
|
438
|
+
const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal);
|
439
|
+
const std::string type = device ? "device" : "host";
|
440
|
+
allocations[buf->buffer] = size;
|
441
|
+
total_device += device ? size : 0;
|
442
|
+
total_host += device ? 0 : size;
|
443
|
+
VK_LOG_MEMORY("VULKAN" << buf->ctx->idx << ": +" << format_size(size) << " " << type << " at " << buf->buffer << ". Total device: " << format_size(total_device) << ", total host: " << format_size(total_host));
|
444
|
+
}
|
445
|
+
|
446
|
+
void vk_memory_logger::log_deallocation(vk_buffer_ref buf_ref) {
|
447
|
+
if (buf_ref.expired() || buf_ref.lock()->size == 0) {
|
448
|
+
return;
|
449
|
+
}
|
450
|
+
|
451
|
+
std::lock_guard<std::mutex> guard(log_mutex);
|
452
|
+
vk_buffer buf = buf_ref.lock();
|
453
|
+
const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal);
|
454
|
+
std::string type = device ? "device" : "host";
|
455
|
+
auto it = allocations.find(buf->buffer);
|
456
|
+
total_device -= device ? it->second : 0;
|
457
|
+
total_host -= device ? 0 : it->second;
|
458
|
+
if (it != allocations.end()) {
|
459
|
+
VK_LOG_MEMORY("VULKAN" << buf->ctx->idx << ": -" << format_size(it->second) << " " << type << " at " << buf->buffer << ". Total device: " << format_size(total_device) << ", total host: " << format_size(total_host));
|
460
|
+
allocations.erase(it);
|
461
|
+
} else {
|
462
|
+
VK_LOG_MEMORY("ERROR VULKAN" << buf->ctx->idx << ": Attempted to deallocate unknown " << type << " memory at " << buf->buffer);
|
463
|
+
}
|
464
|
+
}
|
465
|
+
#endif // GGML_VULKAN_MEMORY_DEBUG
|
466
|
+
|
384
467
|
struct vk_instance_t {
|
385
468
|
vk::Instance instance;
|
386
469
|
|
@@ -393,15 +476,11 @@ struct vk_instance_t {
|
|
393
476
|
};
|
394
477
|
|
395
478
|
static std::shared_ptr<vk_device> ggml_vk_get_device(size_t idx) {
|
396
|
-
|
397
|
-
std::cerr << "ggml_vk_get_device(" << idx << ")" << std::endl;
|
398
|
-
#endif
|
479
|
+
VK_LOG_DEBUG("ggml_vk_get_device(" << idx << ")");
|
399
480
|
static std::weak_ptr<vk_device> devices[GGML_VK_MAX_DEVICES];
|
400
481
|
|
401
482
|
if (devices[idx].expired()) {
|
402
|
-
|
403
|
-
std::cerr << "Initializing new vk_device" << std::endl;
|
404
|
-
#endif
|
483
|
+
VK_LOG_DEBUG("Initializing new vk_device");
|
405
484
|
std::shared_ptr<vk_device> device = std::make_shared<vk_device>();
|
406
485
|
device->initialized = false;
|
407
486
|
devices[idx] = device;
|
@@ -428,9 +507,7 @@ static vk_instance_t vk_instance;
|
|
428
507
|
GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend);
|
429
508
|
|
430
509
|
static void ggml_vk_create_pipeline(ggml_backend_vk_context * ctx, vk_pipeline& pipeline, const std::string& name, size_t spv_size, const void* spv_data, const std::string& entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t>&& specialization_constants, uint32_t align) {
|
431
|
-
|
432
|
-
std::cerr << "ggml_vk_create_pipeline(" << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")" << std::endl;
|
433
|
-
#endif
|
510
|
+
VK_LOG_DEBUG("ggml_vk_create_pipeline(" << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")");
|
434
511
|
GGML_ASSERT(parameter_count > 0);
|
435
512
|
GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
|
436
513
|
|
@@ -531,9 +608,7 @@ static void ggml_vk_create_pipeline(ggml_backend_vk_context * ctx, vk_pipeline&
|
|
531
608
|
}
|
532
609
|
|
533
610
|
static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) {
|
534
|
-
|
535
|
-
std::cerr << "ggml_pipeline_destroy_pipeline(" << pipeline->name << ")" << std::endl;
|
536
|
-
#endif
|
611
|
+
VK_LOG_DEBUG("ggml_pipeline_destroy_pipeline(" << pipeline->name << ")");
|
537
612
|
for (auto& pool : pipeline->descriptor_pools) {
|
538
613
|
device.destroyDescriptorPool(pool);
|
539
614
|
}
|
@@ -551,9 +626,7 @@ static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline)
|
|
551
626
|
}
|
552
627
|
|
553
628
|
static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx, vk_pipeline& pipeline, uint32_t n) {
|
554
|
-
|
555
|
-
std::cerr << "ggml_pipeline_allocate_descriptor_sets(" << pipeline->name << ", " << n << ")" << std::endl;
|
556
|
-
#endif
|
629
|
+
VK_LOG_DEBUG("ggml_pipeline_allocate_descriptor_sets(" << pipeline->name << ", " << n << ")");
|
557
630
|
if (pipeline->descriptor_sets.size() >= pipeline->descriptor_set_idx + n) {
|
558
631
|
// Enough descriptors are available
|
559
632
|
return;
|
@@ -583,16 +656,12 @@ static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx
|
|
583
656
|
}
|
584
657
|
|
585
658
|
static void ggml_pipeline_cleanup(vk_pipeline& pipeline) {
|
586
|
-
|
587
|
-
std::cerr << "ggml_pipeline_cleanup(" << pipeline->name << ")" << std::endl;
|
588
|
-
#endif
|
659
|
+
VK_LOG_DEBUG("ggml_pipeline_cleanup(" << pipeline->name << ")");
|
589
660
|
pipeline->descriptor_set_idx = 0;
|
590
661
|
}
|
591
662
|
|
592
663
|
static vk::CommandBuffer ggml_vk_create_cmd_buffer(ggml_backend_vk_context * ctx, vk_queue& q) {
|
593
|
-
|
594
|
-
std::cerr << "ggml_vk_create_cmd_buffer()" << std::endl;
|
595
|
-
#endif
|
664
|
+
VK_LOG_DEBUG("ggml_vk_create_cmd_buffer()");
|
596
665
|
if (q.cmd_buffers.size() > q.cmd_buffer_idx) {
|
597
666
|
// Reuse command buffer
|
598
667
|
return q.cmd_buffers[q.cmd_buffer_idx++];
|
@@ -612,9 +681,7 @@ static vk::CommandBuffer ggml_vk_create_cmd_buffer(ggml_backend_vk_context * ctx
|
|
612
681
|
}
|
613
682
|
|
614
683
|
static vk_submission ggml_vk_create_submission(ggml_backend_vk_context * ctx, vk_queue& q, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores) {
|
615
|
-
|
616
|
-
std::cerr << "ggml_vk_create_submission()" << std::endl;
|
617
|
-
#endif
|
684
|
+
VK_LOG_DEBUG("ggml_vk_create_submission()");
|
618
685
|
vk_submission s;
|
619
686
|
s.buffer = ggml_vk_create_cmd_buffer(ctx, q);
|
620
687
|
s.wait_semaphores = std::move(wait_semaphores);
|
@@ -623,9 +690,7 @@ static vk_submission ggml_vk_create_submission(ggml_backend_vk_context * ctx, vk
|
|
623
690
|
}
|
624
691
|
|
625
692
|
static void ggml_vk_submit(vk_context * ctx, vk::Fence fence) {
|
626
|
-
|
627
|
-
std::cerr << "ggml_vk_submit(" << ctx->seqs.size() << ", " << fence << ")" << std::endl;
|
628
|
-
#endif
|
693
|
+
VK_LOG_DEBUG("ggml_vk_submit(" << ctx->seqs.size() << ", " << fence << ")");
|
629
694
|
if (ctx->seqs.empty()) {
|
630
695
|
return;
|
631
696
|
}
|
@@ -699,9 +764,7 @@ static void ggml_vk_submit(vk_context * ctx, vk::Fence fence) {
|
|
699
764
|
}
|
700
765
|
|
701
766
|
static uint32_t ggml_vk_find_queue_family_index(std::vector<vk::QueueFamilyProperties>& queue_family_props, const vk::QueueFlags& required, const vk::QueueFlags& avoid, int32_t compute_index, uint32_t min_num_queues) {
|
702
|
-
|
703
|
-
std::cerr << "ggml_vk_find_queue_family_index()" << std::endl;
|
704
|
-
#endif
|
767
|
+
VK_LOG_DEBUG("ggml_vk_find_queue_family_index()");
|
705
768
|
const uint32_t qfsize = queue_family_props.size();
|
706
769
|
|
707
770
|
// Try with avoid preferences first
|
@@ -747,9 +810,7 @@ static uint32_t ggml_vk_find_queue_family_index(std::vector<vk::QueueFamilyPrope
|
|
747
810
|
}
|
748
811
|
|
749
812
|
static void ggml_vk_create_queue(ggml_backend_vk_context * ctx, vk_queue& q, uint32_t queue_family_index, uint32_t queue_index, vk::PipelineStageFlags&& stage_flags) {
|
750
|
-
|
751
|
-
std::cerr << "ggml_vk_create_queue()" << std::endl;
|
752
|
-
#endif
|
813
|
+
VK_LOG_DEBUG("ggml_vk_create_queue()");
|
753
814
|
q.queue_family_index = queue_family_index;
|
754
815
|
|
755
816
|
vk::CommandPoolCreateInfo command_pool_create_info_compute(vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT), queue_family_index);
|
@@ -763,9 +824,7 @@ static void ggml_vk_create_queue(ggml_backend_vk_context * ctx, vk_queue& q, uin
|
|
763
824
|
}
|
764
825
|
|
765
826
|
static vk_context * ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_queue& q) {
|
766
|
-
|
767
|
-
std::cerr << "ggml_vk_create_context()" << std::endl;
|
768
|
-
#endif
|
827
|
+
VK_LOG_DEBUG("ggml_vk_create_context()");
|
769
828
|
ctx->gc.contexts.emplace_back();
|
770
829
|
vk_context * result = &ctx->gc.contexts[ctx->gc.contexts.size() - 1];
|
771
830
|
memset((void *) result, 0, sizeof(vk_context));
|
@@ -775,9 +834,7 @@ static vk_context * ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_que
|
|
775
834
|
}
|
776
835
|
|
777
836
|
static vk_semaphore * ggml_vk_create_binary_semaphore(ggml_backend_vk_context * ctx) {
|
778
|
-
|
779
|
-
std::cerr << "ggml_vk_create_timeline_semaphore()" << std::endl;
|
780
|
-
#endif
|
837
|
+
VK_LOG_DEBUG("ggml_vk_create_timeline_semaphore()");
|
781
838
|
vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eBinary, 0 };
|
782
839
|
vk::SemaphoreCreateInfo ci{};
|
783
840
|
ci.setPNext(&tci);
|
@@ -787,9 +844,7 @@ static vk_semaphore * ggml_vk_create_binary_semaphore(ggml_backend_vk_context *
|
|
787
844
|
}
|
788
845
|
|
789
846
|
static vk_semaphore * ggml_vk_create_timeline_semaphore(ggml_backend_vk_context * ctx) {
|
790
|
-
|
791
|
-
std::cerr << "ggml_vk_create_timeline_semaphore()" << std::endl;
|
792
|
-
#endif
|
847
|
+
VK_LOG_DEBUG("ggml_vk_create_timeline_semaphore()");
|
793
848
|
if (ctx->semaphore_idx >= ctx->gc.tl_semaphores.size()) {
|
794
849
|
vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eTimeline, 0 };
|
795
850
|
vk::SemaphoreCreateInfo ci{};
|
@@ -808,9 +863,7 @@ static vk::Event ggml_vk_create_event(ggml_backend_vk_context * ctx) {
|
|
808
863
|
}
|
809
864
|
|
810
865
|
static void ggml_vk_queue_cleanup(ggml_backend_vk_context * ctx, vk_queue& q) {
|
811
|
-
|
812
|
-
std::cerr << "ggml_vk_queue_cleanup()" << std::endl;
|
813
|
-
#endif
|
866
|
+
VK_LOG_DEBUG("ggml_vk_queue_cleanup()");
|
814
867
|
// Requires command buffers to be done
|
815
868
|
|
816
869
|
ctx->device->device.resetCommandPool(q.pool);
|
@@ -830,9 +883,7 @@ static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_pr
|
|
830
883
|
}
|
831
884
|
|
832
885
|
static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
|
833
|
-
|
834
|
-
std::cerr << "ggml_vk_create_buffer(device " << ctx->idx << ", " << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl;
|
835
|
-
#endif
|
886
|
+
VK_LOG_DEBUG("ggml_vk_create_buffer(device " << ctx->idx << ", " << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")");
|
836
887
|
vk_buffer buf = std::make_shared<vk_buffer_struct>();
|
837
888
|
|
838
889
|
if (size == 0) {
|
@@ -892,8 +943,8 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
|
|
892
943
|
|
893
944
|
buf->device = ctx->device;
|
894
945
|
|
895
|
-
#ifdef
|
896
|
-
|
946
|
+
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
947
|
+
ctx->memory_logger.log_allocation(buf, size);
|
897
948
|
#endif
|
898
949
|
|
899
950
|
return buf;
|
@@ -928,6 +979,14 @@ static vk_buffer ggml_vk_create_buffer_device(ggml_backend_vk_context * ctx, siz
|
|
928
979
|
}
|
929
980
|
|
930
981
|
static void ggml_vk_destroy_buffer(vk_buffer& buf) {
|
982
|
+
if (buf == nullptr) {
|
983
|
+
return;
|
984
|
+
}
|
985
|
+
|
986
|
+
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
987
|
+
buf->ctx->memory_logger.log_deallocation(buf);
|
988
|
+
#endif
|
989
|
+
|
931
990
|
buf.reset();
|
932
991
|
}
|
933
992
|
|
@@ -936,9 +995,7 @@ static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) {
|
|
936
995
|
}
|
937
996
|
|
938
997
|
static void ggml_vk_sync_buffers(vk_context * ctx) {
|
939
|
-
|
940
|
-
std::cerr << "ggml_vk_sync_buffers()" << std::endl;
|
941
|
-
#endif
|
998
|
+
VK_LOG_DEBUG("ggml_vk_sync_buffers()");
|
942
999
|
const std::vector<vk::MemoryBarrier> mem_barriers{ { { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite }, { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite } } };
|
943
1000
|
|
944
1001
|
ctx->s->buffer.pipelineBarrier(
|
@@ -952,9 +1009,7 @@ static void ggml_vk_sync_buffers(vk_context * ctx) {
|
|
952
1009
|
}
|
953
1010
|
|
954
1011
|
static void ggml_vk_wait_events(vk_context * ctx, std::vector<vk::Event>&& events) {
|
955
|
-
|
956
|
-
std::cerr << "ggml_vk_wait_events()" << std::endl;
|
957
|
-
#endif
|
1012
|
+
VK_LOG_DEBUG("ggml_vk_wait_events()");
|
958
1013
|
if (events.empty()) {
|
959
1014
|
return;
|
960
1015
|
}
|
@@ -989,9 +1044,7 @@ static bool ggml_vk_build_shader(ggml_type type) {
|
|
989
1044
|
}
|
990
1045
|
|
991
1046
|
static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
992
|
-
|
993
|
-
std::cerr << "ggml_vk_load_shaders(" << ctx->name << ")" << std::endl;
|
994
|
-
#endif
|
1047
|
+
VK_LOG_DEBUG("ggml_vk_load_shaders(" << ctx->name << ")");
|
995
1048
|
|
996
1049
|
const std::shared_ptr<vk_device> device = ctx->device;
|
997
1050
|
|
@@ -1042,12 +1095,12 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1042
1095
|
ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
1043
1096
|
|
1044
1097
|
if (device->fp16) {
|
1045
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l",
|
1046
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m",
|
1047
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s",
|
1048
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l",
|
1049
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m",
|
1050
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s",
|
1098
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
1099
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
1100
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
|
1101
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l", matmul_f32_f32_aligned_len, matmul_f32_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
|
1102
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_f32_aligned_len, matmul_f32_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
|
1103
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_f32_aligned_len, matmul_f32_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
|
1051
1104
|
|
1052
1105
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
1053
1106
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
@@ -1140,12 +1193,12 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1140
1193
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
1141
1194
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
1142
1195
|
|
1143
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l",
|
1144
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m",
|
1145
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s",
|
1146
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l",
|
1147
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m",
|
1148
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s",
|
1196
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
|
1197
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
|
1198
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
|
1199
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l", matmul_id_f32_f32_aligned_len, matmul_id_f32_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align);
|
1200
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m", matmul_id_f32_f32_aligned_len, matmul_id_f32_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align);
|
1201
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s", matmul_id_f32_f32_aligned_len, matmul_id_f32_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align);
|
1149
1202
|
|
1150
1203
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->l, "matmul_id_f16_l", matmul_id_f16_len, matmul_id_f16_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
|
1151
1204
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->m, "matmul_id_f16_m", matmul_id_f16_len, matmul_id_f16_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
|
@@ -1231,12 +1284,12 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1231
1284
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_m, "matmul_id_q6_k_f32_aligned_m", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
1232
1285
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_s, "matmul_id_q6_k_f32_aligned_s", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
1233
1286
|
} else {
|
1234
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l",
|
1235
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m",
|
1236
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s",
|
1237
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l",
|
1238
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m",
|
1239
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s",
|
1287
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
1288
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
1289
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
|
1290
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l", matmul_f32_f32_aligned_fp32_len, matmul_f32_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
|
1291
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_f32_aligned_fp32_len, matmul_f32_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
|
1292
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_f32_aligned_fp32_len, matmul_f32_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
|
1240
1293
|
|
1241
1294
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
1242
1295
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
@@ -1329,12 +1382,12 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1329
1382
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
1330
1383
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
1331
1384
|
|
1332
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l",
|
1333
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m",
|
1334
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s",
|
1335
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l",
|
1336
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m",
|
1337
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s",
|
1385
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
|
1386
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
|
1387
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
|
1388
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l", matmul_id_f32_f32_aligned_fp32_len, matmul_id_f32_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align);
|
1389
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m", matmul_id_f32_f32_aligned_fp32_len, matmul_id_f32_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align);
|
1390
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s", matmul_id_f32_f32_aligned_fp32_len, matmul_id_f32_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align);
|
1338
1391
|
|
1339
1392
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->l, "matmul_id_f16_l", matmul_id_f16_fp32_len, matmul_id_f16_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
|
1340
1393
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->m, "matmul_id_f16_m", matmul_id_f16_fp32_len, matmul_id_f16_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
|
@@ -1429,11 +1482,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1429
1482
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1430
1483
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1431
1484
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1432
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "
|
1433
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "
|
1434
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "
|
1435
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "
|
1436
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "
|
1485
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1486
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1487
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1488
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1489
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1437
1490
|
|
1438
1491
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32", mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1439
1492
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32", mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
@@ -1442,11 +1495,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1442
1495
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1443
1496
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1444
1497
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1445
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "
|
1446
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "
|
1447
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "
|
1448
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "
|
1449
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "
|
1498
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1499
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1500
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1501
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1502
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1450
1503
|
|
1451
1504
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1452
1505
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
@@ -1455,11 +1508,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1455
1508
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1456
1509
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1457
1510
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1458
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "
|
1459
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "
|
1460
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "
|
1461
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "
|
1462
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "
|
1511
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1512
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1513
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1514
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1515
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1463
1516
|
|
1464
1517
|
// dequant shaders
|
1465
1518
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
@@ -1468,11 +1521,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1468
1521
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_0], "dequant_q5_0", dequant_q5_0_len, dequant_q5_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
1469
1522
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_1], "dequant_q5_1", dequant_q5_1_len, dequant_q5_1_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
1470
1523
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q8_0], "dequant_q8_0", dequant_q8_0_len, dequant_q8_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
1471
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q2_K], "
|
1472
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q3_K], "
|
1473
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q4_K], "
|
1474
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_K], "
|
1475
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q6_K], "
|
1524
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q2_K], "dequant_q2_k", dequant_q2_k_len, dequant_q2_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
|
1525
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q3_K], "dequant_q3_k", dequant_q3_k_len, dequant_q3_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
|
1526
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q4_K], "dequant_q4_k", dequant_q4_k_len, dequant_q4_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
|
1527
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_K], "dequant_q5_k", dequant_q5_k_len, dequant_q5_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
|
1528
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_k", dequant_q6_k_len, dequant_q6_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
|
1476
1529
|
|
1477
1530
|
// get_rows
|
1478
1531
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F32 ], "get_rows_f32", get_rows_f32_len, get_rows_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
@@ -1538,9 +1591,7 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1538
1591
|
static void ggml_vk_print_gpu_info(size_t idx) {
|
1539
1592
|
GGML_ASSERT(idx < vk_instance.device_indices.size());
|
1540
1593
|
size_t dev_num = vk_instance.device_indices[idx];
|
1541
|
-
|
1542
|
-
std::cerr << "ggml_vk_print_gpu_info(" << dev_num << ")" << std::endl;
|
1543
|
-
#endif
|
1594
|
+
VK_LOG_DEBUG("ggml_vk_print_gpu_info(" << dev_num << ")");
|
1544
1595
|
GGML_ASSERT(vk_instance.initialized);
|
1545
1596
|
|
1546
1597
|
std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
|
@@ -1617,9 +1668,7 @@ void ggml_vk_instance_init() {
|
|
1617
1668
|
if (vk_instance_initialized) {
|
1618
1669
|
return;
|
1619
1670
|
}
|
1620
|
-
|
1621
|
-
std::cerr << "ggml_vk_instance_init()" << std::endl;
|
1622
|
-
#endif
|
1671
|
+
VK_LOG_DEBUG("ggml_vk_instance_init()");
|
1623
1672
|
|
1624
1673
|
vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION };
|
1625
1674
|
|
@@ -1696,33 +1745,37 @@ void ggml_vk_instance_init() {
|
|
1696
1745
|
|
1697
1746
|
// Default to using all dedicated GPUs
|
1698
1747
|
for (size_t i = 0; i < devices.size(); i++) {
|
1699
|
-
vk::
|
1700
|
-
|
1701
|
-
|
1748
|
+
vk::PhysicalDeviceProperties2 new_props;
|
1749
|
+
vk::PhysicalDeviceDriverProperties new_driver;
|
1750
|
+
vk::PhysicalDeviceIDProperties new_id;
|
1751
|
+
new_props.pNext = &new_driver;
|
1752
|
+
new_driver.pNext = &new_id;
|
1753
|
+
devices[i].getProperties2(&new_props);
|
1754
|
+
|
1755
|
+
if (new_props.properties.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) {
|
1702
1756
|
// Check if there are two physical devices corresponding to the same GPU
|
1703
1757
|
auto old_device = std::find_if(
|
1704
1758
|
vk_instance.device_indices.begin(),
|
1705
1759
|
vk_instance.device_indices.end(),
|
1706
|
-
[&devices, &
|
1760
|
+
[&devices, &new_id](const size_t k){
|
1761
|
+
vk::PhysicalDeviceProperties2 old_props;
|
1762
|
+
vk::PhysicalDeviceIDProperties old_id;
|
1763
|
+
old_props.pNext = &old_id;
|
1764
|
+
devices[k].getProperties2(&old_props);
|
1765
|
+
return std::equal(std::begin(old_id.deviceUUID), std::end(old_id.deviceUUID), std::begin(new_id.deviceUUID));
|
1766
|
+
}
|
1707
1767
|
);
|
1708
1768
|
if (old_device == vk_instance.device_indices.end()) {
|
1709
1769
|
vk_instance.device_indices.push_back(i);
|
1710
1770
|
} else {
|
1711
1771
|
// There can be two physical devices corresponding to the same GPU if there are 2 different drivers
|
1712
1772
|
// This can cause error when splitting layers aross the devices, need to keep only 1
|
1713
|
-
|
1714
|
-
std::cerr << "Device " << i << " and device " << *old_device << " have the same device id" << std::endl;
|
1715
|
-
#endif
|
1773
|
+
VK_LOG_DEBUG("Device " << i << " and device " << *old_device << " have the same deviceUUID");
|
1716
1774
|
|
1717
|
-
vk::PhysicalDeviceProperties2
|
1775
|
+
vk::PhysicalDeviceProperties2 old_props;
|
1718
1776
|
vk::PhysicalDeviceDriverProperties old_driver;
|
1719
|
-
|
1720
|
-
devices[*old_device].getProperties2(&
|
1721
|
-
|
1722
|
-
vk::PhysicalDeviceProperties2 new_prop;
|
1723
|
-
vk::PhysicalDeviceDriverProperties new_driver;
|
1724
|
-
new_prop.pNext = &new_driver;
|
1725
|
-
devices[i].getProperties2(&new_prop);
|
1777
|
+
old_props.pNext = &old_driver;
|
1778
|
+
devices[*old_device].getProperties2(&old_props);
|
1726
1779
|
|
1727
1780
|
std::map<vk::DriverId, int> driver_priorities {};
|
1728
1781
|
int old_priority = std::numeric_limits<int>::max();
|
@@ -1730,7 +1783,7 @@ void ggml_vk_instance_init() {
|
|
1730
1783
|
|
1731
1784
|
// Check https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkDriverId.html for the list of driver id
|
1732
1785
|
// Smaller number -> higher priority
|
1733
|
-
switch (
|
1786
|
+
switch (old_props.properties.vendorID) {
|
1734
1787
|
case VK_VENDOR_ID_AMD:
|
1735
1788
|
driver_priorities[vk::DriverId::eMesaRadv] = 1;
|
1736
1789
|
driver_priorities[vk::DriverId::eAmdOpenSource] = 2;
|
@@ -1760,16 +1813,11 @@ void ggml_vk_instance_init() {
|
|
1760
1813
|
vk_instance.device_indices.erase(r, vk_instance.device_indices.end());
|
1761
1814
|
vk_instance.device_indices.push_back(i);
|
1762
1815
|
|
1763
|
-
|
1764
|
-
std::cerr << "Prioritize device " << i << " driver " << new_driver.driverName << " over device " << *old_device << " driver " << old_driver.driverName << std::endl;
|
1765
|
-
#endif
|
1816
|
+
VK_LOG_DEBUG("Prioritize device " << i << " driver " << new_driver.driverName << " over device " << *old_device << " driver " << old_driver.driverName);
|
1766
1817
|
}
|
1767
|
-
#ifdef GGML_VULKAN_DEBUG
|
1768
1818
|
else {
|
1769
|
-
|
1770
|
-
|
1819
|
+
VK_LOG_DEBUG("Prioritize device " << *old_device << " driver " << old_driver.driverName << " over device " << i << " driver " << new_driver.driverName << std::endl);
|
1771
1820
|
}
|
1772
|
-
#endif
|
1773
1821
|
}
|
1774
1822
|
}
|
1775
1823
|
}
|
@@ -1792,9 +1840,7 @@ void ggml_vk_instance_init() {
|
|
1792
1840
|
static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
1793
1841
|
GGML_ASSERT(idx < vk_instance.device_indices.size());
|
1794
1842
|
size_t dev_num = vk_instance.device_indices[idx];
|
1795
|
-
|
1796
|
-
std::cerr << "ggml_vk_init(" << ctx->name << ", " << dev_num << ")" << std::endl;
|
1797
|
-
#endif
|
1843
|
+
VK_LOG_DEBUG("ggml_vk_init(" << ctx->name << ", " << dev_num << ")");
|
1798
1844
|
ggml_vk_instance_init();
|
1799
1845
|
|
1800
1846
|
std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
|
@@ -1967,9 +2013,7 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
|
1967
2013
|
}
|
1968
2014
|
|
1969
2015
|
static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type type) {
|
1970
|
-
|
1971
|
-
std::cerr << "ggml_vk_get_to_fp16()" << std::endl;
|
1972
|
-
#endif
|
2016
|
+
VK_LOG_DEBUG("ggml_vk_get_to_fp16()");
|
1973
2017
|
switch (type) {
|
1974
2018
|
case GGML_TYPE_F32:
|
1975
2019
|
case GGML_TYPE_Q4_0:
|
@@ -1991,9 +2035,7 @@ static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type
|
|
1991
2035
|
}
|
1992
2036
|
|
1993
2037
|
static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type) {
|
1994
|
-
|
1995
|
-
std::cerr << "ggml_vk_get_mul_mat_mat_pipeline()" << std::endl;
|
1996
|
-
#endif
|
2038
|
+
VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_pipeline()");
|
1997
2039
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
1998
2040
|
return ctx->device->pipeline_matmul_f32;
|
1999
2041
|
}
|
@@ -2029,9 +2071,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
|
|
2029
2071
|
}
|
2030
2072
|
|
2031
2073
|
static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type) {
|
2032
|
-
|
2033
|
-
std::cerr << "ggml_vk_get_dequantize_mul_mat_vec()" << std::endl;
|
2034
|
-
#endif
|
2074
|
+
VK_LOG_DEBUG("ggml_vk_get_dequantize_mul_mat_vec()");
|
2035
2075
|
GGML_ASSERT(b_type == GGML_TYPE_F32 || b_type == GGML_TYPE_F16);
|
2036
2076
|
|
2037
2077
|
switch (a_type) {
|
@@ -2056,9 +2096,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *
|
|
2056
2096
|
}
|
2057
2097
|
|
2058
2098
|
static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type) {
|
2059
|
-
|
2060
|
-
std::cerr << "ggml_vk_get_mul_mat_mat_id_pipeline()" << std::endl;
|
2061
|
-
#endif
|
2099
|
+
VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_id_pipeline()");
|
2062
2100
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
2063
2101
|
return ctx->device->pipeline_matmul_id_f32;
|
2064
2102
|
}
|
@@ -2091,9 +2129,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_co
|
|
2091
2129
|
}
|
2092
2130
|
|
2093
2131
|
static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type) {
|
2094
|
-
|
2095
|
-
std::cerr << "ggml_vk_get_dequantize_mul_mat_vec()" << std::endl;
|
2096
|
-
#endif
|
2132
|
+
VK_LOG_DEBUG("ggml_vk_get_dequantize_mul_mat_vec()");
|
2097
2133
|
GGML_ASSERT(b_type == GGML_TYPE_F32);
|
2098
2134
|
|
2099
2135
|
switch (a_type) {
|
@@ -2118,9 +2154,9 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context
|
|
2118
2154
|
}
|
2119
2155
|
|
2120
2156
|
static vk_buffer ggml_vk_pool_malloc(ggml_backend_vk_context * ctx, size_t size) {
|
2121
|
-
|
2122
|
-
|
2123
|
-
|
2157
|
+
VK_LOG_DEBUG("ggml_vk_pool_malloc(" << size << ")");
|
2158
|
+
VK_LOG_MEMORY("ggml_vk_pool_malloc");
|
2159
|
+
|
2124
2160
|
int best_i = -1;
|
2125
2161
|
size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs
|
2126
2162
|
int worst_i = -1;
|
@@ -2148,13 +2184,11 @@ static vk_buffer ggml_vk_pool_malloc(ggml_backend_vk_context * ctx, size_t size)
|
|
2148
2184
|
ggml_vk_destroy_buffer(b);
|
2149
2185
|
}
|
2150
2186
|
|
2151
|
-
return
|
2187
|
+
return ggml_vk_create_buffer_device(ctx, size);
|
2152
2188
|
}
|
2153
2189
|
|
2154
2190
|
static void ggml_vk_pool_free(ggml_backend_vk_context * ctx, vk_buffer& buffer) {
|
2155
|
-
|
2156
|
-
std::cerr << "ggml_vk_pool_free(" << buffer->size << ")" << std::endl;
|
2157
|
-
#endif
|
2191
|
+
VK_LOG_DEBUG("ggml_vk_pool_free(" << buffer->size << ")");
|
2158
2192
|
for (int i = 0; i < MAX_VK_BUFFERS; ++i) {
|
2159
2193
|
vk_buffer& b = ctx->buffer_pool[i];
|
2160
2194
|
if (b == nullptr) {
|
@@ -2175,6 +2209,8 @@ static vk_buffer ggml_vk_create_buffer_temp(ggml_backend_vk_context * ctx, size_
|
|
2175
2209
|
}
|
2176
2210
|
}
|
2177
2211
|
|
2212
|
+
VK_LOG_MEMORY("ggml_vk_create_buffer_temp(" << size << ")");
|
2213
|
+
|
2178
2214
|
// Otherwise create new buffer
|
2179
2215
|
vk_buffer buf = ggml_vk_pool_malloc(ctx, size);
|
2180
2216
|
ctx->gc.temp_buffers.push_back(buf);
|
@@ -2183,9 +2219,7 @@ static vk_buffer ggml_vk_create_buffer_temp(ggml_backend_vk_context * ctx, size_
|
|
2183
2219
|
}
|
2184
2220
|
|
2185
2221
|
static void * ggml_vk_host_malloc(ggml_backend_vk_context * ctx, size_t size) {
|
2186
|
-
|
2187
|
-
std::cerr << "ggml_vk_host_malloc(" << size << ")" << std::endl;
|
2188
|
-
#endif
|
2222
|
+
VK_LOG_MEMORY("ggml_vk_host_malloc(" << size << ")");
|
2189
2223
|
vk_buffer buf = ggml_vk_create_buffer(ctx, size,
|
2190
2224
|
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
2191
2225
|
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
@@ -2207,9 +2241,7 @@ static void ggml_vk_host_free(ggml_backend_vk_context * ctx, void* ptr) {
|
|
2207
2241
|
if (ptr == nullptr) {
|
2208
2242
|
return;
|
2209
2243
|
}
|
2210
|
-
|
2211
|
-
std::cerr << "ggml_vk_host_free(" << ptr << ")" << std::endl;
|
2212
|
-
#endif
|
2244
|
+
VK_LOG_MEMORY("ggml_vk_host_free(" << ptr << ")");
|
2213
2245
|
vk_buffer buf;
|
2214
2246
|
size_t index;
|
2215
2247
|
for (size_t i = 0; i < ctx->pinned_memory.size(); i++) {
|
@@ -2261,13 +2293,11 @@ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context * ctx, vk_context
|
|
2261
2293
|
const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]);
|
2262
2294
|
const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]);
|
2263
2295
|
const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]);
|
2264
|
-
|
2265
|
-
std::cerr << "ggml_vk_dispatch_pipeline(" << pipeline->name << ", {";
|
2296
|
+
VK_LOG_DEBUG("ggml_vk_dispatch_pipeline(" << pipeline->name << ", {";
|
2266
2297
|
for (auto& buffer : buffers) {
|
2267
2298
|
std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.size << "), ";
|
2268
2299
|
}
|
2269
|
-
std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))"
|
2270
|
-
#endif
|
2300
|
+
std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))");
|
2271
2301
|
std::vector<vk::DescriptorBufferInfo> descriptor_buffer_infos;
|
2272
2302
|
std::vector<vk::WriteDescriptorSet> write_descriptor_sets;
|
2273
2303
|
GGML_ASSERT(pipeline->descriptor_set_idx < pipeline->descriptor_sets.size());
|
@@ -2300,9 +2330,7 @@ static void ggml_vk_end_submission(vk_submission& s, std::vector<vk_semaphore> w
|
|
2300
2330
|
}
|
2301
2331
|
|
2302
2332
|
static void ggml_vk_ctx_end(vk_context * ctx) {
|
2303
|
-
|
2304
|
-
std::cerr << "ggml_vk_ctx_end(" << ctx << ", " << ctx->seqs.size() << ")" << std::endl;
|
2305
|
-
#endif
|
2333
|
+
VK_LOG_DEBUG("ggml_vk_ctx_end(" << ctx << ", " << ctx->seqs.size() << ")");
|
2306
2334
|
if (ctx->s == nullptr) {
|
2307
2335
|
return;
|
2308
2336
|
}
|
@@ -2312,9 +2340,7 @@ static void ggml_vk_ctx_end(vk_context * ctx) {
|
|
2312
2340
|
}
|
2313
2341
|
|
2314
2342
|
static void ggml_vk_ctx_begin(ggml_backend_vk_context * ctx, vk_context * subctx) {
|
2315
|
-
|
2316
|
-
std::cerr << "ggml_vk_ctx_begin(" << ctx << ")" << std::endl;
|
2317
|
-
#endif
|
2343
|
+
VK_LOG_DEBUG("ggml_vk_ctx_begin(" << ctx << ")");
|
2318
2344
|
if (subctx->s != nullptr) {
|
2319
2345
|
ggml_vk_ctx_end(subctx);
|
2320
2346
|
}
|
@@ -2324,9 +2350,7 @@ static void ggml_vk_ctx_begin(ggml_backend_vk_context * ctx, vk_context * subctx
|
|
2324
2350
|
}
|
2325
2351
|
|
2326
2352
|
static size_t ggml_vk_align_size(size_t width, size_t align) {
|
2327
|
-
|
2328
|
-
std::cerr << "ggml_vk_align_size(" << width << ", " << align << ")" << std::endl;
|
2329
|
-
#endif
|
2353
|
+
VK_LOG_DEBUG("ggml_vk_align_size(" << width << ", " << align << ")");
|
2330
2354
|
return CEIL_DIV(width, align) * align;
|
2331
2355
|
}
|
2332
2356
|
|
@@ -2340,6 +2364,7 @@ static void deferred_memcpy(void * dst, const void * src, size_t size, std::vect
|
|
2340
2364
|
|
2341
2365
|
static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, size_t size) {
|
2342
2366
|
if (ctx->sync_staging == nullptr || ctx->sync_staging->size < size) {
|
2367
|
+
VK_LOG_MEMORY("ggml_vk_ensure_sync_staging_buffer(" << size << ")");
|
2343
2368
|
ggml_vk_destroy_buffer(ctx->sync_staging);
|
2344
2369
|
ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size,
|
2345
2370
|
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
@@ -2348,9 +2373,7 @@ static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, si
|
|
2348
2373
|
}
|
2349
2374
|
|
2350
2375
|
static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) {
|
2351
|
-
|
2352
|
-
std::cerr << "ggml_vk_buffer_write_nc_async(" << tensor << ")" << std::endl;
|
2353
|
-
#endif
|
2376
|
+
VK_LOG_DEBUG("ggml_vk_buffer_write_nc_async(" << tensor << ")");
|
2354
2377
|
GGML_ASSERT(!ggml_is_contiguous(tensor));
|
2355
2378
|
// Buffer is already mapped
|
2356
2379
|
if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
|
@@ -2455,9 +2478,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
|
|
2455
2478
|
}
|
2456
2479
|
|
2457
2480
|
static void ggml_vk_buffer_write_2d_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, bool sync_staging = false) {
|
2458
|
-
|
2459
|
-
std::cerr << "ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")" << std::endl;
|
2460
|
-
#endif
|
2481
|
+
VK_LOG_DEBUG("ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")");
|
2461
2482
|
// Make sure ctx owns the buffer
|
2462
2483
|
GGML_ASSERT(dst->ctx == ctx);
|
2463
2484
|
|
@@ -2492,9 +2513,7 @@ static void ggml_vk_buffer_write_2d_async(ggml_backend_vk_context * ctx, vk_cont
|
|
2492
2513
|
subctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices);
|
2493
2514
|
return;
|
2494
2515
|
}
|
2495
|
-
|
2496
|
-
std::cerr << "STAGING" << std::endl;
|
2497
|
-
#endif
|
2516
|
+
VK_LOG_DEBUG("STAGING");
|
2498
2517
|
|
2499
2518
|
// Staging buffer required
|
2500
2519
|
vk_buffer staging = ctx->staging;
|
@@ -2529,16 +2548,12 @@ static void ggml_vk_buffer_write_2d_async(ggml_backend_vk_context * ctx, vk_cont
|
|
2529
2548
|
}
|
2530
2549
|
|
2531
2550
|
static void ggml_vk_buffer_write_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, bool sync_staging = false) {
|
2532
|
-
|
2533
|
-
std::cerr << "ggml_vk_buffer_write_async(" << size << ")" << std::endl;
|
2534
|
-
#endif
|
2551
|
+
VK_LOG_DEBUG("ggml_vk_buffer_write_async(" << size << ")");
|
2535
2552
|
return ggml_vk_buffer_write_2d_async(ctx, subctx, dst, offset, src, size, size, 1, sync_staging);
|
2536
2553
|
}
|
2537
2554
|
|
2538
2555
|
static void ggml_vk_buffer_write_2d(ggml_backend_vk_context * ctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height) {
|
2539
|
-
|
2540
|
-
std::cerr << "ggml_vk_buffer_write_2d(" << width << ", " << height << ")" << std::endl;
|
2541
|
-
#endif
|
2556
|
+
VK_LOG_DEBUG("ggml_vk_buffer_write_2d(" << width << ", " << height << ")");
|
2542
2557
|
// Buffer is already mapped
|
2543
2558
|
if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
|
2544
2559
|
GGML_ASSERT(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
|
@@ -2563,16 +2578,12 @@ static void ggml_vk_buffer_write_2d(ggml_backend_vk_context * ctx, vk_buffer& ds
|
|
2563
2578
|
}
|
2564
2579
|
|
2565
2580
|
static void ggml_vk_buffer_write(ggml_backend_vk_context * ctx, vk_buffer& dst, size_t offset, const void * src, size_t size) {
|
2566
|
-
|
2567
|
-
std::cerr << "ggml_vk_buffer_write(" << size << ")" << std::endl;
|
2568
|
-
#endif
|
2581
|
+
VK_LOG_DEBUG("ggml_vk_buffer_write(" << size << ")");
|
2569
2582
|
ggml_vk_buffer_write_2d(ctx, dst, offset, src, 0, size, 1);
|
2570
2583
|
}
|
2571
2584
|
|
2572
2585
|
static void ggml_vk_buffer_read_2d_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, bool sync_staging = false) {
|
2573
|
-
|
2574
|
-
std::cerr << "ggml_vk_buffer_read_2d_async(offset=" << offset << ", width=" << width << ", height=" << height << ")" << std::endl;
|
2575
|
-
#endif
|
2586
|
+
VK_LOG_DEBUG("ggml_vk_buffer_read_2d_async(offset=" << offset << ", width=" << width << ", height=" << height << ")");
|
2576
2587
|
GGML_ASSERT(width > 0);
|
2577
2588
|
GGML_ASSERT(height > 0);
|
2578
2589
|
GGML_ASSERT(src != nullptr);
|
@@ -2606,9 +2617,7 @@ static void ggml_vk_buffer_read_2d_async(ggml_backend_vk_context * ctx, vk_conte
|
|
2606
2617
|
|
2607
2618
|
return;
|
2608
2619
|
}
|
2609
|
-
|
2610
|
-
std::cerr << "STAGING" << std::endl;
|
2611
|
-
#endif
|
2620
|
+
VK_LOG_DEBUG("STAGING");
|
2612
2621
|
|
2613
2622
|
// Fall back to staging buffer
|
2614
2623
|
vk_buffer staging = ctx->staging;
|
@@ -2635,9 +2644,7 @@ static void ggml_vk_buffer_read_async(ggml_backend_vk_context * ctx, vk_context
|
|
2635
2644
|
}
|
2636
2645
|
|
2637
2646
|
static void ggml_vk_buffer_read(ggml_backend_vk_context * ctx, vk_buffer& src, size_t offset, void * dst, size_t size) {
|
2638
|
-
|
2639
|
-
std::cerr << "ggml_vk_buffer_read(" << offset << ", " << size << ")" << std::endl;
|
2640
|
-
#endif
|
2647
|
+
VK_LOG_DEBUG("ggml_vk_buffer_read(" << offset << ", " << size << ")");
|
2641
2648
|
if(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
|
2642
2649
|
GGML_ASSERT(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
|
2643
2650
|
|
@@ -2659,9 +2666,7 @@ static void ggml_vk_buffer_read(ggml_backend_vk_context * ctx, vk_buffer& src, s
|
|
2659
2666
|
}
|
2660
2667
|
|
2661
2668
|
static void ggml_vk_buffer_copy_async(vk_context * ctx, vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
|
2662
|
-
|
2663
|
-
std::cerr << "ggml_vk_buffer_copy_async(" << size << ")" << std::endl;
|
2664
|
-
#endif
|
2669
|
+
VK_LOG_DEBUG("ggml_vk_buffer_copy_async(" << size << ")");
|
2665
2670
|
// Make sure both buffers are on same ctx
|
2666
2671
|
GGML_ASSERT(src->ctx == dst->ctx);
|
2667
2672
|
|
@@ -2672,9 +2677,7 @@ static void ggml_vk_buffer_copy_async(vk_context * ctx, vk_buffer& dst, size_t d
|
|
2672
2677
|
|
2673
2678
|
static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
|
2674
2679
|
if (src->ctx == dst->ctx) {
|
2675
|
-
|
2676
|
-
std::cerr << "ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")" << std::endl;
|
2677
|
-
#endif
|
2680
|
+
VK_LOG_DEBUG("ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")");
|
2678
2681
|
// Copy within the device
|
2679
2682
|
ggml_backend_vk_context * ctx = src->ctx;
|
2680
2683
|
|
@@ -2686,9 +2689,7 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr
|
|
2686
2689
|
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences");
|
2687
2690
|
ctx->device->device.resetFences({ ctx->fence });
|
2688
2691
|
} else {
|
2689
|
-
|
2690
|
-
std::cerr << "ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")" << std::endl;
|
2691
|
-
#endif
|
2692
|
+
VK_LOG_DEBUG("ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")");
|
2692
2693
|
// Copy device to device
|
2693
2694
|
ggml_backend_vk_context * src_ctx = src->ctx;
|
2694
2695
|
ggml_backend_vk_context * dst_ctx = dst->ctx;
|
@@ -2706,9 +2707,7 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr
|
|
2706
2707
|
}
|
2707
2708
|
|
2708
2709
|
static void ggml_vk_buffer_memset(ggml_backend_vk_context * ctx, vk_buffer& dst, size_t offset, uint32_t c, size_t size) {
|
2709
|
-
|
2710
|
-
std::cerr << "ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")" << std::endl;
|
2711
|
-
#endif
|
2710
|
+
VK_LOG_DEBUG("ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")");
|
2712
2711
|
// Make sure ctx owns the buffer
|
2713
2712
|
GGML_ASSERT(dst->ctx == ctx);
|
2714
2713
|
|
@@ -2723,9 +2722,7 @@ static void ggml_vk_buffer_memset(ggml_backend_vk_context * ctx, vk_buffer& dst,
|
|
2723
2722
|
}
|
2724
2723
|
|
2725
2724
|
static void ggml_vk_h2d_tensor_2d(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const ggml_tensor * src, uint64_t i3, uint64_t i2, uint64_t i1) {
|
2726
|
-
|
2727
|
-
std::cerr << "ggml_vk_h2d_tensor_2d(dst=" << dst << ", offset=" << offset << ", src=" << src << ", i3=" << i3 << ", i2=" << i2 << ", i1=" << i1 << ")" << std::endl;
|
2728
|
-
#endif
|
2725
|
+
VK_LOG_DEBUG("ggml_vk_h2d_tensor_2d(dst=" << dst << ", offset=" << offset << ", src=" << src << ", i3=" << i3 << ", i2=" << i2 << ", i1=" << i1 << ")");
|
2729
2726
|
const uint64_t ne0 = src->ne[0];
|
2730
2727
|
const uint64_t ne1 = src->ne[1];
|
2731
2728
|
const uint64_t nb0 = src->nb[0];
|
@@ -2753,9 +2750,7 @@ static void ggml_vk_h2d_tensor_2d(ggml_backend_vk_context * ctx, vk_context * su
|
|
2753
2750
|
}
|
2754
2751
|
|
2755
2752
|
static void ggml_vk_d2h_tensor_2d(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& src, size_t offset, const ggml_tensor * dst) {
|
2756
|
-
|
2757
|
-
std::cerr << "ggml_vk_d2h_tensor_2d()" << std::endl;
|
2758
|
-
#endif
|
2753
|
+
VK_LOG_DEBUG("ggml_vk_d2h_tensor_2d()");
|
2759
2754
|
const uint64_t ne0 = dst->ne[0];
|
2760
2755
|
const uint64_t ne1 = dst->ne[1];
|
2761
2756
|
const uint64_t ne2 = dst->ne[2];
|
@@ -2779,9 +2774,7 @@ static void ggml_vk_d2h_tensor_2d(ggml_backend_vk_context * ctx, vk_context * su
|
|
2779
2774
|
}
|
2780
2775
|
|
2781
2776
|
static uint32_t ggml_vk_guess_split_k(int m, int n, int k) {
|
2782
|
-
|
2783
|
-
std::cerr << "ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")" << std::endl;
|
2784
|
-
#endif
|
2777
|
+
VK_LOG_DEBUG("ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")");
|
2785
2778
|
// if (k > 128 && (m < 128 || n < 128) && m > 2 && n > 2) {
|
2786
2779
|
// return 4;
|
2787
2780
|
// }
|
@@ -2813,9 +2806,7 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline_intel(ggml_backend_vk_context *
|
|
2813
2806
|
}
|
2814
2807
|
|
2815
2808
|
static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned) {
|
2816
|
-
|
2817
|
-
std::cerr << "ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ")" << std::endl;
|
2818
|
-
#endif
|
2809
|
+
VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ")");
|
2819
2810
|
switch (ctx->device->vendor_id) {
|
2820
2811
|
case VK_VENDOR_ID_AMD:
|
2821
2812
|
return ggml_vk_guess_matmul_pipeline_amd(ctx, mmp, m, n, aligned);
|
@@ -2837,9 +2828,7 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
|
|
2837
2828
|
}
|
2838
2829
|
|
2839
2830
|
static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n) {
|
2840
|
-
|
2841
|
-
std::cerr << "ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")" << std::endl;
|
2842
|
-
#endif
|
2831
|
+
VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")");
|
2843
2832
|
return ggml_vk_guess_matmul_pipeline(ctx, mmp, m, n, true)->align;
|
2844
2833
|
}
|
2845
2834
|
|
@@ -2849,9 +2838,7 @@ static void ggml_vk_matmul(
|
|
2849
2838
|
uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d,
|
2850
2839
|
uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
|
2851
2840
|
uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3) {
|
2852
|
-
|
2853
|
-
std::cerr << "ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ")" << std::endl;
|
2854
|
-
#endif
|
2841
|
+
VK_LOG_DEBUG("ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ")");
|
2855
2842
|
ggml_vk_sync_buffers(subctx);
|
2856
2843
|
if (split_k == 1) {
|
2857
2844
|
const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k, ne02, ne12, broadcast2, broadcast3 };
|
@@ -2875,12 +2862,10 @@ static void ggml_vk_matmul_id(
|
|
2875
2862
|
uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d,
|
2876
2863
|
uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
|
2877
2864
|
uint32_t n_as, uint32_t nei0, uint32_t nei1, uint32_t nbi1, uint32_t ne11) {
|
2878
|
-
|
2879
|
-
std::cerr << "ggml_vk_matmul_id(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), ids: (" << ids.buffer->buffer << ", " << ids.offset << ", " << ids.size << "), " <<
|
2865
|
+
VK_LOG_DEBUG("ggml_vk_matmul_id(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), ids: (" << ids.buffer->buffer << ", " << ids.offset << ", " << ids.size << "), " <<
|
2880
2866
|
"m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", " <<
|
2881
2867
|
"batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", " <<
|
2882
|
-
"n_as: " << n_as << ", nei0: " << nei0 << ", nei1: " << nei1 << ", nbi1: " << nbi1 << ", ne11: " << ne11 << ")"
|
2883
|
-
#endif
|
2868
|
+
"n_as: " << n_as << ", nei0: " << nei0 << ", nei1: " << nei1 << ", nbi1: " << nbi1 << ", ne11: " << ne11 << ")");
|
2884
2869
|
ggml_vk_sync_buffers(subctx);
|
2885
2870
|
const vk_mat_mat_id_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d,
|
2886
2871
|
nei0, nei1, nbi1, ne11 };
|
@@ -2910,10 +2895,8 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_
|
|
2910
2895
|
}
|
2911
2896
|
|
2912
2897
|
static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) {
|
2913
|
-
|
2914
|
-
std::cerr << "
|
2915
|
-
std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")" << std::endl;
|
2916
|
-
#endif
|
2898
|
+
VK_LOG_DEBUG("ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
|
2899
|
+
std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")");
|
2917
2900
|
const int tensor_type_size = ggml_type_size(tensor->type);
|
2918
2901
|
|
2919
2902
|
const uint32_t ne = ggml_nelements(tensor);
|
@@ -2930,11 +2913,9 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context
|
|
2930
2913
|
}
|
2931
2914
|
|
2932
2915
|
static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2933
|
-
|
2934
|
-
std::cerr << "ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
2916
|
+
VK_LOG_DEBUG("ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
2935
2917
|
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
2936
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"
|
2937
|
-
#endif
|
2918
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
|
2938
2919
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
|
2939
2920
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
|
2940
2921
|
|
@@ -3105,11 +3086,9 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
3105
3086
|
}
|
3106
3087
|
|
3107
3088
|
static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3108
|
-
|
3109
|
-
std::cerr << "ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3089
|
+
VK_LOG_DEBUG("ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3110
3090
|
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
3111
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"
|
3112
|
-
#endif
|
3091
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
|
3113
3092
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
|
3114
3093
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
|
3115
3094
|
|
@@ -3260,11 +3239,9 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
3260
3239
|
}
|
3261
3240
|
|
3262
3241
|
static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3263
|
-
|
3264
|
-
std::cerr << "ggml_vk_mul_mat_p021_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3242
|
+
VK_LOG_DEBUG("ggml_vk_mul_mat_p021_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3265
3243
|
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
3266
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"
|
3267
|
-
#endif
|
3244
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
|
3268
3245
|
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
3269
3246
|
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT
|
3270
3247
|
GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT
|
@@ -3333,11 +3310,9 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
3333
3310
|
}
|
3334
3311
|
|
3335
3312
|
static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3336
|
-
|
3337
|
-
std::cerr << "ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3313
|
+
VK_LOG_DEBUG("ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3338
3314
|
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
3339
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"
|
3340
|
-
#endif
|
3315
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
|
3341
3316
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
3342
3317
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
3343
3318
|
GGML_ASSERT(!ggml_is_permuted(src0));
|
@@ -3410,9 +3385,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
3410
3385
|
}
|
3411
3386
|
|
3412
3387
|
static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3413
|
-
|
3414
|
-
std::cerr << "ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")" << std::endl;
|
3415
|
-
#endif
|
3388
|
+
VK_LOG_DEBUG("ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")");
|
3416
3389
|
if (src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && dst->ne[1] == 1) {
|
3417
3390
|
ggml_vk_mul_mat_vec_p021_f16_f32(ctx, subctx, src0, src1, dst);
|
3418
3391
|
} else if (src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && dst->ne[1] == 1) {
|
@@ -3425,12 +3398,10 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx,
|
|
3425
3398
|
}
|
3426
3399
|
|
3427
3400
|
static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
|
3428
|
-
|
3429
|
-
std::cerr << "ggml_vk_mul_mat_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3401
|
+
VK_LOG_DEBUG("ggml_vk_mul_mat_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3430
3402
|
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
3431
3403
|
std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3];
|
3432
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"
|
3433
|
-
#endif
|
3404
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
|
3434
3405
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
|
3435
3406
|
GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
3436
3407
|
|
@@ -3616,12 +3587,10 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
|
|
3616
3587
|
}
|
3617
3588
|
|
3618
3589
|
static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
|
3619
|
-
|
3620
|
-
std::cerr << "ggml_vk_mul_mat_vec_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3590
|
+
VK_LOG_DEBUG("ggml_vk_mul_mat_vec_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3621
3591
|
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
3622
3592
|
std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3];
|
3623
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"
|
3624
|
-
#endif
|
3593
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
|
3625
3594
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
|
3626
3595
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
|
3627
3596
|
GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
@@ -3784,9 +3753,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
|
3784
3753
|
}
|
3785
3754
|
|
3786
3755
|
static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
3787
|
-
|
3788
|
-
std::cerr << "ggml_vk_mul_mat_id(" << src0 << ", " << src1 << ", " << src2 << ", " << dst << ")" << std::endl;
|
3789
|
-
#endif
|
3756
|
+
VK_LOG_DEBUG("ggml_vk_mul_mat_id(" << src0 << ", " << src1 << ", " << src2 << ", " << dst << ")");
|
3790
3757
|
if (src2->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) {
|
3791
3758
|
ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, src0, src1, src2, dst);
|
3792
3759
|
} else {
|
@@ -4020,16 +3987,14 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
|
|
4020
3987
|
|
4021
3988
|
template<typename PC>
|
4022
3989
|
static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc) {
|
4023
|
-
|
4024
|
-
std::cerr << "ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3990
|
+
VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
4025
3991
|
if (src1 != nullptr) {
|
4026
3992
|
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
4027
3993
|
}
|
4028
3994
|
if (src2 != nullptr) {
|
4029
3995
|
std::cerr << "), (" << src2 << ", name=" << src2->name << ", type=" << src2->type << ", ne0=" << src2->ne[0] << ", ne1=" << src2->ne[1] << ", ne2=" << src2->ne[2] << ", ne3=" << src2->ne[3] << ", nb0=" << src2->nb[0] << ", nb1=" << src2->nb[1] << ", nb2=" << src2->nb[2] << ", nb3=" << src2->nb[3];
|
4030
3996
|
}
|
4031
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")"
|
4032
|
-
#endif
|
3997
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")");
|
4033
3998
|
GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
|
4034
3999
|
GGML_ASSERT(ggml_vk_op_supports_incontiguous(op) || ggml_vk_dim01_contiguous(src0)); // NOLINT
|
4035
4000
|
GGML_ASSERT(dst->extra != nullptr);
|
@@ -4527,9 +4492,7 @@ static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0
|
|
4527
4492
|
|
4528
4493
|
template <typename X_TYPE, typename Y_TYPE>
|
4529
4494
|
static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t n, size_t k, size_t batch, size_t num_it, int split_k, int shader_size) {
|
4530
|
-
|
4531
|
-
std::cerr << "ggml_vk_test_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << shader_size << ")" << std::endl;
|
4532
|
-
#endif
|
4495
|
+
VK_LOG_DEBUG("ggml_vk_test_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << shader_size << ")");
|
4533
4496
|
const size_t x_ne = m * k * batch;
|
4534
4497
|
const size_t y_ne = k * n * batch;
|
4535
4498
|
const size_t d_ne = m * n * batch;
|
@@ -4943,9 +4906,7 @@ static void ggml_vk_test_h2d_nc(ggml_backend_vk_context * ctx, size_t ne0, size_
|
|
4943
4906
|
}
|
4944
4907
|
|
4945
4908
|
static void ggml_vk_test_transfer(ggml_backend_vk_context * ctx, size_t ne, bool pinned) {
|
4946
|
-
|
4947
|
-
std::cerr << "ggml_vk_test_transfer(" << ne << ")" << std::endl;
|
4948
|
-
#endif
|
4909
|
+
VK_LOG_DEBUG("ggml_vk_test_transfer(" << ne << ")");
|
4949
4910
|
// Check transfers are correct
|
4950
4911
|
vk_buffer buffer = ggml_vk_create_buffer_check(ctx, sizeof(float) * ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
4951
4912
|
|
@@ -5029,9 +4990,7 @@ static void ggml_vk_quantize_data(const float * from, void * to, size_t ne, ggml
|
|
5029
4990
|
}
|
5030
4991
|
|
5031
4992
|
static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) {
|
5032
|
-
|
5033
|
-
std::cerr << "ggml_vk_test_dequant(" << ne << ")" << std::endl;
|
5034
|
-
#endif
|
4993
|
+
VK_LOG_DEBUG("ggml_vk_test_dequant(" << ne << ")");
|
5035
4994
|
const size_t x_sz = sizeof(float) * ne;
|
5036
4995
|
const size_t x_sz_f16 = sizeof(ggml_fp16_t) * ne;
|
5037
4996
|
const size_t qx_sz = ne * ggml_type_size(quant)/ggml_blck_size(quant);
|
@@ -5108,9 +5067,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
|
|
5108
5067
|
}
|
5109
5068
|
|
5110
5069
|
static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, size_t n, size_t k, size_t batch, size_t num_it, size_t split_k, size_t shader_size, ggml_type quant) {
|
5111
|
-
|
5112
|
-
std::cerr << "ggml_vk_test_dequant_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << ggml_type_name(quant) << ")" << std::endl;
|
5113
|
-
#endif
|
5070
|
+
VK_LOG_DEBUG("ggml_vk_test_dequant_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << ggml_type_name(quant) << ")");
|
5114
5071
|
const size_t x_ne = m * k * batch;
|
5115
5072
|
const size_t y_ne = k * n * batch;
|
5116
5073
|
const size_t d_ne = m * n * batch;
|
@@ -5294,9 +5251,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
|
5294
5251
|
#endif
|
5295
5252
|
|
5296
5253
|
static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor) {
|
5297
|
-
|
5298
|
-
std::cerr << "ggml_vk_create_extra(" << tensor << " (" << tensor->name << ", " << ggml_op_name(tensor->op) << "))" << std::endl;
|
5299
|
-
#endif
|
5254
|
+
VK_LOG_DEBUG("ggml_vk_create_extra(" << tensor << " (" << tensor->name << ", " << ggml_op_name(tensor->op) << "))");
|
5300
5255
|
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
|
5301
5256
|
extra->reset();
|
5302
5257
|
tensor->extra = extra;
|
@@ -5304,9 +5259,7 @@ static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor)
|
|
5304
5259
|
}
|
5305
5260
|
|
5306
5261
|
static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggml_tensor * node){
|
5307
|
-
|
5308
|
-
std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
|
5309
|
-
#endif
|
5262
|
+
VK_LOG_DEBUG("ggml_vk_preallocate_buffers_graph(" << node << ")");
|
5310
5263
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
|
5311
5264
|
|
5312
5265
|
if (extra == nullptr) {
|
@@ -5341,7 +5294,7 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
|
5341
5294
|
|
5342
5295
|
bool mmp = (use_src0 && use_src1 && src1_type == GGML_TYPE_F32) ? ggml_vk_get_mul_mat_mat_pipeline(ctx, src0_type, y_non_contig ? GGML_TYPE_F16 : src1->type) != nullptr : false;
|
5343
5296
|
|
5344
|
-
const bool qx_needs_dequant = use_src0 && (mmp || x_non_contig);
|
5297
|
+
const bool qx_needs_dequant = use_src0 && (!mmp || x_non_contig);
|
5345
5298
|
const bool qy_needs_dequant = use_src1 && ((src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig);
|
5346
5299
|
|
5347
5300
|
int split_k;
|
@@ -5419,9 +5372,6 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
|
5419
5372
|
}
|
5420
5373
|
|
5421
5374
|
static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
5422
|
-
#ifdef GGML_VULKAN_DEBUG
|
5423
|
-
std::cerr << "ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
|
5424
|
-
#endif
|
5425
5375
|
#if defined(GGML_VULKAN_RUN_TESTS)
|
5426
5376
|
ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
|
5427
5377
|
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
@@ -5560,6 +5510,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
5560
5510
|
#endif
|
5561
5511
|
|
5562
5512
|
if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) {
|
5513
|
+
VK_LOG_MEMORY("ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << ")");
|
5563
5514
|
// Resize buffer
|
5564
5515
|
if (ctx->prealloc_x != nullptr) {
|
5565
5516
|
ggml_vk_destroy_buffer(ctx->prealloc_x);
|
@@ -5567,6 +5518,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
5567
5518
|
ctx->prealloc_x = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_x);
|
5568
5519
|
}
|
5569
5520
|
if (ctx->prealloc_y == nullptr || (ctx->prealloc_size_y > 0 && ctx->prealloc_y->size < ctx->prealloc_size_y)) {
|
5521
|
+
VK_LOG_MEMORY("ggml_vk_preallocate_buffers(y_size: " << ctx->prealloc_size_y << ")");
|
5570
5522
|
// Resize buffer
|
5571
5523
|
if (ctx->prealloc_y != nullptr) {
|
5572
5524
|
ggml_vk_destroy_buffer(ctx->prealloc_y);
|
@@ -5574,6 +5526,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
5574
5526
|
ctx->prealloc_y = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_y);
|
5575
5527
|
}
|
5576
5528
|
if (ctx->prealloc_split_k == nullptr || (ctx->prealloc_size_split_k > 0 && ctx->prealloc_split_k->size < ctx->prealloc_size_split_k)) {
|
5529
|
+
VK_LOG_MEMORY("ggml_vk_preallocate_buffers(split_k_size: " << ctx->prealloc_size_split_k << ")");
|
5577
5530
|
// Resize buffer
|
5578
5531
|
if (ctx->prealloc_split_k != nullptr) {
|
5579
5532
|
ggml_vk_destroy_buffer(ctx->prealloc_split_k);
|
@@ -5581,6 +5534,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
5581
5534
|
ctx->prealloc_split_k = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_split_k);
|
5582
5535
|
}
|
5583
5536
|
if (ctx->staging == nullptr || (ctx->staging_size > 0 && ctx->staging->size < ctx->staging_size)) {
|
5537
|
+
VK_LOG_MEMORY("ggml_vk_preallocate_buffers(staging_size: " << ctx->staging_size << ")");
|
5584
5538
|
// Resize buffer
|
5585
5539
|
if (ctx->staging != nullptr) {
|
5586
5540
|
ggml_vk_destroy_buffer(ctx->staging);
|
@@ -5598,9 +5552,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
5598
5552
|
return;
|
5599
5553
|
}
|
5600
5554
|
|
5601
|
-
|
5602
|
-
std::cerr << "ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")" << std::endl;
|
5603
|
-
#endif
|
5555
|
+
VK_LOG_DEBUG("ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")");
|
5604
5556
|
ctx->semaphore_idx = 0;
|
5605
5557
|
ctx->staging_offset = 0;
|
5606
5558
|
|
@@ -5823,9 +5775,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
|
|
5823
5775
|
return true;
|
5824
5776
|
}
|
5825
5777
|
|
5826
|
-
|
5827
|
-
std::cerr << "ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")" << std::endl;
|
5828
|
-
#endif
|
5778
|
+
VK_LOG_DEBUG("ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")");
|
5829
5779
|
|
5830
5780
|
#ifdef GGML_VULKAN_CHECK_RESULTS
|
5831
5781
|
ggml_vk_check_results_0(ctx, params, tensor);
|
@@ -5860,9 +5810,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
|
|
5860
5810
|
|
5861
5811
|
// Clean up after graph processing is done
|
5862
5812
|
static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
|
5863
|
-
|
5864
|
-
std::cerr << "ggml_vk_graph_cleanup()" << std::endl;
|
5865
|
-
#endif
|
5813
|
+
VK_LOG_DEBUG("ggml_vk_graph_cleanup()");
|
5866
5814
|
for (auto& buffer : ctx->gc.temp_buffers) {
|
5867
5815
|
ggml_vk_pool_free(ctx, buffer);
|
5868
5816
|
}
|
@@ -5906,9 +5854,7 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
|
|
5906
5854
|
|
5907
5855
|
// Clean up on backend free
|
5908
5856
|
static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
|
5909
|
-
|
5910
|
-
std::cerr << "ggml_vk_cleanup(" << ctx->idx << ")" << std::endl;
|
5911
|
-
#endif
|
5857
|
+
VK_LOG_DEBUG("ggml_vk_cleanup(" << ctx->idx << ")");
|
5912
5858
|
ggml_vk_graph_cleanup(ctx);
|
5913
5859
|
|
5914
5860
|
ggml_vk_destroy_buffer(ctx->prealloc_x);
|
@@ -6003,9 +5949,7 @@ GGML_CALL static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) {
|
|
6003
5949
|
}
|
6004
5950
|
|
6005
5951
|
GGML_CALL static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
6006
|
-
|
6007
|
-
std::cerr << "ggml_backend_vk_buffer_free_buffer()" << std::endl;
|
6008
|
-
#endif
|
5952
|
+
VK_LOG_MEMORY("ggml_backend_vk_buffer_free_buffer()");
|
6009
5953
|
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
6010
5954
|
ggml_vk_destroy_buffer(ctx->dev_buffer);
|
6011
5955
|
delete ctx;
|
@@ -6018,9 +5962,7 @@ GGML_CALL static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t bu
|
|
6018
5962
|
}
|
6019
5963
|
|
6020
5964
|
GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
6021
|
-
|
6022
|
-
std::cerr << "ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")" << std::endl;
|
6023
|
-
#endif
|
5965
|
+
VK_LOG_DEBUG("ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")");
|
6024
5966
|
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
6025
5967
|
|
6026
5968
|
if (tensor->view_src != nullptr) {
|
@@ -6036,9 +5978,7 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b
|
|
6036
5978
|
}
|
6037
5979
|
|
6038
5980
|
GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
6039
|
-
|
6040
|
-
std::cerr << "ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
|
6041
|
-
#endif
|
5981
|
+
VK_LOG_DEBUG("ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
|
6042
5982
|
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
6043
5983
|
|
6044
5984
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
@@ -6049,9 +5989,7 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu
|
|
6049
5989
|
}
|
6050
5990
|
|
6051
5991
|
GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
6052
|
-
|
6053
|
-
std::cerr << "ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
|
6054
|
-
#endif
|
5992
|
+
VK_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
|
6055
5993
|
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
6056
5994
|
|
6057
5995
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
@@ -6109,9 +6047,7 @@ GGML_CALL static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buff
|
|
6109
6047
|
}
|
6110
6048
|
|
6111
6049
|
GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
6112
|
-
|
6113
|
-
std::cerr << "ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")" << std::endl;
|
6114
|
-
#endif
|
6050
|
+
VK_LOG_MEMORY("ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")");
|
6115
6051
|
ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
|
6116
6052
|
|
6117
6053
|
vk_buffer dev_buffer = nullptr;
|
@@ -6154,9 +6090,7 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
|
|
6154
6090
|
GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) {
|
6155
6091
|
ggml_vk_instance_init();
|
6156
6092
|
|
6157
|
-
|
6158
|
-
std::cerr << "ggml_backend_vk_buffer_type(" << dev_num << ")" << std::endl;
|
6159
|
-
#endif
|
6093
|
+
VK_LOG_DEBUG("ggml_backend_vk_buffer_type(" << dev_num << ")");
|
6160
6094
|
|
6161
6095
|
GGML_ASSERT(dev_num < vk_instance.device_indices.size());
|
6162
6096
|
|
@@ -6180,16 +6114,12 @@ GGML_CALL static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buff
|
|
6180
6114
|
}
|
6181
6115
|
|
6182
6116
|
GGML_CALL static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
6183
|
-
|
6184
|
-
std::cerr << "ggml_backend_vk_host_buffer_free_buffer()" << std::endl;
|
6185
|
-
#endif
|
6117
|
+
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
|
6186
6118
|
ggml_vk_host_free(&vk_instance.contexts[0], buffer->context);
|
6187
6119
|
}
|
6188
6120
|
|
6189
6121
|
GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
6190
|
-
|
6191
|
-
std::cerr << "ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")" << std::endl;
|
6192
|
-
#endif
|
6122
|
+
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")");
|
6193
6123
|
size += 32; // Behave like the CPU buffer type
|
6194
6124
|
void * ptr = nullptr;
|
6195
6125
|
try {
|
@@ -6246,9 +6176,7 @@ GGML_CALL static const char * ggml_backend_vk_name(ggml_backend_t backend) {
|
|
6246
6176
|
|
6247
6177
|
GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend) {
|
6248
6178
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
6249
|
-
|
6250
|
-
std::cerr << "ggml_backend_vk_free(" << ctx->name << ")" << std::endl;
|
6251
|
-
#endif
|
6179
|
+
VK_LOG_DEBUG("ggml_backend_vk_free(" << ctx->name << ")");
|
6252
6180
|
|
6253
6181
|
size_t idx = ctx->idx;
|
6254
6182
|
|
@@ -6272,9 +6200,7 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_vk_get_default_buffer_t
|
|
6272
6200
|
}
|
6273
6201
|
|
6274
6202
|
GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
6275
|
-
|
6276
|
-
std::cerr << "ggml_backend_vk_set_tensor_async(" << size << ")" << std::endl;
|
6277
|
-
#endif
|
6203
|
+
VK_LOG_DEBUG("ggml_backend_vk_set_tensor_async(" << size << ")");
|
6278
6204
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
6279
6205
|
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
6280
6206
|
|
@@ -6292,9 +6218,7 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
|
|
6292
6218
|
}
|
6293
6219
|
|
6294
6220
|
GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
6295
|
-
|
6296
|
-
std::cerr << "ggml_backend_vk_get_tensor_async(" << size << ")" << std::endl;
|
6297
|
-
#endif
|
6221
|
+
VK_LOG_DEBUG("ggml_backend_vk_get_tensor_async(" << size << ")");
|
6298
6222
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
6299
6223
|
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
6300
6224
|
|
@@ -6312,9 +6236,7 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
|
|
6312
6236
|
}
|
6313
6237
|
|
6314
6238
|
GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
|
6315
|
-
|
6316
|
-
std::cerr << "ggml_backend_vk_cpy_tensor_async()" << std::endl;
|
6317
|
-
#endif
|
6239
|
+
VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async()");
|
6318
6240
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
6319
6241
|
if ((dst->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || dst->buffer->buft == ggml_backend_vk_host_buffer_type()) && ggml_backend_buffer_is_vk(src->buffer)) {
|
6320
6242
|
ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra;
|
@@ -6337,9 +6259,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c
|
|
6337
6259
|
}
|
6338
6260
|
|
6339
6261
|
GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
|
6340
|
-
|
6341
|
-
std::cerr << "ggml_backend_vk_synchronize()" << std::endl;
|
6342
|
-
#endif
|
6262
|
+
VK_LOG_DEBUG("ggml_backend_vk_synchronize()");
|
6343
6263
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
6344
6264
|
if(ctx->transfer_ctx == nullptr) {
|
6345
6265
|
return;
|
@@ -6367,9 +6287,7 @@ static bool ggml_vk_is_empty(ggml_tensor * node) {
|
|
6367
6287
|
}
|
6368
6288
|
|
6369
6289
|
GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
6370
|
-
|
6371
|
-
std::cerr << "ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)" << std::endl;
|
6372
|
-
#endif
|
6290
|
+
VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
|
6373
6291
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
6374
6292
|
|
6375
6293
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
@@ -6582,9 +6500,7 @@ GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
|
|
6582
6500
|
if (vk_instance.initialized[dev_num]) {
|
6583
6501
|
return vk_instance.backends[dev_num];
|
6584
6502
|
}
|
6585
|
-
|
6586
|
-
std::cerr << "ggml_backend_vk_init(" << dev_num << ")" << std::endl;
|
6587
|
-
#endif
|
6503
|
+
VK_LOG_DEBUG("ggml_backend_vk_init(" << dev_num << ")");
|
6588
6504
|
|
6589
6505
|
ggml_backend_vk_context * ctx = &vk_instance.contexts[dev_num];
|
6590
6506
|
ggml_vk_init(ctx, dev_num);
|
@@ -6800,9 +6716,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6800
6716
|
return;
|
6801
6717
|
}
|
6802
6718
|
|
6803
|
-
|
6804
|
-
std::cerr << "ggml_vk_check_results_0(" << tensor->name << ")" << std::endl;
|
6805
|
-
#endif
|
6719
|
+
VK_LOG_DEBUG("ggml_vk_check_results_0(" << tensor->name << ")");
|
6806
6720
|
|
6807
6721
|
ggml_tensor * src0 = tensor->src[0];
|
6808
6722
|
ggml_tensor * src1 = tensor->src[1];
|
@@ -7108,9 +7022,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
7108
7022
|
return;
|
7109
7023
|
}
|
7110
7024
|
|
7111
|
-
|
7112
|
-
std::cerr << "ggml_vk_check_results_1(" << tensor->name << ")" << std::endl;
|
7113
|
-
#endif
|
7025
|
+
VK_LOG_DEBUG("ggml_vk_check_results_1(" << tensor->name << ")");
|
7114
7026
|
|
7115
7027
|
ggml_tensor * src0 = tensor->src[0];
|
7116
7028
|
ggml_tensor * src1 = tensor->src[1];
|