llama_cpp 0.16.1 → 0.16.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,6 +8,7 @@
8
8
 
9
9
  #include <algorithm>
10
10
  #include <cmath>
11
+ #include <iomanip>
11
12
  #include <iostream>
12
13
  #include <tuple>
13
14
  #include <vector>
@@ -57,6 +58,12 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA
57
58
  } \
58
59
  } while (0)
59
60
 
61
+ #ifdef GGML_VULKAN_DEBUG
62
+ #define VK_LOG_DEBUG(msg) std::cerr << msg << std::endl
63
+ #else
64
+ #define VK_LOG_DEBUG(msg) ((void) 0)
65
+ #endif // GGML_VULKAN_DEBUG
66
+
60
67
  struct ggml_backend_vk_context;
61
68
 
62
69
  struct vk_queue {
@@ -159,9 +166,7 @@ struct vk_device {
159
166
  std::vector<vk_pipeline_ref> pipelines;
160
167
 
161
168
  ~vk_device() {
162
- #ifdef GGML_VULKAN_DEBUG
163
- std::cerr << "destroy device " << name << std::endl;
164
- #endif
169
+ VK_LOG_DEBUG("destroy device " << name);
165
170
  device.destroyCommandPool(compute_queue.pool);
166
171
  if (!single_queue) {
167
172
  device.destroyCommandPool(transfer_queue.pool);
@@ -196,9 +201,7 @@ struct vk_buffer_struct {
196
201
  if (size == 0) {
197
202
  return;
198
203
  }
199
- #ifdef GGML_VULKAN_DEBUG
200
- std::cerr << "~vk_buffer_struct(" << buffer << ", " << size << ")" << std::endl;
201
- #endif
204
+ VK_LOG_DEBUG("~vk_buffer_struct(" << buffer << ", " << size << ")");
202
205
 
203
206
  device->device.freeMemory(device_memory);
204
207
  device->device.destroyBuffer(buffer);
@@ -355,6 +358,49 @@ struct ggml_vk_garbage_collector {
355
358
  std::vector<vk_context> contexts;
356
359
  };
357
360
 
361
+ #if defined(GGML_VULKAN_MEMORY_DEBUG) || defined(GGML_VULKAN_DEBUG)
362
+ #include <mutex>
363
+
364
+ #define VK_LOG_MEMORY(msg) std::cerr << "ggml_vulkan memory: " << msg << std::endl
365
+
366
+ static std::string format_size(size_t size) {
367
+ const size_t kib = 1024;
368
+ const size_t mib = kib * 1024;
369
+ const size_t gib = mib * 1024;
370
+
371
+ std::ostringstream oss;
372
+ oss << std::fixed << std::setprecision(2);
373
+
374
+ if (size >= gib) {
375
+ oss << static_cast<double>(size) / gib << " GiB";
376
+ } else if (size >= mib) {
377
+ oss << static_cast<double>(size) / mib << " MiB";
378
+ } else if (size >= kib) {
379
+ oss << static_cast<double>(size) / kib << " KiB";
380
+ } else {
381
+ oss << size << " B";
382
+ }
383
+
384
+ return oss.str();
385
+ }
386
+
387
+ static std::mutex log_mutex;
388
+
389
+ class vk_memory_logger {
390
+ public:
391
+ vk_memory_logger(): total_device(0), total_host(0) {}
392
+ void log_allocation(vk_buffer_ref buf_ref, size_t size);
393
+ void log_deallocation(vk_buffer_ref buf_ref);
394
+
395
+ private:
396
+ std::map<vk::Buffer, size_t> allocations; // Track allocations
397
+ size_t total_device;
398
+ size_t total_host;
399
+ };
400
+ #else
401
+ #define VK_LOG_MEMORY(msg) ((void) 0)
402
+ #endif // GGML_VULKAN_MEMORY_DEBUG
403
+
358
404
  struct ggml_backend_vk_context {
359
405
  std::string name;
360
406
 
@@ -379,8 +425,45 @@ struct ggml_backend_vk_context {
379
425
  bool initialized;
380
426
 
381
427
  size_t idx;
428
+
429
+ #ifdef GGML_VULKAN_MEMORY_DEBUG
430
+ vk_memory_logger memory_logger;
431
+ #endif
382
432
  };
383
433
 
434
+ #ifdef GGML_VULKAN_MEMORY_DEBUG
435
+ void vk_memory_logger::log_allocation(vk_buffer_ref buf_ref, size_t size) {
436
+ std::lock_guard<std::mutex> guard(log_mutex);
437
+ vk_buffer buf = buf_ref.lock();
438
+ const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal);
439
+ const std::string type = device ? "device" : "host";
440
+ allocations[buf->buffer] = size;
441
+ total_device += device ? size : 0;
442
+ total_host += device ? 0 : size;
443
+ VK_LOG_MEMORY("VULKAN" << buf->ctx->idx << ": +" << format_size(size) << " " << type << " at " << buf->buffer << ". Total device: " << format_size(total_device) << ", total host: " << format_size(total_host));
444
+ }
445
+
446
+ void vk_memory_logger::log_deallocation(vk_buffer_ref buf_ref) {
447
+ if (buf_ref.expired() || buf_ref.lock()->size == 0) {
448
+ return;
449
+ }
450
+
451
+ std::lock_guard<std::mutex> guard(log_mutex);
452
+ vk_buffer buf = buf_ref.lock();
453
+ const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal);
454
+ std::string type = device ? "device" : "host";
455
+ auto it = allocations.find(buf->buffer);
456
+ total_device -= device ? it->second : 0;
457
+ total_host -= device ? 0 : it->second;
458
+ if (it != allocations.end()) {
459
+ VK_LOG_MEMORY("VULKAN" << buf->ctx->idx << ": -" << format_size(it->second) << " " << type << " at " << buf->buffer << ". Total device: " << format_size(total_device) << ", total host: " << format_size(total_host));
460
+ allocations.erase(it);
461
+ } else {
462
+ VK_LOG_MEMORY("ERROR VULKAN" << buf->ctx->idx << ": Attempted to deallocate unknown " << type << " memory at " << buf->buffer);
463
+ }
464
+ }
465
+ #endif // GGML_VULKAN_MEMORY_DEBUG
466
+
384
467
  struct vk_instance_t {
385
468
  vk::Instance instance;
386
469
 
@@ -393,15 +476,11 @@ struct vk_instance_t {
393
476
  };
394
477
 
395
478
  static std::shared_ptr<vk_device> ggml_vk_get_device(size_t idx) {
396
- #ifdef GGML_VULKAN_DEBUG
397
- std::cerr << "ggml_vk_get_device(" << idx << ")" << std::endl;
398
- #endif
479
+ VK_LOG_DEBUG("ggml_vk_get_device(" << idx << ")");
399
480
  static std::weak_ptr<vk_device> devices[GGML_VK_MAX_DEVICES];
400
481
 
401
482
  if (devices[idx].expired()) {
402
- #ifdef GGML_VULKAN_DEBUG
403
- std::cerr << "Initializing new vk_device" << std::endl;
404
- #endif
483
+ VK_LOG_DEBUG("Initializing new vk_device");
405
484
  std::shared_ptr<vk_device> device = std::make_shared<vk_device>();
406
485
  device->initialized = false;
407
486
  devices[idx] = device;
@@ -428,9 +507,7 @@ static vk_instance_t vk_instance;
428
507
  GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend);
429
508
 
430
509
  static void ggml_vk_create_pipeline(ggml_backend_vk_context * ctx, vk_pipeline& pipeline, const std::string& name, size_t spv_size, const void* spv_data, const std::string& entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t>&& specialization_constants, uint32_t align) {
431
- #ifdef GGML_VULKAN_DEBUG
432
- std::cerr << "ggml_vk_create_pipeline(" << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")" << std::endl;
433
- #endif
510
+ VK_LOG_DEBUG("ggml_vk_create_pipeline(" << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")");
434
511
  GGML_ASSERT(parameter_count > 0);
435
512
  GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
436
513
 
@@ -531,9 +608,7 @@ static void ggml_vk_create_pipeline(ggml_backend_vk_context * ctx, vk_pipeline&
531
608
  }
532
609
 
533
610
  static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) {
534
- #ifdef GGML_VULKAN_DEBUG
535
- std::cerr << "ggml_pipeline_destroy_pipeline(" << pipeline->name << ")" << std::endl;
536
- #endif
611
+ VK_LOG_DEBUG("ggml_pipeline_destroy_pipeline(" << pipeline->name << ")");
537
612
  for (auto& pool : pipeline->descriptor_pools) {
538
613
  device.destroyDescriptorPool(pool);
539
614
  }
@@ -551,9 +626,7 @@ static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline)
551
626
  }
552
627
 
553
628
  static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx, vk_pipeline& pipeline, uint32_t n) {
554
- #ifdef GGML_VULKAN_DEBUG
555
- std::cerr << "ggml_pipeline_allocate_descriptor_sets(" << pipeline->name << ", " << n << ")" << std::endl;
556
- #endif
629
+ VK_LOG_DEBUG("ggml_pipeline_allocate_descriptor_sets(" << pipeline->name << ", " << n << ")");
557
630
  if (pipeline->descriptor_sets.size() >= pipeline->descriptor_set_idx + n) {
558
631
  // Enough descriptors are available
559
632
  return;
@@ -583,16 +656,12 @@ static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx
583
656
  }
584
657
 
585
658
  static void ggml_pipeline_cleanup(vk_pipeline& pipeline) {
586
- #ifdef GGML_VULKAN_DEBUG
587
- std::cerr << "ggml_pipeline_cleanup(" << pipeline->name << ")" << std::endl;
588
- #endif
659
+ VK_LOG_DEBUG("ggml_pipeline_cleanup(" << pipeline->name << ")");
589
660
  pipeline->descriptor_set_idx = 0;
590
661
  }
591
662
 
592
663
  static vk::CommandBuffer ggml_vk_create_cmd_buffer(ggml_backend_vk_context * ctx, vk_queue& q) {
593
- #ifdef GGML_VULKAN_DEBUG
594
- std::cerr << "ggml_vk_create_cmd_buffer()" << std::endl;
595
- #endif
664
+ VK_LOG_DEBUG("ggml_vk_create_cmd_buffer()");
596
665
  if (q.cmd_buffers.size() > q.cmd_buffer_idx) {
597
666
  // Reuse command buffer
598
667
  return q.cmd_buffers[q.cmd_buffer_idx++];
@@ -612,9 +681,7 @@ static vk::CommandBuffer ggml_vk_create_cmd_buffer(ggml_backend_vk_context * ctx
612
681
  }
613
682
 
614
683
  static vk_submission ggml_vk_create_submission(ggml_backend_vk_context * ctx, vk_queue& q, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores) {
615
- #ifdef GGML_VULKAN_DEBUG
616
- std::cerr << "ggml_vk_create_submission()" << std::endl;
617
- #endif
684
+ VK_LOG_DEBUG("ggml_vk_create_submission()");
618
685
  vk_submission s;
619
686
  s.buffer = ggml_vk_create_cmd_buffer(ctx, q);
620
687
  s.wait_semaphores = std::move(wait_semaphores);
@@ -623,9 +690,7 @@ static vk_submission ggml_vk_create_submission(ggml_backend_vk_context * ctx, vk
623
690
  }
624
691
 
625
692
  static void ggml_vk_submit(vk_context * ctx, vk::Fence fence) {
626
- #ifdef GGML_VULKAN_DEBUG
627
- std::cerr << "ggml_vk_submit(" << ctx->seqs.size() << ", " << fence << ")" << std::endl;
628
- #endif
693
+ VK_LOG_DEBUG("ggml_vk_submit(" << ctx->seqs.size() << ", " << fence << ")");
629
694
  if (ctx->seqs.empty()) {
630
695
  return;
631
696
  }
@@ -699,9 +764,7 @@ static void ggml_vk_submit(vk_context * ctx, vk::Fence fence) {
699
764
  }
700
765
 
701
766
  static uint32_t ggml_vk_find_queue_family_index(std::vector<vk::QueueFamilyProperties>& queue_family_props, const vk::QueueFlags& required, const vk::QueueFlags& avoid, int32_t compute_index, uint32_t min_num_queues) {
702
- #ifdef GGML_VULKAN_DEBUG
703
- std::cerr << "ggml_vk_find_queue_family_index()" << std::endl;
704
- #endif
767
+ VK_LOG_DEBUG("ggml_vk_find_queue_family_index()");
705
768
  const uint32_t qfsize = queue_family_props.size();
706
769
 
707
770
  // Try with avoid preferences first
@@ -747,9 +810,7 @@ static uint32_t ggml_vk_find_queue_family_index(std::vector<vk::QueueFamilyPrope
747
810
  }
748
811
 
749
812
  static void ggml_vk_create_queue(ggml_backend_vk_context * ctx, vk_queue& q, uint32_t queue_family_index, uint32_t queue_index, vk::PipelineStageFlags&& stage_flags) {
750
- #ifdef GGML_VULKAN_DEBUG
751
- std::cerr << "ggml_vk_create_queue()" << std::endl;
752
- #endif
813
+ VK_LOG_DEBUG("ggml_vk_create_queue()");
753
814
  q.queue_family_index = queue_family_index;
754
815
 
755
816
  vk::CommandPoolCreateInfo command_pool_create_info_compute(vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT), queue_family_index);
@@ -763,9 +824,7 @@ static void ggml_vk_create_queue(ggml_backend_vk_context * ctx, vk_queue& q, uin
763
824
  }
764
825
 
765
826
  static vk_context * ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_queue& q) {
766
- #ifdef GGML_VULKAN_DEBUG
767
- std::cerr << "ggml_vk_create_context()" << std::endl;
768
- #endif
827
+ VK_LOG_DEBUG("ggml_vk_create_context()");
769
828
  ctx->gc.contexts.emplace_back();
770
829
  vk_context * result = &ctx->gc.contexts[ctx->gc.contexts.size() - 1];
771
830
  memset((void *) result, 0, sizeof(vk_context));
@@ -775,9 +834,7 @@ static vk_context * ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_que
775
834
  }
776
835
 
777
836
  static vk_semaphore * ggml_vk_create_binary_semaphore(ggml_backend_vk_context * ctx) {
778
- #ifdef GGML_VULKAN_DEBUG
779
- std::cerr << "ggml_vk_create_timeline_semaphore()" << std::endl;
780
- #endif
837
+ VK_LOG_DEBUG("ggml_vk_create_timeline_semaphore()");
781
838
  vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eBinary, 0 };
782
839
  vk::SemaphoreCreateInfo ci{};
783
840
  ci.setPNext(&tci);
@@ -787,9 +844,7 @@ static vk_semaphore * ggml_vk_create_binary_semaphore(ggml_backend_vk_context *
787
844
  }
788
845
 
789
846
  static vk_semaphore * ggml_vk_create_timeline_semaphore(ggml_backend_vk_context * ctx) {
790
- #ifdef GGML_VULKAN_DEBUG
791
- std::cerr << "ggml_vk_create_timeline_semaphore()" << std::endl;
792
- #endif
847
+ VK_LOG_DEBUG("ggml_vk_create_timeline_semaphore()");
793
848
  if (ctx->semaphore_idx >= ctx->gc.tl_semaphores.size()) {
794
849
  vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eTimeline, 0 };
795
850
  vk::SemaphoreCreateInfo ci{};
@@ -808,9 +863,7 @@ static vk::Event ggml_vk_create_event(ggml_backend_vk_context * ctx) {
808
863
  }
809
864
 
810
865
  static void ggml_vk_queue_cleanup(ggml_backend_vk_context * ctx, vk_queue& q) {
811
- #ifdef GGML_VULKAN_DEBUG
812
- std::cerr << "ggml_vk_queue_cleanup()" << std::endl;
813
- #endif
866
+ VK_LOG_DEBUG("ggml_vk_queue_cleanup()");
814
867
  // Requires command buffers to be done
815
868
 
816
869
  ctx->device->device.resetCommandPool(q.pool);
@@ -830,9 +883,7 @@ static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_pr
830
883
  }
831
884
 
832
885
  static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
833
- #ifdef GGML_VULKAN_DEBUG
834
- std::cerr << "ggml_vk_create_buffer(device " << ctx->idx << ", " << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl;
835
- #endif
886
+ VK_LOG_DEBUG("ggml_vk_create_buffer(device " << ctx->idx << ", " << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")");
836
887
  vk_buffer buf = std::make_shared<vk_buffer_struct>();
837
888
 
838
889
  if (size == 0) {
@@ -892,8 +943,8 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
892
943
 
893
944
  buf->device = ctx->device;
894
945
 
895
- #ifdef GGML_VULKAN_DEBUG
896
- std::cerr << "Created buffer " << buf->buffer << std::endl;
946
+ #ifdef GGML_VULKAN_MEMORY_DEBUG
947
+ ctx->memory_logger.log_allocation(buf, size);
897
948
  #endif
898
949
 
899
950
  return buf;
@@ -928,6 +979,14 @@ static vk_buffer ggml_vk_create_buffer_device(ggml_backend_vk_context * ctx, siz
928
979
  }
929
980
 
930
981
  static void ggml_vk_destroy_buffer(vk_buffer& buf) {
982
+ if (buf == nullptr) {
983
+ return;
984
+ }
985
+
986
+ #ifdef GGML_VULKAN_MEMORY_DEBUG
987
+ buf->ctx->memory_logger.log_deallocation(buf);
988
+ #endif
989
+
931
990
  buf.reset();
932
991
  }
933
992
 
@@ -936,9 +995,7 @@ static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) {
936
995
  }
937
996
 
938
997
  static void ggml_vk_sync_buffers(vk_context * ctx) {
939
- #ifdef GGML_VULKAN_DEBUG
940
- std::cerr << "ggml_vk_sync_buffers()" << std::endl;
941
- #endif
998
+ VK_LOG_DEBUG("ggml_vk_sync_buffers()");
942
999
  const std::vector<vk::MemoryBarrier> mem_barriers{ { { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite }, { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite } } };
943
1000
 
944
1001
  ctx->s->buffer.pipelineBarrier(
@@ -952,9 +1009,7 @@ static void ggml_vk_sync_buffers(vk_context * ctx) {
952
1009
  }
953
1010
 
954
1011
  static void ggml_vk_wait_events(vk_context * ctx, std::vector<vk::Event>&& events) {
955
- #ifdef GGML_VULKAN_DEBUG
956
- std::cerr << "ggml_vk_wait_events()" << std::endl;
957
- #endif
1012
+ VK_LOG_DEBUG("ggml_vk_wait_events()");
958
1013
  if (events.empty()) {
959
1014
  return;
960
1015
  }
@@ -989,9 +1044,7 @@ static bool ggml_vk_build_shader(ggml_type type) {
989
1044
  }
990
1045
 
991
1046
  static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
992
- #ifdef GGML_VULKAN_DEBUG
993
- std::cerr << "ggml_vk_load_shaders(" << ctx->name << ")" << std::endl;
994
- #endif
1047
+ VK_LOG_DEBUG("ggml_vk_load_shaders(" << ctx->name << ")");
995
1048
 
996
1049
  const std::shared_ptr<vk_device> device = ctx->device;
997
1050
 
@@ -1042,12 +1095,12 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1042
1095
  ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
1043
1096
 
1044
1097
  if (device->fp16) {
1045
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_len, matmul_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1046
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_len, matmul_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1047
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s", matmul_f32_len, matmul_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
1048
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
1049
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1050
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1098
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1099
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1100
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
1101
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l", matmul_f32_f32_aligned_len, matmul_f32_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
1102
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_f32_aligned_len, matmul_f32_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1103
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_f32_aligned_len, matmul_f32_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1051
1104
 
1052
1105
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1053
1106
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
@@ -1140,12 +1193,12 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1140
1193
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1141
1194
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1142
1195
 
1143
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_len, matmul_id_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
1144
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_len, matmul_id_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
1145
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_len, matmul_id_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
1146
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l", matmul_id_f32_aligned_len, matmul_id_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align);
1147
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m", matmul_id_f32_aligned_len, matmul_id_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align);
1148
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s", matmul_id_f32_aligned_len, matmul_id_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align);
1196
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
1197
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
1198
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
1199
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l", matmul_id_f32_f32_aligned_len, matmul_id_f32_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align);
1200
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m", matmul_id_f32_f32_aligned_len, matmul_id_f32_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align);
1201
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s", matmul_id_f32_f32_aligned_len, matmul_id_f32_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align);
1149
1202
 
1150
1203
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->l, "matmul_id_f16_l", matmul_id_f16_len, matmul_id_f16_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
1151
1204
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->m, "matmul_id_f16_m", matmul_id_f16_len, matmul_id_f16_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
@@ -1231,12 +1284,12 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1231
1284
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_m, "matmul_id_q6_k_f32_aligned_m", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1232
1285
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_s, "matmul_id_q6_k_f32_aligned_s", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1233
1286
  } else {
1234
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1235
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1236
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
1237
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
1238
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1239
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1287
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1288
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1289
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
1290
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l", matmul_f32_f32_aligned_fp32_len, matmul_f32_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
1291
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_f32_aligned_fp32_len, matmul_f32_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1292
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_f32_aligned_fp32_len, matmul_f32_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1240
1293
 
1241
1294
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1242
1295
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
@@ -1329,12 +1382,12 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1329
1382
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1330
1383
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1331
1384
 
1332
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_fp32_len, matmul_id_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
1333
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_fp32_len, matmul_id_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
1334
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_fp32_len, matmul_id_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
1335
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l", matmul_id_f32_aligned_fp32_len, matmul_id_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align);
1336
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m", matmul_id_f32_aligned_fp32_len, matmul_id_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align);
1337
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s", matmul_id_f32_aligned_fp32_len, matmul_id_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align);
1385
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
1386
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
1387
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
1388
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l", matmul_id_f32_f32_aligned_fp32_len, matmul_id_f32_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align);
1389
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m", matmul_id_f32_f32_aligned_fp32_len, matmul_id_f32_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align);
1390
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s", matmul_id_f32_f32_aligned_fp32_len, matmul_id_f32_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align);
1338
1391
 
1339
1392
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->l, "matmul_id_f16_l", matmul_id_f16_fp32_len, matmul_id_f16_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
1340
1393
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->m, "matmul_id_f16_m", matmul_id_f16_fp32_len, matmul_id_f16_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
@@ -1429,11 +1482,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1429
1482
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1430
1483
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1431
1484
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1432
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_K_f32_f32", mul_mat_vec_q2_K_f32_f32_len, mul_mat_vec_q2_K_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1433
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_K_f32_f32", mul_mat_vec_q3_K_f32_f32_len, mul_mat_vec_q3_K_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1434
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_K_f32_f32", mul_mat_vec_q4_K_f32_f32_len, mul_mat_vec_q4_K_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1435
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_K_f32_f32", mul_mat_vec_q5_K_f32_f32_len, mul_mat_vec_q5_K_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1436
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_K_f32_f32", mul_mat_vec_q6_K_f32_f32_len, mul_mat_vec_q6_K_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1485
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1486
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1487
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1488
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1489
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1437
1490
 
1438
1491
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32", mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1439
1492
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32", mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
@@ -1442,11 +1495,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1442
1495
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1443
1496
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1444
1497
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1445
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_K_f16_f32", mul_mat_vec_q2_K_f16_f32_len, mul_mat_vec_q2_K_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1446
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_K_f16_f32", mul_mat_vec_q3_K_f16_f32_len, mul_mat_vec_q3_K_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1447
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_K_f16_f32", mul_mat_vec_q4_K_f16_f32_len, mul_mat_vec_q4_K_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1448
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_K_f16_f32", mul_mat_vec_q5_K_f16_f32_len, mul_mat_vec_q5_K_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1449
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_K_f16_f32", mul_mat_vec_q6_K_f16_f32_len, mul_mat_vec_q6_K_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1498
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1499
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1500
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1501
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1502
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1450
1503
 
1451
1504
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1452
1505
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
@@ -1455,11 +1508,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1455
1508
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1456
1509
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1457
1510
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1458
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_K_f32", mul_mat_vec_id_q2_K_f32_len, mul_mat_vec_id_q2_K_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1459
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_K_f32", mul_mat_vec_id_q3_K_f32_len, mul_mat_vec_id_q3_K_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1460
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_K_f32", mul_mat_vec_id_q4_K_f32_len, mul_mat_vec_id_q4_K_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1461
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_K_f32", mul_mat_vec_id_q5_K_f32_len, mul_mat_vec_id_q5_K_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1462
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_K_f32", mul_mat_vec_id_q6_K_f32_len, mul_mat_vec_id_q6_K_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1511
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1512
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1513
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1514
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1515
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1463
1516
 
1464
1517
  // dequant shaders
1465
1518
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
@@ -1468,11 +1521,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1468
1521
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_0], "dequant_q5_0", dequant_q5_0_len, dequant_q5_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
1469
1522
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_1], "dequant_q5_1", dequant_q5_1_len, dequant_q5_1_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
1470
1523
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q8_0], "dequant_q8_0", dequant_q8_0_len, dequant_q8_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
1471
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q2_K], "dequant_q2_K", dequant_q2_K_len, dequant_q2_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
1472
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q3_K], "dequant_q3_K", dequant_q3_K_len, dequant_q3_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
1473
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q4_K], "dequant_q4_K", dequant_q4_K_len, dequant_q4_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
1474
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_K], "dequant_q5_K", dequant_q5_K_len, dequant_q5_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
1475
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_K", dequant_q6_K_len, dequant_q6_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
1524
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q2_K], "dequant_q2_k", dequant_q2_k_len, dequant_q2_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
1525
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q3_K], "dequant_q3_k", dequant_q3_k_len, dequant_q3_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
1526
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q4_K], "dequant_q4_k", dequant_q4_k_len, dequant_q4_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
1527
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_K], "dequant_q5_k", dequant_q5_k_len, dequant_q5_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
1528
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_k", dequant_q6_k_len, dequant_q6_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
1476
1529
 
1477
1530
  // get_rows
1478
1531
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F32 ], "get_rows_f32", get_rows_f32_len, get_rows_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
@@ -1538,9 +1591,7 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1538
1591
  static void ggml_vk_print_gpu_info(size_t idx) {
1539
1592
  GGML_ASSERT(idx < vk_instance.device_indices.size());
1540
1593
  size_t dev_num = vk_instance.device_indices[idx];
1541
- #ifdef GGML_VULKAN_DEBUG
1542
- std::cerr << "ggml_vk_print_gpu_info(" << dev_num << ")" << std::endl;
1543
- #endif
1594
+ VK_LOG_DEBUG("ggml_vk_print_gpu_info(" << dev_num << ")");
1544
1595
  GGML_ASSERT(vk_instance.initialized);
1545
1596
 
1546
1597
  std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
@@ -1617,9 +1668,7 @@ void ggml_vk_instance_init() {
1617
1668
  if (vk_instance_initialized) {
1618
1669
  return;
1619
1670
  }
1620
- #ifdef GGML_VULKAN_DEBUG
1621
- std::cerr << "ggml_vk_instance_init()" << std::endl;
1622
- #endif
1671
+ VK_LOG_DEBUG("ggml_vk_instance_init()");
1623
1672
 
1624
1673
  vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION };
1625
1674
 
@@ -1696,33 +1745,37 @@ void ggml_vk_instance_init() {
1696
1745
 
1697
1746
  // Default to using all dedicated GPUs
1698
1747
  for (size_t i = 0; i < devices.size(); i++) {
1699
- vk::PhysicalDeviceProperties props = devices[i].getProperties();
1700
-
1701
- if (props.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) {
1748
+ vk::PhysicalDeviceProperties2 new_props;
1749
+ vk::PhysicalDeviceDriverProperties new_driver;
1750
+ vk::PhysicalDeviceIDProperties new_id;
1751
+ new_props.pNext = &new_driver;
1752
+ new_driver.pNext = &new_id;
1753
+ devices[i].getProperties2(&new_props);
1754
+
1755
+ if (new_props.properties.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) {
1702
1756
  // Check if there are two physical devices corresponding to the same GPU
1703
1757
  auto old_device = std::find_if(
1704
1758
  vk_instance.device_indices.begin(),
1705
1759
  vk_instance.device_indices.end(),
1706
- [&devices, &props](const size_t k){ return devices[k].getProperties().deviceID == props.deviceID; }
1760
+ [&devices, &new_id](const size_t k){
1761
+ vk::PhysicalDeviceProperties2 old_props;
1762
+ vk::PhysicalDeviceIDProperties old_id;
1763
+ old_props.pNext = &old_id;
1764
+ devices[k].getProperties2(&old_props);
1765
+ return std::equal(std::begin(old_id.deviceUUID), std::end(old_id.deviceUUID), std::begin(new_id.deviceUUID));
1766
+ }
1707
1767
  );
1708
1768
  if (old_device == vk_instance.device_indices.end()) {
1709
1769
  vk_instance.device_indices.push_back(i);
1710
1770
  } else {
1711
1771
  // There can be two physical devices corresponding to the same GPU if there are 2 different drivers
1712
1772
  // This can cause error when splitting layers aross the devices, need to keep only 1
1713
- #ifdef GGML_VULKAN_DEBUG
1714
- std::cerr << "Device " << i << " and device " << *old_device << " have the same device id" << std::endl;
1715
- #endif
1773
+ VK_LOG_DEBUG("Device " << i << " and device " << *old_device << " have the same deviceUUID");
1716
1774
 
1717
- vk::PhysicalDeviceProperties2 old_prop;
1775
+ vk::PhysicalDeviceProperties2 old_props;
1718
1776
  vk::PhysicalDeviceDriverProperties old_driver;
1719
- old_prop.pNext = &old_driver;
1720
- devices[*old_device].getProperties2(&old_prop);
1721
-
1722
- vk::PhysicalDeviceProperties2 new_prop;
1723
- vk::PhysicalDeviceDriverProperties new_driver;
1724
- new_prop.pNext = &new_driver;
1725
- devices[i].getProperties2(&new_prop);
1777
+ old_props.pNext = &old_driver;
1778
+ devices[*old_device].getProperties2(&old_props);
1726
1779
 
1727
1780
  std::map<vk::DriverId, int> driver_priorities {};
1728
1781
  int old_priority = std::numeric_limits<int>::max();
@@ -1730,7 +1783,7 @@ void ggml_vk_instance_init() {
1730
1783
 
1731
1784
  // Check https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkDriverId.html for the list of driver id
1732
1785
  // Smaller number -> higher priority
1733
- switch (old_prop.properties.vendorID) {
1786
+ switch (old_props.properties.vendorID) {
1734
1787
  case VK_VENDOR_ID_AMD:
1735
1788
  driver_priorities[vk::DriverId::eMesaRadv] = 1;
1736
1789
  driver_priorities[vk::DriverId::eAmdOpenSource] = 2;
@@ -1760,16 +1813,11 @@ void ggml_vk_instance_init() {
1760
1813
  vk_instance.device_indices.erase(r, vk_instance.device_indices.end());
1761
1814
  vk_instance.device_indices.push_back(i);
1762
1815
 
1763
- #ifdef GGML_VULKAN_DEBUG
1764
- std::cerr << "Prioritize device " << i << " driver " << new_driver.driverName << " over device " << *old_device << " driver " << old_driver.driverName << std::endl;
1765
- #endif
1816
+ VK_LOG_DEBUG("Prioritize device " << i << " driver " << new_driver.driverName << " over device " << *old_device << " driver " << old_driver.driverName);
1766
1817
  }
1767
- #ifdef GGML_VULKAN_DEBUG
1768
1818
  else {
1769
- std::cerr << "Prioritize device " << *old_device << " driver " << old_driver.driverName << " over device " << i << " driver " << new_driver.driverName << std::endl;
1770
-
1819
+ VK_LOG_DEBUG("Prioritize device " << *old_device << " driver " << old_driver.driverName << " over device " << i << " driver " << new_driver.driverName << std::endl);
1771
1820
  }
1772
- #endif
1773
1821
  }
1774
1822
  }
1775
1823
  }
@@ -1792,9 +1840,7 @@ void ggml_vk_instance_init() {
1792
1840
  static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
1793
1841
  GGML_ASSERT(idx < vk_instance.device_indices.size());
1794
1842
  size_t dev_num = vk_instance.device_indices[idx];
1795
- #ifdef GGML_VULKAN_DEBUG
1796
- std::cerr << "ggml_vk_init(" << ctx->name << ", " << dev_num << ")" << std::endl;
1797
- #endif
1843
+ VK_LOG_DEBUG("ggml_vk_init(" << ctx->name << ", " << dev_num << ")");
1798
1844
  ggml_vk_instance_init();
1799
1845
 
1800
1846
  std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
@@ -1967,9 +2013,7 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
1967
2013
  }
1968
2014
 
1969
2015
  static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type type) {
1970
- #ifdef GGML_VULKAN_DEBUG
1971
- std::cerr << "ggml_vk_get_to_fp16()" << std::endl;
1972
- #endif
2016
+ VK_LOG_DEBUG("ggml_vk_get_to_fp16()");
1973
2017
  switch (type) {
1974
2018
  case GGML_TYPE_F32:
1975
2019
  case GGML_TYPE_Q4_0:
@@ -1991,9 +2035,7 @@ static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type
1991
2035
  }
1992
2036
 
1993
2037
  static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type) {
1994
- #ifdef GGML_VULKAN_DEBUG
1995
- std::cerr << "ggml_vk_get_mul_mat_mat_pipeline()" << std::endl;
1996
- #endif
2038
+ VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_pipeline()");
1997
2039
  if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
1998
2040
  return ctx->device->pipeline_matmul_f32;
1999
2041
  }
@@ -2029,9 +2071,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
2029
2071
  }
2030
2072
 
2031
2073
  static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type) {
2032
- #ifdef GGML_VULKAN_DEBUG
2033
- std::cerr << "ggml_vk_get_dequantize_mul_mat_vec()" << std::endl;
2034
- #endif
2074
+ VK_LOG_DEBUG("ggml_vk_get_dequantize_mul_mat_vec()");
2035
2075
  GGML_ASSERT(b_type == GGML_TYPE_F32 || b_type == GGML_TYPE_F16);
2036
2076
 
2037
2077
  switch (a_type) {
@@ -2056,9 +2096,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *
2056
2096
  }
2057
2097
 
2058
2098
  static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type) {
2059
- #ifdef GGML_VULKAN_DEBUG
2060
- std::cerr << "ggml_vk_get_mul_mat_mat_id_pipeline()" << std::endl;
2061
- #endif
2099
+ VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_id_pipeline()");
2062
2100
  if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
2063
2101
  return ctx->device->pipeline_matmul_id_f32;
2064
2102
  }
@@ -2091,9 +2129,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_co
2091
2129
  }
2092
2130
 
2093
2131
  static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type) {
2094
- #ifdef GGML_VULKAN_DEBUG
2095
- std::cerr << "ggml_vk_get_dequantize_mul_mat_vec()" << std::endl;
2096
- #endif
2132
+ VK_LOG_DEBUG("ggml_vk_get_dequantize_mul_mat_vec()");
2097
2133
  GGML_ASSERT(b_type == GGML_TYPE_F32);
2098
2134
 
2099
2135
  switch (a_type) {
@@ -2118,9 +2154,9 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context
2118
2154
  }
2119
2155
 
2120
2156
  static vk_buffer ggml_vk_pool_malloc(ggml_backend_vk_context * ctx, size_t size) {
2121
- #ifdef GGML_VULKAN_DEBUG
2122
- std::cerr << "ggml_vk_pool_malloc(" << size << ")" << std::endl;
2123
- #endif
2157
+ VK_LOG_DEBUG("ggml_vk_pool_malloc(" << size << ")");
2158
+ VK_LOG_MEMORY("ggml_vk_pool_malloc");
2159
+
2124
2160
  int best_i = -1;
2125
2161
  size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs
2126
2162
  int worst_i = -1;
@@ -2148,13 +2184,11 @@ static vk_buffer ggml_vk_pool_malloc(ggml_backend_vk_context * ctx, size_t size)
2148
2184
  ggml_vk_destroy_buffer(b);
2149
2185
  }
2150
2186
 
2151
- return ggml_vk_create_buffer_check(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal);
2187
+ return ggml_vk_create_buffer_device(ctx, size);
2152
2188
  }
2153
2189
 
2154
2190
  static void ggml_vk_pool_free(ggml_backend_vk_context * ctx, vk_buffer& buffer) {
2155
- #ifdef GGML_VULKAN_DEBUG
2156
- std::cerr << "ggml_vk_pool_free(" << buffer->size << ")" << std::endl;
2157
- #endif
2191
+ VK_LOG_DEBUG("ggml_vk_pool_free(" << buffer->size << ")");
2158
2192
  for (int i = 0; i < MAX_VK_BUFFERS; ++i) {
2159
2193
  vk_buffer& b = ctx->buffer_pool[i];
2160
2194
  if (b == nullptr) {
@@ -2175,6 +2209,8 @@ static vk_buffer ggml_vk_create_buffer_temp(ggml_backend_vk_context * ctx, size_
2175
2209
  }
2176
2210
  }
2177
2211
 
2212
+ VK_LOG_MEMORY("ggml_vk_create_buffer_temp(" << size << ")");
2213
+
2178
2214
  // Otherwise create new buffer
2179
2215
  vk_buffer buf = ggml_vk_pool_malloc(ctx, size);
2180
2216
  ctx->gc.temp_buffers.push_back(buf);
@@ -2183,9 +2219,7 @@ static vk_buffer ggml_vk_create_buffer_temp(ggml_backend_vk_context * ctx, size_
2183
2219
  }
2184
2220
 
2185
2221
  static void * ggml_vk_host_malloc(ggml_backend_vk_context * ctx, size_t size) {
2186
- #ifdef GGML_VULKAN_DEBUG
2187
- std::cerr << "ggml_vk_host_malloc(" << size << ")" << std::endl;
2188
- #endif
2222
+ VK_LOG_MEMORY("ggml_vk_host_malloc(" << size << ")");
2189
2223
  vk_buffer buf = ggml_vk_create_buffer(ctx, size,
2190
2224
  vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
2191
2225
  vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
@@ -2207,9 +2241,7 @@ static void ggml_vk_host_free(ggml_backend_vk_context * ctx, void* ptr) {
2207
2241
  if (ptr == nullptr) {
2208
2242
  return;
2209
2243
  }
2210
- #ifdef GGML_VULKAN_DEBUG
2211
- std::cerr << "ggml_vk_host_free(" << ptr << ")" << std::endl;
2212
- #endif
2244
+ VK_LOG_MEMORY("ggml_vk_host_free(" << ptr << ")");
2213
2245
  vk_buffer buf;
2214
2246
  size_t index;
2215
2247
  for (size_t i = 0; i < ctx->pinned_memory.size(); i++) {
@@ -2261,13 +2293,11 @@ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context * ctx, vk_context
2261
2293
  const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]);
2262
2294
  const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]);
2263
2295
  const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]);
2264
- #ifdef GGML_VULKAN_DEBUG
2265
- std::cerr << "ggml_vk_dispatch_pipeline(" << pipeline->name << ", {";
2296
+ VK_LOG_DEBUG("ggml_vk_dispatch_pipeline(" << pipeline->name << ", {";
2266
2297
  for (auto& buffer : buffers) {
2267
2298
  std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.size << "), ";
2268
2299
  }
2269
- std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))" << std::endl;
2270
- #endif
2300
+ std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))");
2271
2301
  std::vector<vk::DescriptorBufferInfo> descriptor_buffer_infos;
2272
2302
  std::vector<vk::WriteDescriptorSet> write_descriptor_sets;
2273
2303
  GGML_ASSERT(pipeline->descriptor_set_idx < pipeline->descriptor_sets.size());
@@ -2300,9 +2330,7 @@ static void ggml_vk_end_submission(vk_submission& s, std::vector<vk_semaphore> w
2300
2330
  }
2301
2331
 
2302
2332
  static void ggml_vk_ctx_end(vk_context * ctx) {
2303
- #ifdef GGML_VULKAN_DEBUG
2304
- std::cerr << "ggml_vk_ctx_end(" << ctx << ", " << ctx->seqs.size() << ")" << std::endl;
2305
- #endif
2333
+ VK_LOG_DEBUG("ggml_vk_ctx_end(" << ctx << ", " << ctx->seqs.size() << ")");
2306
2334
  if (ctx->s == nullptr) {
2307
2335
  return;
2308
2336
  }
@@ -2312,9 +2340,7 @@ static void ggml_vk_ctx_end(vk_context * ctx) {
2312
2340
  }
2313
2341
 
2314
2342
  static void ggml_vk_ctx_begin(ggml_backend_vk_context * ctx, vk_context * subctx) {
2315
- #ifdef GGML_VULKAN_DEBUG
2316
- std::cerr << "ggml_vk_ctx_begin(" << ctx << ")" << std::endl;
2317
- #endif
2343
+ VK_LOG_DEBUG("ggml_vk_ctx_begin(" << ctx << ")");
2318
2344
  if (subctx->s != nullptr) {
2319
2345
  ggml_vk_ctx_end(subctx);
2320
2346
  }
@@ -2324,9 +2350,7 @@ static void ggml_vk_ctx_begin(ggml_backend_vk_context * ctx, vk_context * subctx
2324
2350
  }
2325
2351
 
2326
2352
  static size_t ggml_vk_align_size(size_t width, size_t align) {
2327
- #ifdef GGML_VULKAN_DEBUG
2328
- std::cerr << "ggml_vk_align_size(" << width << ", " << align << ")" << std::endl;
2329
- #endif
2353
+ VK_LOG_DEBUG("ggml_vk_align_size(" << width << ", " << align << ")");
2330
2354
  return CEIL_DIV(width, align) * align;
2331
2355
  }
2332
2356
 
@@ -2340,6 +2364,7 @@ static void deferred_memcpy(void * dst, const void * src, size_t size, std::vect
2340
2364
 
2341
2365
  static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, size_t size) {
2342
2366
  if (ctx->sync_staging == nullptr || ctx->sync_staging->size < size) {
2367
+ VK_LOG_MEMORY("ggml_vk_ensure_sync_staging_buffer(" << size << ")");
2343
2368
  ggml_vk_destroy_buffer(ctx->sync_staging);
2344
2369
  ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size,
2345
2370
  vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
@@ -2348,9 +2373,7 @@ static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, si
2348
2373
  }
2349
2374
 
2350
2375
  static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) {
2351
- #ifdef GGML_VULKAN_DEBUG
2352
- std::cerr << "ggml_vk_buffer_write_nc_async(" << tensor << ")" << std::endl;
2353
- #endif
2376
+ VK_LOG_DEBUG("ggml_vk_buffer_write_nc_async(" << tensor << ")");
2354
2377
  GGML_ASSERT(!ggml_is_contiguous(tensor));
2355
2378
  // Buffer is already mapped
2356
2379
  if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
@@ -2455,9 +2478,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
2455
2478
  }
2456
2479
 
2457
2480
  static void ggml_vk_buffer_write_2d_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, bool sync_staging = false) {
2458
- #ifdef GGML_VULKAN_DEBUG
2459
- std::cerr << "ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")" << std::endl;
2460
- #endif
2481
+ VK_LOG_DEBUG("ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")");
2461
2482
  // Make sure ctx owns the buffer
2462
2483
  GGML_ASSERT(dst->ctx == ctx);
2463
2484
 
@@ -2492,9 +2513,7 @@ static void ggml_vk_buffer_write_2d_async(ggml_backend_vk_context * ctx, vk_cont
2492
2513
  subctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices);
2493
2514
  return;
2494
2515
  }
2495
- #ifdef GGML_VULKAN_DEBUG
2496
- std::cerr << "STAGING" << std::endl;
2497
- #endif
2516
+ VK_LOG_DEBUG("STAGING");
2498
2517
 
2499
2518
  // Staging buffer required
2500
2519
  vk_buffer staging = ctx->staging;
@@ -2529,16 +2548,12 @@ static void ggml_vk_buffer_write_2d_async(ggml_backend_vk_context * ctx, vk_cont
2529
2548
  }
2530
2549
 
2531
2550
  static void ggml_vk_buffer_write_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, bool sync_staging = false) {
2532
- #ifdef GGML_VULKAN_DEBUG
2533
- std::cerr << "ggml_vk_buffer_write_async(" << size << ")" << std::endl;
2534
- #endif
2551
+ VK_LOG_DEBUG("ggml_vk_buffer_write_async(" << size << ")");
2535
2552
  return ggml_vk_buffer_write_2d_async(ctx, subctx, dst, offset, src, size, size, 1, sync_staging);
2536
2553
  }
2537
2554
 
2538
2555
  static void ggml_vk_buffer_write_2d(ggml_backend_vk_context * ctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height) {
2539
- #ifdef GGML_VULKAN_DEBUG
2540
- std::cerr << "ggml_vk_buffer_write_2d(" << width << ", " << height << ")" << std::endl;
2541
- #endif
2556
+ VK_LOG_DEBUG("ggml_vk_buffer_write_2d(" << width << ", " << height << ")");
2542
2557
  // Buffer is already mapped
2543
2558
  if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
2544
2559
  GGML_ASSERT(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
@@ -2563,16 +2578,12 @@ static void ggml_vk_buffer_write_2d(ggml_backend_vk_context * ctx, vk_buffer& ds
2563
2578
  }
2564
2579
 
2565
2580
  static void ggml_vk_buffer_write(ggml_backend_vk_context * ctx, vk_buffer& dst, size_t offset, const void * src, size_t size) {
2566
- #ifdef GGML_VULKAN_DEBUG
2567
- std::cerr << "ggml_vk_buffer_write(" << size << ")" << std::endl;
2568
- #endif
2581
+ VK_LOG_DEBUG("ggml_vk_buffer_write(" << size << ")");
2569
2582
  ggml_vk_buffer_write_2d(ctx, dst, offset, src, 0, size, 1);
2570
2583
  }
2571
2584
 
2572
2585
  static void ggml_vk_buffer_read_2d_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, bool sync_staging = false) {
2573
- #ifdef GGML_VULKAN_DEBUG
2574
- std::cerr << "ggml_vk_buffer_read_2d_async(offset=" << offset << ", width=" << width << ", height=" << height << ")" << std::endl;
2575
- #endif
2586
+ VK_LOG_DEBUG("ggml_vk_buffer_read_2d_async(offset=" << offset << ", width=" << width << ", height=" << height << ")");
2576
2587
  GGML_ASSERT(width > 0);
2577
2588
  GGML_ASSERT(height > 0);
2578
2589
  GGML_ASSERT(src != nullptr);
@@ -2606,9 +2617,7 @@ static void ggml_vk_buffer_read_2d_async(ggml_backend_vk_context * ctx, vk_conte
2606
2617
 
2607
2618
  return;
2608
2619
  }
2609
- #ifdef GGML_VULKAN_DEBUG
2610
- std::cerr << "STAGING" << std::endl;
2611
- #endif
2620
+ VK_LOG_DEBUG("STAGING");
2612
2621
 
2613
2622
  // Fall back to staging buffer
2614
2623
  vk_buffer staging = ctx->staging;
@@ -2635,9 +2644,7 @@ static void ggml_vk_buffer_read_async(ggml_backend_vk_context * ctx, vk_context
2635
2644
  }
2636
2645
 
2637
2646
  static void ggml_vk_buffer_read(ggml_backend_vk_context * ctx, vk_buffer& src, size_t offset, void * dst, size_t size) {
2638
- #ifdef GGML_VULKAN_DEBUG
2639
- std::cerr << "ggml_vk_buffer_read(" << offset << ", " << size << ")" << std::endl;
2640
- #endif
2647
+ VK_LOG_DEBUG("ggml_vk_buffer_read(" << offset << ", " << size << ")");
2641
2648
  if(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
2642
2649
  GGML_ASSERT(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
2643
2650
 
@@ -2659,9 +2666,7 @@ static void ggml_vk_buffer_read(ggml_backend_vk_context * ctx, vk_buffer& src, s
2659
2666
  }
2660
2667
 
2661
2668
  static void ggml_vk_buffer_copy_async(vk_context * ctx, vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
2662
- #ifdef GGML_VULKAN_DEBUG
2663
- std::cerr << "ggml_vk_buffer_copy_async(" << size << ")" << std::endl;
2664
- #endif
2669
+ VK_LOG_DEBUG("ggml_vk_buffer_copy_async(" << size << ")");
2665
2670
  // Make sure both buffers are on same ctx
2666
2671
  GGML_ASSERT(src->ctx == dst->ctx);
2667
2672
 
@@ -2672,9 +2677,7 @@ static void ggml_vk_buffer_copy_async(vk_context * ctx, vk_buffer& dst, size_t d
2672
2677
 
2673
2678
  static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
2674
2679
  if (src->ctx == dst->ctx) {
2675
- #ifdef GGML_VULKAN_DEBUG
2676
- std::cerr << "ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")" << std::endl;
2677
- #endif
2680
+ VK_LOG_DEBUG("ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")");
2678
2681
  // Copy within the device
2679
2682
  ggml_backend_vk_context * ctx = src->ctx;
2680
2683
 
@@ -2686,9 +2689,7 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr
2686
2689
  VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences");
2687
2690
  ctx->device->device.resetFences({ ctx->fence });
2688
2691
  } else {
2689
- #ifdef GGML_VULKAN_DEBUG
2690
- std::cerr << "ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")" << std::endl;
2691
- #endif
2692
+ VK_LOG_DEBUG("ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")");
2692
2693
  // Copy device to device
2693
2694
  ggml_backend_vk_context * src_ctx = src->ctx;
2694
2695
  ggml_backend_vk_context * dst_ctx = dst->ctx;
@@ -2706,9 +2707,7 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr
2706
2707
  }
2707
2708
 
2708
2709
  static void ggml_vk_buffer_memset(ggml_backend_vk_context * ctx, vk_buffer& dst, size_t offset, uint32_t c, size_t size) {
2709
- #ifdef GGML_VULKAN_DEBUG
2710
- std::cerr << "ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")" << std::endl;
2711
- #endif
2710
+ VK_LOG_DEBUG("ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")");
2712
2711
  // Make sure ctx owns the buffer
2713
2712
  GGML_ASSERT(dst->ctx == ctx);
2714
2713
 
@@ -2723,9 +2722,7 @@ static void ggml_vk_buffer_memset(ggml_backend_vk_context * ctx, vk_buffer& dst,
2723
2722
  }
2724
2723
 
2725
2724
  static void ggml_vk_h2d_tensor_2d(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const ggml_tensor * src, uint64_t i3, uint64_t i2, uint64_t i1) {
2726
- #ifdef GGML_VULKAN_DEBUG
2727
- std::cerr << "ggml_vk_h2d_tensor_2d(dst=" << dst << ", offset=" << offset << ", src=" << src << ", i3=" << i3 << ", i2=" << i2 << ", i1=" << i1 << ")" << std::endl;
2728
- #endif
2725
+ VK_LOG_DEBUG("ggml_vk_h2d_tensor_2d(dst=" << dst << ", offset=" << offset << ", src=" << src << ", i3=" << i3 << ", i2=" << i2 << ", i1=" << i1 << ")");
2729
2726
  const uint64_t ne0 = src->ne[0];
2730
2727
  const uint64_t ne1 = src->ne[1];
2731
2728
  const uint64_t nb0 = src->nb[0];
@@ -2753,9 +2750,7 @@ static void ggml_vk_h2d_tensor_2d(ggml_backend_vk_context * ctx, vk_context * su
2753
2750
  }
2754
2751
 
2755
2752
  static void ggml_vk_d2h_tensor_2d(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& src, size_t offset, const ggml_tensor * dst) {
2756
- #ifdef GGML_VULKAN_DEBUG
2757
- std::cerr << "ggml_vk_d2h_tensor_2d()" << std::endl;
2758
- #endif
2753
+ VK_LOG_DEBUG("ggml_vk_d2h_tensor_2d()");
2759
2754
  const uint64_t ne0 = dst->ne[0];
2760
2755
  const uint64_t ne1 = dst->ne[1];
2761
2756
  const uint64_t ne2 = dst->ne[2];
@@ -2779,9 +2774,7 @@ static void ggml_vk_d2h_tensor_2d(ggml_backend_vk_context * ctx, vk_context * su
2779
2774
  }
2780
2775
 
2781
2776
  static uint32_t ggml_vk_guess_split_k(int m, int n, int k) {
2782
- #ifdef GGML_VULKAN_DEBUG
2783
- std::cerr << "ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")" << std::endl;
2784
- #endif
2777
+ VK_LOG_DEBUG("ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")");
2785
2778
  // if (k > 128 && (m < 128 || n < 128) && m > 2 && n > 2) {
2786
2779
  // return 4;
2787
2780
  // }
@@ -2813,9 +2806,7 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline_intel(ggml_backend_vk_context *
2813
2806
  }
2814
2807
 
2815
2808
  static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned) {
2816
- #ifdef GGML_VULKAN_DEBUG
2817
- std::cerr << "ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ")" << std::endl;
2818
- #endif
2809
+ VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ")");
2819
2810
  switch (ctx->device->vendor_id) {
2820
2811
  case VK_VENDOR_ID_AMD:
2821
2812
  return ggml_vk_guess_matmul_pipeline_amd(ctx, mmp, m, n, aligned);
@@ -2837,9 +2828,7 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
2837
2828
  }
2838
2829
 
2839
2830
  static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n) {
2840
- #ifdef GGML_VULKAN_DEBUG
2841
- std::cerr << "ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")" << std::endl;
2842
- #endif
2831
+ VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")");
2843
2832
  return ggml_vk_guess_matmul_pipeline(ctx, mmp, m, n, true)->align;
2844
2833
  }
2845
2834
 
@@ -2849,9 +2838,7 @@ static void ggml_vk_matmul(
2849
2838
  uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d,
2850
2839
  uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
2851
2840
  uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3) {
2852
- #ifdef GGML_VULKAN_DEBUG
2853
- std::cerr << "ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ")" << std::endl;
2854
- #endif
2841
+ VK_LOG_DEBUG("ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ")");
2855
2842
  ggml_vk_sync_buffers(subctx);
2856
2843
  if (split_k == 1) {
2857
2844
  const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k, ne02, ne12, broadcast2, broadcast3 };
@@ -2875,12 +2862,10 @@ static void ggml_vk_matmul_id(
2875
2862
  uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d,
2876
2863
  uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
2877
2864
  uint32_t n_as, uint32_t nei0, uint32_t nei1, uint32_t nbi1, uint32_t ne11) {
2878
- #ifdef GGML_VULKAN_DEBUG
2879
- std::cerr << "ggml_vk_matmul_id(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), ids: (" << ids.buffer->buffer << ", " << ids.offset << ", " << ids.size << "), " <<
2865
+ VK_LOG_DEBUG("ggml_vk_matmul_id(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), ids: (" << ids.buffer->buffer << ", " << ids.offset << ", " << ids.size << "), " <<
2880
2866
  "m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", " <<
2881
2867
  "batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", " <<
2882
- "n_as: " << n_as << ", nei0: " << nei0 << ", nei1: " << nei1 << ", nbi1: " << nbi1 << ", ne11: " << ne11 << ")" << std::endl;
2883
- #endif
2868
+ "n_as: " << n_as << ", nei0: " << nei0 << ", nei1: " << nei1 << ", nbi1: " << nbi1 << ", ne11: " << ne11 << ")");
2884
2869
  ggml_vk_sync_buffers(subctx);
2885
2870
  const vk_mat_mat_id_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d,
2886
2871
  nei0, nei1, nbi1, ne11 };
@@ -2910,10 +2895,8 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_
2910
2895
  }
2911
2896
 
2912
2897
  static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) {
2913
- #ifdef GGML_VULKAN_DEBUG
2914
- std::cerr << "ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
2915
- std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")" << std::endl;
2916
- #endif
2898
+ VK_LOG_DEBUG("ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
2899
+ std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")");
2917
2900
  const int tensor_type_size = ggml_type_size(tensor->type);
2918
2901
 
2919
2902
  const uint32_t ne = ggml_nelements(tensor);
@@ -2930,11 +2913,9 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context
2930
2913
  }
2931
2914
 
2932
2915
  static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2933
- #ifdef GGML_VULKAN_DEBUG
2934
- std::cerr << "ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
2916
+ VK_LOG_DEBUG("ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
2935
2917
  std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
2936
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
2937
- #endif
2918
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
2938
2919
  GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
2939
2920
  GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
2940
2921
 
@@ -3105,11 +3086,9 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
3105
3086
  }
3106
3087
 
3107
3088
  static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3108
- #ifdef GGML_VULKAN_DEBUG
3109
- std::cerr << "ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3089
+ VK_LOG_DEBUG("ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3110
3090
  std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3111
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3112
- #endif
3091
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
3113
3092
  GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
3114
3093
  GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
3115
3094
 
@@ -3260,11 +3239,9 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
3260
3239
  }
3261
3240
 
3262
3241
  static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3263
- #ifdef GGML_VULKAN_DEBUG
3264
- std::cerr << "ggml_vk_mul_mat_p021_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3242
+ VK_LOG_DEBUG("ggml_vk_mul_mat_p021_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3265
3243
  std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3266
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3267
- #endif
3244
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
3268
3245
  GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
3269
3246
  GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT
3270
3247
  GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT
@@ -3333,11 +3310,9 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
3333
3310
  }
3334
3311
 
3335
3312
  static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3336
- #ifdef GGML_VULKAN_DEBUG
3337
- std::cerr << "ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3313
+ VK_LOG_DEBUG("ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3338
3314
  std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3339
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3340
- #endif
3315
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
3341
3316
  GGML_ASSERT(!ggml_is_transposed(src0));
3342
3317
  GGML_ASSERT(!ggml_is_transposed(src1));
3343
3318
  GGML_ASSERT(!ggml_is_permuted(src0));
@@ -3410,9 +3385,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
3410
3385
  }
3411
3386
 
3412
3387
  static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3413
- #ifdef GGML_VULKAN_DEBUG
3414
- std::cerr << "ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")" << std::endl;
3415
- #endif
3388
+ VK_LOG_DEBUG("ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")");
3416
3389
  if (src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && dst->ne[1] == 1) {
3417
3390
  ggml_vk_mul_mat_vec_p021_f16_f32(ctx, subctx, src0, src1, dst);
3418
3391
  } else if (src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && dst->ne[1] == 1) {
@@ -3425,12 +3398,10 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx,
3425
3398
  }
3426
3399
 
3427
3400
  static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
3428
- #ifdef GGML_VULKAN_DEBUG
3429
- std::cerr << "ggml_vk_mul_mat_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3401
+ VK_LOG_DEBUG("ggml_vk_mul_mat_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3430
3402
  std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3431
3403
  std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3];
3432
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3433
- #endif
3404
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
3434
3405
  GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
3435
3406
  GGML_ASSERT(ids->type == GGML_TYPE_I32);
3436
3407
 
@@ -3616,12 +3587,10 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
3616
3587
  }
3617
3588
 
3618
3589
  static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
3619
- #ifdef GGML_VULKAN_DEBUG
3620
- std::cerr << "ggml_vk_mul_mat_vec_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3590
+ VK_LOG_DEBUG("ggml_vk_mul_mat_vec_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3621
3591
  std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3622
3592
  std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3];
3623
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3624
- #endif
3593
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
3625
3594
  GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
3626
3595
  GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
3627
3596
  GGML_ASSERT(ids->type == GGML_TYPE_I32);
@@ -3784,9 +3753,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
3784
3753
  }
3785
3754
 
3786
3755
  static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
3787
- #ifdef GGML_VULKAN_DEBUG
3788
- std::cerr << "ggml_vk_mul_mat_id(" << src0 << ", " << src1 << ", " << src2 << ", " << dst << ")" << std::endl;
3789
- #endif
3756
+ VK_LOG_DEBUG("ggml_vk_mul_mat_id(" << src0 << ", " << src1 << ", " << src2 << ", " << dst << ")");
3790
3757
  if (src2->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) {
3791
3758
  ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, src0, src1, src2, dst);
3792
3759
  } else {
@@ -4020,16 +3987,14 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
4020
3987
 
4021
3988
  template<typename PC>
4022
3989
  static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc) {
4023
- #ifdef GGML_VULKAN_DEBUG
4024
- std::cerr << "ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3990
+ VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
4025
3991
  if (src1 != nullptr) {
4026
3992
  std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
4027
3993
  }
4028
3994
  if (src2 != nullptr) {
4029
3995
  std::cerr << "), (" << src2 << ", name=" << src2->name << ", type=" << src2->type << ", ne0=" << src2->ne[0] << ", ne1=" << src2->ne[1] << ", ne2=" << src2->ne[2] << ", ne3=" << src2->ne[3] << ", nb0=" << src2->nb[0] << ", nb1=" << src2->nb[1] << ", nb2=" << src2->nb[2] << ", nb3=" << src2->nb[3];
4030
3996
  }
4031
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl;
4032
- #endif
3997
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")");
4033
3998
  GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
4034
3999
  GGML_ASSERT(ggml_vk_op_supports_incontiguous(op) || ggml_vk_dim01_contiguous(src0)); // NOLINT
4035
4000
  GGML_ASSERT(dst->extra != nullptr);
@@ -4527,9 +4492,7 @@ static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0
4527
4492
 
4528
4493
  template <typename X_TYPE, typename Y_TYPE>
4529
4494
  static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t n, size_t k, size_t batch, size_t num_it, int split_k, int shader_size) {
4530
- #ifdef GGML_VULKAN_DEBUG
4531
- std::cerr << "ggml_vk_test_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << shader_size << ")" << std::endl;
4532
- #endif
4495
+ VK_LOG_DEBUG("ggml_vk_test_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << shader_size << ")");
4533
4496
  const size_t x_ne = m * k * batch;
4534
4497
  const size_t y_ne = k * n * batch;
4535
4498
  const size_t d_ne = m * n * batch;
@@ -4943,9 +4906,7 @@ static void ggml_vk_test_h2d_nc(ggml_backend_vk_context * ctx, size_t ne0, size_
4943
4906
  }
4944
4907
 
4945
4908
  static void ggml_vk_test_transfer(ggml_backend_vk_context * ctx, size_t ne, bool pinned) {
4946
- #ifdef GGML_VULKAN_DEBUG
4947
- std::cerr << "ggml_vk_test_transfer(" << ne << ")" << std::endl;
4948
- #endif
4909
+ VK_LOG_DEBUG("ggml_vk_test_transfer(" << ne << ")");
4949
4910
  // Check transfers are correct
4950
4911
  vk_buffer buffer = ggml_vk_create_buffer_check(ctx, sizeof(float) * ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
4951
4912
 
@@ -5029,9 +4990,7 @@ static void ggml_vk_quantize_data(const float * from, void * to, size_t ne, ggml
5029
4990
  }
5030
4991
 
5031
4992
  static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) {
5032
- #ifdef GGML_VULKAN_DEBUG
5033
- std::cerr << "ggml_vk_test_dequant(" << ne << ")" << std::endl;
5034
- #endif
4993
+ VK_LOG_DEBUG("ggml_vk_test_dequant(" << ne << ")");
5035
4994
  const size_t x_sz = sizeof(float) * ne;
5036
4995
  const size_t x_sz_f16 = sizeof(ggml_fp16_t) * ne;
5037
4996
  const size_t qx_sz = ne * ggml_type_size(quant)/ggml_blck_size(quant);
@@ -5108,9 +5067,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
5108
5067
  }
5109
5068
 
5110
5069
  static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, size_t n, size_t k, size_t batch, size_t num_it, size_t split_k, size_t shader_size, ggml_type quant) {
5111
- #ifdef GGML_VULKAN_DEBUG
5112
- std::cerr << "ggml_vk_test_dequant_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << ggml_type_name(quant) << ")" << std::endl;
5113
- #endif
5070
+ VK_LOG_DEBUG("ggml_vk_test_dequant_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << ggml_type_name(quant) << ")");
5114
5071
  const size_t x_ne = m * k * batch;
5115
5072
  const size_t y_ne = k * n * batch;
5116
5073
  const size_t d_ne = m * n * batch;
@@ -5294,9 +5251,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
5294
5251
  #endif
5295
5252
 
5296
5253
  static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor) {
5297
- #ifdef GGML_VULKAN_DEBUG
5298
- std::cerr << "ggml_vk_create_extra(" << tensor << " (" << tensor->name << ", " << ggml_op_name(tensor->op) << "))" << std::endl;
5299
- #endif
5254
+ VK_LOG_DEBUG("ggml_vk_create_extra(" << tensor << " (" << tensor->name << ", " << ggml_op_name(tensor->op) << "))");
5300
5255
  ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
5301
5256
  extra->reset();
5302
5257
  tensor->extra = extra;
@@ -5304,9 +5259,7 @@ static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor)
5304
5259
  }
5305
5260
 
5306
5261
  static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggml_tensor * node){
5307
- #ifdef GGML_VULKAN_DEBUG
5308
- std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
5309
- #endif
5262
+ VK_LOG_DEBUG("ggml_vk_preallocate_buffers_graph(" << node << ")");
5310
5263
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
5311
5264
 
5312
5265
  if (extra == nullptr) {
@@ -5341,7 +5294,7 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
5341
5294
 
5342
5295
  bool mmp = (use_src0 && use_src1 && src1_type == GGML_TYPE_F32) ? ggml_vk_get_mul_mat_mat_pipeline(ctx, src0_type, y_non_contig ? GGML_TYPE_F16 : src1->type) != nullptr : false;
5343
5296
 
5344
- const bool qx_needs_dequant = use_src0 && (mmp || x_non_contig);
5297
+ const bool qx_needs_dequant = use_src0 && (!mmp || x_non_contig);
5345
5298
  const bool qy_needs_dequant = use_src1 && ((src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig);
5346
5299
 
5347
5300
  int split_k;
@@ -5419,9 +5372,6 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
5419
5372
  }
5420
5373
 
5421
5374
  static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
5422
- #ifdef GGML_VULKAN_DEBUG
5423
- std::cerr << "ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
5424
- #endif
5425
5375
  #if defined(GGML_VULKAN_RUN_TESTS)
5426
5376
  ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
5427
5377
  vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
@@ -5560,6 +5510,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
5560
5510
  #endif
5561
5511
 
5562
5512
  if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) {
5513
+ VK_LOG_MEMORY("ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << ")");
5563
5514
  // Resize buffer
5564
5515
  if (ctx->prealloc_x != nullptr) {
5565
5516
  ggml_vk_destroy_buffer(ctx->prealloc_x);
@@ -5567,6 +5518,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
5567
5518
  ctx->prealloc_x = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_x);
5568
5519
  }
5569
5520
  if (ctx->prealloc_y == nullptr || (ctx->prealloc_size_y > 0 && ctx->prealloc_y->size < ctx->prealloc_size_y)) {
5521
+ VK_LOG_MEMORY("ggml_vk_preallocate_buffers(y_size: " << ctx->prealloc_size_y << ")");
5570
5522
  // Resize buffer
5571
5523
  if (ctx->prealloc_y != nullptr) {
5572
5524
  ggml_vk_destroy_buffer(ctx->prealloc_y);
@@ -5574,6 +5526,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
5574
5526
  ctx->prealloc_y = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_y);
5575
5527
  }
5576
5528
  if (ctx->prealloc_split_k == nullptr || (ctx->prealloc_size_split_k > 0 && ctx->prealloc_split_k->size < ctx->prealloc_size_split_k)) {
5529
+ VK_LOG_MEMORY("ggml_vk_preallocate_buffers(split_k_size: " << ctx->prealloc_size_split_k << ")");
5577
5530
  // Resize buffer
5578
5531
  if (ctx->prealloc_split_k != nullptr) {
5579
5532
  ggml_vk_destroy_buffer(ctx->prealloc_split_k);
@@ -5581,6 +5534,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
5581
5534
  ctx->prealloc_split_k = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_split_k);
5582
5535
  }
5583
5536
  if (ctx->staging == nullptr || (ctx->staging_size > 0 && ctx->staging->size < ctx->staging_size)) {
5537
+ VK_LOG_MEMORY("ggml_vk_preallocate_buffers(staging_size: " << ctx->staging_size << ")");
5584
5538
  // Resize buffer
5585
5539
  if (ctx->staging != nullptr) {
5586
5540
  ggml_vk_destroy_buffer(ctx->staging);
@@ -5598,9 +5552,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5598
5552
  return;
5599
5553
  }
5600
5554
 
5601
- #ifdef GGML_VULKAN_DEBUG
5602
- std::cerr << "ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")" << std::endl;
5603
- #endif
5555
+ VK_LOG_DEBUG("ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")");
5604
5556
  ctx->semaphore_idx = 0;
5605
5557
  ctx->staging_offset = 0;
5606
5558
 
@@ -5823,9 +5775,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
5823
5775
  return true;
5824
5776
  }
5825
5777
 
5826
- #ifdef GGML_VULKAN_DEBUG
5827
- std::cerr << "ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")" << std::endl;
5828
- #endif
5778
+ VK_LOG_DEBUG("ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")");
5829
5779
 
5830
5780
  #ifdef GGML_VULKAN_CHECK_RESULTS
5831
5781
  ggml_vk_check_results_0(ctx, params, tensor);
@@ -5860,9 +5810,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
5860
5810
 
5861
5811
  // Clean up after graph processing is done
5862
5812
  static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
5863
- #ifdef GGML_VULKAN_DEBUG
5864
- std::cerr << "ggml_vk_graph_cleanup()" << std::endl;
5865
- #endif
5813
+ VK_LOG_DEBUG("ggml_vk_graph_cleanup()");
5866
5814
  for (auto& buffer : ctx->gc.temp_buffers) {
5867
5815
  ggml_vk_pool_free(ctx, buffer);
5868
5816
  }
@@ -5906,9 +5854,7 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
5906
5854
 
5907
5855
  // Clean up on backend free
5908
5856
  static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
5909
- #ifdef GGML_VULKAN_DEBUG
5910
- std::cerr << "ggml_vk_cleanup(" << ctx->idx << ")" << std::endl;
5911
- #endif
5857
+ VK_LOG_DEBUG("ggml_vk_cleanup(" << ctx->idx << ")");
5912
5858
  ggml_vk_graph_cleanup(ctx);
5913
5859
 
5914
5860
  ggml_vk_destroy_buffer(ctx->prealloc_x);
@@ -6003,9 +5949,7 @@ GGML_CALL static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) {
6003
5949
  }
6004
5950
 
6005
5951
  GGML_CALL static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
6006
- #ifdef GGML_VULKAN_DEBUG
6007
- std::cerr << "ggml_backend_vk_buffer_free_buffer()" << std::endl;
6008
- #endif
5952
+ VK_LOG_MEMORY("ggml_backend_vk_buffer_free_buffer()");
6009
5953
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
6010
5954
  ggml_vk_destroy_buffer(ctx->dev_buffer);
6011
5955
  delete ctx;
@@ -6018,9 +5962,7 @@ GGML_CALL static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t bu
6018
5962
  }
6019
5963
 
6020
5964
  GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
6021
- #ifdef GGML_VULKAN_DEBUG
6022
- std::cerr << "ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")" << std::endl;
6023
- #endif
5965
+ VK_LOG_DEBUG("ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")");
6024
5966
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
6025
5967
 
6026
5968
  if (tensor->view_src != nullptr) {
@@ -6036,9 +5978,7 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b
6036
5978
  }
6037
5979
 
6038
5980
  GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
6039
- #ifdef GGML_VULKAN_DEBUG
6040
- std::cerr << "ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
6041
- #endif
5981
+ VK_LOG_DEBUG("ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
6042
5982
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
6043
5983
 
6044
5984
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
@@ -6049,9 +5989,7 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu
6049
5989
  }
6050
5990
 
6051
5991
  GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
6052
- #ifdef GGML_VULKAN_DEBUG
6053
- std::cerr << "ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
6054
- #endif
5992
+ VK_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
6055
5993
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
6056
5994
 
6057
5995
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
@@ -6109,9 +6047,7 @@ GGML_CALL static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buff
6109
6047
  }
6110
6048
 
6111
6049
  GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
6112
- #ifdef GGML_VULKAN_DEBUG
6113
- std::cerr << "ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")" << std::endl;
6114
- #endif
6050
+ VK_LOG_MEMORY("ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")");
6115
6051
  ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
6116
6052
 
6117
6053
  vk_buffer dev_buffer = nullptr;
@@ -6154,9 +6090,7 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
6154
6090
  GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) {
6155
6091
  ggml_vk_instance_init();
6156
6092
 
6157
- #ifdef GGML_VULKAN_DEBUG
6158
- std::cerr << "ggml_backend_vk_buffer_type(" << dev_num << ")" << std::endl;
6159
- #endif
6093
+ VK_LOG_DEBUG("ggml_backend_vk_buffer_type(" << dev_num << ")");
6160
6094
 
6161
6095
  GGML_ASSERT(dev_num < vk_instance.device_indices.size());
6162
6096
 
@@ -6180,16 +6114,12 @@ GGML_CALL static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buff
6180
6114
  }
6181
6115
 
6182
6116
  GGML_CALL static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
6183
- #ifdef GGML_VULKAN_DEBUG
6184
- std::cerr << "ggml_backend_vk_host_buffer_free_buffer()" << std::endl;
6185
- #endif
6117
+ VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
6186
6118
  ggml_vk_host_free(&vk_instance.contexts[0], buffer->context);
6187
6119
  }
6188
6120
 
6189
6121
  GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
6190
- #ifdef GGML_VULKAN_DEBUG
6191
- std::cerr << "ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")" << std::endl;
6192
- #endif
6122
+ VK_LOG_MEMORY("ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")");
6193
6123
  size += 32; // Behave like the CPU buffer type
6194
6124
  void * ptr = nullptr;
6195
6125
  try {
@@ -6246,9 +6176,7 @@ GGML_CALL static const char * ggml_backend_vk_name(ggml_backend_t backend) {
6246
6176
 
6247
6177
  GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend) {
6248
6178
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6249
- #ifdef GGML_VULKAN_DEBUG
6250
- std::cerr << "ggml_backend_vk_free(" << ctx->name << ")" << std::endl;
6251
- #endif
6179
+ VK_LOG_DEBUG("ggml_backend_vk_free(" << ctx->name << ")");
6252
6180
 
6253
6181
  size_t idx = ctx->idx;
6254
6182
 
@@ -6272,9 +6200,7 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_vk_get_default_buffer_t
6272
6200
  }
6273
6201
 
6274
6202
  GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
6275
- #ifdef GGML_VULKAN_DEBUG
6276
- std::cerr << "ggml_backend_vk_set_tensor_async(" << size << ")" << std::endl;
6277
- #endif
6203
+ VK_LOG_DEBUG("ggml_backend_vk_set_tensor_async(" << size << ")");
6278
6204
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6279
6205
  GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
6280
6206
 
@@ -6292,9 +6218,7 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
6292
6218
  }
6293
6219
 
6294
6220
  GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
6295
- #ifdef GGML_VULKAN_DEBUG
6296
- std::cerr << "ggml_backend_vk_get_tensor_async(" << size << ")" << std::endl;
6297
- #endif
6221
+ VK_LOG_DEBUG("ggml_backend_vk_get_tensor_async(" << size << ")");
6298
6222
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6299
6223
  GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
6300
6224
 
@@ -6312,9 +6236,7 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
6312
6236
  }
6313
6237
 
6314
6238
  GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
6315
- #ifdef GGML_VULKAN_DEBUG
6316
- std::cerr << "ggml_backend_vk_cpy_tensor_async()" << std::endl;
6317
- #endif
6239
+ VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async()");
6318
6240
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6319
6241
  if ((dst->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || dst->buffer->buft == ggml_backend_vk_host_buffer_type()) && ggml_backend_buffer_is_vk(src->buffer)) {
6320
6242
  ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra;
@@ -6337,9 +6259,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c
6337
6259
  }
6338
6260
 
6339
6261
  GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
6340
- #ifdef GGML_VULKAN_DEBUG
6341
- std::cerr << "ggml_backend_vk_synchronize()" << std::endl;
6342
- #endif
6262
+ VK_LOG_DEBUG("ggml_backend_vk_synchronize()");
6343
6263
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6344
6264
  if(ctx->transfer_ctx == nullptr) {
6345
6265
  return;
@@ -6367,9 +6287,7 @@ static bool ggml_vk_is_empty(ggml_tensor * node) {
6367
6287
  }
6368
6288
 
6369
6289
  GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
6370
- #ifdef GGML_VULKAN_DEBUG
6371
- std::cerr << "ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)" << std::endl;
6372
- #endif
6290
+ VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
6373
6291
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6374
6292
 
6375
6293
  for (int i = 0; i < cgraph->n_nodes; i++) {
@@ -6582,9 +6500,7 @@ GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
6582
6500
  if (vk_instance.initialized[dev_num]) {
6583
6501
  return vk_instance.backends[dev_num];
6584
6502
  }
6585
- #ifdef GGML_VULKAN_DEBUG
6586
- std::cerr << "ggml_backend_vk_init(" << dev_num << ")" << std::endl;
6587
- #endif
6503
+ VK_LOG_DEBUG("ggml_backend_vk_init(" << dev_num << ")");
6588
6504
 
6589
6505
  ggml_backend_vk_context * ctx = &vk_instance.contexts[dev_num];
6590
6506
  ggml_vk_init(ctx, dev_num);
@@ -6800,9 +6716,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6800
6716
  return;
6801
6717
  }
6802
6718
 
6803
- #ifdef GGML_VULKAN_DEBUG
6804
- std::cerr << "ggml_vk_check_results_0(" << tensor->name << ")" << std::endl;
6805
- #endif
6719
+ VK_LOG_DEBUG("ggml_vk_check_results_0(" << tensor->name << ")");
6806
6720
 
6807
6721
  ggml_tensor * src0 = tensor->src[0];
6808
6722
  ggml_tensor * src1 = tensor->src[1];
@@ -7108,9 +7022,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
7108
7022
  return;
7109
7023
  }
7110
7024
 
7111
- #ifdef GGML_VULKAN_DEBUG
7112
- std::cerr << "ggml_vk_check_results_1(" << tensor->name << ")" << std::endl;
7113
- #endif
7025
+ VK_LOG_DEBUG("ggml_vk_check_results_1(" << tensor->name << ")");
7114
7026
 
7115
7027
  ggml_tensor * src0 = tensor->src[0];
7116
7028
  ggml_tensor * src1 = tensor->src[1];