llama_cpp 0.16.1 → 0.16.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -8,6 +8,7 @@
8
8
 
9
9
  #include <algorithm>
10
10
  #include <cmath>
11
+ #include <iomanip>
11
12
  #include <iostream>
12
13
  #include <tuple>
13
14
  #include <vector>
@@ -57,6 +58,12 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA
57
58
  } \
58
59
  } while (0)
59
60
 
61
+ #ifdef GGML_VULKAN_DEBUG
62
+ #define VK_LOG_DEBUG(msg) std::cerr << msg << std::endl
63
+ #else
64
+ #define VK_LOG_DEBUG(msg) ((void) 0)
65
+ #endif // GGML_VULKAN_DEBUG
66
+
60
67
  struct ggml_backend_vk_context;
61
68
 
62
69
  struct vk_queue {
@@ -159,9 +166,7 @@ struct vk_device {
159
166
  std::vector<vk_pipeline_ref> pipelines;
160
167
 
161
168
  ~vk_device() {
162
- #ifdef GGML_VULKAN_DEBUG
163
- std::cerr << "destroy device " << name << std::endl;
164
- #endif
169
+ VK_LOG_DEBUG("destroy device " << name);
165
170
  device.destroyCommandPool(compute_queue.pool);
166
171
  if (!single_queue) {
167
172
  device.destroyCommandPool(transfer_queue.pool);
@@ -196,9 +201,7 @@ struct vk_buffer_struct {
196
201
  if (size == 0) {
197
202
  return;
198
203
  }
199
- #ifdef GGML_VULKAN_DEBUG
200
- std::cerr << "~vk_buffer_struct(" << buffer << ", " << size << ")" << std::endl;
201
- #endif
204
+ VK_LOG_DEBUG("~vk_buffer_struct(" << buffer << ", " << size << ")");
202
205
 
203
206
  device->device.freeMemory(device_memory);
204
207
  device->device.destroyBuffer(buffer);
@@ -355,6 +358,49 @@ struct ggml_vk_garbage_collector {
355
358
  std::vector<vk_context> contexts;
356
359
  };
357
360
 
361
+ #if defined(GGML_VULKAN_MEMORY_DEBUG) || defined(GGML_VULKAN_DEBUG)
362
+ #include <mutex>
363
+
364
+ #define VK_LOG_MEMORY(msg) std::cerr << "ggml_vulkan memory: " << msg << std::endl
365
+
366
+ static std::string format_size(size_t size) {
367
+ const size_t kib = 1024;
368
+ const size_t mib = kib * 1024;
369
+ const size_t gib = mib * 1024;
370
+
371
+ std::ostringstream oss;
372
+ oss << std::fixed << std::setprecision(2);
373
+
374
+ if (size >= gib) {
375
+ oss << static_cast<double>(size) / gib << " GiB";
376
+ } else if (size >= mib) {
377
+ oss << static_cast<double>(size) / mib << " MiB";
378
+ } else if (size >= kib) {
379
+ oss << static_cast<double>(size) / kib << " KiB";
380
+ } else {
381
+ oss << size << " B";
382
+ }
383
+
384
+ return oss.str();
385
+ }
386
+
387
+ static std::mutex log_mutex;
388
+
389
+ class vk_memory_logger {
390
+ public:
391
+ vk_memory_logger(): total_device(0), total_host(0) {}
392
+ void log_allocation(vk_buffer_ref buf_ref, size_t size);
393
+ void log_deallocation(vk_buffer_ref buf_ref);
394
+
395
+ private:
396
+ std::map<vk::Buffer, size_t> allocations; // Track allocations
397
+ size_t total_device;
398
+ size_t total_host;
399
+ };
400
+ #else
401
+ #define VK_LOG_MEMORY(msg) ((void) 0)
402
+ #endif // GGML_VULKAN_MEMORY_DEBUG
403
+
358
404
  struct ggml_backend_vk_context {
359
405
  std::string name;
360
406
 
@@ -379,8 +425,45 @@ struct ggml_backend_vk_context {
379
425
  bool initialized;
380
426
 
381
427
  size_t idx;
428
+
429
+ #ifdef GGML_VULKAN_MEMORY_DEBUG
430
+ vk_memory_logger memory_logger;
431
+ #endif
382
432
  };
383
433
 
434
+ #ifdef GGML_VULKAN_MEMORY_DEBUG
435
+ void vk_memory_logger::log_allocation(vk_buffer_ref buf_ref, size_t size) {
436
+ std::lock_guard<std::mutex> guard(log_mutex);
437
+ vk_buffer buf = buf_ref.lock();
438
+ const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal);
439
+ const std::string type = device ? "device" : "host";
440
+ allocations[buf->buffer] = size;
441
+ total_device += device ? size : 0;
442
+ total_host += device ? 0 : size;
443
+ VK_LOG_MEMORY("VULKAN" << buf->ctx->idx << ": +" << format_size(size) << " " << type << " at " << buf->buffer << ". Total device: " << format_size(total_device) << ", total host: " << format_size(total_host));
444
+ }
445
+
446
+ void vk_memory_logger::log_deallocation(vk_buffer_ref buf_ref) {
447
+ if (buf_ref.expired() || buf_ref.lock()->size == 0) {
448
+ return;
449
+ }
450
+
451
+ std::lock_guard<std::mutex> guard(log_mutex);
452
+ vk_buffer buf = buf_ref.lock();
453
+ const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal);
454
+ std::string type = device ? "device" : "host";
455
+ auto it = allocations.find(buf->buffer);
456
+ total_device -= device ? it->second : 0;
457
+ total_host -= device ? 0 : it->second;
458
+ if (it != allocations.end()) {
459
+ VK_LOG_MEMORY("VULKAN" << buf->ctx->idx << ": -" << format_size(it->second) << " " << type << " at " << buf->buffer << ". Total device: " << format_size(total_device) << ", total host: " << format_size(total_host));
460
+ allocations.erase(it);
461
+ } else {
462
+ VK_LOG_MEMORY("ERROR VULKAN" << buf->ctx->idx << ": Attempted to deallocate unknown " << type << " memory at " << buf->buffer);
463
+ }
464
+ }
465
+ #endif // GGML_VULKAN_MEMORY_DEBUG
466
+
384
467
  struct vk_instance_t {
385
468
  vk::Instance instance;
386
469
 
@@ -393,15 +476,11 @@ struct vk_instance_t {
393
476
  };
394
477
 
395
478
  static std::shared_ptr<vk_device> ggml_vk_get_device(size_t idx) {
396
- #ifdef GGML_VULKAN_DEBUG
397
- std::cerr << "ggml_vk_get_device(" << idx << ")" << std::endl;
398
- #endif
479
+ VK_LOG_DEBUG("ggml_vk_get_device(" << idx << ")");
399
480
  static std::weak_ptr<vk_device> devices[GGML_VK_MAX_DEVICES];
400
481
 
401
482
  if (devices[idx].expired()) {
402
- #ifdef GGML_VULKAN_DEBUG
403
- std::cerr << "Initializing new vk_device" << std::endl;
404
- #endif
483
+ VK_LOG_DEBUG("Initializing new vk_device");
405
484
  std::shared_ptr<vk_device> device = std::make_shared<vk_device>();
406
485
  device->initialized = false;
407
486
  devices[idx] = device;
@@ -428,9 +507,7 @@ static vk_instance_t vk_instance;
428
507
  GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend);
429
508
 
430
509
  static void ggml_vk_create_pipeline(ggml_backend_vk_context * ctx, vk_pipeline& pipeline, const std::string& name, size_t spv_size, const void* spv_data, const std::string& entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t>&& specialization_constants, uint32_t align) {
431
- #ifdef GGML_VULKAN_DEBUG
432
- std::cerr << "ggml_vk_create_pipeline(" << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")" << std::endl;
433
- #endif
510
+ VK_LOG_DEBUG("ggml_vk_create_pipeline(" << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")");
434
511
  GGML_ASSERT(parameter_count > 0);
435
512
  GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
436
513
 
@@ -531,9 +608,7 @@ static void ggml_vk_create_pipeline(ggml_backend_vk_context * ctx, vk_pipeline&
531
608
  }
532
609
 
533
610
  static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) {
534
- #ifdef GGML_VULKAN_DEBUG
535
- std::cerr << "ggml_pipeline_destroy_pipeline(" << pipeline->name << ")" << std::endl;
536
- #endif
611
+ VK_LOG_DEBUG("ggml_pipeline_destroy_pipeline(" << pipeline->name << ")");
537
612
  for (auto& pool : pipeline->descriptor_pools) {
538
613
  device.destroyDescriptorPool(pool);
539
614
  }
@@ -551,9 +626,7 @@ static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline)
551
626
  }
552
627
 
553
628
  static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx, vk_pipeline& pipeline, uint32_t n) {
554
- #ifdef GGML_VULKAN_DEBUG
555
- std::cerr << "ggml_pipeline_allocate_descriptor_sets(" << pipeline->name << ", " << n << ")" << std::endl;
556
- #endif
629
+ VK_LOG_DEBUG("ggml_pipeline_allocate_descriptor_sets(" << pipeline->name << ", " << n << ")");
557
630
  if (pipeline->descriptor_sets.size() >= pipeline->descriptor_set_idx + n) {
558
631
  // Enough descriptors are available
559
632
  return;
@@ -583,16 +656,12 @@ static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx
583
656
  }
584
657
 
585
658
  static void ggml_pipeline_cleanup(vk_pipeline& pipeline) {
586
- #ifdef GGML_VULKAN_DEBUG
587
- std::cerr << "ggml_pipeline_cleanup(" << pipeline->name << ")" << std::endl;
588
- #endif
659
+ VK_LOG_DEBUG("ggml_pipeline_cleanup(" << pipeline->name << ")");
589
660
  pipeline->descriptor_set_idx = 0;
590
661
  }
591
662
 
592
663
  static vk::CommandBuffer ggml_vk_create_cmd_buffer(ggml_backend_vk_context * ctx, vk_queue& q) {
593
- #ifdef GGML_VULKAN_DEBUG
594
- std::cerr << "ggml_vk_create_cmd_buffer()" << std::endl;
595
- #endif
664
+ VK_LOG_DEBUG("ggml_vk_create_cmd_buffer()");
596
665
  if (q.cmd_buffers.size() > q.cmd_buffer_idx) {
597
666
  // Reuse command buffer
598
667
  return q.cmd_buffers[q.cmd_buffer_idx++];
@@ -612,9 +681,7 @@ static vk::CommandBuffer ggml_vk_create_cmd_buffer(ggml_backend_vk_context * ctx
612
681
  }
613
682
 
614
683
  static vk_submission ggml_vk_create_submission(ggml_backend_vk_context * ctx, vk_queue& q, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores) {
615
- #ifdef GGML_VULKAN_DEBUG
616
- std::cerr << "ggml_vk_create_submission()" << std::endl;
617
- #endif
684
+ VK_LOG_DEBUG("ggml_vk_create_submission()");
618
685
  vk_submission s;
619
686
  s.buffer = ggml_vk_create_cmd_buffer(ctx, q);
620
687
  s.wait_semaphores = std::move(wait_semaphores);
@@ -623,9 +690,7 @@ static vk_submission ggml_vk_create_submission(ggml_backend_vk_context * ctx, vk
623
690
  }
624
691
 
625
692
  static void ggml_vk_submit(vk_context * ctx, vk::Fence fence) {
626
- #ifdef GGML_VULKAN_DEBUG
627
- std::cerr << "ggml_vk_submit(" << ctx->seqs.size() << ", " << fence << ")" << std::endl;
628
- #endif
693
+ VK_LOG_DEBUG("ggml_vk_submit(" << ctx->seqs.size() << ", " << fence << ")");
629
694
  if (ctx->seqs.empty()) {
630
695
  return;
631
696
  }
@@ -699,9 +764,7 @@ static void ggml_vk_submit(vk_context * ctx, vk::Fence fence) {
699
764
  }
700
765
 
701
766
  static uint32_t ggml_vk_find_queue_family_index(std::vector<vk::QueueFamilyProperties>& queue_family_props, const vk::QueueFlags& required, const vk::QueueFlags& avoid, int32_t compute_index, uint32_t min_num_queues) {
702
- #ifdef GGML_VULKAN_DEBUG
703
- std::cerr << "ggml_vk_find_queue_family_index()" << std::endl;
704
- #endif
767
+ VK_LOG_DEBUG("ggml_vk_find_queue_family_index()");
705
768
  const uint32_t qfsize = queue_family_props.size();
706
769
 
707
770
  // Try with avoid preferences first
@@ -747,9 +810,7 @@ static uint32_t ggml_vk_find_queue_family_index(std::vector<vk::QueueFamilyPrope
747
810
  }
748
811
 
749
812
  static void ggml_vk_create_queue(ggml_backend_vk_context * ctx, vk_queue& q, uint32_t queue_family_index, uint32_t queue_index, vk::PipelineStageFlags&& stage_flags) {
750
- #ifdef GGML_VULKAN_DEBUG
751
- std::cerr << "ggml_vk_create_queue()" << std::endl;
752
- #endif
813
+ VK_LOG_DEBUG("ggml_vk_create_queue()");
753
814
  q.queue_family_index = queue_family_index;
754
815
 
755
816
  vk::CommandPoolCreateInfo command_pool_create_info_compute(vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT), queue_family_index);
@@ -763,9 +824,7 @@ static void ggml_vk_create_queue(ggml_backend_vk_context * ctx, vk_queue& q, uin
763
824
  }
764
825
 
765
826
  static vk_context * ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_queue& q) {
766
- #ifdef GGML_VULKAN_DEBUG
767
- std::cerr << "ggml_vk_create_context()" << std::endl;
768
- #endif
827
+ VK_LOG_DEBUG("ggml_vk_create_context()");
769
828
  ctx->gc.contexts.emplace_back();
770
829
  vk_context * result = &ctx->gc.contexts[ctx->gc.contexts.size() - 1];
771
830
  memset((void *) result, 0, sizeof(vk_context));
@@ -775,9 +834,7 @@ static vk_context * ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_que
775
834
  }
776
835
 
777
836
  static vk_semaphore * ggml_vk_create_binary_semaphore(ggml_backend_vk_context * ctx) {
778
- #ifdef GGML_VULKAN_DEBUG
779
- std::cerr << "ggml_vk_create_timeline_semaphore()" << std::endl;
780
- #endif
837
+ VK_LOG_DEBUG("ggml_vk_create_timeline_semaphore()");
781
838
  vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eBinary, 0 };
782
839
  vk::SemaphoreCreateInfo ci{};
783
840
  ci.setPNext(&tci);
@@ -787,9 +844,7 @@ static vk_semaphore * ggml_vk_create_binary_semaphore(ggml_backend_vk_context *
787
844
  }
788
845
 
789
846
  static vk_semaphore * ggml_vk_create_timeline_semaphore(ggml_backend_vk_context * ctx) {
790
- #ifdef GGML_VULKAN_DEBUG
791
- std::cerr << "ggml_vk_create_timeline_semaphore()" << std::endl;
792
- #endif
847
+ VK_LOG_DEBUG("ggml_vk_create_timeline_semaphore()");
793
848
  if (ctx->semaphore_idx >= ctx->gc.tl_semaphores.size()) {
794
849
  vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eTimeline, 0 };
795
850
  vk::SemaphoreCreateInfo ci{};
@@ -808,9 +863,7 @@ static vk::Event ggml_vk_create_event(ggml_backend_vk_context * ctx) {
808
863
  }
809
864
 
810
865
  static void ggml_vk_queue_cleanup(ggml_backend_vk_context * ctx, vk_queue& q) {
811
- #ifdef GGML_VULKAN_DEBUG
812
- std::cerr << "ggml_vk_queue_cleanup()" << std::endl;
813
- #endif
866
+ VK_LOG_DEBUG("ggml_vk_queue_cleanup()");
814
867
  // Requires command buffers to be done
815
868
 
816
869
  ctx->device->device.resetCommandPool(q.pool);
@@ -830,9 +883,7 @@ static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_pr
830
883
  }
831
884
 
832
885
  static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
833
- #ifdef GGML_VULKAN_DEBUG
834
- std::cerr << "ggml_vk_create_buffer(device " << ctx->idx << ", " << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl;
835
- #endif
886
+ VK_LOG_DEBUG("ggml_vk_create_buffer(device " << ctx->idx << ", " << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")");
836
887
  vk_buffer buf = std::make_shared<vk_buffer_struct>();
837
888
 
838
889
  if (size == 0) {
@@ -892,8 +943,8 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
892
943
 
893
944
  buf->device = ctx->device;
894
945
 
895
- #ifdef GGML_VULKAN_DEBUG
896
- std::cerr << "Created buffer " << buf->buffer << std::endl;
946
+ #ifdef GGML_VULKAN_MEMORY_DEBUG
947
+ ctx->memory_logger.log_allocation(buf, size);
897
948
  #endif
898
949
 
899
950
  return buf;
@@ -928,6 +979,14 @@ static vk_buffer ggml_vk_create_buffer_device(ggml_backend_vk_context * ctx, siz
928
979
  }
929
980
 
930
981
  static void ggml_vk_destroy_buffer(vk_buffer& buf) {
982
+ if (buf == nullptr) {
983
+ return;
984
+ }
985
+
986
+ #ifdef GGML_VULKAN_MEMORY_DEBUG
987
+ buf->ctx->memory_logger.log_deallocation(buf);
988
+ #endif
989
+
931
990
  buf.reset();
932
991
  }
933
992
 
@@ -936,9 +995,7 @@ static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) {
936
995
  }
937
996
 
938
997
  static void ggml_vk_sync_buffers(vk_context * ctx) {
939
- #ifdef GGML_VULKAN_DEBUG
940
- std::cerr << "ggml_vk_sync_buffers()" << std::endl;
941
- #endif
998
+ VK_LOG_DEBUG("ggml_vk_sync_buffers()");
942
999
  const std::vector<vk::MemoryBarrier> mem_barriers{ { { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite }, { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite } } };
943
1000
 
944
1001
  ctx->s->buffer.pipelineBarrier(
@@ -952,9 +1009,7 @@ static void ggml_vk_sync_buffers(vk_context * ctx) {
952
1009
  }
953
1010
 
954
1011
  static void ggml_vk_wait_events(vk_context * ctx, std::vector<vk::Event>&& events) {
955
- #ifdef GGML_VULKAN_DEBUG
956
- std::cerr << "ggml_vk_wait_events()" << std::endl;
957
- #endif
1012
+ VK_LOG_DEBUG("ggml_vk_wait_events()");
958
1013
  if (events.empty()) {
959
1014
  return;
960
1015
  }
@@ -989,9 +1044,7 @@ static bool ggml_vk_build_shader(ggml_type type) {
989
1044
  }
990
1045
 
991
1046
  static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
992
- #ifdef GGML_VULKAN_DEBUG
993
- std::cerr << "ggml_vk_load_shaders(" << ctx->name << ")" << std::endl;
994
- #endif
1047
+ VK_LOG_DEBUG("ggml_vk_load_shaders(" << ctx->name << ")");
995
1048
 
996
1049
  const std::shared_ptr<vk_device> device = ctx->device;
997
1050
 
@@ -1042,12 +1095,12 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1042
1095
  ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
1043
1096
 
1044
1097
  if (device->fp16) {
1045
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_len, matmul_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1046
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_len, matmul_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1047
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s", matmul_f32_len, matmul_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
1048
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
1049
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1050
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1098
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1099
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1100
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
1101
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l", matmul_f32_f32_aligned_len, matmul_f32_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
1102
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_f32_aligned_len, matmul_f32_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1103
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_f32_aligned_len, matmul_f32_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1051
1104
 
1052
1105
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1053
1106
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
@@ -1140,12 +1193,12 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1140
1193
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1141
1194
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1142
1195
 
1143
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_len, matmul_id_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
1144
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_len, matmul_id_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
1145
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_len, matmul_id_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
1146
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l", matmul_id_f32_aligned_len, matmul_id_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align);
1147
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m", matmul_id_f32_aligned_len, matmul_id_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align);
1148
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s", matmul_id_f32_aligned_len, matmul_id_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align);
1196
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
1197
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
1198
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
1199
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l", matmul_id_f32_f32_aligned_len, matmul_id_f32_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align);
1200
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m", matmul_id_f32_f32_aligned_len, matmul_id_f32_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align);
1201
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s", matmul_id_f32_f32_aligned_len, matmul_id_f32_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align);
1149
1202
 
1150
1203
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->l, "matmul_id_f16_l", matmul_id_f16_len, matmul_id_f16_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
1151
1204
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->m, "matmul_id_f16_m", matmul_id_f16_len, matmul_id_f16_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
@@ -1231,12 +1284,12 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1231
1284
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_m, "matmul_id_q6_k_f32_aligned_m", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1232
1285
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_s, "matmul_id_q6_k_f32_aligned_s", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1233
1286
  } else {
1234
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1235
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1236
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
1237
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
1238
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1239
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1287
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1288
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1289
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
1290
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l", matmul_f32_f32_aligned_fp32_len, matmul_f32_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
1291
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_f32_aligned_fp32_len, matmul_f32_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1292
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_f32_aligned_fp32_len, matmul_f32_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1240
1293
 
1241
1294
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1242
1295
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
@@ -1329,12 +1382,12 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1329
1382
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
1330
1383
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
1331
1384
 
1332
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_fp32_len, matmul_id_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
1333
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_fp32_len, matmul_id_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
1334
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_fp32_len, matmul_id_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
1335
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l", matmul_id_f32_aligned_fp32_len, matmul_id_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align);
1336
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m", matmul_id_f32_aligned_fp32_len, matmul_id_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align);
1337
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s", matmul_id_f32_aligned_fp32_len, matmul_id_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align);
1385
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
1386
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
1387
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
1388
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l", matmul_id_f32_f32_aligned_fp32_len, matmul_id_f32_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align);
1389
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m", matmul_id_f32_f32_aligned_fp32_len, matmul_id_f32_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align);
1390
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s", matmul_id_f32_f32_aligned_fp32_len, matmul_id_f32_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align);
1338
1391
 
1339
1392
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->l, "matmul_id_f16_l", matmul_id_f16_fp32_len, matmul_id_f16_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
1340
1393
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->m, "matmul_id_f16_m", matmul_id_f16_fp32_len, matmul_id_f16_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
@@ -1429,11 +1482,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1429
1482
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1430
1483
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1431
1484
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1432
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_K_f32_f32", mul_mat_vec_q2_K_f32_f32_len, mul_mat_vec_q2_K_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1433
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_K_f32_f32", mul_mat_vec_q3_K_f32_f32_len, mul_mat_vec_q3_K_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1434
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_K_f32_f32", mul_mat_vec_q4_K_f32_f32_len, mul_mat_vec_q4_K_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1435
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_K_f32_f32", mul_mat_vec_q5_K_f32_f32_len, mul_mat_vec_q5_K_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1436
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_K_f32_f32", mul_mat_vec_q6_K_f32_f32_len, mul_mat_vec_q6_K_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1485
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1486
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1487
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1488
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1489
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1437
1490
 
1438
1491
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32", mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1439
1492
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32", mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
@@ -1442,11 +1495,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1442
1495
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1443
1496
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1444
1497
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1445
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_K_f16_f32", mul_mat_vec_q2_K_f16_f32_len, mul_mat_vec_q2_K_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1446
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_K_f16_f32", mul_mat_vec_q3_K_f16_f32_len, mul_mat_vec_q3_K_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1447
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_K_f16_f32", mul_mat_vec_q4_K_f16_f32_len, mul_mat_vec_q4_K_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1448
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_K_f16_f32", mul_mat_vec_q5_K_f16_f32_len, mul_mat_vec_q5_K_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1449
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_K_f16_f32", mul_mat_vec_q6_K_f16_f32_len, mul_mat_vec_q6_K_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1498
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1499
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1500
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1501
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1502
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1450
1503
 
1451
1504
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1452
1505
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
@@ -1455,11 +1508,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1455
1508
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1456
1509
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1457
1510
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1458
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_K_f32", mul_mat_vec_id_q2_K_f32_len, mul_mat_vec_id_q2_K_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1459
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_K_f32", mul_mat_vec_id_q3_K_f32_len, mul_mat_vec_id_q3_K_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1460
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_K_f32", mul_mat_vec_id_q4_K_f32_len, mul_mat_vec_id_q4_K_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1461
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_K_f32", mul_mat_vec_id_q5_K_f32_len, mul_mat_vec_id_q5_K_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1462
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_K_f32", mul_mat_vec_id_q6_K_f32_len, mul_mat_vec_id_q6_K_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1511
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1512
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1513
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1514
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1515
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
1463
1516
 
1464
1517
  // dequant shaders
1465
1518
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
@@ -1468,11 +1521,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1468
1521
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_0], "dequant_q5_0", dequant_q5_0_len, dequant_q5_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
1469
1522
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_1], "dequant_q5_1", dequant_q5_1_len, dequant_q5_1_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
1470
1523
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q8_0], "dequant_q8_0", dequant_q8_0_len, dequant_q8_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
1471
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q2_K], "dequant_q2_K", dequant_q2_K_len, dequant_q2_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
1472
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q3_K], "dequant_q3_K", dequant_q3_K_len, dequant_q3_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
1473
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q4_K], "dequant_q4_K", dequant_q4_K_len, dequant_q4_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
1474
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_K], "dequant_q5_K", dequant_q5_K_len, dequant_q5_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
1475
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_K", dequant_q6_K_len, dequant_q6_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
1524
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q2_K], "dequant_q2_k", dequant_q2_k_len, dequant_q2_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
1525
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q3_K], "dequant_q3_k", dequant_q3_k_len, dequant_q3_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
1526
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q4_K], "dequant_q4_k", dequant_q4_k_len, dequant_q4_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
1527
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_K], "dequant_q5_k", dequant_q5_k_len, dequant_q5_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
1528
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_k", dequant_q6_k_len, dequant_q6_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
1476
1529
 
1477
1530
  // get_rows
1478
1531
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F32 ], "get_rows_f32", get_rows_f32_len, get_rows_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
@@ -1538,9 +1591,7 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1538
1591
  static void ggml_vk_print_gpu_info(size_t idx) {
1539
1592
  GGML_ASSERT(idx < vk_instance.device_indices.size());
1540
1593
  size_t dev_num = vk_instance.device_indices[idx];
1541
- #ifdef GGML_VULKAN_DEBUG
1542
- std::cerr << "ggml_vk_print_gpu_info(" << dev_num << ")" << std::endl;
1543
- #endif
1594
+ VK_LOG_DEBUG("ggml_vk_print_gpu_info(" << dev_num << ")");
1544
1595
  GGML_ASSERT(vk_instance.initialized);
1545
1596
 
1546
1597
  std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
@@ -1617,9 +1668,7 @@ void ggml_vk_instance_init() {
1617
1668
  if (vk_instance_initialized) {
1618
1669
  return;
1619
1670
  }
1620
- #ifdef GGML_VULKAN_DEBUG
1621
- std::cerr << "ggml_vk_instance_init()" << std::endl;
1622
- #endif
1671
+ VK_LOG_DEBUG("ggml_vk_instance_init()");
1623
1672
 
1624
1673
  vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION };
1625
1674
 
@@ -1696,33 +1745,37 @@ void ggml_vk_instance_init() {
1696
1745
 
1697
1746
  // Default to using all dedicated GPUs
1698
1747
  for (size_t i = 0; i < devices.size(); i++) {
1699
- vk::PhysicalDeviceProperties props = devices[i].getProperties();
1700
-
1701
- if (props.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) {
1748
+ vk::PhysicalDeviceProperties2 new_props;
1749
+ vk::PhysicalDeviceDriverProperties new_driver;
1750
+ vk::PhysicalDeviceIDProperties new_id;
1751
+ new_props.pNext = &new_driver;
1752
+ new_driver.pNext = &new_id;
1753
+ devices[i].getProperties2(&new_props);
1754
+
1755
+ if (new_props.properties.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) {
1702
1756
  // Check if there are two physical devices corresponding to the same GPU
1703
1757
  auto old_device = std::find_if(
1704
1758
  vk_instance.device_indices.begin(),
1705
1759
  vk_instance.device_indices.end(),
1706
- [&devices, &props](const size_t k){ return devices[k].getProperties().deviceID == props.deviceID; }
1760
+ [&devices, &new_id](const size_t k){
1761
+ vk::PhysicalDeviceProperties2 old_props;
1762
+ vk::PhysicalDeviceIDProperties old_id;
1763
+ old_props.pNext = &old_id;
1764
+ devices[k].getProperties2(&old_props);
1765
+ return std::equal(std::begin(old_id.deviceUUID), std::end(old_id.deviceUUID), std::begin(new_id.deviceUUID));
1766
+ }
1707
1767
  );
1708
1768
  if (old_device == vk_instance.device_indices.end()) {
1709
1769
  vk_instance.device_indices.push_back(i);
1710
1770
  } else {
1711
1771
  // There can be two physical devices corresponding to the same GPU if there are 2 different drivers
1712
1772
  // This can cause error when splitting layers aross the devices, need to keep only 1
1713
- #ifdef GGML_VULKAN_DEBUG
1714
- std::cerr << "Device " << i << " and device " << *old_device << " have the same device id" << std::endl;
1715
- #endif
1773
+ VK_LOG_DEBUG("Device " << i << " and device " << *old_device << " have the same deviceUUID");
1716
1774
 
1717
- vk::PhysicalDeviceProperties2 old_prop;
1775
+ vk::PhysicalDeviceProperties2 old_props;
1718
1776
  vk::PhysicalDeviceDriverProperties old_driver;
1719
- old_prop.pNext = &old_driver;
1720
- devices[*old_device].getProperties2(&old_prop);
1721
-
1722
- vk::PhysicalDeviceProperties2 new_prop;
1723
- vk::PhysicalDeviceDriverProperties new_driver;
1724
- new_prop.pNext = &new_driver;
1725
- devices[i].getProperties2(&new_prop);
1777
+ old_props.pNext = &old_driver;
1778
+ devices[*old_device].getProperties2(&old_props);
1726
1779
 
1727
1780
  std::map<vk::DriverId, int> driver_priorities {};
1728
1781
  int old_priority = std::numeric_limits<int>::max();
@@ -1730,7 +1783,7 @@ void ggml_vk_instance_init() {
1730
1783
 
1731
1784
  // Check https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkDriverId.html for the list of driver id
1732
1785
  // Smaller number -> higher priority
1733
- switch (old_prop.properties.vendorID) {
1786
+ switch (old_props.properties.vendorID) {
1734
1787
  case VK_VENDOR_ID_AMD:
1735
1788
  driver_priorities[vk::DriverId::eMesaRadv] = 1;
1736
1789
  driver_priorities[vk::DriverId::eAmdOpenSource] = 2;
@@ -1760,16 +1813,11 @@ void ggml_vk_instance_init() {
1760
1813
  vk_instance.device_indices.erase(r, vk_instance.device_indices.end());
1761
1814
  vk_instance.device_indices.push_back(i);
1762
1815
 
1763
- #ifdef GGML_VULKAN_DEBUG
1764
- std::cerr << "Prioritize device " << i << " driver " << new_driver.driverName << " over device " << *old_device << " driver " << old_driver.driverName << std::endl;
1765
- #endif
1816
+ VK_LOG_DEBUG("Prioritize device " << i << " driver " << new_driver.driverName << " over device " << *old_device << " driver " << old_driver.driverName);
1766
1817
  }
1767
- #ifdef GGML_VULKAN_DEBUG
1768
1818
  else {
1769
- std::cerr << "Prioritize device " << *old_device << " driver " << old_driver.driverName << " over device " << i << " driver " << new_driver.driverName << std::endl;
1770
-
1819
+ VK_LOG_DEBUG("Prioritize device " << *old_device << " driver " << old_driver.driverName << " over device " << i << " driver " << new_driver.driverName << std::endl);
1771
1820
  }
1772
- #endif
1773
1821
  }
1774
1822
  }
1775
1823
  }
@@ -1792,9 +1840,7 @@ void ggml_vk_instance_init() {
1792
1840
  static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
1793
1841
  GGML_ASSERT(idx < vk_instance.device_indices.size());
1794
1842
  size_t dev_num = vk_instance.device_indices[idx];
1795
- #ifdef GGML_VULKAN_DEBUG
1796
- std::cerr << "ggml_vk_init(" << ctx->name << ", " << dev_num << ")" << std::endl;
1797
- #endif
1843
+ VK_LOG_DEBUG("ggml_vk_init(" << ctx->name << ", " << dev_num << ")");
1798
1844
  ggml_vk_instance_init();
1799
1845
 
1800
1846
  std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
@@ -1967,9 +2013,7 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
1967
2013
  }
1968
2014
 
1969
2015
  static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type type) {
1970
- #ifdef GGML_VULKAN_DEBUG
1971
- std::cerr << "ggml_vk_get_to_fp16()" << std::endl;
1972
- #endif
2016
+ VK_LOG_DEBUG("ggml_vk_get_to_fp16()");
1973
2017
  switch (type) {
1974
2018
  case GGML_TYPE_F32:
1975
2019
  case GGML_TYPE_Q4_0:
@@ -1991,9 +2035,7 @@ static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type
1991
2035
  }
1992
2036
 
1993
2037
  static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type) {
1994
- #ifdef GGML_VULKAN_DEBUG
1995
- std::cerr << "ggml_vk_get_mul_mat_mat_pipeline()" << std::endl;
1996
- #endif
2038
+ VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_pipeline()");
1997
2039
  if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
1998
2040
  return ctx->device->pipeline_matmul_f32;
1999
2041
  }
@@ -2029,9 +2071,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
2029
2071
  }
2030
2072
 
2031
2073
  static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type) {
2032
- #ifdef GGML_VULKAN_DEBUG
2033
- std::cerr << "ggml_vk_get_dequantize_mul_mat_vec()" << std::endl;
2034
- #endif
2074
+ VK_LOG_DEBUG("ggml_vk_get_dequantize_mul_mat_vec()");
2035
2075
  GGML_ASSERT(b_type == GGML_TYPE_F32 || b_type == GGML_TYPE_F16);
2036
2076
 
2037
2077
  switch (a_type) {
@@ -2056,9 +2096,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *
2056
2096
  }
2057
2097
 
2058
2098
  static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type) {
2059
- #ifdef GGML_VULKAN_DEBUG
2060
- std::cerr << "ggml_vk_get_mul_mat_mat_id_pipeline()" << std::endl;
2061
- #endif
2099
+ VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_id_pipeline()");
2062
2100
  if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
2063
2101
  return ctx->device->pipeline_matmul_id_f32;
2064
2102
  }
@@ -2091,9 +2129,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_co
2091
2129
  }
2092
2130
 
2093
2131
  static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type) {
2094
- #ifdef GGML_VULKAN_DEBUG
2095
- std::cerr << "ggml_vk_get_dequantize_mul_mat_vec()" << std::endl;
2096
- #endif
2132
+ VK_LOG_DEBUG("ggml_vk_get_dequantize_mul_mat_vec()");
2097
2133
  GGML_ASSERT(b_type == GGML_TYPE_F32);
2098
2134
 
2099
2135
  switch (a_type) {
@@ -2118,9 +2154,9 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context
2118
2154
  }
2119
2155
 
2120
2156
  static vk_buffer ggml_vk_pool_malloc(ggml_backend_vk_context * ctx, size_t size) {
2121
- #ifdef GGML_VULKAN_DEBUG
2122
- std::cerr << "ggml_vk_pool_malloc(" << size << ")" << std::endl;
2123
- #endif
2157
+ VK_LOG_DEBUG("ggml_vk_pool_malloc(" << size << ")");
2158
+ VK_LOG_MEMORY("ggml_vk_pool_malloc");
2159
+
2124
2160
  int best_i = -1;
2125
2161
  size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs
2126
2162
  int worst_i = -1;
@@ -2148,13 +2184,11 @@ static vk_buffer ggml_vk_pool_malloc(ggml_backend_vk_context * ctx, size_t size)
2148
2184
  ggml_vk_destroy_buffer(b);
2149
2185
  }
2150
2186
 
2151
- return ggml_vk_create_buffer_check(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal);
2187
+ return ggml_vk_create_buffer_device(ctx, size);
2152
2188
  }
2153
2189
 
2154
2190
  static void ggml_vk_pool_free(ggml_backend_vk_context * ctx, vk_buffer& buffer) {
2155
- #ifdef GGML_VULKAN_DEBUG
2156
- std::cerr << "ggml_vk_pool_free(" << buffer->size << ")" << std::endl;
2157
- #endif
2191
+ VK_LOG_DEBUG("ggml_vk_pool_free(" << buffer->size << ")");
2158
2192
  for (int i = 0; i < MAX_VK_BUFFERS; ++i) {
2159
2193
  vk_buffer& b = ctx->buffer_pool[i];
2160
2194
  if (b == nullptr) {
@@ -2175,6 +2209,8 @@ static vk_buffer ggml_vk_create_buffer_temp(ggml_backend_vk_context * ctx, size_
2175
2209
  }
2176
2210
  }
2177
2211
 
2212
+ VK_LOG_MEMORY("ggml_vk_create_buffer_temp(" << size << ")");
2213
+
2178
2214
  // Otherwise create new buffer
2179
2215
  vk_buffer buf = ggml_vk_pool_malloc(ctx, size);
2180
2216
  ctx->gc.temp_buffers.push_back(buf);
@@ -2183,9 +2219,7 @@ static vk_buffer ggml_vk_create_buffer_temp(ggml_backend_vk_context * ctx, size_
2183
2219
  }
2184
2220
 
2185
2221
  static void * ggml_vk_host_malloc(ggml_backend_vk_context * ctx, size_t size) {
2186
- #ifdef GGML_VULKAN_DEBUG
2187
- std::cerr << "ggml_vk_host_malloc(" << size << ")" << std::endl;
2188
- #endif
2222
+ VK_LOG_MEMORY("ggml_vk_host_malloc(" << size << ")");
2189
2223
  vk_buffer buf = ggml_vk_create_buffer(ctx, size,
2190
2224
  vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
2191
2225
  vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
@@ -2207,9 +2241,7 @@ static void ggml_vk_host_free(ggml_backend_vk_context * ctx, void* ptr) {
2207
2241
  if (ptr == nullptr) {
2208
2242
  return;
2209
2243
  }
2210
- #ifdef GGML_VULKAN_DEBUG
2211
- std::cerr << "ggml_vk_host_free(" << ptr << ")" << std::endl;
2212
- #endif
2244
+ VK_LOG_MEMORY("ggml_vk_host_free(" << ptr << ")");
2213
2245
  vk_buffer buf;
2214
2246
  size_t index;
2215
2247
  for (size_t i = 0; i < ctx->pinned_memory.size(); i++) {
@@ -2261,13 +2293,11 @@ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context * ctx, vk_context
2261
2293
  const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]);
2262
2294
  const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]);
2263
2295
  const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]);
2264
- #ifdef GGML_VULKAN_DEBUG
2265
- std::cerr << "ggml_vk_dispatch_pipeline(" << pipeline->name << ", {";
2296
+ VK_LOG_DEBUG("ggml_vk_dispatch_pipeline(" << pipeline->name << ", {";
2266
2297
  for (auto& buffer : buffers) {
2267
2298
  std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.size << "), ";
2268
2299
  }
2269
- std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))" << std::endl;
2270
- #endif
2300
+ std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))");
2271
2301
  std::vector<vk::DescriptorBufferInfo> descriptor_buffer_infos;
2272
2302
  std::vector<vk::WriteDescriptorSet> write_descriptor_sets;
2273
2303
  GGML_ASSERT(pipeline->descriptor_set_idx < pipeline->descriptor_sets.size());
@@ -2300,9 +2330,7 @@ static void ggml_vk_end_submission(vk_submission& s, std::vector<vk_semaphore> w
2300
2330
  }
2301
2331
 
2302
2332
  static void ggml_vk_ctx_end(vk_context * ctx) {
2303
- #ifdef GGML_VULKAN_DEBUG
2304
- std::cerr << "ggml_vk_ctx_end(" << ctx << ", " << ctx->seqs.size() << ")" << std::endl;
2305
- #endif
2333
+ VK_LOG_DEBUG("ggml_vk_ctx_end(" << ctx << ", " << ctx->seqs.size() << ")");
2306
2334
  if (ctx->s == nullptr) {
2307
2335
  return;
2308
2336
  }
@@ -2312,9 +2340,7 @@ static void ggml_vk_ctx_end(vk_context * ctx) {
2312
2340
  }
2313
2341
 
2314
2342
  static void ggml_vk_ctx_begin(ggml_backend_vk_context * ctx, vk_context * subctx) {
2315
- #ifdef GGML_VULKAN_DEBUG
2316
- std::cerr << "ggml_vk_ctx_begin(" << ctx << ")" << std::endl;
2317
- #endif
2343
+ VK_LOG_DEBUG("ggml_vk_ctx_begin(" << ctx << ")");
2318
2344
  if (subctx->s != nullptr) {
2319
2345
  ggml_vk_ctx_end(subctx);
2320
2346
  }
@@ -2324,9 +2350,7 @@ static void ggml_vk_ctx_begin(ggml_backend_vk_context * ctx, vk_context * subctx
2324
2350
  }
2325
2351
 
2326
2352
  static size_t ggml_vk_align_size(size_t width, size_t align) {
2327
- #ifdef GGML_VULKAN_DEBUG
2328
- std::cerr << "ggml_vk_align_size(" << width << ", " << align << ")" << std::endl;
2329
- #endif
2353
+ VK_LOG_DEBUG("ggml_vk_align_size(" << width << ", " << align << ")");
2330
2354
  return CEIL_DIV(width, align) * align;
2331
2355
  }
2332
2356
 
@@ -2340,6 +2364,7 @@ static void deferred_memcpy(void * dst, const void * src, size_t size, std::vect
2340
2364
 
2341
2365
  static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, size_t size) {
2342
2366
  if (ctx->sync_staging == nullptr || ctx->sync_staging->size < size) {
2367
+ VK_LOG_MEMORY("ggml_vk_ensure_sync_staging_buffer(" << size << ")");
2343
2368
  ggml_vk_destroy_buffer(ctx->sync_staging);
2344
2369
  ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size,
2345
2370
  vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
@@ -2348,9 +2373,7 @@ static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, si
2348
2373
  }
2349
2374
 
2350
2375
  static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) {
2351
- #ifdef GGML_VULKAN_DEBUG
2352
- std::cerr << "ggml_vk_buffer_write_nc_async(" << tensor << ")" << std::endl;
2353
- #endif
2376
+ VK_LOG_DEBUG("ggml_vk_buffer_write_nc_async(" << tensor << ")");
2354
2377
  GGML_ASSERT(!ggml_is_contiguous(tensor));
2355
2378
  // Buffer is already mapped
2356
2379
  if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
@@ -2455,9 +2478,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
2455
2478
  }
2456
2479
 
2457
2480
  static void ggml_vk_buffer_write_2d_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, bool sync_staging = false) {
2458
- #ifdef GGML_VULKAN_DEBUG
2459
- std::cerr << "ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")" << std::endl;
2460
- #endif
2481
+ VK_LOG_DEBUG("ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")");
2461
2482
  // Make sure ctx owns the buffer
2462
2483
  GGML_ASSERT(dst->ctx == ctx);
2463
2484
 
@@ -2492,9 +2513,7 @@ static void ggml_vk_buffer_write_2d_async(ggml_backend_vk_context * ctx, vk_cont
2492
2513
  subctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices);
2493
2514
  return;
2494
2515
  }
2495
- #ifdef GGML_VULKAN_DEBUG
2496
- std::cerr << "STAGING" << std::endl;
2497
- #endif
2516
+ VK_LOG_DEBUG("STAGING");
2498
2517
 
2499
2518
  // Staging buffer required
2500
2519
  vk_buffer staging = ctx->staging;
@@ -2529,16 +2548,12 @@ static void ggml_vk_buffer_write_2d_async(ggml_backend_vk_context * ctx, vk_cont
2529
2548
  }
2530
2549
 
2531
2550
  static void ggml_vk_buffer_write_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, bool sync_staging = false) {
2532
- #ifdef GGML_VULKAN_DEBUG
2533
- std::cerr << "ggml_vk_buffer_write_async(" << size << ")" << std::endl;
2534
- #endif
2551
+ VK_LOG_DEBUG("ggml_vk_buffer_write_async(" << size << ")");
2535
2552
  return ggml_vk_buffer_write_2d_async(ctx, subctx, dst, offset, src, size, size, 1, sync_staging);
2536
2553
  }
2537
2554
 
2538
2555
  static void ggml_vk_buffer_write_2d(ggml_backend_vk_context * ctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height) {
2539
- #ifdef GGML_VULKAN_DEBUG
2540
- std::cerr << "ggml_vk_buffer_write_2d(" << width << ", " << height << ")" << std::endl;
2541
- #endif
2556
+ VK_LOG_DEBUG("ggml_vk_buffer_write_2d(" << width << ", " << height << ")");
2542
2557
  // Buffer is already mapped
2543
2558
  if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
2544
2559
  GGML_ASSERT(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
@@ -2563,16 +2578,12 @@ static void ggml_vk_buffer_write_2d(ggml_backend_vk_context * ctx, vk_buffer& ds
2563
2578
  }
2564
2579
 
2565
2580
  static void ggml_vk_buffer_write(ggml_backend_vk_context * ctx, vk_buffer& dst, size_t offset, const void * src, size_t size) {
2566
- #ifdef GGML_VULKAN_DEBUG
2567
- std::cerr << "ggml_vk_buffer_write(" << size << ")" << std::endl;
2568
- #endif
2581
+ VK_LOG_DEBUG("ggml_vk_buffer_write(" << size << ")");
2569
2582
  ggml_vk_buffer_write_2d(ctx, dst, offset, src, 0, size, 1);
2570
2583
  }
2571
2584
 
2572
2585
  static void ggml_vk_buffer_read_2d_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, bool sync_staging = false) {
2573
- #ifdef GGML_VULKAN_DEBUG
2574
- std::cerr << "ggml_vk_buffer_read_2d_async(offset=" << offset << ", width=" << width << ", height=" << height << ")" << std::endl;
2575
- #endif
2586
+ VK_LOG_DEBUG("ggml_vk_buffer_read_2d_async(offset=" << offset << ", width=" << width << ", height=" << height << ")");
2576
2587
  GGML_ASSERT(width > 0);
2577
2588
  GGML_ASSERT(height > 0);
2578
2589
  GGML_ASSERT(src != nullptr);
@@ -2606,9 +2617,7 @@ static void ggml_vk_buffer_read_2d_async(ggml_backend_vk_context * ctx, vk_conte
2606
2617
 
2607
2618
  return;
2608
2619
  }
2609
- #ifdef GGML_VULKAN_DEBUG
2610
- std::cerr << "STAGING" << std::endl;
2611
- #endif
2620
+ VK_LOG_DEBUG("STAGING");
2612
2621
 
2613
2622
  // Fall back to staging buffer
2614
2623
  vk_buffer staging = ctx->staging;
@@ -2635,9 +2644,7 @@ static void ggml_vk_buffer_read_async(ggml_backend_vk_context * ctx, vk_context
2635
2644
  }
2636
2645
 
2637
2646
  static void ggml_vk_buffer_read(ggml_backend_vk_context * ctx, vk_buffer& src, size_t offset, void * dst, size_t size) {
2638
- #ifdef GGML_VULKAN_DEBUG
2639
- std::cerr << "ggml_vk_buffer_read(" << offset << ", " << size << ")" << std::endl;
2640
- #endif
2647
+ VK_LOG_DEBUG("ggml_vk_buffer_read(" << offset << ", " << size << ")");
2641
2648
  if(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
2642
2649
  GGML_ASSERT(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
2643
2650
 
@@ -2659,9 +2666,7 @@ static void ggml_vk_buffer_read(ggml_backend_vk_context * ctx, vk_buffer& src, s
2659
2666
  }
2660
2667
 
2661
2668
  static void ggml_vk_buffer_copy_async(vk_context * ctx, vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
2662
- #ifdef GGML_VULKAN_DEBUG
2663
- std::cerr << "ggml_vk_buffer_copy_async(" << size << ")" << std::endl;
2664
- #endif
2669
+ VK_LOG_DEBUG("ggml_vk_buffer_copy_async(" << size << ")");
2665
2670
  // Make sure both buffers are on same ctx
2666
2671
  GGML_ASSERT(src->ctx == dst->ctx);
2667
2672
 
@@ -2672,9 +2677,7 @@ static void ggml_vk_buffer_copy_async(vk_context * ctx, vk_buffer& dst, size_t d
2672
2677
 
2673
2678
  static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
2674
2679
  if (src->ctx == dst->ctx) {
2675
- #ifdef GGML_VULKAN_DEBUG
2676
- std::cerr << "ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")" << std::endl;
2677
- #endif
2680
+ VK_LOG_DEBUG("ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")");
2678
2681
  // Copy within the device
2679
2682
  ggml_backend_vk_context * ctx = src->ctx;
2680
2683
 
@@ -2686,9 +2689,7 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr
2686
2689
  VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences");
2687
2690
  ctx->device->device.resetFences({ ctx->fence });
2688
2691
  } else {
2689
- #ifdef GGML_VULKAN_DEBUG
2690
- std::cerr << "ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")" << std::endl;
2691
- #endif
2692
+ VK_LOG_DEBUG("ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")");
2692
2693
  // Copy device to device
2693
2694
  ggml_backend_vk_context * src_ctx = src->ctx;
2694
2695
  ggml_backend_vk_context * dst_ctx = dst->ctx;
@@ -2706,9 +2707,7 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr
2706
2707
  }
2707
2708
 
2708
2709
  static void ggml_vk_buffer_memset(ggml_backend_vk_context * ctx, vk_buffer& dst, size_t offset, uint32_t c, size_t size) {
2709
- #ifdef GGML_VULKAN_DEBUG
2710
- std::cerr << "ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")" << std::endl;
2711
- #endif
2710
+ VK_LOG_DEBUG("ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")");
2712
2711
  // Make sure ctx owns the buffer
2713
2712
  GGML_ASSERT(dst->ctx == ctx);
2714
2713
 
@@ -2723,9 +2722,7 @@ static void ggml_vk_buffer_memset(ggml_backend_vk_context * ctx, vk_buffer& dst,
2723
2722
  }
2724
2723
 
2725
2724
  static void ggml_vk_h2d_tensor_2d(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const ggml_tensor * src, uint64_t i3, uint64_t i2, uint64_t i1) {
2726
- #ifdef GGML_VULKAN_DEBUG
2727
- std::cerr << "ggml_vk_h2d_tensor_2d(dst=" << dst << ", offset=" << offset << ", src=" << src << ", i3=" << i3 << ", i2=" << i2 << ", i1=" << i1 << ")" << std::endl;
2728
- #endif
2725
+ VK_LOG_DEBUG("ggml_vk_h2d_tensor_2d(dst=" << dst << ", offset=" << offset << ", src=" << src << ", i3=" << i3 << ", i2=" << i2 << ", i1=" << i1 << ")");
2729
2726
  const uint64_t ne0 = src->ne[0];
2730
2727
  const uint64_t ne1 = src->ne[1];
2731
2728
  const uint64_t nb0 = src->nb[0];
@@ -2753,9 +2750,7 @@ static void ggml_vk_h2d_tensor_2d(ggml_backend_vk_context * ctx, vk_context * su
2753
2750
  }
2754
2751
 
2755
2752
  static void ggml_vk_d2h_tensor_2d(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& src, size_t offset, const ggml_tensor * dst) {
2756
- #ifdef GGML_VULKAN_DEBUG
2757
- std::cerr << "ggml_vk_d2h_tensor_2d()" << std::endl;
2758
- #endif
2753
+ VK_LOG_DEBUG("ggml_vk_d2h_tensor_2d()");
2759
2754
  const uint64_t ne0 = dst->ne[0];
2760
2755
  const uint64_t ne1 = dst->ne[1];
2761
2756
  const uint64_t ne2 = dst->ne[2];
@@ -2779,9 +2774,7 @@ static void ggml_vk_d2h_tensor_2d(ggml_backend_vk_context * ctx, vk_context * su
2779
2774
  }
2780
2775
 
2781
2776
  static uint32_t ggml_vk_guess_split_k(int m, int n, int k) {
2782
- #ifdef GGML_VULKAN_DEBUG
2783
- std::cerr << "ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")" << std::endl;
2784
- #endif
2777
+ VK_LOG_DEBUG("ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")");
2785
2778
  // if (k > 128 && (m < 128 || n < 128) && m > 2 && n > 2) {
2786
2779
  // return 4;
2787
2780
  // }
@@ -2813,9 +2806,7 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline_intel(ggml_backend_vk_context *
2813
2806
  }
2814
2807
 
2815
2808
  static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned) {
2816
- #ifdef GGML_VULKAN_DEBUG
2817
- std::cerr << "ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ")" << std::endl;
2818
- #endif
2809
+ VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ")");
2819
2810
  switch (ctx->device->vendor_id) {
2820
2811
  case VK_VENDOR_ID_AMD:
2821
2812
  return ggml_vk_guess_matmul_pipeline_amd(ctx, mmp, m, n, aligned);
@@ -2837,9 +2828,7 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
2837
2828
  }
2838
2829
 
2839
2830
  static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n) {
2840
- #ifdef GGML_VULKAN_DEBUG
2841
- std::cerr << "ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")" << std::endl;
2842
- #endif
2831
+ VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")");
2843
2832
  return ggml_vk_guess_matmul_pipeline(ctx, mmp, m, n, true)->align;
2844
2833
  }
2845
2834
 
@@ -2849,9 +2838,7 @@ static void ggml_vk_matmul(
2849
2838
  uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d,
2850
2839
  uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
2851
2840
  uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3) {
2852
- #ifdef GGML_VULKAN_DEBUG
2853
- std::cerr << "ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ")" << std::endl;
2854
- #endif
2841
+ VK_LOG_DEBUG("ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ")");
2855
2842
  ggml_vk_sync_buffers(subctx);
2856
2843
  if (split_k == 1) {
2857
2844
  const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k, ne02, ne12, broadcast2, broadcast3 };
@@ -2875,12 +2862,10 @@ static void ggml_vk_matmul_id(
2875
2862
  uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d,
2876
2863
  uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
2877
2864
  uint32_t n_as, uint32_t nei0, uint32_t nei1, uint32_t nbi1, uint32_t ne11) {
2878
- #ifdef GGML_VULKAN_DEBUG
2879
- std::cerr << "ggml_vk_matmul_id(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), ids: (" << ids.buffer->buffer << ", " << ids.offset << ", " << ids.size << "), " <<
2865
+ VK_LOG_DEBUG("ggml_vk_matmul_id(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), ids: (" << ids.buffer->buffer << ", " << ids.offset << ", " << ids.size << "), " <<
2880
2866
  "m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", " <<
2881
2867
  "batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", " <<
2882
- "n_as: " << n_as << ", nei0: " << nei0 << ", nei1: " << nei1 << ", nbi1: " << nbi1 << ", ne11: " << ne11 << ")" << std::endl;
2883
- #endif
2868
+ "n_as: " << n_as << ", nei0: " << nei0 << ", nei1: " << nei1 << ", nbi1: " << nbi1 << ", ne11: " << ne11 << ")");
2884
2869
  ggml_vk_sync_buffers(subctx);
2885
2870
  const vk_mat_mat_id_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d,
2886
2871
  nei0, nei1, nbi1, ne11 };
@@ -2910,10 +2895,8 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_
2910
2895
  }
2911
2896
 
2912
2897
  static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) {
2913
- #ifdef GGML_VULKAN_DEBUG
2914
- std::cerr << "ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
2915
- std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")" << std::endl;
2916
- #endif
2898
+ VK_LOG_DEBUG("ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
2899
+ std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")");
2917
2900
  const int tensor_type_size = ggml_type_size(tensor->type);
2918
2901
 
2919
2902
  const uint32_t ne = ggml_nelements(tensor);
@@ -2930,11 +2913,9 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context
2930
2913
  }
2931
2914
 
2932
2915
  static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2933
- #ifdef GGML_VULKAN_DEBUG
2934
- std::cerr << "ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
2916
+ VK_LOG_DEBUG("ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
2935
2917
  std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
2936
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
2937
- #endif
2918
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
2938
2919
  GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
2939
2920
  GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
2940
2921
 
@@ -3105,11 +3086,9 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
3105
3086
  }
3106
3087
 
3107
3088
  static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3108
- #ifdef GGML_VULKAN_DEBUG
3109
- std::cerr << "ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3089
+ VK_LOG_DEBUG("ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3110
3090
  std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3111
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3112
- #endif
3091
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
3113
3092
  GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
3114
3093
  GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
3115
3094
 
@@ -3260,11 +3239,9 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
3260
3239
  }
3261
3240
 
3262
3241
  static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3263
- #ifdef GGML_VULKAN_DEBUG
3264
- std::cerr << "ggml_vk_mul_mat_p021_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3242
+ VK_LOG_DEBUG("ggml_vk_mul_mat_p021_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3265
3243
  std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3266
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3267
- #endif
3244
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
3268
3245
  GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
3269
3246
  GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT
3270
3247
  GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT
@@ -3333,11 +3310,9 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
3333
3310
  }
3334
3311
 
3335
3312
  static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3336
- #ifdef GGML_VULKAN_DEBUG
3337
- std::cerr << "ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3313
+ VK_LOG_DEBUG("ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3338
3314
  std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3339
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3340
- #endif
3315
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
3341
3316
  GGML_ASSERT(!ggml_is_transposed(src0));
3342
3317
  GGML_ASSERT(!ggml_is_transposed(src1));
3343
3318
  GGML_ASSERT(!ggml_is_permuted(src0));
@@ -3410,9 +3385,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
3410
3385
  }
3411
3386
 
3412
3387
  static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3413
- #ifdef GGML_VULKAN_DEBUG
3414
- std::cerr << "ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")" << std::endl;
3415
- #endif
3388
+ VK_LOG_DEBUG("ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")");
3416
3389
  if (src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && dst->ne[1] == 1) {
3417
3390
  ggml_vk_mul_mat_vec_p021_f16_f32(ctx, subctx, src0, src1, dst);
3418
3391
  } else if (src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && dst->ne[1] == 1) {
@@ -3425,12 +3398,10 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx,
3425
3398
  }
3426
3399
 
3427
3400
  static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
3428
- #ifdef GGML_VULKAN_DEBUG
3429
- std::cerr << "ggml_vk_mul_mat_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3401
+ VK_LOG_DEBUG("ggml_vk_mul_mat_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3430
3402
  std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3431
3403
  std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3];
3432
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3433
- #endif
3404
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
3434
3405
  GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
3435
3406
  GGML_ASSERT(ids->type == GGML_TYPE_I32);
3436
3407
 
@@ -3616,12 +3587,10 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
3616
3587
  }
3617
3588
 
3618
3589
  static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
3619
- #ifdef GGML_VULKAN_DEBUG
3620
- std::cerr << "ggml_vk_mul_mat_vec_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3590
+ VK_LOG_DEBUG("ggml_vk_mul_mat_vec_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3621
3591
  std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3622
3592
  std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3];
3623
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3624
- #endif
3593
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
3625
3594
  GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
3626
3595
  GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
3627
3596
  GGML_ASSERT(ids->type == GGML_TYPE_I32);
@@ -3784,9 +3753,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
3784
3753
  }
3785
3754
 
3786
3755
  static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
3787
- #ifdef GGML_VULKAN_DEBUG
3788
- std::cerr << "ggml_vk_mul_mat_id(" << src0 << ", " << src1 << ", " << src2 << ", " << dst << ")" << std::endl;
3789
- #endif
3756
+ VK_LOG_DEBUG("ggml_vk_mul_mat_id(" << src0 << ", " << src1 << ", " << src2 << ", " << dst << ")");
3790
3757
  if (src2->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) {
3791
3758
  ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, src0, src1, src2, dst);
3792
3759
  } else {
@@ -4020,16 +3987,14 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
4020
3987
 
4021
3988
  template<typename PC>
4022
3989
  static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc) {
4023
- #ifdef GGML_VULKAN_DEBUG
4024
- std::cerr << "ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3990
+ VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
4025
3991
  if (src1 != nullptr) {
4026
3992
  std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
4027
3993
  }
4028
3994
  if (src2 != nullptr) {
4029
3995
  std::cerr << "), (" << src2 << ", name=" << src2->name << ", type=" << src2->type << ", ne0=" << src2->ne[0] << ", ne1=" << src2->ne[1] << ", ne2=" << src2->ne[2] << ", ne3=" << src2->ne[3] << ", nb0=" << src2->nb[0] << ", nb1=" << src2->nb[1] << ", nb2=" << src2->nb[2] << ", nb3=" << src2->nb[3];
4030
3996
  }
4031
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl;
4032
- #endif
3997
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")");
4033
3998
  GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
4034
3999
  GGML_ASSERT(ggml_vk_op_supports_incontiguous(op) || ggml_vk_dim01_contiguous(src0)); // NOLINT
4035
4000
  GGML_ASSERT(dst->extra != nullptr);
@@ -4527,9 +4492,7 @@ static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0
4527
4492
 
4528
4493
  template <typename X_TYPE, typename Y_TYPE>
4529
4494
  static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t n, size_t k, size_t batch, size_t num_it, int split_k, int shader_size) {
4530
- #ifdef GGML_VULKAN_DEBUG
4531
- std::cerr << "ggml_vk_test_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << shader_size << ")" << std::endl;
4532
- #endif
4495
+ VK_LOG_DEBUG("ggml_vk_test_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << shader_size << ")");
4533
4496
  const size_t x_ne = m * k * batch;
4534
4497
  const size_t y_ne = k * n * batch;
4535
4498
  const size_t d_ne = m * n * batch;
@@ -4943,9 +4906,7 @@ static void ggml_vk_test_h2d_nc(ggml_backend_vk_context * ctx, size_t ne0, size_
4943
4906
  }
4944
4907
 
4945
4908
  static void ggml_vk_test_transfer(ggml_backend_vk_context * ctx, size_t ne, bool pinned) {
4946
- #ifdef GGML_VULKAN_DEBUG
4947
- std::cerr << "ggml_vk_test_transfer(" << ne << ")" << std::endl;
4948
- #endif
4909
+ VK_LOG_DEBUG("ggml_vk_test_transfer(" << ne << ")");
4949
4910
  // Check transfers are correct
4950
4911
  vk_buffer buffer = ggml_vk_create_buffer_check(ctx, sizeof(float) * ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
4951
4912
 
@@ -5029,9 +4990,7 @@ static void ggml_vk_quantize_data(const float * from, void * to, size_t ne, ggml
5029
4990
  }
5030
4991
 
5031
4992
  static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) {
5032
- #ifdef GGML_VULKAN_DEBUG
5033
- std::cerr << "ggml_vk_test_dequant(" << ne << ")" << std::endl;
5034
- #endif
4993
+ VK_LOG_DEBUG("ggml_vk_test_dequant(" << ne << ")");
5035
4994
  const size_t x_sz = sizeof(float) * ne;
5036
4995
  const size_t x_sz_f16 = sizeof(ggml_fp16_t) * ne;
5037
4996
  const size_t qx_sz = ne * ggml_type_size(quant)/ggml_blck_size(quant);
@@ -5108,9 +5067,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
5108
5067
  }
5109
5068
 
5110
5069
  static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, size_t n, size_t k, size_t batch, size_t num_it, size_t split_k, size_t shader_size, ggml_type quant) {
5111
- #ifdef GGML_VULKAN_DEBUG
5112
- std::cerr << "ggml_vk_test_dequant_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << ggml_type_name(quant) << ")" << std::endl;
5113
- #endif
5070
+ VK_LOG_DEBUG("ggml_vk_test_dequant_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << ggml_type_name(quant) << ")");
5114
5071
  const size_t x_ne = m * k * batch;
5115
5072
  const size_t y_ne = k * n * batch;
5116
5073
  const size_t d_ne = m * n * batch;
@@ -5294,9 +5251,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
5294
5251
  #endif
5295
5252
 
5296
5253
  static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor) {
5297
- #ifdef GGML_VULKAN_DEBUG
5298
- std::cerr << "ggml_vk_create_extra(" << tensor << " (" << tensor->name << ", " << ggml_op_name(tensor->op) << "))" << std::endl;
5299
- #endif
5254
+ VK_LOG_DEBUG("ggml_vk_create_extra(" << tensor << " (" << tensor->name << ", " << ggml_op_name(tensor->op) << "))");
5300
5255
  ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
5301
5256
  extra->reset();
5302
5257
  tensor->extra = extra;
@@ -5304,9 +5259,7 @@ static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor)
5304
5259
  }
5305
5260
 
5306
5261
  static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggml_tensor * node){
5307
- #ifdef GGML_VULKAN_DEBUG
5308
- std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
5309
- #endif
5262
+ VK_LOG_DEBUG("ggml_vk_preallocate_buffers_graph(" << node << ")");
5310
5263
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
5311
5264
 
5312
5265
  if (extra == nullptr) {
@@ -5341,7 +5294,7 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
5341
5294
 
5342
5295
  bool mmp = (use_src0 && use_src1 && src1_type == GGML_TYPE_F32) ? ggml_vk_get_mul_mat_mat_pipeline(ctx, src0_type, y_non_contig ? GGML_TYPE_F16 : src1->type) != nullptr : false;
5343
5296
 
5344
- const bool qx_needs_dequant = use_src0 && (mmp || x_non_contig);
5297
+ const bool qx_needs_dequant = use_src0 && (!mmp || x_non_contig);
5345
5298
  const bool qy_needs_dequant = use_src1 && ((src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig);
5346
5299
 
5347
5300
  int split_k;
@@ -5419,9 +5372,6 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
5419
5372
  }
5420
5373
 
5421
5374
  static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
5422
- #ifdef GGML_VULKAN_DEBUG
5423
- std::cerr << "ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
5424
- #endif
5425
5375
  #if defined(GGML_VULKAN_RUN_TESTS)
5426
5376
  ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
5427
5377
  vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
@@ -5560,6 +5510,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
5560
5510
  #endif
5561
5511
 
5562
5512
  if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) {
5513
+ VK_LOG_MEMORY("ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << ")");
5563
5514
  // Resize buffer
5564
5515
  if (ctx->prealloc_x != nullptr) {
5565
5516
  ggml_vk_destroy_buffer(ctx->prealloc_x);
@@ -5567,6 +5518,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
5567
5518
  ctx->prealloc_x = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_x);
5568
5519
  }
5569
5520
  if (ctx->prealloc_y == nullptr || (ctx->prealloc_size_y > 0 && ctx->prealloc_y->size < ctx->prealloc_size_y)) {
5521
+ VK_LOG_MEMORY("ggml_vk_preallocate_buffers(y_size: " << ctx->prealloc_size_y << ")");
5570
5522
  // Resize buffer
5571
5523
  if (ctx->prealloc_y != nullptr) {
5572
5524
  ggml_vk_destroy_buffer(ctx->prealloc_y);
@@ -5574,6 +5526,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
5574
5526
  ctx->prealloc_y = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_y);
5575
5527
  }
5576
5528
  if (ctx->prealloc_split_k == nullptr || (ctx->prealloc_size_split_k > 0 && ctx->prealloc_split_k->size < ctx->prealloc_size_split_k)) {
5529
+ VK_LOG_MEMORY("ggml_vk_preallocate_buffers(split_k_size: " << ctx->prealloc_size_split_k << ")");
5577
5530
  // Resize buffer
5578
5531
  if (ctx->prealloc_split_k != nullptr) {
5579
5532
  ggml_vk_destroy_buffer(ctx->prealloc_split_k);
@@ -5581,6 +5534,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
5581
5534
  ctx->prealloc_split_k = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_split_k);
5582
5535
  }
5583
5536
  if (ctx->staging == nullptr || (ctx->staging_size > 0 && ctx->staging->size < ctx->staging_size)) {
5537
+ VK_LOG_MEMORY("ggml_vk_preallocate_buffers(staging_size: " << ctx->staging_size << ")");
5584
5538
  // Resize buffer
5585
5539
  if (ctx->staging != nullptr) {
5586
5540
  ggml_vk_destroy_buffer(ctx->staging);
@@ -5598,9 +5552,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5598
5552
  return;
5599
5553
  }
5600
5554
 
5601
- #ifdef GGML_VULKAN_DEBUG
5602
- std::cerr << "ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")" << std::endl;
5603
- #endif
5555
+ VK_LOG_DEBUG("ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")");
5604
5556
  ctx->semaphore_idx = 0;
5605
5557
  ctx->staging_offset = 0;
5606
5558
 
@@ -5823,9 +5775,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
5823
5775
  return true;
5824
5776
  }
5825
5777
 
5826
- #ifdef GGML_VULKAN_DEBUG
5827
- std::cerr << "ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")" << std::endl;
5828
- #endif
5778
+ VK_LOG_DEBUG("ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")");
5829
5779
 
5830
5780
  #ifdef GGML_VULKAN_CHECK_RESULTS
5831
5781
  ggml_vk_check_results_0(ctx, params, tensor);
@@ -5860,9 +5810,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
5860
5810
 
5861
5811
  // Clean up after graph processing is done
5862
5812
  static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
5863
- #ifdef GGML_VULKAN_DEBUG
5864
- std::cerr << "ggml_vk_graph_cleanup()" << std::endl;
5865
- #endif
5813
+ VK_LOG_DEBUG("ggml_vk_graph_cleanup()");
5866
5814
  for (auto& buffer : ctx->gc.temp_buffers) {
5867
5815
  ggml_vk_pool_free(ctx, buffer);
5868
5816
  }
@@ -5906,9 +5854,7 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
5906
5854
 
5907
5855
  // Clean up on backend free
5908
5856
  static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
5909
- #ifdef GGML_VULKAN_DEBUG
5910
- std::cerr << "ggml_vk_cleanup(" << ctx->idx << ")" << std::endl;
5911
- #endif
5857
+ VK_LOG_DEBUG("ggml_vk_cleanup(" << ctx->idx << ")");
5912
5858
  ggml_vk_graph_cleanup(ctx);
5913
5859
 
5914
5860
  ggml_vk_destroy_buffer(ctx->prealloc_x);
@@ -6003,9 +5949,7 @@ GGML_CALL static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) {
6003
5949
  }
6004
5950
 
6005
5951
  GGML_CALL static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
6006
- #ifdef GGML_VULKAN_DEBUG
6007
- std::cerr << "ggml_backend_vk_buffer_free_buffer()" << std::endl;
6008
- #endif
5952
+ VK_LOG_MEMORY("ggml_backend_vk_buffer_free_buffer()");
6009
5953
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
6010
5954
  ggml_vk_destroy_buffer(ctx->dev_buffer);
6011
5955
  delete ctx;
@@ -6018,9 +5962,7 @@ GGML_CALL static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t bu
6018
5962
  }
6019
5963
 
6020
5964
  GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
6021
- #ifdef GGML_VULKAN_DEBUG
6022
- std::cerr << "ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")" << std::endl;
6023
- #endif
5965
+ VK_LOG_DEBUG("ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")");
6024
5966
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
6025
5967
 
6026
5968
  if (tensor->view_src != nullptr) {
@@ -6036,9 +5978,7 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b
6036
5978
  }
6037
5979
 
6038
5980
  GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
6039
- #ifdef GGML_VULKAN_DEBUG
6040
- std::cerr << "ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
6041
- #endif
5981
+ VK_LOG_DEBUG("ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
6042
5982
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
6043
5983
 
6044
5984
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
@@ -6049,9 +5989,7 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu
6049
5989
  }
6050
5990
 
6051
5991
  GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
6052
- #ifdef GGML_VULKAN_DEBUG
6053
- std::cerr << "ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
6054
- #endif
5992
+ VK_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
6055
5993
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
6056
5994
 
6057
5995
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
@@ -6109,9 +6047,7 @@ GGML_CALL static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buff
6109
6047
  }
6110
6048
 
6111
6049
  GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
6112
- #ifdef GGML_VULKAN_DEBUG
6113
- std::cerr << "ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")" << std::endl;
6114
- #endif
6050
+ VK_LOG_MEMORY("ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")");
6115
6051
  ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
6116
6052
 
6117
6053
  vk_buffer dev_buffer = nullptr;
@@ -6154,9 +6090,7 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
6154
6090
  GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) {
6155
6091
  ggml_vk_instance_init();
6156
6092
 
6157
- #ifdef GGML_VULKAN_DEBUG
6158
- std::cerr << "ggml_backend_vk_buffer_type(" << dev_num << ")" << std::endl;
6159
- #endif
6093
+ VK_LOG_DEBUG("ggml_backend_vk_buffer_type(" << dev_num << ")");
6160
6094
 
6161
6095
  GGML_ASSERT(dev_num < vk_instance.device_indices.size());
6162
6096
 
@@ -6180,16 +6114,12 @@ GGML_CALL static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buff
6180
6114
  }
6181
6115
 
6182
6116
  GGML_CALL static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
6183
- #ifdef GGML_VULKAN_DEBUG
6184
- std::cerr << "ggml_backend_vk_host_buffer_free_buffer()" << std::endl;
6185
- #endif
6117
+ VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
6186
6118
  ggml_vk_host_free(&vk_instance.contexts[0], buffer->context);
6187
6119
  }
6188
6120
 
6189
6121
  GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
6190
- #ifdef GGML_VULKAN_DEBUG
6191
- std::cerr << "ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")" << std::endl;
6192
- #endif
6122
+ VK_LOG_MEMORY("ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")");
6193
6123
  size += 32; // Behave like the CPU buffer type
6194
6124
  void * ptr = nullptr;
6195
6125
  try {
@@ -6246,9 +6176,7 @@ GGML_CALL static const char * ggml_backend_vk_name(ggml_backend_t backend) {
6246
6176
 
6247
6177
  GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend) {
6248
6178
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6249
- #ifdef GGML_VULKAN_DEBUG
6250
- std::cerr << "ggml_backend_vk_free(" << ctx->name << ")" << std::endl;
6251
- #endif
6179
+ VK_LOG_DEBUG("ggml_backend_vk_free(" << ctx->name << ")");
6252
6180
 
6253
6181
  size_t idx = ctx->idx;
6254
6182
 
@@ -6272,9 +6200,7 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_vk_get_default_buffer_t
6272
6200
  }
6273
6201
 
6274
6202
  GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
6275
- #ifdef GGML_VULKAN_DEBUG
6276
- std::cerr << "ggml_backend_vk_set_tensor_async(" << size << ")" << std::endl;
6277
- #endif
6203
+ VK_LOG_DEBUG("ggml_backend_vk_set_tensor_async(" << size << ")");
6278
6204
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6279
6205
  GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
6280
6206
 
@@ -6292,9 +6218,7 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
6292
6218
  }
6293
6219
 
6294
6220
  GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
6295
- #ifdef GGML_VULKAN_DEBUG
6296
- std::cerr << "ggml_backend_vk_get_tensor_async(" << size << ")" << std::endl;
6297
- #endif
6221
+ VK_LOG_DEBUG("ggml_backend_vk_get_tensor_async(" << size << ")");
6298
6222
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6299
6223
  GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
6300
6224
 
@@ -6312,9 +6236,7 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
6312
6236
  }
6313
6237
 
6314
6238
  GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
6315
- #ifdef GGML_VULKAN_DEBUG
6316
- std::cerr << "ggml_backend_vk_cpy_tensor_async()" << std::endl;
6317
- #endif
6239
+ VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async()");
6318
6240
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6319
6241
  if ((dst->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || dst->buffer->buft == ggml_backend_vk_host_buffer_type()) && ggml_backend_buffer_is_vk(src->buffer)) {
6320
6242
  ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra;
@@ -6337,9 +6259,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c
6337
6259
  }
6338
6260
 
6339
6261
  GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
6340
- #ifdef GGML_VULKAN_DEBUG
6341
- std::cerr << "ggml_backend_vk_synchronize()" << std::endl;
6342
- #endif
6262
+ VK_LOG_DEBUG("ggml_backend_vk_synchronize()");
6343
6263
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6344
6264
  if(ctx->transfer_ctx == nullptr) {
6345
6265
  return;
@@ -6367,9 +6287,7 @@ static bool ggml_vk_is_empty(ggml_tensor * node) {
6367
6287
  }
6368
6288
 
6369
6289
  GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
6370
- #ifdef GGML_VULKAN_DEBUG
6371
- std::cerr << "ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)" << std::endl;
6372
- #endif
6290
+ VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
6373
6291
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6374
6292
 
6375
6293
  for (int i = 0; i < cgraph->n_nodes; i++) {
@@ -6582,9 +6500,7 @@ GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
6582
6500
  if (vk_instance.initialized[dev_num]) {
6583
6501
  return vk_instance.backends[dev_num];
6584
6502
  }
6585
- #ifdef GGML_VULKAN_DEBUG
6586
- std::cerr << "ggml_backend_vk_init(" << dev_num << ")" << std::endl;
6587
- #endif
6503
+ VK_LOG_DEBUG("ggml_backend_vk_init(" << dev_num << ")");
6588
6504
 
6589
6505
  ggml_backend_vk_context * ctx = &vk_instance.contexts[dev_num];
6590
6506
  ggml_vk_init(ctx, dev_num);
@@ -6800,9 +6716,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6800
6716
  return;
6801
6717
  }
6802
6718
 
6803
- #ifdef GGML_VULKAN_DEBUG
6804
- std::cerr << "ggml_vk_check_results_0(" << tensor->name << ")" << std::endl;
6805
- #endif
6719
+ VK_LOG_DEBUG("ggml_vk_check_results_0(" << tensor->name << ")");
6806
6720
 
6807
6721
  ggml_tensor * src0 = tensor->src[0];
6808
6722
  ggml_tensor * src1 = tensor->src[1];
@@ -7108,9 +7022,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
7108
7022
  return;
7109
7023
  }
7110
7024
 
7111
- #ifdef GGML_VULKAN_DEBUG
7112
- std::cerr << "ggml_vk_check_results_1(" << tensor->name << ")" << std::endl;
7113
- #endif
7025
+ VK_LOG_DEBUG("ggml_vk_check_results_1(" << tensor->name << ")");
7114
7026
 
7115
7027
  ggml_tensor * src0 = tensor->src[0];
7116
7028
  ggml_tensor * src1 = tensor->src[1];