llama_cpp 0.16.1 → 0.16.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +12 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- data/vendor/tmp/llama.cpp/Makefile +10 -2
- data/vendor/tmp/llama.cpp/ggml-backend.c +14 -3
- data/vendor/tmp/llama.cpp/ggml-backend.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +10 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +28 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +6 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +982 -368
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +8 -3
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +2124 -13202
- data/vendor/tmp/llama.cpp/ggml-sycl.h +1 -10
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +27564 -23876
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +278 -366
- data/vendor/tmp/llama.cpp/ggml.c +67 -150
- data/vendor/tmp/llama.cpp/ggml.h +6 -0
- data/vendor/tmp/llama.cpp/llama.cpp +530 -237
- data/vendor/tmp/llama.cpp/llama.h +5 -1
- data/vendor/tmp/llama.cpp/sgemm.cpp +2 -0
- data/vendor/tmp/llama.cpp/unicode-data.cpp +851 -801
- data/vendor/tmp/llama.cpp/unicode.cpp +33 -19
- data/vendor/tmp/llama.cpp/unicode.h +1 -1
- metadata +2 -2
@@ -8,6 +8,7 @@
|
|
8
8
|
|
9
9
|
#include <algorithm>
|
10
10
|
#include <cmath>
|
11
|
+
#include <iomanip>
|
11
12
|
#include <iostream>
|
12
13
|
#include <tuple>
|
13
14
|
#include <vector>
|
@@ -57,6 +58,12 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA
|
|
57
58
|
} \
|
58
59
|
} while (0)
|
59
60
|
|
61
|
+
#ifdef GGML_VULKAN_DEBUG
|
62
|
+
#define VK_LOG_DEBUG(msg) std::cerr << msg << std::endl
|
63
|
+
#else
|
64
|
+
#define VK_LOG_DEBUG(msg) ((void) 0)
|
65
|
+
#endif // GGML_VULKAN_DEBUG
|
66
|
+
|
60
67
|
struct ggml_backend_vk_context;
|
61
68
|
|
62
69
|
struct vk_queue {
|
@@ -159,9 +166,7 @@ struct vk_device {
|
|
159
166
|
std::vector<vk_pipeline_ref> pipelines;
|
160
167
|
|
161
168
|
~vk_device() {
|
162
|
-
|
163
|
-
std::cerr << "destroy device " << name << std::endl;
|
164
|
-
#endif
|
169
|
+
VK_LOG_DEBUG("destroy device " << name);
|
165
170
|
device.destroyCommandPool(compute_queue.pool);
|
166
171
|
if (!single_queue) {
|
167
172
|
device.destroyCommandPool(transfer_queue.pool);
|
@@ -196,9 +201,7 @@ struct vk_buffer_struct {
|
|
196
201
|
if (size == 0) {
|
197
202
|
return;
|
198
203
|
}
|
199
|
-
|
200
|
-
std::cerr << "~vk_buffer_struct(" << buffer << ", " << size << ")" << std::endl;
|
201
|
-
#endif
|
204
|
+
VK_LOG_DEBUG("~vk_buffer_struct(" << buffer << ", " << size << ")");
|
202
205
|
|
203
206
|
device->device.freeMemory(device_memory);
|
204
207
|
device->device.destroyBuffer(buffer);
|
@@ -355,6 +358,49 @@ struct ggml_vk_garbage_collector {
|
|
355
358
|
std::vector<vk_context> contexts;
|
356
359
|
};
|
357
360
|
|
361
|
+
#if defined(GGML_VULKAN_MEMORY_DEBUG) || defined(GGML_VULKAN_DEBUG)
|
362
|
+
#include <mutex>
|
363
|
+
|
364
|
+
#define VK_LOG_MEMORY(msg) std::cerr << "ggml_vulkan memory: " << msg << std::endl
|
365
|
+
|
366
|
+
static std::string format_size(size_t size) {
|
367
|
+
const size_t kib = 1024;
|
368
|
+
const size_t mib = kib * 1024;
|
369
|
+
const size_t gib = mib * 1024;
|
370
|
+
|
371
|
+
std::ostringstream oss;
|
372
|
+
oss << std::fixed << std::setprecision(2);
|
373
|
+
|
374
|
+
if (size >= gib) {
|
375
|
+
oss << static_cast<double>(size) / gib << " GiB";
|
376
|
+
} else if (size >= mib) {
|
377
|
+
oss << static_cast<double>(size) / mib << " MiB";
|
378
|
+
} else if (size >= kib) {
|
379
|
+
oss << static_cast<double>(size) / kib << " KiB";
|
380
|
+
} else {
|
381
|
+
oss << size << " B";
|
382
|
+
}
|
383
|
+
|
384
|
+
return oss.str();
|
385
|
+
}
|
386
|
+
|
387
|
+
static std::mutex log_mutex;
|
388
|
+
|
389
|
+
class vk_memory_logger {
|
390
|
+
public:
|
391
|
+
vk_memory_logger(): total_device(0), total_host(0) {}
|
392
|
+
void log_allocation(vk_buffer_ref buf_ref, size_t size);
|
393
|
+
void log_deallocation(vk_buffer_ref buf_ref);
|
394
|
+
|
395
|
+
private:
|
396
|
+
std::map<vk::Buffer, size_t> allocations; // Track allocations
|
397
|
+
size_t total_device;
|
398
|
+
size_t total_host;
|
399
|
+
};
|
400
|
+
#else
|
401
|
+
#define VK_LOG_MEMORY(msg) ((void) 0)
|
402
|
+
#endif // GGML_VULKAN_MEMORY_DEBUG
|
403
|
+
|
358
404
|
struct ggml_backend_vk_context {
|
359
405
|
std::string name;
|
360
406
|
|
@@ -379,8 +425,45 @@ struct ggml_backend_vk_context {
|
|
379
425
|
bool initialized;
|
380
426
|
|
381
427
|
size_t idx;
|
428
|
+
|
429
|
+
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
430
|
+
vk_memory_logger memory_logger;
|
431
|
+
#endif
|
382
432
|
};
|
383
433
|
|
434
|
+
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
435
|
+
void vk_memory_logger::log_allocation(vk_buffer_ref buf_ref, size_t size) {
|
436
|
+
std::lock_guard<std::mutex> guard(log_mutex);
|
437
|
+
vk_buffer buf = buf_ref.lock();
|
438
|
+
const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal);
|
439
|
+
const std::string type = device ? "device" : "host";
|
440
|
+
allocations[buf->buffer] = size;
|
441
|
+
total_device += device ? size : 0;
|
442
|
+
total_host += device ? 0 : size;
|
443
|
+
VK_LOG_MEMORY("VULKAN" << buf->ctx->idx << ": +" << format_size(size) << " " << type << " at " << buf->buffer << ". Total device: " << format_size(total_device) << ", total host: " << format_size(total_host));
|
444
|
+
}
|
445
|
+
|
446
|
+
void vk_memory_logger::log_deallocation(vk_buffer_ref buf_ref) {
|
447
|
+
if (buf_ref.expired() || buf_ref.lock()->size == 0) {
|
448
|
+
return;
|
449
|
+
}
|
450
|
+
|
451
|
+
std::lock_guard<std::mutex> guard(log_mutex);
|
452
|
+
vk_buffer buf = buf_ref.lock();
|
453
|
+
const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal);
|
454
|
+
std::string type = device ? "device" : "host";
|
455
|
+
auto it = allocations.find(buf->buffer);
|
456
|
+
total_device -= device ? it->second : 0;
|
457
|
+
total_host -= device ? 0 : it->second;
|
458
|
+
if (it != allocations.end()) {
|
459
|
+
VK_LOG_MEMORY("VULKAN" << buf->ctx->idx << ": -" << format_size(it->second) << " " << type << " at " << buf->buffer << ". Total device: " << format_size(total_device) << ", total host: " << format_size(total_host));
|
460
|
+
allocations.erase(it);
|
461
|
+
} else {
|
462
|
+
VK_LOG_MEMORY("ERROR VULKAN" << buf->ctx->idx << ": Attempted to deallocate unknown " << type << " memory at " << buf->buffer);
|
463
|
+
}
|
464
|
+
}
|
465
|
+
#endif // GGML_VULKAN_MEMORY_DEBUG
|
466
|
+
|
384
467
|
struct vk_instance_t {
|
385
468
|
vk::Instance instance;
|
386
469
|
|
@@ -393,15 +476,11 @@ struct vk_instance_t {
|
|
393
476
|
};
|
394
477
|
|
395
478
|
static std::shared_ptr<vk_device> ggml_vk_get_device(size_t idx) {
|
396
|
-
|
397
|
-
std::cerr << "ggml_vk_get_device(" << idx << ")" << std::endl;
|
398
|
-
#endif
|
479
|
+
VK_LOG_DEBUG("ggml_vk_get_device(" << idx << ")");
|
399
480
|
static std::weak_ptr<vk_device> devices[GGML_VK_MAX_DEVICES];
|
400
481
|
|
401
482
|
if (devices[idx].expired()) {
|
402
|
-
|
403
|
-
std::cerr << "Initializing new vk_device" << std::endl;
|
404
|
-
#endif
|
483
|
+
VK_LOG_DEBUG("Initializing new vk_device");
|
405
484
|
std::shared_ptr<vk_device> device = std::make_shared<vk_device>();
|
406
485
|
device->initialized = false;
|
407
486
|
devices[idx] = device;
|
@@ -428,9 +507,7 @@ static vk_instance_t vk_instance;
|
|
428
507
|
GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend);
|
429
508
|
|
430
509
|
static void ggml_vk_create_pipeline(ggml_backend_vk_context * ctx, vk_pipeline& pipeline, const std::string& name, size_t spv_size, const void* spv_data, const std::string& entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t>&& specialization_constants, uint32_t align) {
|
431
|
-
|
432
|
-
std::cerr << "ggml_vk_create_pipeline(" << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")" << std::endl;
|
433
|
-
#endif
|
510
|
+
VK_LOG_DEBUG("ggml_vk_create_pipeline(" << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")");
|
434
511
|
GGML_ASSERT(parameter_count > 0);
|
435
512
|
GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
|
436
513
|
|
@@ -531,9 +608,7 @@ static void ggml_vk_create_pipeline(ggml_backend_vk_context * ctx, vk_pipeline&
|
|
531
608
|
}
|
532
609
|
|
533
610
|
static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) {
|
534
|
-
|
535
|
-
std::cerr << "ggml_pipeline_destroy_pipeline(" << pipeline->name << ")" << std::endl;
|
536
|
-
#endif
|
611
|
+
VK_LOG_DEBUG("ggml_pipeline_destroy_pipeline(" << pipeline->name << ")");
|
537
612
|
for (auto& pool : pipeline->descriptor_pools) {
|
538
613
|
device.destroyDescriptorPool(pool);
|
539
614
|
}
|
@@ -551,9 +626,7 @@ static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline)
|
|
551
626
|
}
|
552
627
|
|
553
628
|
static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx, vk_pipeline& pipeline, uint32_t n) {
|
554
|
-
|
555
|
-
std::cerr << "ggml_pipeline_allocate_descriptor_sets(" << pipeline->name << ", " << n << ")" << std::endl;
|
556
|
-
#endif
|
629
|
+
VK_LOG_DEBUG("ggml_pipeline_allocate_descriptor_sets(" << pipeline->name << ", " << n << ")");
|
557
630
|
if (pipeline->descriptor_sets.size() >= pipeline->descriptor_set_idx + n) {
|
558
631
|
// Enough descriptors are available
|
559
632
|
return;
|
@@ -583,16 +656,12 @@ static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx
|
|
583
656
|
}
|
584
657
|
|
585
658
|
static void ggml_pipeline_cleanup(vk_pipeline& pipeline) {
|
586
|
-
|
587
|
-
std::cerr << "ggml_pipeline_cleanup(" << pipeline->name << ")" << std::endl;
|
588
|
-
#endif
|
659
|
+
VK_LOG_DEBUG("ggml_pipeline_cleanup(" << pipeline->name << ")");
|
589
660
|
pipeline->descriptor_set_idx = 0;
|
590
661
|
}
|
591
662
|
|
592
663
|
static vk::CommandBuffer ggml_vk_create_cmd_buffer(ggml_backend_vk_context * ctx, vk_queue& q) {
|
593
|
-
|
594
|
-
std::cerr << "ggml_vk_create_cmd_buffer()" << std::endl;
|
595
|
-
#endif
|
664
|
+
VK_LOG_DEBUG("ggml_vk_create_cmd_buffer()");
|
596
665
|
if (q.cmd_buffers.size() > q.cmd_buffer_idx) {
|
597
666
|
// Reuse command buffer
|
598
667
|
return q.cmd_buffers[q.cmd_buffer_idx++];
|
@@ -612,9 +681,7 @@ static vk::CommandBuffer ggml_vk_create_cmd_buffer(ggml_backend_vk_context * ctx
|
|
612
681
|
}
|
613
682
|
|
614
683
|
static vk_submission ggml_vk_create_submission(ggml_backend_vk_context * ctx, vk_queue& q, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores) {
|
615
|
-
|
616
|
-
std::cerr << "ggml_vk_create_submission()" << std::endl;
|
617
|
-
#endif
|
684
|
+
VK_LOG_DEBUG("ggml_vk_create_submission()");
|
618
685
|
vk_submission s;
|
619
686
|
s.buffer = ggml_vk_create_cmd_buffer(ctx, q);
|
620
687
|
s.wait_semaphores = std::move(wait_semaphores);
|
@@ -623,9 +690,7 @@ static vk_submission ggml_vk_create_submission(ggml_backend_vk_context * ctx, vk
|
|
623
690
|
}
|
624
691
|
|
625
692
|
static void ggml_vk_submit(vk_context * ctx, vk::Fence fence) {
|
626
|
-
|
627
|
-
std::cerr << "ggml_vk_submit(" << ctx->seqs.size() << ", " << fence << ")" << std::endl;
|
628
|
-
#endif
|
693
|
+
VK_LOG_DEBUG("ggml_vk_submit(" << ctx->seqs.size() << ", " << fence << ")");
|
629
694
|
if (ctx->seqs.empty()) {
|
630
695
|
return;
|
631
696
|
}
|
@@ -699,9 +764,7 @@ static void ggml_vk_submit(vk_context * ctx, vk::Fence fence) {
|
|
699
764
|
}
|
700
765
|
|
701
766
|
static uint32_t ggml_vk_find_queue_family_index(std::vector<vk::QueueFamilyProperties>& queue_family_props, const vk::QueueFlags& required, const vk::QueueFlags& avoid, int32_t compute_index, uint32_t min_num_queues) {
|
702
|
-
|
703
|
-
std::cerr << "ggml_vk_find_queue_family_index()" << std::endl;
|
704
|
-
#endif
|
767
|
+
VK_LOG_DEBUG("ggml_vk_find_queue_family_index()");
|
705
768
|
const uint32_t qfsize = queue_family_props.size();
|
706
769
|
|
707
770
|
// Try with avoid preferences first
|
@@ -747,9 +810,7 @@ static uint32_t ggml_vk_find_queue_family_index(std::vector<vk::QueueFamilyPrope
|
|
747
810
|
}
|
748
811
|
|
749
812
|
static void ggml_vk_create_queue(ggml_backend_vk_context * ctx, vk_queue& q, uint32_t queue_family_index, uint32_t queue_index, vk::PipelineStageFlags&& stage_flags) {
|
750
|
-
|
751
|
-
std::cerr << "ggml_vk_create_queue()" << std::endl;
|
752
|
-
#endif
|
813
|
+
VK_LOG_DEBUG("ggml_vk_create_queue()");
|
753
814
|
q.queue_family_index = queue_family_index;
|
754
815
|
|
755
816
|
vk::CommandPoolCreateInfo command_pool_create_info_compute(vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT), queue_family_index);
|
@@ -763,9 +824,7 @@ static void ggml_vk_create_queue(ggml_backend_vk_context * ctx, vk_queue& q, uin
|
|
763
824
|
}
|
764
825
|
|
765
826
|
static vk_context * ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_queue& q) {
|
766
|
-
|
767
|
-
std::cerr << "ggml_vk_create_context()" << std::endl;
|
768
|
-
#endif
|
827
|
+
VK_LOG_DEBUG("ggml_vk_create_context()");
|
769
828
|
ctx->gc.contexts.emplace_back();
|
770
829
|
vk_context * result = &ctx->gc.contexts[ctx->gc.contexts.size() - 1];
|
771
830
|
memset((void *) result, 0, sizeof(vk_context));
|
@@ -775,9 +834,7 @@ static vk_context * ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_que
|
|
775
834
|
}
|
776
835
|
|
777
836
|
static vk_semaphore * ggml_vk_create_binary_semaphore(ggml_backend_vk_context * ctx) {
|
778
|
-
|
779
|
-
std::cerr << "ggml_vk_create_timeline_semaphore()" << std::endl;
|
780
|
-
#endif
|
837
|
+
VK_LOG_DEBUG("ggml_vk_create_timeline_semaphore()");
|
781
838
|
vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eBinary, 0 };
|
782
839
|
vk::SemaphoreCreateInfo ci{};
|
783
840
|
ci.setPNext(&tci);
|
@@ -787,9 +844,7 @@ static vk_semaphore * ggml_vk_create_binary_semaphore(ggml_backend_vk_context *
|
|
787
844
|
}
|
788
845
|
|
789
846
|
static vk_semaphore * ggml_vk_create_timeline_semaphore(ggml_backend_vk_context * ctx) {
|
790
|
-
|
791
|
-
std::cerr << "ggml_vk_create_timeline_semaphore()" << std::endl;
|
792
|
-
#endif
|
847
|
+
VK_LOG_DEBUG("ggml_vk_create_timeline_semaphore()");
|
793
848
|
if (ctx->semaphore_idx >= ctx->gc.tl_semaphores.size()) {
|
794
849
|
vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eTimeline, 0 };
|
795
850
|
vk::SemaphoreCreateInfo ci{};
|
@@ -808,9 +863,7 @@ static vk::Event ggml_vk_create_event(ggml_backend_vk_context * ctx) {
|
|
808
863
|
}
|
809
864
|
|
810
865
|
static void ggml_vk_queue_cleanup(ggml_backend_vk_context * ctx, vk_queue& q) {
|
811
|
-
|
812
|
-
std::cerr << "ggml_vk_queue_cleanup()" << std::endl;
|
813
|
-
#endif
|
866
|
+
VK_LOG_DEBUG("ggml_vk_queue_cleanup()");
|
814
867
|
// Requires command buffers to be done
|
815
868
|
|
816
869
|
ctx->device->device.resetCommandPool(q.pool);
|
@@ -830,9 +883,7 @@ static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_pr
|
|
830
883
|
}
|
831
884
|
|
832
885
|
static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
|
833
|
-
|
834
|
-
std::cerr << "ggml_vk_create_buffer(device " << ctx->idx << ", " << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl;
|
835
|
-
#endif
|
886
|
+
VK_LOG_DEBUG("ggml_vk_create_buffer(device " << ctx->idx << ", " << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")");
|
836
887
|
vk_buffer buf = std::make_shared<vk_buffer_struct>();
|
837
888
|
|
838
889
|
if (size == 0) {
|
@@ -892,8 +943,8 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
|
|
892
943
|
|
893
944
|
buf->device = ctx->device;
|
894
945
|
|
895
|
-
#ifdef
|
896
|
-
|
946
|
+
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
947
|
+
ctx->memory_logger.log_allocation(buf, size);
|
897
948
|
#endif
|
898
949
|
|
899
950
|
return buf;
|
@@ -928,6 +979,14 @@ static vk_buffer ggml_vk_create_buffer_device(ggml_backend_vk_context * ctx, siz
|
|
928
979
|
}
|
929
980
|
|
930
981
|
static void ggml_vk_destroy_buffer(vk_buffer& buf) {
|
982
|
+
if (buf == nullptr) {
|
983
|
+
return;
|
984
|
+
}
|
985
|
+
|
986
|
+
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
987
|
+
buf->ctx->memory_logger.log_deallocation(buf);
|
988
|
+
#endif
|
989
|
+
|
931
990
|
buf.reset();
|
932
991
|
}
|
933
992
|
|
@@ -936,9 +995,7 @@ static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) {
|
|
936
995
|
}
|
937
996
|
|
938
997
|
static void ggml_vk_sync_buffers(vk_context * ctx) {
|
939
|
-
|
940
|
-
std::cerr << "ggml_vk_sync_buffers()" << std::endl;
|
941
|
-
#endif
|
998
|
+
VK_LOG_DEBUG("ggml_vk_sync_buffers()");
|
942
999
|
const std::vector<vk::MemoryBarrier> mem_barriers{ { { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite }, { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite } } };
|
943
1000
|
|
944
1001
|
ctx->s->buffer.pipelineBarrier(
|
@@ -952,9 +1009,7 @@ static void ggml_vk_sync_buffers(vk_context * ctx) {
|
|
952
1009
|
}
|
953
1010
|
|
954
1011
|
static void ggml_vk_wait_events(vk_context * ctx, std::vector<vk::Event>&& events) {
|
955
|
-
|
956
|
-
std::cerr << "ggml_vk_wait_events()" << std::endl;
|
957
|
-
#endif
|
1012
|
+
VK_LOG_DEBUG("ggml_vk_wait_events()");
|
958
1013
|
if (events.empty()) {
|
959
1014
|
return;
|
960
1015
|
}
|
@@ -989,9 +1044,7 @@ static bool ggml_vk_build_shader(ggml_type type) {
|
|
989
1044
|
}
|
990
1045
|
|
991
1046
|
static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
992
|
-
|
993
|
-
std::cerr << "ggml_vk_load_shaders(" << ctx->name << ")" << std::endl;
|
994
|
-
#endif
|
1047
|
+
VK_LOG_DEBUG("ggml_vk_load_shaders(" << ctx->name << ")");
|
995
1048
|
|
996
1049
|
const std::shared_ptr<vk_device> device = ctx->device;
|
997
1050
|
|
@@ -1042,12 +1095,12 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1042
1095
|
ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
1043
1096
|
|
1044
1097
|
if (device->fp16) {
|
1045
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l",
|
1046
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m",
|
1047
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s",
|
1048
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l",
|
1049
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m",
|
1050
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s",
|
1098
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
1099
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
1100
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
|
1101
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l", matmul_f32_f32_aligned_len, matmul_f32_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
|
1102
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_f32_aligned_len, matmul_f32_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
|
1103
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_f32_aligned_len, matmul_f32_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
|
1051
1104
|
|
1052
1105
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
1053
1106
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
@@ -1140,12 +1193,12 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1140
1193
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
1141
1194
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
1142
1195
|
|
1143
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l",
|
1144
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m",
|
1145
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s",
|
1146
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l",
|
1147
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m",
|
1148
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s",
|
1196
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
|
1197
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
|
1198
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
|
1199
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l", matmul_id_f32_f32_aligned_len, matmul_id_f32_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align);
|
1200
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m", matmul_id_f32_f32_aligned_len, matmul_id_f32_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align);
|
1201
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s", matmul_id_f32_f32_aligned_len, matmul_id_f32_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align);
|
1149
1202
|
|
1150
1203
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->l, "matmul_id_f16_l", matmul_id_f16_len, matmul_id_f16_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
|
1151
1204
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->m, "matmul_id_f16_m", matmul_id_f16_len, matmul_id_f16_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
|
@@ -1231,12 +1284,12 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1231
1284
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_m, "matmul_id_q6_k_f32_aligned_m", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
1232
1285
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_s, "matmul_id_q6_k_f32_aligned_s", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
1233
1286
|
} else {
|
1234
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l",
|
1235
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m",
|
1236
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s",
|
1237
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l",
|
1238
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m",
|
1239
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s",
|
1287
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
1288
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
1289
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
|
1290
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l", matmul_f32_f32_aligned_fp32_len, matmul_f32_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
|
1291
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_f32_aligned_fp32_len, matmul_f32_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
|
1292
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_f32_aligned_fp32_len, matmul_f32_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
|
1240
1293
|
|
1241
1294
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
1242
1295
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
@@ -1329,12 +1382,12 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1329
1382
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
1330
1383
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
1331
1384
|
|
1332
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l",
|
1333
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m",
|
1334
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s",
|
1335
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l",
|
1336
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m",
|
1337
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s",
|
1385
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
|
1386
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
|
1387
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
|
1388
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l", matmul_id_f32_f32_aligned_fp32_len, matmul_id_f32_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align);
|
1389
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m", matmul_id_f32_f32_aligned_fp32_len, matmul_id_f32_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align);
|
1390
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s", matmul_id_f32_f32_aligned_fp32_len, matmul_id_f32_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align);
|
1338
1391
|
|
1339
1392
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->l, "matmul_id_f16_l", matmul_id_f16_fp32_len, matmul_id_f16_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
|
1340
1393
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->m, "matmul_id_f16_m", matmul_id_f16_fp32_len, matmul_id_f16_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
|
@@ -1429,11 +1482,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1429
1482
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1430
1483
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1431
1484
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1432
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "
|
1433
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "
|
1434
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "
|
1435
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "
|
1436
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "
|
1485
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1486
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1487
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1488
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1489
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1437
1490
|
|
1438
1491
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32", mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1439
1492
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32", mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
@@ -1442,11 +1495,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1442
1495
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1443
1496
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1444
1497
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1445
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "
|
1446
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "
|
1447
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "
|
1448
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "
|
1449
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "
|
1498
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1499
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1500
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1501
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1502
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1450
1503
|
|
1451
1504
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1452
1505
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
@@ -1455,11 +1508,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1455
1508
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1456
1509
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1457
1510
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1458
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "
|
1459
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "
|
1460
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "
|
1461
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "
|
1462
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "
|
1511
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1512
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1513
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1514
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1515
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
1463
1516
|
|
1464
1517
|
// dequant shaders
|
1465
1518
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
@@ -1468,11 +1521,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1468
1521
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_0], "dequant_q5_0", dequant_q5_0_len, dequant_q5_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
1469
1522
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_1], "dequant_q5_1", dequant_q5_1_len, dequant_q5_1_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
1470
1523
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q8_0], "dequant_q8_0", dequant_q8_0_len, dequant_q8_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
1471
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q2_K], "
|
1472
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q3_K], "
|
1473
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q4_K], "
|
1474
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_K], "
|
1475
|
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q6_K], "
|
1524
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q2_K], "dequant_q2_k", dequant_q2_k_len, dequant_q2_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
|
1525
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q3_K], "dequant_q3_k", dequant_q3_k_len, dequant_q3_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
|
1526
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q4_K], "dequant_q4_k", dequant_q4_k_len, dequant_q4_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
|
1527
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_K], "dequant_q5_k", dequant_q5_k_len, dequant_q5_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
|
1528
|
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_k", dequant_q6_k_len, dequant_q6_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
|
1476
1529
|
|
1477
1530
|
// get_rows
|
1478
1531
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F32 ], "get_rows_f32", get_rows_f32_len, get_rows_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
@@ -1538,9 +1591,7 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
1538
1591
|
static void ggml_vk_print_gpu_info(size_t idx) {
|
1539
1592
|
GGML_ASSERT(idx < vk_instance.device_indices.size());
|
1540
1593
|
size_t dev_num = vk_instance.device_indices[idx];
|
1541
|
-
|
1542
|
-
std::cerr << "ggml_vk_print_gpu_info(" << dev_num << ")" << std::endl;
|
1543
|
-
#endif
|
1594
|
+
VK_LOG_DEBUG("ggml_vk_print_gpu_info(" << dev_num << ")");
|
1544
1595
|
GGML_ASSERT(vk_instance.initialized);
|
1545
1596
|
|
1546
1597
|
std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
|
@@ -1617,9 +1668,7 @@ void ggml_vk_instance_init() {
|
|
1617
1668
|
if (vk_instance_initialized) {
|
1618
1669
|
return;
|
1619
1670
|
}
|
1620
|
-
|
1621
|
-
std::cerr << "ggml_vk_instance_init()" << std::endl;
|
1622
|
-
#endif
|
1671
|
+
VK_LOG_DEBUG("ggml_vk_instance_init()");
|
1623
1672
|
|
1624
1673
|
vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION };
|
1625
1674
|
|
@@ -1696,33 +1745,37 @@ void ggml_vk_instance_init() {
|
|
1696
1745
|
|
1697
1746
|
// Default to using all dedicated GPUs
|
1698
1747
|
for (size_t i = 0; i < devices.size(); i++) {
|
1699
|
-
vk::
|
1700
|
-
|
1701
|
-
|
1748
|
+
vk::PhysicalDeviceProperties2 new_props;
|
1749
|
+
vk::PhysicalDeviceDriverProperties new_driver;
|
1750
|
+
vk::PhysicalDeviceIDProperties new_id;
|
1751
|
+
new_props.pNext = &new_driver;
|
1752
|
+
new_driver.pNext = &new_id;
|
1753
|
+
devices[i].getProperties2(&new_props);
|
1754
|
+
|
1755
|
+
if (new_props.properties.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) {
|
1702
1756
|
// Check if there are two physical devices corresponding to the same GPU
|
1703
1757
|
auto old_device = std::find_if(
|
1704
1758
|
vk_instance.device_indices.begin(),
|
1705
1759
|
vk_instance.device_indices.end(),
|
1706
|
-
[&devices, &
|
1760
|
+
[&devices, &new_id](const size_t k){
|
1761
|
+
vk::PhysicalDeviceProperties2 old_props;
|
1762
|
+
vk::PhysicalDeviceIDProperties old_id;
|
1763
|
+
old_props.pNext = &old_id;
|
1764
|
+
devices[k].getProperties2(&old_props);
|
1765
|
+
return std::equal(std::begin(old_id.deviceUUID), std::end(old_id.deviceUUID), std::begin(new_id.deviceUUID));
|
1766
|
+
}
|
1707
1767
|
);
|
1708
1768
|
if (old_device == vk_instance.device_indices.end()) {
|
1709
1769
|
vk_instance.device_indices.push_back(i);
|
1710
1770
|
} else {
|
1711
1771
|
// There can be two physical devices corresponding to the same GPU if there are 2 different drivers
|
1712
1772
|
// This can cause error when splitting layers aross the devices, need to keep only 1
|
1713
|
-
|
1714
|
-
std::cerr << "Device " << i << " and device " << *old_device << " have the same device id" << std::endl;
|
1715
|
-
#endif
|
1773
|
+
VK_LOG_DEBUG("Device " << i << " and device " << *old_device << " have the same deviceUUID");
|
1716
1774
|
|
1717
|
-
vk::PhysicalDeviceProperties2
|
1775
|
+
vk::PhysicalDeviceProperties2 old_props;
|
1718
1776
|
vk::PhysicalDeviceDriverProperties old_driver;
|
1719
|
-
|
1720
|
-
devices[*old_device].getProperties2(&
|
1721
|
-
|
1722
|
-
vk::PhysicalDeviceProperties2 new_prop;
|
1723
|
-
vk::PhysicalDeviceDriverProperties new_driver;
|
1724
|
-
new_prop.pNext = &new_driver;
|
1725
|
-
devices[i].getProperties2(&new_prop);
|
1777
|
+
old_props.pNext = &old_driver;
|
1778
|
+
devices[*old_device].getProperties2(&old_props);
|
1726
1779
|
|
1727
1780
|
std::map<vk::DriverId, int> driver_priorities {};
|
1728
1781
|
int old_priority = std::numeric_limits<int>::max();
|
@@ -1730,7 +1783,7 @@ void ggml_vk_instance_init() {
|
|
1730
1783
|
|
1731
1784
|
// Check https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkDriverId.html for the list of driver id
|
1732
1785
|
// Smaller number -> higher priority
|
1733
|
-
switch (
|
1786
|
+
switch (old_props.properties.vendorID) {
|
1734
1787
|
case VK_VENDOR_ID_AMD:
|
1735
1788
|
driver_priorities[vk::DriverId::eMesaRadv] = 1;
|
1736
1789
|
driver_priorities[vk::DriverId::eAmdOpenSource] = 2;
|
@@ -1760,16 +1813,11 @@ void ggml_vk_instance_init() {
|
|
1760
1813
|
vk_instance.device_indices.erase(r, vk_instance.device_indices.end());
|
1761
1814
|
vk_instance.device_indices.push_back(i);
|
1762
1815
|
|
1763
|
-
|
1764
|
-
std::cerr << "Prioritize device " << i << " driver " << new_driver.driverName << " over device " << *old_device << " driver " << old_driver.driverName << std::endl;
|
1765
|
-
#endif
|
1816
|
+
VK_LOG_DEBUG("Prioritize device " << i << " driver " << new_driver.driverName << " over device " << *old_device << " driver " << old_driver.driverName);
|
1766
1817
|
}
|
1767
|
-
#ifdef GGML_VULKAN_DEBUG
|
1768
1818
|
else {
|
1769
|
-
|
1770
|
-
|
1819
|
+
VK_LOG_DEBUG("Prioritize device " << *old_device << " driver " << old_driver.driverName << " over device " << i << " driver " << new_driver.driverName << std::endl);
|
1771
1820
|
}
|
1772
|
-
#endif
|
1773
1821
|
}
|
1774
1822
|
}
|
1775
1823
|
}
|
@@ -1792,9 +1840,7 @@ void ggml_vk_instance_init() {
|
|
1792
1840
|
static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
1793
1841
|
GGML_ASSERT(idx < vk_instance.device_indices.size());
|
1794
1842
|
size_t dev_num = vk_instance.device_indices[idx];
|
1795
|
-
|
1796
|
-
std::cerr << "ggml_vk_init(" << ctx->name << ", " << dev_num << ")" << std::endl;
|
1797
|
-
#endif
|
1843
|
+
VK_LOG_DEBUG("ggml_vk_init(" << ctx->name << ", " << dev_num << ")");
|
1798
1844
|
ggml_vk_instance_init();
|
1799
1845
|
|
1800
1846
|
std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
|
@@ -1967,9 +2013,7 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
|
1967
2013
|
}
|
1968
2014
|
|
1969
2015
|
static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type type) {
|
1970
|
-
|
1971
|
-
std::cerr << "ggml_vk_get_to_fp16()" << std::endl;
|
1972
|
-
#endif
|
2016
|
+
VK_LOG_DEBUG("ggml_vk_get_to_fp16()");
|
1973
2017
|
switch (type) {
|
1974
2018
|
case GGML_TYPE_F32:
|
1975
2019
|
case GGML_TYPE_Q4_0:
|
@@ -1991,9 +2035,7 @@ static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type
|
|
1991
2035
|
}
|
1992
2036
|
|
1993
2037
|
static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type) {
|
1994
|
-
|
1995
|
-
std::cerr << "ggml_vk_get_mul_mat_mat_pipeline()" << std::endl;
|
1996
|
-
#endif
|
2038
|
+
VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_pipeline()");
|
1997
2039
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
1998
2040
|
return ctx->device->pipeline_matmul_f32;
|
1999
2041
|
}
|
@@ -2029,9 +2071,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
|
|
2029
2071
|
}
|
2030
2072
|
|
2031
2073
|
static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type) {
|
2032
|
-
|
2033
|
-
std::cerr << "ggml_vk_get_dequantize_mul_mat_vec()" << std::endl;
|
2034
|
-
#endif
|
2074
|
+
VK_LOG_DEBUG("ggml_vk_get_dequantize_mul_mat_vec()");
|
2035
2075
|
GGML_ASSERT(b_type == GGML_TYPE_F32 || b_type == GGML_TYPE_F16);
|
2036
2076
|
|
2037
2077
|
switch (a_type) {
|
@@ -2056,9 +2096,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *
|
|
2056
2096
|
}
|
2057
2097
|
|
2058
2098
|
static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type) {
|
2059
|
-
|
2060
|
-
std::cerr << "ggml_vk_get_mul_mat_mat_id_pipeline()" << std::endl;
|
2061
|
-
#endif
|
2099
|
+
VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_id_pipeline()");
|
2062
2100
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
2063
2101
|
return ctx->device->pipeline_matmul_id_f32;
|
2064
2102
|
}
|
@@ -2091,9 +2129,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_co
|
|
2091
2129
|
}
|
2092
2130
|
|
2093
2131
|
static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type) {
|
2094
|
-
|
2095
|
-
std::cerr << "ggml_vk_get_dequantize_mul_mat_vec()" << std::endl;
|
2096
|
-
#endif
|
2132
|
+
VK_LOG_DEBUG("ggml_vk_get_dequantize_mul_mat_vec()");
|
2097
2133
|
GGML_ASSERT(b_type == GGML_TYPE_F32);
|
2098
2134
|
|
2099
2135
|
switch (a_type) {
|
@@ -2118,9 +2154,9 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context
|
|
2118
2154
|
}
|
2119
2155
|
|
2120
2156
|
static vk_buffer ggml_vk_pool_malloc(ggml_backend_vk_context * ctx, size_t size) {
|
2121
|
-
|
2122
|
-
|
2123
|
-
|
2157
|
+
VK_LOG_DEBUG("ggml_vk_pool_malloc(" << size << ")");
|
2158
|
+
VK_LOG_MEMORY("ggml_vk_pool_malloc");
|
2159
|
+
|
2124
2160
|
int best_i = -1;
|
2125
2161
|
size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs
|
2126
2162
|
int worst_i = -1;
|
@@ -2148,13 +2184,11 @@ static vk_buffer ggml_vk_pool_malloc(ggml_backend_vk_context * ctx, size_t size)
|
|
2148
2184
|
ggml_vk_destroy_buffer(b);
|
2149
2185
|
}
|
2150
2186
|
|
2151
|
-
return
|
2187
|
+
return ggml_vk_create_buffer_device(ctx, size);
|
2152
2188
|
}
|
2153
2189
|
|
2154
2190
|
static void ggml_vk_pool_free(ggml_backend_vk_context * ctx, vk_buffer& buffer) {
|
2155
|
-
|
2156
|
-
std::cerr << "ggml_vk_pool_free(" << buffer->size << ")" << std::endl;
|
2157
|
-
#endif
|
2191
|
+
VK_LOG_DEBUG("ggml_vk_pool_free(" << buffer->size << ")");
|
2158
2192
|
for (int i = 0; i < MAX_VK_BUFFERS; ++i) {
|
2159
2193
|
vk_buffer& b = ctx->buffer_pool[i];
|
2160
2194
|
if (b == nullptr) {
|
@@ -2175,6 +2209,8 @@ static vk_buffer ggml_vk_create_buffer_temp(ggml_backend_vk_context * ctx, size_
|
|
2175
2209
|
}
|
2176
2210
|
}
|
2177
2211
|
|
2212
|
+
VK_LOG_MEMORY("ggml_vk_create_buffer_temp(" << size << ")");
|
2213
|
+
|
2178
2214
|
// Otherwise create new buffer
|
2179
2215
|
vk_buffer buf = ggml_vk_pool_malloc(ctx, size);
|
2180
2216
|
ctx->gc.temp_buffers.push_back(buf);
|
@@ -2183,9 +2219,7 @@ static vk_buffer ggml_vk_create_buffer_temp(ggml_backend_vk_context * ctx, size_
|
|
2183
2219
|
}
|
2184
2220
|
|
2185
2221
|
static void * ggml_vk_host_malloc(ggml_backend_vk_context * ctx, size_t size) {
|
2186
|
-
|
2187
|
-
std::cerr << "ggml_vk_host_malloc(" << size << ")" << std::endl;
|
2188
|
-
#endif
|
2222
|
+
VK_LOG_MEMORY("ggml_vk_host_malloc(" << size << ")");
|
2189
2223
|
vk_buffer buf = ggml_vk_create_buffer(ctx, size,
|
2190
2224
|
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
2191
2225
|
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
@@ -2207,9 +2241,7 @@ static void ggml_vk_host_free(ggml_backend_vk_context * ctx, void* ptr) {
|
|
2207
2241
|
if (ptr == nullptr) {
|
2208
2242
|
return;
|
2209
2243
|
}
|
2210
|
-
|
2211
|
-
std::cerr << "ggml_vk_host_free(" << ptr << ")" << std::endl;
|
2212
|
-
#endif
|
2244
|
+
VK_LOG_MEMORY("ggml_vk_host_free(" << ptr << ")");
|
2213
2245
|
vk_buffer buf;
|
2214
2246
|
size_t index;
|
2215
2247
|
for (size_t i = 0; i < ctx->pinned_memory.size(); i++) {
|
@@ -2261,13 +2293,11 @@ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context * ctx, vk_context
|
|
2261
2293
|
const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]);
|
2262
2294
|
const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]);
|
2263
2295
|
const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]);
|
2264
|
-
|
2265
|
-
std::cerr << "ggml_vk_dispatch_pipeline(" << pipeline->name << ", {";
|
2296
|
+
VK_LOG_DEBUG("ggml_vk_dispatch_pipeline(" << pipeline->name << ", {";
|
2266
2297
|
for (auto& buffer : buffers) {
|
2267
2298
|
std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.size << "), ";
|
2268
2299
|
}
|
2269
|
-
std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))"
|
2270
|
-
#endif
|
2300
|
+
std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))");
|
2271
2301
|
std::vector<vk::DescriptorBufferInfo> descriptor_buffer_infos;
|
2272
2302
|
std::vector<vk::WriteDescriptorSet> write_descriptor_sets;
|
2273
2303
|
GGML_ASSERT(pipeline->descriptor_set_idx < pipeline->descriptor_sets.size());
|
@@ -2300,9 +2330,7 @@ static void ggml_vk_end_submission(vk_submission& s, std::vector<vk_semaphore> w
|
|
2300
2330
|
}
|
2301
2331
|
|
2302
2332
|
static void ggml_vk_ctx_end(vk_context * ctx) {
|
2303
|
-
|
2304
|
-
std::cerr << "ggml_vk_ctx_end(" << ctx << ", " << ctx->seqs.size() << ")" << std::endl;
|
2305
|
-
#endif
|
2333
|
+
VK_LOG_DEBUG("ggml_vk_ctx_end(" << ctx << ", " << ctx->seqs.size() << ")");
|
2306
2334
|
if (ctx->s == nullptr) {
|
2307
2335
|
return;
|
2308
2336
|
}
|
@@ -2312,9 +2340,7 @@ static void ggml_vk_ctx_end(vk_context * ctx) {
|
|
2312
2340
|
}
|
2313
2341
|
|
2314
2342
|
static void ggml_vk_ctx_begin(ggml_backend_vk_context * ctx, vk_context * subctx) {
|
2315
|
-
|
2316
|
-
std::cerr << "ggml_vk_ctx_begin(" << ctx << ")" << std::endl;
|
2317
|
-
#endif
|
2343
|
+
VK_LOG_DEBUG("ggml_vk_ctx_begin(" << ctx << ")");
|
2318
2344
|
if (subctx->s != nullptr) {
|
2319
2345
|
ggml_vk_ctx_end(subctx);
|
2320
2346
|
}
|
@@ -2324,9 +2350,7 @@ static void ggml_vk_ctx_begin(ggml_backend_vk_context * ctx, vk_context * subctx
|
|
2324
2350
|
}
|
2325
2351
|
|
2326
2352
|
static size_t ggml_vk_align_size(size_t width, size_t align) {
|
2327
|
-
|
2328
|
-
std::cerr << "ggml_vk_align_size(" << width << ", " << align << ")" << std::endl;
|
2329
|
-
#endif
|
2353
|
+
VK_LOG_DEBUG("ggml_vk_align_size(" << width << ", " << align << ")");
|
2330
2354
|
return CEIL_DIV(width, align) * align;
|
2331
2355
|
}
|
2332
2356
|
|
@@ -2340,6 +2364,7 @@ static void deferred_memcpy(void * dst, const void * src, size_t size, std::vect
|
|
2340
2364
|
|
2341
2365
|
static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, size_t size) {
|
2342
2366
|
if (ctx->sync_staging == nullptr || ctx->sync_staging->size < size) {
|
2367
|
+
VK_LOG_MEMORY("ggml_vk_ensure_sync_staging_buffer(" << size << ")");
|
2343
2368
|
ggml_vk_destroy_buffer(ctx->sync_staging);
|
2344
2369
|
ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size,
|
2345
2370
|
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
@@ -2348,9 +2373,7 @@ static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, si
|
|
2348
2373
|
}
|
2349
2374
|
|
2350
2375
|
static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) {
|
2351
|
-
|
2352
|
-
std::cerr << "ggml_vk_buffer_write_nc_async(" << tensor << ")" << std::endl;
|
2353
|
-
#endif
|
2376
|
+
VK_LOG_DEBUG("ggml_vk_buffer_write_nc_async(" << tensor << ")");
|
2354
2377
|
GGML_ASSERT(!ggml_is_contiguous(tensor));
|
2355
2378
|
// Buffer is already mapped
|
2356
2379
|
if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
|
@@ -2455,9 +2478,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
|
|
2455
2478
|
}
|
2456
2479
|
|
2457
2480
|
static void ggml_vk_buffer_write_2d_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, bool sync_staging = false) {
|
2458
|
-
|
2459
|
-
std::cerr << "ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")" << std::endl;
|
2460
|
-
#endif
|
2481
|
+
VK_LOG_DEBUG("ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")");
|
2461
2482
|
// Make sure ctx owns the buffer
|
2462
2483
|
GGML_ASSERT(dst->ctx == ctx);
|
2463
2484
|
|
@@ -2492,9 +2513,7 @@ static void ggml_vk_buffer_write_2d_async(ggml_backend_vk_context * ctx, vk_cont
|
|
2492
2513
|
subctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices);
|
2493
2514
|
return;
|
2494
2515
|
}
|
2495
|
-
|
2496
|
-
std::cerr << "STAGING" << std::endl;
|
2497
|
-
#endif
|
2516
|
+
VK_LOG_DEBUG("STAGING");
|
2498
2517
|
|
2499
2518
|
// Staging buffer required
|
2500
2519
|
vk_buffer staging = ctx->staging;
|
@@ -2529,16 +2548,12 @@ static void ggml_vk_buffer_write_2d_async(ggml_backend_vk_context * ctx, vk_cont
|
|
2529
2548
|
}
|
2530
2549
|
|
2531
2550
|
static void ggml_vk_buffer_write_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, bool sync_staging = false) {
|
2532
|
-
|
2533
|
-
std::cerr << "ggml_vk_buffer_write_async(" << size << ")" << std::endl;
|
2534
|
-
#endif
|
2551
|
+
VK_LOG_DEBUG("ggml_vk_buffer_write_async(" << size << ")");
|
2535
2552
|
return ggml_vk_buffer_write_2d_async(ctx, subctx, dst, offset, src, size, size, 1, sync_staging);
|
2536
2553
|
}
|
2537
2554
|
|
2538
2555
|
static void ggml_vk_buffer_write_2d(ggml_backend_vk_context * ctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height) {
|
2539
|
-
|
2540
|
-
std::cerr << "ggml_vk_buffer_write_2d(" << width << ", " << height << ")" << std::endl;
|
2541
|
-
#endif
|
2556
|
+
VK_LOG_DEBUG("ggml_vk_buffer_write_2d(" << width << ", " << height << ")");
|
2542
2557
|
// Buffer is already mapped
|
2543
2558
|
if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
|
2544
2559
|
GGML_ASSERT(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
|
@@ -2563,16 +2578,12 @@ static void ggml_vk_buffer_write_2d(ggml_backend_vk_context * ctx, vk_buffer& ds
|
|
2563
2578
|
}
|
2564
2579
|
|
2565
2580
|
static void ggml_vk_buffer_write(ggml_backend_vk_context * ctx, vk_buffer& dst, size_t offset, const void * src, size_t size) {
|
2566
|
-
|
2567
|
-
std::cerr << "ggml_vk_buffer_write(" << size << ")" << std::endl;
|
2568
|
-
#endif
|
2581
|
+
VK_LOG_DEBUG("ggml_vk_buffer_write(" << size << ")");
|
2569
2582
|
ggml_vk_buffer_write_2d(ctx, dst, offset, src, 0, size, 1);
|
2570
2583
|
}
|
2571
2584
|
|
2572
2585
|
static void ggml_vk_buffer_read_2d_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, bool sync_staging = false) {
|
2573
|
-
|
2574
|
-
std::cerr << "ggml_vk_buffer_read_2d_async(offset=" << offset << ", width=" << width << ", height=" << height << ")" << std::endl;
|
2575
|
-
#endif
|
2586
|
+
VK_LOG_DEBUG("ggml_vk_buffer_read_2d_async(offset=" << offset << ", width=" << width << ", height=" << height << ")");
|
2576
2587
|
GGML_ASSERT(width > 0);
|
2577
2588
|
GGML_ASSERT(height > 0);
|
2578
2589
|
GGML_ASSERT(src != nullptr);
|
@@ -2606,9 +2617,7 @@ static void ggml_vk_buffer_read_2d_async(ggml_backend_vk_context * ctx, vk_conte
|
|
2606
2617
|
|
2607
2618
|
return;
|
2608
2619
|
}
|
2609
|
-
|
2610
|
-
std::cerr << "STAGING" << std::endl;
|
2611
|
-
#endif
|
2620
|
+
VK_LOG_DEBUG("STAGING");
|
2612
2621
|
|
2613
2622
|
// Fall back to staging buffer
|
2614
2623
|
vk_buffer staging = ctx->staging;
|
@@ -2635,9 +2644,7 @@ static void ggml_vk_buffer_read_async(ggml_backend_vk_context * ctx, vk_context
|
|
2635
2644
|
}
|
2636
2645
|
|
2637
2646
|
static void ggml_vk_buffer_read(ggml_backend_vk_context * ctx, vk_buffer& src, size_t offset, void * dst, size_t size) {
|
2638
|
-
|
2639
|
-
std::cerr << "ggml_vk_buffer_read(" << offset << ", " << size << ")" << std::endl;
|
2640
|
-
#endif
|
2647
|
+
VK_LOG_DEBUG("ggml_vk_buffer_read(" << offset << ", " << size << ")");
|
2641
2648
|
if(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
|
2642
2649
|
GGML_ASSERT(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
|
2643
2650
|
|
@@ -2659,9 +2666,7 @@ static void ggml_vk_buffer_read(ggml_backend_vk_context * ctx, vk_buffer& src, s
|
|
2659
2666
|
}
|
2660
2667
|
|
2661
2668
|
static void ggml_vk_buffer_copy_async(vk_context * ctx, vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
|
2662
|
-
|
2663
|
-
std::cerr << "ggml_vk_buffer_copy_async(" << size << ")" << std::endl;
|
2664
|
-
#endif
|
2669
|
+
VK_LOG_DEBUG("ggml_vk_buffer_copy_async(" << size << ")");
|
2665
2670
|
// Make sure both buffers are on same ctx
|
2666
2671
|
GGML_ASSERT(src->ctx == dst->ctx);
|
2667
2672
|
|
@@ -2672,9 +2677,7 @@ static void ggml_vk_buffer_copy_async(vk_context * ctx, vk_buffer& dst, size_t d
|
|
2672
2677
|
|
2673
2678
|
static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
|
2674
2679
|
if (src->ctx == dst->ctx) {
|
2675
|
-
|
2676
|
-
std::cerr << "ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")" << std::endl;
|
2677
|
-
#endif
|
2680
|
+
VK_LOG_DEBUG("ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")");
|
2678
2681
|
// Copy within the device
|
2679
2682
|
ggml_backend_vk_context * ctx = src->ctx;
|
2680
2683
|
|
@@ -2686,9 +2689,7 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr
|
|
2686
2689
|
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences");
|
2687
2690
|
ctx->device->device.resetFences({ ctx->fence });
|
2688
2691
|
} else {
|
2689
|
-
|
2690
|
-
std::cerr << "ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")" << std::endl;
|
2691
|
-
#endif
|
2692
|
+
VK_LOG_DEBUG("ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")");
|
2692
2693
|
// Copy device to device
|
2693
2694
|
ggml_backend_vk_context * src_ctx = src->ctx;
|
2694
2695
|
ggml_backend_vk_context * dst_ctx = dst->ctx;
|
@@ -2706,9 +2707,7 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr
|
|
2706
2707
|
}
|
2707
2708
|
|
2708
2709
|
static void ggml_vk_buffer_memset(ggml_backend_vk_context * ctx, vk_buffer& dst, size_t offset, uint32_t c, size_t size) {
|
2709
|
-
|
2710
|
-
std::cerr << "ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")" << std::endl;
|
2711
|
-
#endif
|
2710
|
+
VK_LOG_DEBUG("ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")");
|
2712
2711
|
// Make sure ctx owns the buffer
|
2713
2712
|
GGML_ASSERT(dst->ctx == ctx);
|
2714
2713
|
|
@@ -2723,9 +2722,7 @@ static void ggml_vk_buffer_memset(ggml_backend_vk_context * ctx, vk_buffer& dst,
|
|
2723
2722
|
}
|
2724
2723
|
|
2725
2724
|
static void ggml_vk_h2d_tensor_2d(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const ggml_tensor * src, uint64_t i3, uint64_t i2, uint64_t i1) {
|
2726
|
-
|
2727
|
-
std::cerr << "ggml_vk_h2d_tensor_2d(dst=" << dst << ", offset=" << offset << ", src=" << src << ", i3=" << i3 << ", i2=" << i2 << ", i1=" << i1 << ")" << std::endl;
|
2728
|
-
#endif
|
2725
|
+
VK_LOG_DEBUG("ggml_vk_h2d_tensor_2d(dst=" << dst << ", offset=" << offset << ", src=" << src << ", i3=" << i3 << ", i2=" << i2 << ", i1=" << i1 << ")");
|
2729
2726
|
const uint64_t ne0 = src->ne[0];
|
2730
2727
|
const uint64_t ne1 = src->ne[1];
|
2731
2728
|
const uint64_t nb0 = src->nb[0];
|
@@ -2753,9 +2750,7 @@ static void ggml_vk_h2d_tensor_2d(ggml_backend_vk_context * ctx, vk_context * su
|
|
2753
2750
|
}
|
2754
2751
|
|
2755
2752
|
static void ggml_vk_d2h_tensor_2d(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& src, size_t offset, const ggml_tensor * dst) {
|
2756
|
-
|
2757
|
-
std::cerr << "ggml_vk_d2h_tensor_2d()" << std::endl;
|
2758
|
-
#endif
|
2753
|
+
VK_LOG_DEBUG("ggml_vk_d2h_tensor_2d()");
|
2759
2754
|
const uint64_t ne0 = dst->ne[0];
|
2760
2755
|
const uint64_t ne1 = dst->ne[1];
|
2761
2756
|
const uint64_t ne2 = dst->ne[2];
|
@@ -2779,9 +2774,7 @@ static void ggml_vk_d2h_tensor_2d(ggml_backend_vk_context * ctx, vk_context * su
|
|
2779
2774
|
}
|
2780
2775
|
|
2781
2776
|
static uint32_t ggml_vk_guess_split_k(int m, int n, int k) {
|
2782
|
-
|
2783
|
-
std::cerr << "ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")" << std::endl;
|
2784
|
-
#endif
|
2777
|
+
VK_LOG_DEBUG("ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")");
|
2785
2778
|
// if (k > 128 && (m < 128 || n < 128) && m > 2 && n > 2) {
|
2786
2779
|
// return 4;
|
2787
2780
|
// }
|
@@ -2813,9 +2806,7 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline_intel(ggml_backend_vk_context *
|
|
2813
2806
|
}
|
2814
2807
|
|
2815
2808
|
static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned) {
|
2816
|
-
|
2817
|
-
std::cerr << "ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ")" << std::endl;
|
2818
|
-
#endif
|
2809
|
+
VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ")");
|
2819
2810
|
switch (ctx->device->vendor_id) {
|
2820
2811
|
case VK_VENDOR_ID_AMD:
|
2821
2812
|
return ggml_vk_guess_matmul_pipeline_amd(ctx, mmp, m, n, aligned);
|
@@ -2837,9 +2828,7 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
|
|
2837
2828
|
}
|
2838
2829
|
|
2839
2830
|
static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n) {
|
2840
|
-
|
2841
|
-
std::cerr << "ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")" << std::endl;
|
2842
|
-
#endif
|
2831
|
+
VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")");
|
2843
2832
|
return ggml_vk_guess_matmul_pipeline(ctx, mmp, m, n, true)->align;
|
2844
2833
|
}
|
2845
2834
|
|
@@ -2849,9 +2838,7 @@ static void ggml_vk_matmul(
|
|
2849
2838
|
uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d,
|
2850
2839
|
uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
|
2851
2840
|
uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3) {
|
2852
|
-
|
2853
|
-
std::cerr << "ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ")" << std::endl;
|
2854
|
-
#endif
|
2841
|
+
VK_LOG_DEBUG("ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ")");
|
2855
2842
|
ggml_vk_sync_buffers(subctx);
|
2856
2843
|
if (split_k == 1) {
|
2857
2844
|
const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k, ne02, ne12, broadcast2, broadcast3 };
|
@@ -2875,12 +2862,10 @@ static void ggml_vk_matmul_id(
|
|
2875
2862
|
uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d,
|
2876
2863
|
uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
|
2877
2864
|
uint32_t n_as, uint32_t nei0, uint32_t nei1, uint32_t nbi1, uint32_t ne11) {
|
2878
|
-
|
2879
|
-
std::cerr << "ggml_vk_matmul_id(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), ids: (" << ids.buffer->buffer << ", " << ids.offset << ", " << ids.size << "), " <<
|
2865
|
+
VK_LOG_DEBUG("ggml_vk_matmul_id(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), ids: (" << ids.buffer->buffer << ", " << ids.offset << ", " << ids.size << "), " <<
|
2880
2866
|
"m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", " <<
|
2881
2867
|
"batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", " <<
|
2882
|
-
"n_as: " << n_as << ", nei0: " << nei0 << ", nei1: " << nei1 << ", nbi1: " << nbi1 << ", ne11: " << ne11 << ")"
|
2883
|
-
#endif
|
2868
|
+
"n_as: " << n_as << ", nei0: " << nei0 << ", nei1: " << nei1 << ", nbi1: " << nbi1 << ", ne11: " << ne11 << ")");
|
2884
2869
|
ggml_vk_sync_buffers(subctx);
|
2885
2870
|
const vk_mat_mat_id_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d,
|
2886
2871
|
nei0, nei1, nbi1, ne11 };
|
@@ -2910,10 +2895,8 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_
|
|
2910
2895
|
}
|
2911
2896
|
|
2912
2897
|
static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) {
|
2913
|
-
|
2914
|
-
std::cerr << "
|
2915
|
-
std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")" << std::endl;
|
2916
|
-
#endif
|
2898
|
+
VK_LOG_DEBUG("ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
|
2899
|
+
std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")");
|
2917
2900
|
const int tensor_type_size = ggml_type_size(tensor->type);
|
2918
2901
|
|
2919
2902
|
const uint32_t ne = ggml_nelements(tensor);
|
@@ -2930,11 +2913,9 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context
|
|
2930
2913
|
}
|
2931
2914
|
|
2932
2915
|
static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2933
|
-
|
2934
|
-
std::cerr << "ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
2916
|
+
VK_LOG_DEBUG("ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
2935
2917
|
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
2936
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"
|
2937
|
-
#endif
|
2918
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
|
2938
2919
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
|
2939
2920
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
|
2940
2921
|
|
@@ -3105,11 +3086,9 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
3105
3086
|
}
|
3106
3087
|
|
3107
3088
|
static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3108
|
-
|
3109
|
-
std::cerr << "ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3089
|
+
VK_LOG_DEBUG("ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3110
3090
|
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
3111
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"
|
3112
|
-
#endif
|
3091
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
|
3113
3092
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
|
3114
3093
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
|
3115
3094
|
|
@@ -3260,11 +3239,9 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
3260
3239
|
}
|
3261
3240
|
|
3262
3241
|
static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3263
|
-
|
3264
|
-
std::cerr << "ggml_vk_mul_mat_p021_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3242
|
+
VK_LOG_DEBUG("ggml_vk_mul_mat_p021_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3265
3243
|
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
3266
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"
|
3267
|
-
#endif
|
3244
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
|
3268
3245
|
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
3269
3246
|
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT
|
3270
3247
|
GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT
|
@@ -3333,11 +3310,9 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
3333
3310
|
}
|
3334
3311
|
|
3335
3312
|
static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3336
|
-
|
3337
|
-
std::cerr << "ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3313
|
+
VK_LOG_DEBUG("ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3338
3314
|
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
3339
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"
|
3340
|
-
#endif
|
3315
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
|
3341
3316
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
3342
3317
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
3343
3318
|
GGML_ASSERT(!ggml_is_permuted(src0));
|
@@ -3410,9 +3385,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
3410
3385
|
}
|
3411
3386
|
|
3412
3387
|
static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3413
|
-
|
3414
|
-
std::cerr << "ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")" << std::endl;
|
3415
|
-
#endif
|
3388
|
+
VK_LOG_DEBUG("ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")");
|
3416
3389
|
if (src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && dst->ne[1] == 1) {
|
3417
3390
|
ggml_vk_mul_mat_vec_p021_f16_f32(ctx, subctx, src0, src1, dst);
|
3418
3391
|
} else if (src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && dst->ne[1] == 1) {
|
@@ -3425,12 +3398,10 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx,
|
|
3425
3398
|
}
|
3426
3399
|
|
3427
3400
|
static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
|
3428
|
-
|
3429
|
-
std::cerr << "ggml_vk_mul_mat_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3401
|
+
VK_LOG_DEBUG("ggml_vk_mul_mat_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3430
3402
|
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
3431
3403
|
std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3];
|
3432
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"
|
3433
|
-
#endif
|
3404
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
|
3434
3405
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
|
3435
3406
|
GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
3436
3407
|
|
@@ -3616,12 +3587,10 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
|
|
3616
3587
|
}
|
3617
3588
|
|
3618
3589
|
static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
|
3619
|
-
|
3620
|
-
std::cerr << "ggml_vk_mul_mat_vec_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3590
|
+
VK_LOG_DEBUG("ggml_vk_mul_mat_vec_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3621
3591
|
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
3622
3592
|
std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3];
|
3623
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"
|
3624
|
-
#endif
|
3593
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
|
3625
3594
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
|
3626
3595
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
|
3627
3596
|
GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
@@ -3784,9 +3753,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
|
3784
3753
|
}
|
3785
3754
|
|
3786
3755
|
static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
3787
|
-
|
3788
|
-
std::cerr << "ggml_vk_mul_mat_id(" << src0 << ", " << src1 << ", " << src2 << ", " << dst << ")" << std::endl;
|
3789
|
-
#endif
|
3756
|
+
VK_LOG_DEBUG("ggml_vk_mul_mat_id(" << src0 << ", " << src1 << ", " << src2 << ", " << dst << ")");
|
3790
3757
|
if (src2->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) {
|
3791
3758
|
ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, src0, src1, src2, dst);
|
3792
3759
|
} else {
|
@@ -4020,16 +3987,14 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
|
|
4020
3987
|
|
4021
3988
|
template<typename PC>
|
4022
3989
|
static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc) {
|
4023
|
-
|
4024
|
-
std::cerr << "ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
3990
|
+
VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
4025
3991
|
if (src1 != nullptr) {
|
4026
3992
|
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
4027
3993
|
}
|
4028
3994
|
if (src2 != nullptr) {
|
4029
3995
|
std::cerr << "), (" << src2 << ", name=" << src2->name << ", type=" << src2->type << ", ne0=" << src2->ne[0] << ", ne1=" << src2->ne[1] << ", ne2=" << src2->ne[2] << ", ne3=" << src2->ne[3] << ", nb0=" << src2->nb[0] << ", nb1=" << src2->nb[1] << ", nb2=" << src2->nb[2] << ", nb3=" << src2->nb[3];
|
4030
3996
|
}
|
4031
|
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")"
|
4032
|
-
#endif
|
3997
|
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")");
|
4033
3998
|
GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
|
4034
3999
|
GGML_ASSERT(ggml_vk_op_supports_incontiguous(op) || ggml_vk_dim01_contiguous(src0)); // NOLINT
|
4035
4000
|
GGML_ASSERT(dst->extra != nullptr);
|
@@ -4527,9 +4492,7 @@ static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0
|
|
4527
4492
|
|
4528
4493
|
template <typename X_TYPE, typename Y_TYPE>
|
4529
4494
|
static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t n, size_t k, size_t batch, size_t num_it, int split_k, int shader_size) {
|
4530
|
-
|
4531
|
-
std::cerr << "ggml_vk_test_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << shader_size << ")" << std::endl;
|
4532
|
-
#endif
|
4495
|
+
VK_LOG_DEBUG("ggml_vk_test_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << shader_size << ")");
|
4533
4496
|
const size_t x_ne = m * k * batch;
|
4534
4497
|
const size_t y_ne = k * n * batch;
|
4535
4498
|
const size_t d_ne = m * n * batch;
|
@@ -4943,9 +4906,7 @@ static void ggml_vk_test_h2d_nc(ggml_backend_vk_context * ctx, size_t ne0, size_
|
|
4943
4906
|
}
|
4944
4907
|
|
4945
4908
|
static void ggml_vk_test_transfer(ggml_backend_vk_context * ctx, size_t ne, bool pinned) {
|
4946
|
-
|
4947
|
-
std::cerr << "ggml_vk_test_transfer(" << ne << ")" << std::endl;
|
4948
|
-
#endif
|
4909
|
+
VK_LOG_DEBUG("ggml_vk_test_transfer(" << ne << ")");
|
4949
4910
|
// Check transfers are correct
|
4950
4911
|
vk_buffer buffer = ggml_vk_create_buffer_check(ctx, sizeof(float) * ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
4951
4912
|
|
@@ -5029,9 +4990,7 @@ static void ggml_vk_quantize_data(const float * from, void * to, size_t ne, ggml
|
|
5029
4990
|
}
|
5030
4991
|
|
5031
4992
|
static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) {
|
5032
|
-
|
5033
|
-
std::cerr << "ggml_vk_test_dequant(" << ne << ")" << std::endl;
|
5034
|
-
#endif
|
4993
|
+
VK_LOG_DEBUG("ggml_vk_test_dequant(" << ne << ")");
|
5035
4994
|
const size_t x_sz = sizeof(float) * ne;
|
5036
4995
|
const size_t x_sz_f16 = sizeof(ggml_fp16_t) * ne;
|
5037
4996
|
const size_t qx_sz = ne * ggml_type_size(quant)/ggml_blck_size(quant);
|
@@ -5108,9 +5067,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
|
|
5108
5067
|
}
|
5109
5068
|
|
5110
5069
|
static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, size_t n, size_t k, size_t batch, size_t num_it, size_t split_k, size_t shader_size, ggml_type quant) {
|
5111
|
-
|
5112
|
-
std::cerr << "ggml_vk_test_dequant_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << ggml_type_name(quant) << ")" << std::endl;
|
5113
|
-
#endif
|
5070
|
+
VK_LOG_DEBUG("ggml_vk_test_dequant_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << ggml_type_name(quant) << ")");
|
5114
5071
|
const size_t x_ne = m * k * batch;
|
5115
5072
|
const size_t y_ne = k * n * batch;
|
5116
5073
|
const size_t d_ne = m * n * batch;
|
@@ -5294,9 +5251,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
|
5294
5251
|
#endif
|
5295
5252
|
|
5296
5253
|
static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor) {
|
5297
|
-
|
5298
|
-
std::cerr << "ggml_vk_create_extra(" << tensor << " (" << tensor->name << ", " << ggml_op_name(tensor->op) << "))" << std::endl;
|
5299
|
-
#endif
|
5254
|
+
VK_LOG_DEBUG("ggml_vk_create_extra(" << tensor << " (" << tensor->name << ", " << ggml_op_name(tensor->op) << "))");
|
5300
5255
|
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
|
5301
5256
|
extra->reset();
|
5302
5257
|
tensor->extra = extra;
|
@@ -5304,9 +5259,7 @@ static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor)
|
|
5304
5259
|
}
|
5305
5260
|
|
5306
5261
|
static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggml_tensor * node){
|
5307
|
-
|
5308
|
-
std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
|
5309
|
-
#endif
|
5262
|
+
VK_LOG_DEBUG("ggml_vk_preallocate_buffers_graph(" << node << ")");
|
5310
5263
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
|
5311
5264
|
|
5312
5265
|
if (extra == nullptr) {
|
@@ -5341,7 +5294,7 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
|
5341
5294
|
|
5342
5295
|
bool mmp = (use_src0 && use_src1 && src1_type == GGML_TYPE_F32) ? ggml_vk_get_mul_mat_mat_pipeline(ctx, src0_type, y_non_contig ? GGML_TYPE_F16 : src1->type) != nullptr : false;
|
5343
5296
|
|
5344
|
-
const bool qx_needs_dequant = use_src0 && (mmp || x_non_contig);
|
5297
|
+
const bool qx_needs_dequant = use_src0 && (!mmp || x_non_contig);
|
5345
5298
|
const bool qy_needs_dequant = use_src1 && ((src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig);
|
5346
5299
|
|
5347
5300
|
int split_k;
|
@@ -5419,9 +5372,6 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
|
5419
5372
|
}
|
5420
5373
|
|
5421
5374
|
static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
5422
|
-
#ifdef GGML_VULKAN_DEBUG
|
5423
|
-
std::cerr << "ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
|
5424
|
-
#endif
|
5425
5375
|
#if defined(GGML_VULKAN_RUN_TESTS)
|
5426
5376
|
ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
|
5427
5377
|
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
@@ -5560,6 +5510,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
5560
5510
|
#endif
|
5561
5511
|
|
5562
5512
|
if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) {
|
5513
|
+
VK_LOG_MEMORY("ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << ")");
|
5563
5514
|
// Resize buffer
|
5564
5515
|
if (ctx->prealloc_x != nullptr) {
|
5565
5516
|
ggml_vk_destroy_buffer(ctx->prealloc_x);
|
@@ -5567,6 +5518,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
5567
5518
|
ctx->prealloc_x = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_x);
|
5568
5519
|
}
|
5569
5520
|
if (ctx->prealloc_y == nullptr || (ctx->prealloc_size_y > 0 && ctx->prealloc_y->size < ctx->prealloc_size_y)) {
|
5521
|
+
VK_LOG_MEMORY("ggml_vk_preallocate_buffers(y_size: " << ctx->prealloc_size_y << ")");
|
5570
5522
|
// Resize buffer
|
5571
5523
|
if (ctx->prealloc_y != nullptr) {
|
5572
5524
|
ggml_vk_destroy_buffer(ctx->prealloc_y);
|
@@ -5574,6 +5526,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
5574
5526
|
ctx->prealloc_y = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_y);
|
5575
5527
|
}
|
5576
5528
|
if (ctx->prealloc_split_k == nullptr || (ctx->prealloc_size_split_k > 0 && ctx->prealloc_split_k->size < ctx->prealloc_size_split_k)) {
|
5529
|
+
VK_LOG_MEMORY("ggml_vk_preallocate_buffers(split_k_size: " << ctx->prealloc_size_split_k << ")");
|
5577
5530
|
// Resize buffer
|
5578
5531
|
if (ctx->prealloc_split_k != nullptr) {
|
5579
5532
|
ggml_vk_destroy_buffer(ctx->prealloc_split_k);
|
@@ -5581,6 +5534,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
5581
5534
|
ctx->prealloc_split_k = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_split_k);
|
5582
5535
|
}
|
5583
5536
|
if (ctx->staging == nullptr || (ctx->staging_size > 0 && ctx->staging->size < ctx->staging_size)) {
|
5537
|
+
VK_LOG_MEMORY("ggml_vk_preallocate_buffers(staging_size: " << ctx->staging_size << ")");
|
5584
5538
|
// Resize buffer
|
5585
5539
|
if (ctx->staging != nullptr) {
|
5586
5540
|
ggml_vk_destroy_buffer(ctx->staging);
|
@@ -5598,9 +5552,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
5598
5552
|
return;
|
5599
5553
|
}
|
5600
5554
|
|
5601
|
-
|
5602
|
-
std::cerr << "ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")" << std::endl;
|
5603
|
-
#endif
|
5555
|
+
VK_LOG_DEBUG("ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")");
|
5604
5556
|
ctx->semaphore_idx = 0;
|
5605
5557
|
ctx->staging_offset = 0;
|
5606
5558
|
|
@@ -5823,9 +5775,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
|
|
5823
5775
|
return true;
|
5824
5776
|
}
|
5825
5777
|
|
5826
|
-
|
5827
|
-
std::cerr << "ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")" << std::endl;
|
5828
|
-
#endif
|
5778
|
+
VK_LOG_DEBUG("ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")");
|
5829
5779
|
|
5830
5780
|
#ifdef GGML_VULKAN_CHECK_RESULTS
|
5831
5781
|
ggml_vk_check_results_0(ctx, params, tensor);
|
@@ -5860,9 +5810,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
|
|
5860
5810
|
|
5861
5811
|
// Clean up after graph processing is done
|
5862
5812
|
static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
|
5863
|
-
|
5864
|
-
std::cerr << "ggml_vk_graph_cleanup()" << std::endl;
|
5865
|
-
#endif
|
5813
|
+
VK_LOG_DEBUG("ggml_vk_graph_cleanup()");
|
5866
5814
|
for (auto& buffer : ctx->gc.temp_buffers) {
|
5867
5815
|
ggml_vk_pool_free(ctx, buffer);
|
5868
5816
|
}
|
@@ -5906,9 +5854,7 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
|
|
5906
5854
|
|
5907
5855
|
// Clean up on backend free
|
5908
5856
|
static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
|
5909
|
-
|
5910
|
-
std::cerr << "ggml_vk_cleanup(" << ctx->idx << ")" << std::endl;
|
5911
|
-
#endif
|
5857
|
+
VK_LOG_DEBUG("ggml_vk_cleanup(" << ctx->idx << ")");
|
5912
5858
|
ggml_vk_graph_cleanup(ctx);
|
5913
5859
|
|
5914
5860
|
ggml_vk_destroy_buffer(ctx->prealloc_x);
|
@@ -6003,9 +5949,7 @@ GGML_CALL static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) {
|
|
6003
5949
|
}
|
6004
5950
|
|
6005
5951
|
GGML_CALL static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
6006
|
-
|
6007
|
-
std::cerr << "ggml_backend_vk_buffer_free_buffer()" << std::endl;
|
6008
|
-
#endif
|
5952
|
+
VK_LOG_MEMORY("ggml_backend_vk_buffer_free_buffer()");
|
6009
5953
|
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
6010
5954
|
ggml_vk_destroy_buffer(ctx->dev_buffer);
|
6011
5955
|
delete ctx;
|
@@ -6018,9 +5962,7 @@ GGML_CALL static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t bu
|
|
6018
5962
|
}
|
6019
5963
|
|
6020
5964
|
GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
6021
|
-
|
6022
|
-
std::cerr << "ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")" << std::endl;
|
6023
|
-
#endif
|
5965
|
+
VK_LOG_DEBUG("ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")");
|
6024
5966
|
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
6025
5967
|
|
6026
5968
|
if (tensor->view_src != nullptr) {
|
@@ -6036,9 +5978,7 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b
|
|
6036
5978
|
}
|
6037
5979
|
|
6038
5980
|
GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
6039
|
-
|
6040
|
-
std::cerr << "ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
|
6041
|
-
#endif
|
5981
|
+
VK_LOG_DEBUG("ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
|
6042
5982
|
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
6043
5983
|
|
6044
5984
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
@@ -6049,9 +5989,7 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu
|
|
6049
5989
|
}
|
6050
5990
|
|
6051
5991
|
GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
6052
|
-
|
6053
|
-
std::cerr << "ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
|
6054
|
-
#endif
|
5992
|
+
VK_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
|
6055
5993
|
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
6056
5994
|
|
6057
5995
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
@@ -6109,9 +6047,7 @@ GGML_CALL static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buff
|
|
6109
6047
|
}
|
6110
6048
|
|
6111
6049
|
GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
6112
|
-
|
6113
|
-
std::cerr << "ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")" << std::endl;
|
6114
|
-
#endif
|
6050
|
+
VK_LOG_MEMORY("ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")");
|
6115
6051
|
ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
|
6116
6052
|
|
6117
6053
|
vk_buffer dev_buffer = nullptr;
|
@@ -6154,9 +6090,7 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
|
|
6154
6090
|
GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) {
|
6155
6091
|
ggml_vk_instance_init();
|
6156
6092
|
|
6157
|
-
|
6158
|
-
std::cerr << "ggml_backend_vk_buffer_type(" << dev_num << ")" << std::endl;
|
6159
|
-
#endif
|
6093
|
+
VK_LOG_DEBUG("ggml_backend_vk_buffer_type(" << dev_num << ")");
|
6160
6094
|
|
6161
6095
|
GGML_ASSERT(dev_num < vk_instance.device_indices.size());
|
6162
6096
|
|
@@ -6180,16 +6114,12 @@ GGML_CALL static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buff
|
|
6180
6114
|
}
|
6181
6115
|
|
6182
6116
|
GGML_CALL static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
6183
|
-
|
6184
|
-
std::cerr << "ggml_backend_vk_host_buffer_free_buffer()" << std::endl;
|
6185
|
-
#endif
|
6117
|
+
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
|
6186
6118
|
ggml_vk_host_free(&vk_instance.contexts[0], buffer->context);
|
6187
6119
|
}
|
6188
6120
|
|
6189
6121
|
GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
6190
|
-
|
6191
|
-
std::cerr << "ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")" << std::endl;
|
6192
|
-
#endif
|
6122
|
+
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")");
|
6193
6123
|
size += 32; // Behave like the CPU buffer type
|
6194
6124
|
void * ptr = nullptr;
|
6195
6125
|
try {
|
@@ -6246,9 +6176,7 @@ GGML_CALL static const char * ggml_backend_vk_name(ggml_backend_t backend) {
|
|
6246
6176
|
|
6247
6177
|
GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend) {
|
6248
6178
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
6249
|
-
|
6250
|
-
std::cerr << "ggml_backend_vk_free(" << ctx->name << ")" << std::endl;
|
6251
|
-
#endif
|
6179
|
+
VK_LOG_DEBUG("ggml_backend_vk_free(" << ctx->name << ")");
|
6252
6180
|
|
6253
6181
|
size_t idx = ctx->idx;
|
6254
6182
|
|
@@ -6272,9 +6200,7 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_vk_get_default_buffer_t
|
|
6272
6200
|
}
|
6273
6201
|
|
6274
6202
|
GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
6275
|
-
|
6276
|
-
std::cerr << "ggml_backend_vk_set_tensor_async(" << size << ")" << std::endl;
|
6277
|
-
#endif
|
6203
|
+
VK_LOG_DEBUG("ggml_backend_vk_set_tensor_async(" << size << ")");
|
6278
6204
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
6279
6205
|
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
6280
6206
|
|
@@ -6292,9 +6218,7 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
|
|
6292
6218
|
}
|
6293
6219
|
|
6294
6220
|
GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
6295
|
-
|
6296
|
-
std::cerr << "ggml_backend_vk_get_tensor_async(" << size << ")" << std::endl;
|
6297
|
-
#endif
|
6221
|
+
VK_LOG_DEBUG("ggml_backend_vk_get_tensor_async(" << size << ")");
|
6298
6222
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
6299
6223
|
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
6300
6224
|
|
@@ -6312,9 +6236,7 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
|
|
6312
6236
|
}
|
6313
6237
|
|
6314
6238
|
GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
|
6315
|
-
|
6316
|
-
std::cerr << "ggml_backend_vk_cpy_tensor_async()" << std::endl;
|
6317
|
-
#endif
|
6239
|
+
VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async()");
|
6318
6240
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
6319
6241
|
if ((dst->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || dst->buffer->buft == ggml_backend_vk_host_buffer_type()) && ggml_backend_buffer_is_vk(src->buffer)) {
|
6320
6242
|
ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra;
|
@@ -6337,9 +6259,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c
|
|
6337
6259
|
}
|
6338
6260
|
|
6339
6261
|
GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
|
6340
|
-
|
6341
|
-
std::cerr << "ggml_backend_vk_synchronize()" << std::endl;
|
6342
|
-
#endif
|
6262
|
+
VK_LOG_DEBUG("ggml_backend_vk_synchronize()");
|
6343
6263
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
6344
6264
|
if(ctx->transfer_ctx == nullptr) {
|
6345
6265
|
return;
|
@@ -6367,9 +6287,7 @@ static bool ggml_vk_is_empty(ggml_tensor * node) {
|
|
6367
6287
|
}
|
6368
6288
|
|
6369
6289
|
GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
6370
|
-
|
6371
|
-
std::cerr << "ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)" << std::endl;
|
6372
|
-
#endif
|
6290
|
+
VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
|
6373
6291
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
6374
6292
|
|
6375
6293
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
@@ -6582,9 +6500,7 @@ GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
|
|
6582
6500
|
if (vk_instance.initialized[dev_num]) {
|
6583
6501
|
return vk_instance.backends[dev_num];
|
6584
6502
|
}
|
6585
|
-
|
6586
|
-
std::cerr << "ggml_backend_vk_init(" << dev_num << ")" << std::endl;
|
6587
|
-
#endif
|
6503
|
+
VK_LOG_DEBUG("ggml_backend_vk_init(" << dev_num << ")");
|
6588
6504
|
|
6589
6505
|
ggml_backend_vk_context * ctx = &vk_instance.contexts[dev_num];
|
6590
6506
|
ggml_vk_init(ctx, dev_num);
|
@@ -6800,9 +6716,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
6800
6716
|
return;
|
6801
6717
|
}
|
6802
6718
|
|
6803
|
-
|
6804
|
-
std::cerr << "ggml_vk_check_results_0(" << tensor->name << ")" << std::endl;
|
6805
|
-
#endif
|
6719
|
+
VK_LOG_DEBUG("ggml_vk_check_results_0(" << tensor->name << ")");
|
6806
6720
|
|
6807
6721
|
ggml_tensor * src0 = tensor->src[0];
|
6808
6722
|
ggml_tensor * src1 = tensor->src[1];
|
@@ -7108,9 +7022,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
7108
7022
|
return;
|
7109
7023
|
}
|
7110
7024
|
|
7111
|
-
|
7112
|
-
std::cerr << "ggml_vk_check_results_1(" << tensor->name << ")" << std::endl;
|
7113
|
-
#endif
|
7025
|
+
VK_LOG_DEBUG("ggml_vk_check_results_1(" << tensor->name << ")");
|
7114
7026
|
|
7115
7027
|
ggml_tensor * src0 = tensor->src[0];
|
7116
7028
|
ggml_tensor * src1 = tensor->src[1];
|