llama_cpp 0.14.2 → 0.14.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/llama_cpp.cpp +60 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- data/vendor/tmp/llama.cpp/Makefile +20 -3
- data/vendor/tmp/llama.cpp/ggml-alloc.c +7 -3
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +154 -124
- data/vendor/tmp/llama.cpp/ggml-backend.h +4 -4
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +8741 -8691
- data/vendor/tmp/llama.cpp/ggml-cuda.h +6 -15
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +1 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +34 -11
- data/vendor/tmp/llama.cpp/ggml-metal.metal +260 -28
- data/vendor/tmp/llama.cpp/ggml-quants.c +25 -13
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +237 -78
- data/vendor/tmp/llama.cpp/ggml-sycl.h +6 -1
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +7 -0
- data/vendor/tmp/llama.cpp/ggml.c +98 -16
- data/vendor/tmp/llama.cpp/llama.cpp +382 -42
- data/vendor/tmp/llama.cpp/llama.h +19 -4
- metadata +3 -3
@@ -16,6 +16,7 @@
|
|
16
16
|
#include <cinttypes>
|
17
17
|
#include <cstddef>
|
18
18
|
#include <cstdint>
|
19
|
+
#include <cstdlib>
|
19
20
|
#include <float.h>
|
20
21
|
#include <limits>
|
21
22
|
#include <stdint.h>
|
@@ -24,10 +25,9 @@
|
|
24
25
|
#include <cmath>
|
25
26
|
#include <iostream>
|
26
27
|
#include <fstream>
|
27
|
-
|
28
28
|
#include <stdio.h>
|
29
29
|
#include <stdlib.h>
|
30
|
-
|
30
|
+
#include <regex>
|
31
31
|
|
32
32
|
#include <sycl/sycl.hpp>
|
33
33
|
#include <sycl/half_type.hpp>
|
@@ -82,6 +82,30 @@ Following definition copied from DPCT head files, which are used by ggml-sycl.cp
|
|
82
82
|
#define __dpct_noinline__ __attribute__((noinline))
|
83
83
|
#endif
|
84
84
|
|
85
|
+
|
86
|
+
std::string get_device_type_name(const sycl::device &Device) {
|
87
|
+
auto DeviceType = Device.get_info<sycl::info::device::device_type>();
|
88
|
+
switch (DeviceType) {
|
89
|
+
case sycl::info::device_type::cpu:
|
90
|
+
return "cpu";
|
91
|
+
case sycl::info::device_type::gpu:
|
92
|
+
return "gpu";
|
93
|
+
case sycl::info::device_type::host:
|
94
|
+
return "host";
|
95
|
+
case sycl::info::device_type::accelerator:
|
96
|
+
return "acc";
|
97
|
+
default:
|
98
|
+
return "unknown";
|
99
|
+
}
|
100
|
+
}
|
101
|
+
|
102
|
+
std::string get_device_backend_and_type(const sycl::device &device) {
|
103
|
+
std::stringstream device_type;
|
104
|
+
sycl::backend backend = device.get_backend();
|
105
|
+
device_type << backend << ":" << get_device_type_name(device);
|
106
|
+
return device_type.str();
|
107
|
+
}
|
108
|
+
|
85
109
|
namespace dpct
|
86
110
|
{
|
87
111
|
typedef sycl::queue *queue_ptr;
|
@@ -942,17 +966,67 @@ namespace dpct
|
|
942
966
|
|
943
967
|
private:
|
944
968
|
mutable std::recursive_mutex m_mutex;
|
969
|
+
static bool compare_dev(sycl::device &device1, sycl::device &device2)
|
970
|
+
{
|
971
|
+
dpct::device_info prop1;
|
972
|
+
dpct::get_device_info(prop1, device1);
|
973
|
+
dpct::device_info prop2;
|
974
|
+
dpct::get_device_info(prop2, device2);
|
975
|
+
return prop1.get_max_compute_units() > prop2.get_max_compute_units();
|
976
|
+
}
|
977
|
+
static int convert_backend_index(std::string & backend) {
|
978
|
+
if (backend == "ext_oneapi_level_zero:gpu") return 0;
|
979
|
+
if (backend == "opencl:gpu") return 1;
|
980
|
+
if (backend == "ext_oneapi_cuda:gpu") return 2;
|
981
|
+
if (backend == "ext_oneapi_hip:gpu") return 3;
|
982
|
+
if (backend == "opencl:cpu") return 4;
|
983
|
+
if (backend == "opencl:acc") return 5;
|
984
|
+
printf("convert_backend_index: can't handle backend=%s\n", backend.c_str());
|
985
|
+
GGML_ASSERT(false);
|
986
|
+
}
|
987
|
+
static bool compare_backend(std::string &backend1, std::string &backend2) {
|
988
|
+
return convert_backend_index(backend1) < convert_backend_index(backend2);
|
989
|
+
}
|
945
990
|
dev_mgr()
|
946
991
|
{
|
947
992
|
sycl::device default_device =
|
948
993
|
sycl::device(sycl::default_selector_v);
|
949
994
|
_devs.push_back(std::make_shared<device_ext>(default_device));
|
950
995
|
|
951
|
-
std::vector<sycl::device> sycl_all_devs
|
952
|
-
sycl::device::get_devices(sycl::info::device_type::all);
|
996
|
+
std::vector<sycl::device> sycl_all_devs;
|
953
997
|
// Collect other devices except for the default device.
|
954
998
|
if (default_device.is_cpu())
|
955
999
|
_cpu_device = 0;
|
1000
|
+
|
1001
|
+
auto Platforms = sycl::platform::get_platforms();
|
1002
|
+
// Keep track of the number of devices per backend
|
1003
|
+
std::map<sycl::backend, size_t> DeviceNums;
|
1004
|
+
std::map<std::string, std::vector<sycl::device>> backend_devices;
|
1005
|
+
|
1006
|
+
while (!Platforms.empty()) {
|
1007
|
+
auto Platform = Platforms.back();
|
1008
|
+
Platforms.pop_back();
|
1009
|
+
auto devices = Platform.get_devices();
|
1010
|
+
std::string backend_type = get_device_backend_and_type(devices[0]);
|
1011
|
+
for (const auto &device : devices) {
|
1012
|
+
backend_devices[backend_type].push_back(device);
|
1013
|
+
}
|
1014
|
+
}
|
1015
|
+
|
1016
|
+
std::vector<std::string> keys;
|
1017
|
+
for(auto it = backend_devices.begin(); it != backend_devices.end(); ++it) {
|
1018
|
+
keys.push_back(it->first);
|
1019
|
+
}
|
1020
|
+
std::sort(keys.begin(), keys.end(), compare_backend);
|
1021
|
+
|
1022
|
+
for (auto &key : keys) {
|
1023
|
+
std::vector<sycl::device> devs = backend_devices[key];
|
1024
|
+
std::sort(devs.begin(), devs.end(), compare_dev);
|
1025
|
+
for (const auto &dev : devs) {
|
1026
|
+
sycl_all_devs.push_back(dev);
|
1027
|
+
}
|
1028
|
+
}
|
1029
|
+
|
956
1030
|
for (auto &dev : sycl_all_devs)
|
957
1031
|
{
|
958
1032
|
if (dev == default_device)
|
@@ -3202,6 +3276,11 @@ static int g_work_group_size = 0;
|
|
3202
3276
|
#define GGML_SYCL_MMV_Y 1
|
3203
3277
|
#endif
|
3204
3278
|
|
3279
|
+
enum ggml_sycl_backend_gpu_mode {
|
3280
|
+
SYCL_UNSET_GPU_MODE = -1,
|
3281
|
+
SYCL_SINGLE_GPU_MODE = 0,
|
3282
|
+
SYCL_MUL_GPU_MODE
|
3283
|
+
};
|
3205
3284
|
|
3206
3285
|
static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
3207
3286
|
|
@@ -3401,12 +3480,31 @@ class sycl_gpu_mgr {
|
|
3401
3480
|
int work_group_size = 0;
|
3402
3481
|
std::string gpus_list = "";
|
3403
3482
|
|
3483
|
+
/*
|
3484
|
+
Use all GPUs with same top max compute units
|
3485
|
+
*/
|
3404
3486
|
sycl_gpu_mgr() {
|
3405
3487
|
detect_sycl_gpu_list_with_max_cu();
|
3406
3488
|
get_allow_gpus();
|
3407
3489
|
create_context_with_gpus();
|
3408
3490
|
}
|
3409
3491
|
|
3492
|
+
/*
|
3493
|
+
Only use the assigned GPU
|
3494
|
+
*/
|
3495
|
+
sycl_gpu_mgr(int main_gpu_id) {
|
3496
|
+
sycl::device device = dpct::dev_mgr::instance().get_device(main_gpu_id);
|
3497
|
+
dpct::device_info prop;
|
3498
|
+
dpct::get_device_info(prop, device);
|
3499
|
+
gpus.push_back(main_gpu_id);
|
3500
|
+
devices.push_back(device);
|
3501
|
+
work_group_size = prop.get_max_work_group_size();
|
3502
|
+
max_compute_units = prop.get_max_compute_units();
|
3503
|
+
|
3504
|
+
get_allow_gpus();
|
3505
|
+
create_context_with_gpus();
|
3506
|
+
}
|
3507
|
+
|
3410
3508
|
void create_context_with_gpus() {
|
3411
3509
|
sycl::context ctx = sycl::context(devices);
|
3412
3510
|
assert(gpus.size() > 0);
|
@@ -3422,7 +3520,7 @@ class sycl_gpu_mgr {
|
|
3422
3520
|
gpus_list += std::to_string(gpus[i]);
|
3423
3521
|
gpus_list += ",";
|
3424
3522
|
}
|
3425
|
-
if (gpus_list.length() >
|
3523
|
+
if (gpus_list.length() > 1) {
|
3426
3524
|
gpus_list.pop_back();
|
3427
3525
|
}
|
3428
3526
|
}
|
@@ -3471,8 +3569,8 @@ class sycl_gpu_mgr {
|
|
3471
3569
|
if (gpus[i] == id)
|
3472
3570
|
return i;
|
3473
3571
|
}
|
3474
|
-
|
3475
|
-
|
3572
|
+
printf("miss to get device index by id=%d\n", id);
|
3573
|
+
GGML_ASSERT(false);
|
3476
3574
|
}
|
3477
3575
|
|
3478
3576
|
int get_next_index(int id) {
|
@@ -3481,8 +3579,7 @@ class sycl_gpu_mgr {
|
|
3481
3579
|
if (gpus[i] == id)
|
3482
3580
|
return i;
|
3483
3581
|
}
|
3484
|
-
|
3485
|
-
return -1;
|
3582
|
+
GGML_ASSERT(false);
|
3486
3583
|
}
|
3487
3584
|
|
3488
3585
|
bool is_ext_oneapi_device(const sycl::device &dev) {
|
@@ -3500,11 +3597,14 @@ static int g_device_count = -1;
|
|
3500
3597
|
static int g_all_sycl_device_count = -1;
|
3501
3598
|
static int g_main_device = -1;
|
3502
3599
|
static int g_main_device_id = -1;
|
3600
|
+
static bool g_ggml_backend_sycl_buffer_type_initialized = false;
|
3503
3601
|
|
3504
3602
|
static std::array<float, GGML_SYCL_MAX_DEVICES> g_default_tensor_split = {};
|
3505
3603
|
|
3506
3604
|
static float g_tensor_split[GGML_SYCL_MAX_DEVICES] = {0};
|
3507
3605
|
|
3606
|
+
static ggml_sycl_backend_gpu_mode g_ggml_sycl_backend_gpu_mode = SYCL_UNSET_GPU_MODE;
|
3607
|
+
|
3508
3608
|
struct sycl_device_capabilities {
|
3509
3609
|
int cc; // compute capability
|
3510
3610
|
bool vmm; // virtual memory support
|
@@ -13008,17 +13108,20 @@ bool ggml_sycl_loaded(void) {
|
|
13008
13108
|
return g_sycl_loaded;
|
13009
13109
|
}
|
13010
13110
|
|
13011
|
-
void print_device_detail(int id) {
|
13111
|
+
void print_device_detail(int id, sycl::device &device, std::string device_type) {
|
13112
|
+
|
13012
13113
|
dpct::device_info prop;
|
13013
13114
|
SYCL_CHECK(CHECK_TRY_ERROR(
|
13014
|
-
dpct::get_device_info(prop,
|
13015
|
-
|
13115
|
+
dpct::get_device_info(prop, device)));
|
13116
|
+
|
13016
13117
|
std::string version;
|
13017
13118
|
version += std::to_string(prop.get_major_version());
|
13018
13119
|
version += ".";
|
13019
13120
|
version += std::to_string(prop.get_minor_version());
|
13020
13121
|
|
13021
|
-
|
13122
|
+
device_type = std::regex_replace(device_type, std::regex("ext_oneapi_"), "");
|
13123
|
+
|
13124
|
+
fprintf(stderr, "|%2d|%18s|%45s|%10s|%11d|%8d|%7d|%15lu|\n", id, device_type.c_str(),
|
13022
13125
|
prop.get_name(), version.c_str(), prop.get_max_compute_units(),
|
13023
13126
|
prop.get_max_work_group_size(), prop.get_max_sub_group_size(),
|
13024
13127
|
prop.get_global_mem_size());
|
@@ -13026,19 +13129,35 @@ void print_device_detail(int id) {
|
|
13026
13129
|
|
13027
13130
|
void ggml_backend_sycl_print_sycl_devices() {
|
13028
13131
|
int device_count = dpct::dev_mgr::instance().device_count();
|
13132
|
+
std::map<std::string, size_t> DeviceNums;
|
13029
13133
|
fprintf(stderr, "found %d SYCL devices:\n", device_count);
|
13030
|
-
fprintf(stderr, "|
|
13031
|
-
fprintf(stderr, "
|
13134
|
+
fprintf(stderr, "| | | |Compute |Max compute|Max work|Max sub| |\n");
|
13135
|
+
fprintf(stderr, "|ID| Device Type| Name|capability|units |group |group |Global mem size|\n");
|
13136
|
+
fprintf(stderr, "|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|\n");
|
13032
13137
|
for (int id = 0; id < device_count; ++id) {
|
13033
|
-
|
13138
|
+
sycl::device device = dpct::dev_mgr::instance().get_device(id);
|
13139
|
+
sycl::backend backend = device.get_backend();
|
13140
|
+
std::string backend_type = get_device_backend_and_type(device);
|
13141
|
+
int type_id=DeviceNums[backend_type]++;
|
13142
|
+
std::stringstream device_type;
|
13143
|
+
device_type << "[" << backend_type << ":" << std::to_string(type_id) << "]";
|
13144
|
+
print_device_detail(id, device, device_type.str());
|
13034
13145
|
}
|
13035
13146
|
}
|
13036
13147
|
|
13037
13148
|
void print_gpu_device_list() {
|
13038
|
-
|
13039
|
-
|
13040
|
-
|
13041
|
-
|
13149
|
+
GGML_ASSERT(g_sycl_gpu_mgr);
|
13150
|
+
|
13151
|
+
char* hint=NULL;
|
13152
|
+
if (g_ggml_sycl_backend_gpu_mode == SYCL_SINGLE_GPU_MODE) {
|
13153
|
+
hint = "use %d SYCL GPUs: [%s] with Max compute units:%d\n";
|
13154
|
+
} else {
|
13155
|
+
hint = "detect %d SYCL GPUs: [%s] with top Max compute units:%d\n";
|
13156
|
+
}
|
13157
|
+
fprintf(stderr, hint,
|
13158
|
+
g_sycl_gpu_mgr->get_gpu_count(),
|
13159
|
+
g_sycl_gpu_mgr->gpus_list.c_str(),
|
13160
|
+
g_sycl_gpu_mgr->max_compute_units);
|
13042
13161
|
}
|
13043
13162
|
|
13044
13163
|
int get_sycl_env(const char *env_name, int default_val) {
|
@@ -13074,6 +13193,15 @@ void ggml_init_sycl() try {
|
|
13074
13193
|
#else
|
13075
13194
|
fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
|
13076
13195
|
#endif
|
13196
|
+
|
13197
|
+
/* NOT REMOVE, keep it for next optimize for XMX.
|
13198
|
+
#if defined(SYCL_USE_XMX)
|
13199
|
+
fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
|
13200
|
+
#else
|
13201
|
+
fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
|
13202
|
+
#endif
|
13203
|
+
*/
|
13204
|
+
|
13077
13205
|
if (CHECK_TRY_ERROR(g_all_sycl_device_count =
|
13078
13206
|
dpct::dev_mgr::instance().device_count()) != 0) {
|
13079
13207
|
initialized = true;
|
@@ -13082,68 +13210,65 @@ void ggml_init_sycl() try {
|
|
13082
13210
|
}
|
13083
13211
|
GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
|
13084
13212
|
ggml_backend_sycl_print_sycl_devices();
|
13213
|
+
initialized = true;
|
13214
|
+
g_sycl_loaded = true;
|
13215
|
+
}
|
13216
|
+
}
|
13217
|
+
catch (sycl::exception const &exc) {
|
13218
|
+
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
13219
|
+
<< ", line:" << __LINE__ << std::endl;
|
13220
|
+
std::exit(1);
|
13221
|
+
}
|
13085
13222
|
|
13086
|
-
|
13087
|
-
|
13088
|
-
|
13089
|
-
g_work_group_size = g_sycl_gpu_mgr->work_group_size;
|
13090
|
-
|
13091
|
-
print_gpu_device_list();
|
13223
|
+
void ggml_init_by_gpus(int device_count) try {
|
13224
|
+
g_device_count = device_count;
|
13225
|
+
g_work_group_size = g_sycl_gpu_mgr->work_group_size;
|
13092
13226
|
|
13093
|
-
|
13227
|
+
int64_t total_vram = 0;
|
13094
13228
|
|
13095
|
-
|
13096
|
-
#if defined(SYCL_USE_XMX)
|
13097
|
-
fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
|
13098
|
-
#else
|
13099
|
-
fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
|
13100
|
-
#endif
|
13101
|
-
*/
|
13102
|
-
for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
|
13103
|
-
g_device_caps[id].vmm = 0;
|
13104
|
-
g_device_caps[id].device_id = -1;
|
13105
|
-
g_device_caps[id].cc = 0;
|
13106
|
-
g_tensor_split[id] = 0;
|
13107
|
-
g_default_tensor_split[id] = 0;
|
13108
|
-
}
|
13229
|
+
print_gpu_device_list();
|
13109
13230
|
|
13110
|
-
|
13111
|
-
|
13112
|
-
|
13231
|
+
for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
|
13232
|
+
g_device_caps[id].vmm = 0;
|
13233
|
+
g_device_caps[id].device_id = -1;
|
13234
|
+
g_device_caps[id].cc = 0;
|
13235
|
+
g_tensor_split[id] = 0;
|
13236
|
+
g_default_tensor_split[id] = 0;
|
13237
|
+
}
|
13113
13238
|
|
13114
|
-
|
13115
|
-
|
13116
|
-
|
13239
|
+
for (int i = 0; i < g_device_count; ++i) {
|
13240
|
+
int device_id = g_sycl_gpu_mgr->gpus[i];
|
13241
|
+
g_device_caps[i].vmm = 0;
|
13117
13242
|
|
13118
|
-
|
13119
|
-
|
13243
|
+
dpct::device_info prop;
|
13244
|
+
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
|
13245
|
+
prop, dpct::dev_mgr::instance().get_device(device_id))));
|
13120
13246
|
|
13121
|
-
|
13122
|
-
|
13123
|
-
}
|
13247
|
+
g_default_tensor_split[i] = total_vram;
|
13248
|
+
total_vram += prop.get_global_mem_size();
|
13124
13249
|
|
13125
|
-
|
13126
|
-
|
13127
|
-
|
13250
|
+
g_device_caps[i].cc =
|
13251
|
+
100 * prop.get_major_version() + 10 * prop.get_minor_version();
|
13252
|
+
}
|
13128
13253
|
|
13129
|
-
|
13130
|
-
|
13254
|
+
for (int i = 0; i < g_device_count; ++i) {
|
13255
|
+
g_default_tensor_split[i] /= total_vram;
|
13256
|
+
}
|
13131
13257
|
|
13132
|
-
|
13133
|
-
|
13134
|
-
SYCL_CHECK(CHECK_TRY_ERROR(
|
13135
|
-
g_syclStreams[i][is] =
|
13136
|
-
dpct::get_current_device().create_queue(
|
13137
|
-
g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device())));
|
13138
|
-
}
|
13258
|
+
for (int i = 0; i < g_device_count; ++i) {
|
13259
|
+
SYCL_CHECK(ggml_sycl_set_device(i));
|
13139
13260
|
|
13140
|
-
|
13141
|
-
|
13142
|
-
SYCL_CHECK(CHECK_TRY_ERROR(
|
13261
|
+
// create sycl streams
|
13262
|
+
for (int is = 0; is < MAX_STREAMS; ++is) {
|
13263
|
+
SYCL_CHECK(CHECK_TRY_ERROR(
|
13264
|
+
g_syclStreams[i][is] =
|
13265
|
+
dpct::get_current_device().create_queue(
|
13266
|
+
g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device())));
|
13143
13267
|
}
|
13144
13268
|
|
13145
|
-
|
13146
|
-
|
13269
|
+
const dpct::queue_ptr stream = g_syclStreams[i][0];
|
13270
|
+
// create sycl handle
|
13271
|
+
SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream));
|
13147
13272
|
}
|
13148
13273
|
}
|
13149
13274
|
catch (sycl::exception const &exc) {
|
@@ -16551,22 +16676,24 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
|
|
16551
16676
|
/* .is_host = */ nullptr,
|
16552
16677
|
};
|
16553
16678
|
|
16554
|
-
ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int
|
16679
|
+
ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_index) {
|
16680
|
+
if (device_index>=g_device_count or device_index<0) {
|
16681
|
+
printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
|
16682
|
+
device_index, g_device_count-1);
|
16683
|
+
GGML_ASSERT(device_index<g_device_count);
|
16684
|
+
}
|
16555
16685
|
static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];
|
16556
16686
|
|
16557
|
-
|
16558
|
-
|
16559
|
-
if (!ggml_backend_sycl_buffer_type_initialized) {
|
16687
|
+
if (!g_ggml_backend_sycl_buffer_type_initialized) {
|
16560
16688
|
for (int i = 0; i < g_device_count; i++) {
|
16561
16689
|
ggml_backend_sycl_buffer_types[i] = {
|
16562
16690
|
/* .iface = */ ggml_backend_sycl_buffer_type_interface,
|
16563
16691
|
/* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(g_sycl_gpu_mgr->gpus[i])},
|
16564
16692
|
};
|
16565
16693
|
}
|
16566
|
-
|
16694
|
+
g_ggml_backend_sycl_buffer_type_initialized = true;
|
16567
16695
|
}
|
16568
|
-
|
16569
|
-
return &ggml_backend_sycl_buffer_types[device];
|
16696
|
+
return &ggml_backend_sycl_buffer_types[device_index];
|
16570
16697
|
}
|
16571
16698
|
|
16572
16699
|
// sycl split buffer type
|
@@ -17265,6 +17392,7 @@ static ggml_backend_i ggml_backend_sycl_interface = {
|
|
17265
17392
|
/* .graph_plan_compute = */ NULL,
|
17266
17393
|
/* .graph_compute = */ ggml_backend_sycl_graph_compute,
|
17267
17394
|
/* .supports_op = */ ggml_backend_sycl_supports_op,
|
17395
|
+
/* .offload_op = */ NULL,
|
17268
17396
|
/* .event_new = */ NULL,
|
17269
17397
|
/* .event_free = */ NULL,
|
17270
17398
|
/* .event_record = */ NULL,
|
@@ -17319,11 +17447,42 @@ GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id) {
|
|
17319
17447
|
return g_sycl_gpu_mgr->get_index(device_id);
|
17320
17448
|
}
|
17321
17449
|
|
17450
|
+
GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index) {
|
17451
|
+
return g_sycl_gpu_mgr->gpus[device_index];
|
17452
|
+
}
|
17453
|
+
|
17454
|
+
GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id) {
|
17455
|
+
GGML_ASSERT(main_gpu_id<g_all_sycl_device_count);
|
17456
|
+
fprintf(stderr, "ggml_backend_sycl_set_single_device: use single device: [%d]\n", main_gpu_id);
|
17457
|
+
if (g_sycl_gpu_mgr) {
|
17458
|
+
delete g_sycl_gpu_mgr;
|
17459
|
+
}
|
17460
|
+
g_sycl_gpu_mgr = new sycl_gpu_mgr(main_gpu_id);
|
17461
|
+
g_ggml_sycl_backend_gpu_mode = SYCL_SINGLE_GPU_MODE;
|
17462
|
+
ggml_init_by_gpus(g_sycl_gpu_mgr->get_gpu_count());
|
17463
|
+
g_ggml_backend_sycl_buffer_type_initialized = false;
|
17464
|
+
}
|
17465
|
+
|
17466
|
+
GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode() {
|
17467
|
+
if (g_ggml_sycl_backend_gpu_mode == SYCL_MUL_GPU_MODE) {
|
17468
|
+
return;
|
17469
|
+
}
|
17470
|
+
|
17471
|
+
fprintf(stderr, "ggml_backend_sycl_set_mul_device_mode: true\n");
|
17472
|
+
|
17473
|
+
if (g_sycl_gpu_mgr) {
|
17474
|
+
delete g_sycl_gpu_mgr;
|
17475
|
+
}
|
17476
|
+
g_sycl_gpu_mgr = new sycl_gpu_mgr();
|
17477
|
+
g_ggml_sycl_backend_gpu_mode = SYCL_MUL_GPU_MODE;
|
17478
|
+
ggml_init_by_gpus(g_sycl_gpu_mgr->get_gpu_count());
|
17479
|
+
g_ggml_backend_sycl_buffer_type_initialized = false;
|
17480
|
+
}
|
17481
|
+
|
17322
17482
|
extern "C" int ggml_backend_sycl_reg_devices();
|
17323
17483
|
|
17324
17484
|
int ggml_backend_sycl_reg_devices() {
|
17325
|
-
|
17326
|
-
g_device_count = g_sycl_gpu_mgr->get_gpu_count();
|
17485
|
+
ggml_backend_sycl_set_mul_device_mode();
|
17327
17486
|
assert(g_device_count>0);
|
17328
17487
|
for (int i = 0; i < g_device_count; i++) {
|
17329
17488
|
int id = g_sycl_gpu_mgr->gpus[i];
|
@@ -13,7 +13,7 @@
|
|
13
13
|
extern "C" {
|
14
14
|
#endif
|
15
15
|
|
16
|
-
#define GGML_SYCL_MAX_DEVICES
|
16
|
+
#define GGML_SYCL_MAX_DEVICES 48
|
17
17
|
#define GGML_SYCL_NAME "SYCL"
|
18
18
|
|
19
19
|
GGML_API void ggml_init_sycl(void);
|
@@ -29,6 +29,11 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_typ
|
|
29
29
|
GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
|
30
30
|
GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
|
31
31
|
|
32
|
+
// TODO: these are temporary
|
33
|
+
// ref: https://github.com/ggerganov/llama.cpp/pull/6022#issuecomment-1992615670
|
34
|
+
GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index);
|
35
|
+
GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id);
|
36
|
+
GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode();
|
32
37
|
#ifdef __cplusplus
|
33
38
|
}
|
34
39
|
#endif
|
@@ -710,6 +710,12 @@ static uint32_t ggml_vk_find_queue_family_index(std::vector<vk::QueueFamilyPrope
|
|
710
710
|
}
|
711
711
|
}
|
712
712
|
|
713
|
+
// All commands that are allowed on a queue that supports transfer operations are also allowed on a queue that supports either graphics or compute operations.
|
714
|
+
// Thus, if the capabilities of a queue family include VK_QUEUE_GRAPHICS_BIT or VK_QUEUE_COMPUTE_BIT, then reporting the VK_QUEUE_TRANSFER_BIT capability separately for that queue family is optional.
|
715
|
+
if (compute_index >= 0) {
|
716
|
+
return compute_index;
|
717
|
+
}
|
718
|
+
|
713
719
|
std::cerr << "ggml_vulkan: No suitable queue family index found." << std::endl;
|
714
720
|
|
715
721
|
for(auto &q_family : queue_family_props) {
|
@@ -5693,6 +5699,7 @@ static ggml_backend_i ggml_backend_vk_interface = {
|
|
5693
5699
|
/* .graph_plan_compute = */ NULL,
|
5694
5700
|
/* .graph_compute = */ ggml_backend_vk_graph_compute,
|
5695
5701
|
/* .supports_op = */ ggml_backend_vk_supports_op,
|
5702
|
+
/* .offload_op = */ NULL,
|
5696
5703
|
/* .event_new = */ NULL,
|
5697
5704
|
/* .event_free = */ NULL,
|
5698
5705
|
/* .event_record = */ NULL,
|