llama_cpp 0.14.2 → 0.14.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16,6 +16,7 @@
16
16
  #include <cinttypes>
17
17
  #include <cstddef>
18
18
  #include <cstdint>
19
+ #include <cstdlib>
19
20
  #include <float.h>
20
21
  #include <limits>
21
22
  #include <stdint.h>
@@ -24,10 +25,9 @@
24
25
  #include <cmath>
25
26
  #include <iostream>
26
27
  #include <fstream>
27
-
28
28
  #include <stdio.h>
29
29
  #include <stdlib.h>
30
-
30
+ #include <regex>
31
31
 
32
32
  #include <sycl/sycl.hpp>
33
33
  #include <sycl/half_type.hpp>
@@ -82,6 +82,30 @@ Following definition copied from DPCT head files, which are used by ggml-sycl.cp
82
82
  #define __dpct_noinline__ __attribute__((noinline))
83
83
  #endif
84
84
 
85
+
86
+ std::string get_device_type_name(const sycl::device &Device) {
87
+ auto DeviceType = Device.get_info<sycl::info::device::device_type>();
88
+ switch (DeviceType) {
89
+ case sycl::info::device_type::cpu:
90
+ return "cpu";
91
+ case sycl::info::device_type::gpu:
92
+ return "gpu";
93
+ case sycl::info::device_type::host:
94
+ return "host";
95
+ case sycl::info::device_type::accelerator:
96
+ return "acc";
97
+ default:
98
+ return "unknown";
99
+ }
100
+ }
101
+
102
+ std::string get_device_backend_and_type(const sycl::device &device) {
103
+ std::stringstream device_type;
104
+ sycl::backend backend = device.get_backend();
105
+ device_type << backend << ":" << get_device_type_name(device);
106
+ return device_type.str();
107
+ }
108
+
85
109
  namespace dpct
86
110
  {
87
111
  typedef sycl::queue *queue_ptr;
@@ -942,17 +966,67 @@ namespace dpct
942
966
 
943
967
  private:
944
968
  mutable std::recursive_mutex m_mutex;
969
+ static bool compare_dev(sycl::device &device1, sycl::device &device2)
970
+ {
971
+ dpct::device_info prop1;
972
+ dpct::get_device_info(prop1, device1);
973
+ dpct::device_info prop2;
974
+ dpct::get_device_info(prop2, device2);
975
+ return prop1.get_max_compute_units() > prop2.get_max_compute_units();
976
+ }
977
+ static int convert_backend_index(std::string & backend) {
978
+ if (backend == "ext_oneapi_level_zero:gpu") return 0;
979
+ if (backend == "opencl:gpu") return 1;
980
+ if (backend == "ext_oneapi_cuda:gpu") return 2;
981
+ if (backend == "ext_oneapi_hip:gpu") return 3;
982
+ if (backend == "opencl:cpu") return 4;
983
+ if (backend == "opencl:acc") return 5;
984
+ printf("convert_backend_index: can't handle backend=%s\n", backend.c_str());
985
+ GGML_ASSERT(false);
986
+ }
987
+ static bool compare_backend(std::string &backend1, std::string &backend2) {
988
+ return convert_backend_index(backend1) < convert_backend_index(backend2);
989
+ }
945
990
  dev_mgr()
946
991
  {
947
992
  sycl::device default_device =
948
993
  sycl::device(sycl::default_selector_v);
949
994
  _devs.push_back(std::make_shared<device_ext>(default_device));
950
995
 
951
- std::vector<sycl::device> sycl_all_devs =
952
- sycl::device::get_devices(sycl::info::device_type::all);
996
+ std::vector<sycl::device> sycl_all_devs;
953
997
  // Collect other devices except for the default device.
954
998
  if (default_device.is_cpu())
955
999
  _cpu_device = 0;
1000
+
1001
+ auto Platforms = sycl::platform::get_platforms();
1002
+ // Keep track of the number of devices per backend
1003
+ std::map<sycl::backend, size_t> DeviceNums;
1004
+ std::map<std::string, std::vector<sycl::device>> backend_devices;
1005
+
1006
+ while (!Platforms.empty()) {
1007
+ auto Platform = Platforms.back();
1008
+ Platforms.pop_back();
1009
+ auto devices = Platform.get_devices();
1010
+ std::string backend_type = get_device_backend_and_type(devices[0]);
1011
+ for (const auto &device : devices) {
1012
+ backend_devices[backend_type].push_back(device);
1013
+ }
1014
+ }
1015
+
1016
+ std::vector<std::string> keys;
1017
+ for(auto it = backend_devices.begin(); it != backend_devices.end(); ++it) {
1018
+ keys.push_back(it->first);
1019
+ }
1020
+ std::sort(keys.begin(), keys.end(), compare_backend);
1021
+
1022
+ for (auto &key : keys) {
1023
+ std::vector<sycl::device> devs = backend_devices[key];
1024
+ std::sort(devs.begin(), devs.end(), compare_dev);
1025
+ for (const auto &dev : devs) {
1026
+ sycl_all_devs.push_back(dev);
1027
+ }
1028
+ }
1029
+
956
1030
  for (auto &dev : sycl_all_devs)
957
1031
  {
958
1032
  if (dev == default_device)
@@ -3202,6 +3276,11 @@ static int g_work_group_size = 0;
3202
3276
  #define GGML_SYCL_MMV_Y 1
3203
3277
  #endif
3204
3278
 
3279
+ enum ggml_sycl_backend_gpu_mode {
3280
+ SYCL_UNSET_GPU_MODE = -1,
3281
+ SYCL_SINGLE_GPU_MODE = 0,
3282
+ SYCL_MUL_GPU_MODE
3283
+ };
3205
3284
 
3206
3285
  static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
3207
3286
 
@@ -3401,12 +3480,31 @@ class sycl_gpu_mgr {
3401
3480
  int work_group_size = 0;
3402
3481
  std::string gpus_list = "";
3403
3482
 
3483
+ /*
3484
+ Use all GPUs with same top max compute units
3485
+ */
3404
3486
  sycl_gpu_mgr() {
3405
3487
  detect_sycl_gpu_list_with_max_cu();
3406
3488
  get_allow_gpus();
3407
3489
  create_context_with_gpus();
3408
3490
  }
3409
3491
 
3492
+ /*
3493
+ Only use the assigned GPU
3494
+ */
3495
+ sycl_gpu_mgr(int main_gpu_id) {
3496
+ sycl::device device = dpct::dev_mgr::instance().get_device(main_gpu_id);
3497
+ dpct::device_info prop;
3498
+ dpct::get_device_info(prop, device);
3499
+ gpus.push_back(main_gpu_id);
3500
+ devices.push_back(device);
3501
+ work_group_size = prop.get_max_work_group_size();
3502
+ max_compute_units = prop.get_max_compute_units();
3503
+
3504
+ get_allow_gpus();
3505
+ create_context_with_gpus();
3506
+ }
3507
+
3410
3508
  void create_context_with_gpus() {
3411
3509
  sycl::context ctx = sycl::context(devices);
3412
3510
  assert(gpus.size() > 0);
@@ -3422,7 +3520,7 @@ class sycl_gpu_mgr {
3422
3520
  gpus_list += std::to_string(gpus[i]);
3423
3521
  gpus_list += ",";
3424
3522
  }
3425
- if (gpus_list.length() > 2) {
3523
+ if (gpus_list.length() > 1) {
3426
3524
  gpus_list.pop_back();
3427
3525
  }
3428
3526
  }
@@ -3471,8 +3569,8 @@ class sycl_gpu_mgr {
3471
3569
  if (gpus[i] == id)
3472
3570
  return i;
3473
3571
  }
3474
- assert(false);
3475
- return -1;
3572
+ printf("miss to get device index by id=%d\n", id);
3573
+ GGML_ASSERT(false);
3476
3574
  }
3477
3575
 
3478
3576
  int get_next_index(int id) {
@@ -3481,8 +3579,7 @@ class sycl_gpu_mgr {
3481
3579
  if (gpus[i] == id)
3482
3580
  return i;
3483
3581
  }
3484
- assert(false);
3485
- return -1;
3582
+ GGML_ASSERT(false);
3486
3583
  }
3487
3584
 
3488
3585
  bool is_ext_oneapi_device(const sycl::device &dev) {
@@ -3500,11 +3597,14 @@ static int g_device_count = -1;
3500
3597
  static int g_all_sycl_device_count = -1;
3501
3598
  static int g_main_device = -1;
3502
3599
  static int g_main_device_id = -1;
3600
+ static bool g_ggml_backend_sycl_buffer_type_initialized = false;
3503
3601
 
3504
3602
  static std::array<float, GGML_SYCL_MAX_DEVICES> g_default_tensor_split = {};
3505
3603
 
3506
3604
  static float g_tensor_split[GGML_SYCL_MAX_DEVICES] = {0};
3507
3605
 
3606
+ static ggml_sycl_backend_gpu_mode g_ggml_sycl_backend_gpu_mode = SYCL_UNSET_GPU_MODE;
3607
+
3508
3608
  struct sycl_device_capabilities {
3509
3609
  int cc; // compute capability
3510
3610
  bool vmm; // virtual memory support
@@ -13008,17 +13108,20 @@ bool ggml_sycl_loaded(void) {
13008
13108
  return g_sycl_loaded;
13009
13109
  }
13010
13110
 
13011
- void print_device_detail(int id) {
13111
+ void print_device_detail(int id, sycl::device &device, std::string device_type) {
13112
+
13012
13113
  dpct::device_info prop;
13013
13114
  SYCL_CHECK(CHECK_TRY_ERROR(
13014
- dpct::get_device_info(prop, dpct::dev_mgr::instance().get_device(id))));
13015
- sycl::device cur_device = dpct::dev_mgr::instance().get_device(id);
13115
+ dpct::get_device_info(prop, device)));
13116
+
13016
13117
  std::string version;
13017
13118
  version += std::to_string(prop.get_major_version());
13018
13119
  version += ".";
13019
13120
  version += std::to_string(prop.get_minor_version());
13020
13121
 
13021
- fprintf(stderr, "|%2d|%45s|%18s|%17d|%14d|%13d|%15lu|\n", id,
13122
+ device_type = std::regex_replace(device_type, std::regex("ext_oneapi_"), "");
13123
+
13124
+ fprintf(stderr, "|%2d|%18s|%45s|%10s|%11d|%8d|%7d|%15lu|\n", id, device_type.c_str(),
13022
13125
  prop.get_name(), version.c_str(), prop.get_max_compute_units(),
13023
13126
  prop.get_max_work_group_size(), prop.get_max_sub_group_size(),
13024
13127
  prop.get_global_mem_size());
@@ -13026,19 +13129,35 @@ void print_device_detail(int id) {
13026
13129
 
13027
13130
  void ggml_backend_sycl_print_sycl_devices() {
13028
13131
  int device_count = dpct::dev_mgr::instance().device_count();
13132
+ std::map<std::string, size_t> DeviceNums;
13029
13133
  fprintf(stderr, "found %d SYCL devices:\n", device_count);
13030
- fprintf(stderr, "|ID| Name |compute capability|Max compute units|Max work group|Max sub group|Global mem size|\n");
13031
- fprintf(stderr, "|--|---------------------------------------------|------------------|-----------------|--------------|-------------|---------------|\n");
13134
+ fprintf(stderr, "| | | |Compute |Max compute|Max work|Max sub| |\n");
13135
+ fprintf(stderr, "|ID| Device Type| Name|capability|units |group |group |Global mem size|\n");
13136
+ fprintf(stderr, "|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|\n");
13032
13137
  for (int id = 0; id < device_count; ++id) {
13033
- print_device_detail(id);
13138
+ sycl::device device = dpct::dev_mgr::instance().get_device(id);
13139
+ sycl::backend backend = device.get_backend();
13140
+ std::string backend_type = get_device_backend_and_type(device);
13141
+ int type_id=DeviceNums[backend_type]++;
13142
+ std::stringstream device_type;
13143
+ device_type << "[" << backend_type << ":" << std::to_string(type_id) << "]";
13144
+ print_device_detail(id, device, device_type.str());
13034
13145
  }
13035
13146
  }
13036
13147
 
13037
13148
  void print_gpu_device_list() {
13038
- fprintf(stderr, "detect %d SYCL GPUs: [%s] with Max compute units:%d\n",
13039
- g_sycl_gpu_mgr->get_gpu_count(),
13040
- g_sycl_gpu_mgr->gpus_list.c_str(),
13041
- g_sycl_gpu_mgr->max_compute_units);
13149
+ GGML_ASSERT(g_sycl_gpu_mgr);
13150
+
13151
+ char* hint=NULL;
13152
+ if (g_ggml_sycl_backend_gpu_mode == SYCL_SINGLE_GPU_MODE) {
13153
+ hint = "use %d SYCL GPUs: [%s] with Max compute units:%d\n";
13154
+ } else {
13155
+ hint = "detect %d SYCL GPUs: [%s] with top Max compute units:%d\n";
13156
+ }
13157
+ fprintf(stderr, hint,
13158
+ g_sycl_gpu_mgr->get_gpu_count(),
13159
+ g_sycl_gpu_mgr->gpus_list.c_str(),
13160
+ g_sycl_gpu_mgr->max_compute_units);
13042
13161
  }
13043
13162
 
13044
13163
  int get_sycl_env(const char *env_name, int default_val) {
@@ -13074,6 +13193,15 @@ void ggml_init_sycl() try {
13074
13193
  #else
13075
13194
  fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
13076
13195
  #endif
13196
+
13197
+ /* NOT REMOVE, keep it for next optimize for XMX.
13198
+ #if defined(SYCL_USE_XMX)
13199
+ fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
13200
+ #else
13201
+ fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
13202
+ #endif
13203
+ */
13204
+
13077
13205
  if (CHECK_TRY_ERROR(g_all_sycl_device_count =
13078
13206
  dpct::dev_mgr::instance().device_count()) != 0) {
13079
13207
  initialized = true;
@@ -13082,68 +13210,65 @@ void ggml_init_sycl() try {
13082
13210
  }
13083
13211
  GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
13084
13212
  ggml_backend_sycl_print_sycl_devices();
13213
+ initialized = true;
13214
+ g_sycl_loaded = true;
13215
+ }
13216
+ }
13217
+ catch (sycl::exception const &exc) {
13218
+ std::cerr << exc.what() << "Exception caught at file:" << __FILE__
13219
+ << ", line:" << __LINE__ << std::endl;
13220
+ std::exit(1);
13221
+ }
13085
13222
 
13086
- if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
13087
-
13088
- g_device_count = g_sycl_gpu_mgr->get_gpu_count();
13089
- g_work_group_size = g_sycl_gpu_mgr->work_group_size;
13090
-
13091
- print_gpu_device_list();
13223
+ void ggml_init_by_gpus(int device_count) try {
13224
+ g_device_count = device_count;
13225
+ g_work_group_size = g_sycl_gpu_mgr->work_group_size;
13092
13226
 
13093
- int64_t total_vram = 0;
13227
+ int64_t total_vram = 0;
13094
13228
 
13095
- /* NOT REMOVE, keep it for next optimize for XMX.
13096
- #if defined(SYCL_USE_XMX)
13097
- fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
13098
- #else
13099
- fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
13100
- #endif
13101
- */
13102
- for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
13103
- g_device_caps[id].vmm = 0;
13104
- g_device_caps[id].device_id = -1;
13105
- g_device_caps[id].cc = 0;
13106
- g_tensor_split[id] = 0;
13107
- g_default_tensor_split[id] = 0;
13108
- }
13229
+ print_gpu_device_list();
13109
13230
 
13110
- for (int i = 0; i < g_device_count; ++i) {
13111
- int device_id = g_sycl_gpu_mgr->gpus[i];
13112
- g_device_caps[i].vmm = 0;
13231
+ for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
13232
+ g_device_caps[id].vmm = 0;
13233
+ g_device_caps[id].device_id = -1;
13234
+ g_device_caps[id].cc = 0;
13235
+ g_tensor_split[id] = 0;
13236
+ g_default_tensor_split[id] = 0;
13237
+ }
13113
13238
 
13114
- dpct::device_info prop;
13115
- SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
13116
- prop, dpct::dev_mgr::instance().get_device(device_id))));
13239
+ for (int i = 0; i < g_device_count; ++i) {
13240
+ int device_id = g_sycl_gpu_mgr->gpus[i];
13241
+ g_device_caps[i].vmm = 0;
13117
13242
 
13118
- g_default_tensor_split[i] = total_vram;
13119
- total_vram += prop.get_global_mem_size();
13243
+ dpct::device_info prop;
13244
+ SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
13245
+ prop, dpct::dev_mgr::instance().get_device(device_id))));
13120
13246
 
13121
- g_device_caps[i].cc =
13122
- 100 * prop.get_major_version() + 10 * prop.get_minor_version();
13123
- }
13247
+ g_default_tensor_split[i] = total_vram;
13248
+ total_vram += prop.get_global_mem_size();
13124
13249
 
13125
- for (int i = 0; i < g_device_count; ++i) {
13126
- g_default_tensor_split[i] /= total_vram;
13127
- }
13250
+ g_device_caps[i].cc =
13251
+ 100 * prop.get_major_version() + 10 * prop.get_minor_version();
13252
+ }
13128
13253
 
13129
- for (int i = 0; i < g_device_count; ++i) {
13130
- SYCL_CHECK(ggml_sycl_set_device(i));
13254
+ for (int i = 0; i < g_device_count; ++i) {
13255
+ g_default_tensor_split[i] /= total_vram;
13256
+ }
13131
13257
 
13132
- // create sycl streams
13133
- for (int is = 0; is < MAX_STREAMS; ++is) {
13134
- SYCL_CHECK(CHECK_TRY_ERROR(
13135
- g_syclStreams[i][is] =
13136
- dpct::get_current_device().create_queue(
13137
- g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device())));
13138
- }
13258
+ for (int i = 0; i < g_device_count; ++i) {
13259
+ SYCL_CHECK(ggml_sycl_set_device(i));
13139
13260
 
13140
- const dpct::queue_ptr stream = g_syclStreams[i][0];
13141
- // create sycl handle
13142
- SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream));
13261
+ // create sycl streams
13262
+ for (int is = 0; is < MAX_STREAMS; ++is) {
13263
+ SYCL_CHECK(CHECK_TRY_ERROR(
13264
+ g_syclStreams[i][is] =
13265
+ dpct::get_current_device().create_queue(
13266
+ g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device())));
13143
13267
  }
13144
13268
 
13145
- initialized = true;
13146
- g_sycl_loaded = true;
13269
+ const dpct::queue_ptr stream = g_syclStreams[i][0];
13270
+ // create sycl handle
13271
+ SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream));
13147
13272
  }
13148
13273
  }
13149
13274
  catch (sycl::exception const &exc) {
@@ -16551,22 +16676,24 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
16551
16676
  /* .is_host = */ nullptr,
16552
16677
  };
16553
16678
 
16554
- ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
16679
+ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_index) {
16680
+ if (device_index>=g_device_count or device_index<0) {
16681
+ printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
16682
+ device_index, g_device_count-1);
16683
+ GGML_ASSERT(device_index<g_device_count);
16684
+ }
16555
16685
  static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];
16556
16686
 
16557
- static bool ggml_backend_sycl_buffer_type_initialized = false;
16558
-
16559
- if (!ggml_backend_sycl_buffer_type_initialized) {
16687
+ if (!g_ggml_backend_sycl_buffer_type_initialized) {
16560
16688
  for (int i = 0; i < g_device_count; i++) {
16561
16689
  ggml_backend_sycl_buffer_types[i] = {
16562
16690
  /* .iface = */ ggml_backend_sycl_buffer_type_interface,
16563
16691
  /* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(g_sycl_gpu_mgr->gpus[i])},
16564
16692
  };
16565
16693
  }
16566
- ggml_backend_sycl_buffer_type_initialized = true;
16694
+ g_ggml_backend_sycl_buffer_type_initialized = true;
16567
16695
  }
16568
-
16569
- return &ggml_backend_sycl_buffer_types[device];
16696
+ return &ggml_backend_sycl_buffer_types[device_index];
16570
16697
  }
16571
16698
 
16572
16699
  // sycl split buffer type
@@ -17265,6 +17392,7 @@ static ggml_backend_i ggml_backend_sycl_interface = {
17265
17392
  /* .graph_plan_compute = */ NULL,
17266
17393
  /* .graph_compute = */ ggml_backend_sycl_graph_compute,
17267
17394
  /* .supports_op = */ ggml_backend_sycl_supports_op,
17395
+ /* .offload_op = */ NULL,
17268
17396
  /* .event_new = */ NULL,
17269
17397
  /* .event_free = */ NULL,
17270
17398
  /* .event_record = */ NULL,
@@ -17319,11 +17447,42 @@ GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id) {
17319
17447
  return g_sycl_gpu_mgr->get_index(device_id);
17320
17448
  }
17321
17449
 
17450
+ GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index) {
17451
+ return g_sycl_gpu_mgr->gpus[device_index];
17452
+ }
17453
+
17454
+ GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id) {
17455
+ GGML_ASSERT(main_gpu_id<g_all_sycl_device_count);
17456
+ fprintf(stderr, "ggml_backend_sycl_set_single_device: use single device: [%d]\n", main_gpu_id);
17457
+ if (g_sycl_gpu_mgr) {
17458
+ delete g_sycl_gpu_mgr;
17459
+ }
17460
+ g_sycl_gpu_mgr = new sycl_gpu_mgr(main_gpu_id);
17461
+ g_ggml_sycl_backend_gpu_mode = SYCL_SINGLE_GPU_MODE;
17462
+ ggml_init_by_gpus(g_sycl_gpu_mgr->get_gpu_count());
17463
+ g_ggml_backend_sycl_buffer_type_initialized = false;
17464
+ }
17465
+
17466
+ GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode() {
17467
+ if (g_ggml_sycl_backend_gpu_mode == SYCL_MUL_GPU_MODE) {
17468
+ return;
17469
+ }
17470
+
17471
+ fprintf(stderr, "ggml_backend_sycl_set_mul_device_mode: true\n");
17472
+
17473
+ if (g_sycl_gpu_mgr) {
17474
+ delete g_sycl_gpu_mgr;
17475
+ }
17476
+ g_sycl_gpu_mgr = new sycl_gpu_mgr();
17477
+ g_ggml_sycl_backend_gpu_mode = SYCL_MUL_GPU_MODE;
17478
+ ggml_init_by_gpus(g_sycl_gpu_mgr->get_gpu_count());
17479
+ g_ggml_backend_sycl_buffer_type_initialized = false;
17480
+ }
17481
+
17322
17482
  extern "C" int ggml_backend_sycl_reg_devices();
17323
17483
 
17324
17484
  int ggml_backend_sycl_reg_devices() {
17325
- if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
17326
- g_device_count = g_sycl_gpu_mgr->get_gpu_count();
17485
+ ggml_backend_sycl_set_mul_device_mode();
17327
17486
  assert(g_device_count>0);
17328
17487
  for (int i = 0; i < g_device_count; i++) {
17329
17488
  int id = g_sycl_gpu_mgr->gpus[i];
@@ -13,7 +13,7 @@
13
13
  extern "C" {
14
14
  #endif
15
15
 
16
- #define GGML_SYCL_MAX_DEVICES 16
16
+ #define GGML_SYCL_MAX_DEVICES 48
17
17
  #define GGML_SYCL_NAME "SYCL"
18
18
 
19
19
  GGML_API void ggml_init_sycl(void);
@@ -29,6 +29,11 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_typ
29
29
  GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
30
30
  GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
31
31
 
32
+ // TODO: these are temporary
33
+ // ref: https://github.com/ggerganov/llama.cpp/pull/6022#issuecomment-1992615670
34
+ GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index);
35
+ GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id);
36
+ GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode();
32
37
  #ifdef __cplusplus
33
38
  }
34
39
  #endif
@@ -710,6 +710,12 @@ static uint32_t ggml_vk_find_queue_family_index(std::vector<vk::QueueFamilyPrope
710
710
  }
711
711
  }
712
712
 
713
+ // All commands that are allowed on a queue that supports transfer operations are also allowed on a queue that supports either graphics or compute operations.
714
+ // Thus, if the capabilities of a queue family include VK_QUEUE_GRAPHICS_BIT or VK_QUEUE_COMPUTE_BIT, then reporting the VK_QUEUE_TRANSFER_BIT capability separately for that queue family is optional.
715
+ if (compute_index >= 0) {
716
+ return compute_index;
717
+ }
718
+
713
719
  std::cerr << "ggml_vulkan: No suitable queue family index found." << std::endl;
714
720
 
715
721
  for(auto &q_family : queue_family_props) {
@@ -5693,6 +5699,7 @@ static ggml_backend_i ggml_backend_vk_interface = {
5693
5699
  /* .graph_plan_compute = */ NULL,
5694
5700
  /* .graph_compute = */ ggml_backend_vk_graph_compute,
5695
5701
  /* .supports_op = */ ggml_backend_vk_supports_op,
5702
+ /* .offload_op = */ NULL,
5696
5703
  /* .event_new = */ NULL,
5697
5704
  /* .event_free = */ NULL,
5698
5705
  /* .event_record = */ NULL,