llama_cpp 0.14.2 → 0.14.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -16,6 +16,7 @@
16
16
  #include <cinttypes>
17
17
  #include <cstddef>
18
18
  #include <cstdint>
19
+ #include <cstdlib>
19
20
  #include <float.h>
20
21
  #include <limits>
21
22
  #include <stdint.h>
@@ -24,10 +25,9 @@
24
25
  #include <cmath>
25
26
  #include <iostream>
26
27
  #include <fstream>
27
-
28
28
  #include <stdio.h>
29
29
  #include <stdlib.h>
30
-
30
+ #include <regex>
31
31
 
32
32
  #include <sycl/sycl.hpp>
33
33
  #include <sycl/half_type.hpp>
@@ -82,6 +82,30 @@ Following definition copied from DPCT head files, which are used by ggml-sycl.cp
82
82
  #define __dpct_noinline__ __attribute__((noinline))
83
83
  #endif
84
84
 
85
+
86
+ std::string get_device_type_name(const sycl::device &Device) {
87
+ auto DeviceType = Device.get_info<sycl::info::device::device_type>();
88
+ switch (DeviceType) {
89
+ case sycl::info::device_type::cpu:
90
+ return "cpu";
91
+ case sycl::info::device_type::gpu:
92
+ return "gpu";
93
+ case sycl::info::device_type::host:
94
+ return "host";
95
+ case sycl::info::device_type::accelerator:
96
+ return "acc";
97
+ default:
98
+ return "unknown";
99
+ }
100
+ }
101
+
102
+ std::string get_device_backend_and_type(const sycl::device &device) {
103
+ std::stringstream device_type;
104
+ sycl::backend backend = device.get_backend();
105
+ device_type << backend << ":" << get_device_type_name(device);
106
+ return device_type.str();
107
+ }
108
+
85
109
  namespace dpct
86
110
  {
87
111
  typedef sycl::queue *queue_ptr;
@@ -942,17 +966,67 @@ namespace dpct
942
966
 
943
967
  private:
944
968
  mutable std::recursive_mutex m_mutex;
969
+ static bool compare_dev(sycl::device &device1, sycl::device &device2)
970
+ {
971
+ dpct::device_info prop1;
972
+ dpct::get_device_info(prop1, device1);
973
+ dpct::device_info prop2;
974
+ dpct::get_device_info(prop2, device2);
975
+ return prop1.get_max_compute_units() > prop2.get_max_compute_units();
976
+ }
977
+ static int convert_backend_index(std::string & backend) {
978
+ if (backend == "ext_oneapi_level_zero:gpu") return 0;
979
+ if (backend == "opencl:gpu") return 1;
980
+ if (backend == "ext_oneapi_cuda:gpu") return 2;
981
+ if (backend == "ext_oneapi_hip:gpu") return 3;
982
+ if (backend == "opencl:cpu") return 4;
983
+ if (backend == "opencl:acc") return 5;
984
+ printf("convert_backend_index: can't handle backend=%s\n", backend.c_str());
985
+ GGML_ASSERT(false);
986
+ }
987
+ static bool compare_backend(std::string &backend1, std::string &backend2) {
988
+ return convert_backend_index(backend1) < convert_backend_index(backend2);
989
+ }
945
990
  dev_mgr()
946
991
  {
947
992
  sycl::device default_device =
948
993
  sycl::device(sycl::default_selector_v);
949
994
  _devs.push_back(std::make_shared<device_ext>(default_device));
950
995
 
951
- std::vector<sycl::device> sycl_all_devs =
952
- sycl::device::get_devices(sycl::info::device_type::all);
996
+ std::vector<sycl::device> sycl_all_devs;
953
997
  // Collect other devices except for the default device.
954
998
  if (default_device.is_cpu())
955
999
  _cpu_device = 0;
1000
+
1001
+ auto Platforms = sycl::platform::get_platforms();
1002
+ // Keep track of the number of devices per backend
1003
+ std::map<sycl::backend, size_t> DeviceNums;
1004
+ std::map<std::string, std::vector<sycl::device>> backend_devices;
1005
+
1006
+ while (!Platforms.empty()) {
1007
+ auto Platform = Platforms.back();
1008
+ Platforms.pop_back();
1009
+ auto devices = Platform.get_devices();
1010
+ std::string backend_type = get_device_backend_and_type(devices[0]);
1011
+ for (const auto &device : devices) {
1012
+ backend_devices[backend_type].push_back(device);
1013
+ }
1014
+ }
1015
+
1016
+ std::vector<std::string> keys;
1017
+ for(auto it = backend_devices.begin(); it != backend_devices.end(); ++it) {
1018
+ keys.push_back(it->first);
1019
+ }
1020
+ std::sort(keys.begin(), keys.end(), compare_backend);
1021
+
1022
+ for (auto &key : keys) {
1023
+ std::vector<sycl::device> devs = backend_devices[key];
1024
+ std::sort(devs.begin(), devs.end(), compare_dev);
1025
+ for (const auto &dev : devs) {
1026
+ sycl_all_devs.push_back(dev);
1027
+ }
1028
+ }
1029
+
956
1030
  for (auto &dev : sycl_all_devs)
957
1031
  {
958
1032
  if (dev == default_device)
@@ -3202,6 +3276,11 @@ static int g_work_group_size = 0;
3202
3276
  #define GGML_SYCL_MMV_Y 1
3203
3277
  #endif
3204
3278
 
3279
+ enum ggml_sycl_backend_gpu_mode {
3280
+ SYCL_UNSET_GPU_MODE = -1,
3281
+ SYCL_SINGLE_GPU_MODE = 0,
3282
+ SYCL_MUL_GPU_MODE
3283
+ };
3205
3284
 
3206
3285
  static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
3207
3286
 
@@ -3401,12 +3480,31 @@ class sycl_gpu_mgr {
3401
3480
  int work_group_size = 0;
3402
3481
  std::string gpus_list = "";
3403
3482
 
3483
+ /*
3484
+ Use all GPUs with same top max compute units
3485
+ */
3404
3486
  sycl_gpu_mgr() {
3405
3487
  detect_sycl_gpu_list_with_max_cu();
3406
3488
  get_allow_gpus();
3407
3489
  create_context_with_gpus();
3408
3490
  }
3409
3491
 
3492
+ /*
3493
+ Only use the assigned GPU
3494
+ */
3495
+ sycl_gpu_mgr(int main_gpu_id) {
3496
+ sycl::device device = dpct::dev_mgr::instance().get_device(main_gpu_id);
3497
+ dpct::device_info prop;
3498
+ dpct::get_device_info(prop, device);
3499
+ gpus.push_back(main_gpu_id);
3500
+ devices.push_back(device);
3501
+ work_group_size = prop.get_max_work_group_size();
3502
+ max_compute_units = prop.get_max_compute_units();
3503
+
3504
+ get_allow_gpus();
3505
+ create_context_with_gpus();
3506
+ }
3507
+
3410
3508
  void create_context_with_gpus() {
3411
3509
  sycl::context ctx = sycl::context(devices);
3412
3510
  assert(gpus.size() > 0);
@@ -3422,7 +3520,7 @@ class sycl_gpu_mgr {
3422
3520
  gpus_list += std::to_string(gpus[i]);
3423
3521
  gpus_list += ",";
3424
3522
  }
3425
- if (gpus_list.length() > 2) {
3523
+ if (gpus_list.length() > 1) {
3426
3524
  gpus_list.pop_back();
3427
3525
  }
3428
3526
  }
@@ -3471,8 +3569,8 @@ class sycl_gpu_mgr {
3471
3569
  if (gpus[i] == id)
3472
3570
  return i;
3473
3571
  }
3474
- assert(false);
3475
- return -1;
3572
+ printf("miss to get device index by id=%d\n", id);
3573
+ GGML_ASSERT(false);
3476
3574
  }
3477
3575
 
3478
3576
  int get_next_index(int id) {
@@ -3481,8 +3579,7 @@ class sycl_gpu_mgr {
3481
3579
  if (gpus[i] == id)
3482
3580
  return i;
3483
3581
  }
3484
- assert(false);
3485
- return -1;
3582
+ GGML_ASSERT(false);
3486
3583
  }
3487
3584
 
3488
3585
  bool is_ext_oneapi_device(const sycl::device &dev) {
@@ -3500,11 +3597,14 @@ static int g_device_count = -1;
3500
3597
  static int g_all_sycl_device_count = -1;
3501
3598
  static int g_main_device = -1;
3502
3599
  static int g_main_device_id = -1;
3600
+ static bool g_ggml_backend_sycl_buffer_type_initialized = false;
3503
3601
 
3504
3602
  static std::array<float, GGML_SYCL_MAX_DEVICES> g_default_tensor_split = {};
3505
3603
 
3506
3604
  static float g_tensor_split[GGML_SYCL_MAX_DEVICES] = {0};
3507
3605
 
3606
+ static ggml_sycl_backend_gpu_mode g_ggml_sycl_backend_gpu_mode = SYCL_UNSET_GPU_MODE;
3607
+
3508
3608
  struct sycl_device_capabilities {
3509
3609
  int cc; // compute capability
3510
3610
  bool vmm; // virtual memory support
@@ -13008,17 +13108,20 @@ bool ggml_sycl_loaded(void) {
13008
13108
  return g_sycl_loaded;
13009
13109
  }
13010
13110
 
13011
- void print_device_detail(int id) {
13111
+ void print_device_detail(int id, sycl::device &device, std::string device_type) {
13112
+
13012
13113
  dpct::device_info prop;
13013
13114
  SYCL_CHECK(CHECK_TRY_ERROR(
13014
- dpct::get_device_info(prop, dpct::dev_mgr::instance().get_device(id))));
13015
- sycl::device cur_device = dpct::dev_mgr::instance().get_device(id);
13115
+ dpct::get_device_info(prop, device)));
13116
+
13016
13117
  std::string version;
13017
13118
  version += std::to_string(prop.get_major_version());
13018
13119
  version += ".";
13019
13120
  version += std::to_string(prop.get_minor_version());
13020
13121
 
13021
- fprintf(stderr, "|%2d|%45s|%18s|%17d|%14d|%13d|%15lu|\n", id,
13122
+ device_type = std::regex_replace(device_type, std::regex("ext_oneapi_"), "");
13123
+
13124
+ fprintf(stderr, "|%2d|%18s|%45s|%10s|%11d|%8d|%7d|%15lu|\n", id, device_type.c_str(),
13022
13125
  prop.get_name(), version.c_str(), prop.get_max_compute_units(),
13023
13126
  prop.get_max_work_group_size(), prop.get_max_sub_group_size(),
13024
13127
  prop.get_global_mem_size());
@@ -13026,19 +13129,35 @@ void print_device_detail(int id) {
13026
13129
 
13027
13130
  void ggml_backend_sycl_print_sycl_devices() {
13028
13131
  int device_count = dpct::dev_mgr::instance().device_count();
13132
+ std::map<std::string, size_t> DeviceNums;
13029
13133
  fprintf(stderr, "found %d SYCL devices:\n", device_count);
13030
- fprintf(stderr, "|ID| Name |compute capability|Max compute units|Max work group|Max sub group|Global mem size|\n");
13031
- fprintf(stderr, "|--|---------------------------------------------|------------------|-----------------|--------------|-------------|---------------|\n");
13134
+ fprintf(stderr, "| | | |Compute |Max compute|Max work|Max sub| |\n");
13135
+ fprintf(stderr, "|ID| Device Type| Name|capability|units |group |group |Global mem size|\n");
13136
+ fprintf(stderr, "|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|\n");
13032
13137
  for (int id = 0; id < device_count; ++id) {
13033
- print_device_detail(id);
13138
+ sycl::device device = dpct::dev_mgr::instance().get_device(id);
13139
+ sycl::backend backend = device.get_backend();
13140
+ std::string backend_type = get_device_backend_and_type(device);
13141
+ int type_id=DeviceNums[backend_type]++;
13142
+ std::stringstream device_type;
13143
+ device_type << "[" << backend_type << ":" << std::to_string(type_id) << "]";
13144
+ print_device_detail(id, device, device_type.str());
13034
13145
  }
13035
13146
  }
13036
13147
 
13037
13148
  void print_gpu_device_list() {
13038
- fprintf(stderr, "detect %d SYCL GPUs: [%s] with Max compute units:%d\n",
13039
- g_sycl_gpu_mgr->get_gpu_count(),
13040
- g_sycl_gpu_mgr->gpus_list.c_str(),
13041
- g_sycl_gpu_mgr->max_compute_units);
13149
+ GGML_ASSERT(g_sycl_gpu_mgr);
13150
+
13151
+ char* hint=NULL;
13152
+ if (g_ggml_sycl_backend_gpu_mode == SYCL_SINGLE_GPU_MODE) {
13153
+ hint = "use %d SYCL GPUs: [%s] with Max compute units:%d\n";
13154
+ } else {
13155
+ hint = "detect %d SYCL GPUs: [%s] with top Max compute units:%d\n";
13156
+ }
13157
+ fprintf(stderr, hint,
13158
+ g_sycl_gpu_mgr->get_gpu_count(),
13159
+ g_sycl_gpu_mgr->gpus_list.c_str(),
13160
+ g_sycl_gpu_mgr->max_compute_units);
13042
13161
  }
13043
13162
 
13044
13163
  int get_sycl_env(const char *env_name, int default_val) {
@@ -13074,6 +13193,15 @@ void ggml_init_sycl() try {
13074
13193
  #else
13075
13194
  fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
13076
13195
  #endif
13196
+
13197
+ /* NOT REMOVE, keep it for next optimize for XMX.
13198
+ #if defined(SYCL_USE_XMX)
13199
+ fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
13200
+ #else
13201
+ fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
13202
+ #endif
13203
+ */
13204
+
13077
13205
  if (CHECK_TRY_ERROR(g_all_sycl_device_count =
13078
13206
  dpct::dev_mgr::instance().device_count()) != 0) {
13079
13207
  initialized = true;
@@ -13082,68 +13210,65 @@ void ggml_init_sycl() try {
13082
13210
  }
13083
13211
  GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
13084
13212
  ggml_backend_sycl_print_sycl_devices();
13213
+ initialized = true;
13214
+ g_sycl_loaded = true;
13215
+ }
13216
+ }
13217
+ catch (sycl::exception const &exc) {
13218
+ std::cerr << exc.what() << "Exception caught at file:" << __FILE__
13219
+ << ", line:" << __LINE__ << std::endl;
13220
+ std::exit(1);
13221
+ }
13085
13222
 
13086
- if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
13087
-
13088
- g_device_count = g_sycl_gpu_mgr->get_gpu_count();
13089
- g_work_group_size = g_sycl_gpu_mgr->work_group_size;
13090
-
13091
- print_gpu_device_list();
13223
+ void ggml_init_by_gpus(int device_count) try {
13224
+ g_device_count = device_count;
13225
+ g_work_group_size = g_sycl_gpu_mgr->work_group_size;
13092
13226
 
13093
- int64_t total_vram = 0;
13227
+ int64_t total_vram = 0;
13094
13228
 
13095
- /* NOT REMOVE, keep it for next optimize for XMX.
13096
- #if defined(SYCL_USE_XMX)
13097
- fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
13098
- #else
13099
- fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
13100
- #endif
13101
- */
13102
- for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
13103
- g_device_caps[id].vmm = 0;
13104
- g_device_caps[id].device_id = -1;
13105
- g_device_caps[id].cc = 0;
13106
- g_tensor_split[id] = 0;
13107
- g_default_tensor_split[id] = 0;
13108
- }
13229
+ print_gpu_device_list();
13109
13230
 
13110
- for (int i = 0; i < g_device_count; ++i) {
13111
- int device_id = g_sycl_gpu_mgr->gpus[i];
13112
- g_device_caps[i].vmm = 0;
13231
+ for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
13232
+ g_device_caps[id].vmm = 0;
13233
+ g_device_caps[id].device_id = -1;
13234
+ g_device_caps[id].cc = 0;
13235
+ g_tensor_split[id] = 0;
13236
+ g_default_tensor_split[id] = 0;
13237
+ }
13113
13238
 
13114
- dpct::device_info prop;
13115
- SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
13116
- prop, dpct::dev_mgr::instance().get_device(device_id))));
13239
+ for (int i = 0; i < g_device_count; ++i) {
13240
+ int device_id = g_sycl_gpu_mgr->gpus[i];
13241
+ g_device_caps[i].vmm = 0;
13117
13242
 
13118
- g_default_tensor_split[i] = total_vram;
13119
- total_vram += prop.get_global_mem_size();
13243
+ dpct::device_info prop;
13244
+ SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
13245
+ prop, dpct::dev_mgr::instance().get_device(device_id))));
13120
13246
 
13121
- g_device_caps[i].cc =
13122
- 100 * prop.get_major_version() + 10 * prop.get_minor_version();
13123
- }
13247
+ g_default_tensor_split[i] = total_vram;
13248
+ total_vram += prop.get_global_mem_size();
13124
13249
 
13125
- for (int i = 0; i < g_device_count; ++i) {
13126
- g_default_tensor_split[i] /= total_vram;
13127
- }
13250
+ g_device_caps[i].cc =
13251
+ 100 * prop.get_major_version() + 10 * prop.get_minor_version();
13252
+ }
13128
13253
 
13129
- for (int i = 0; i < g_device_count; ++i) {
13130
- SYCL_CHECK(ggml_sycl_set_device(i));
13254
+ for (int i = 0; i < g_device_count; ++i) {
13255
+ g_default_tensor_split[i] /= total_vram;
13256
+ }
13131
13257
 
13132
- // create sycl streams
13133
- for (int is = 0; is < MAX_STREAMS; ++is) {
13134
- SYCL_CHECK(CHECK_TRY_ERROR(
13135
- g_syclStreams[i][is] =
13136
- dpct::get_current_device().create_queue(
13137
- g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device())));
13138
- }
13258
+ for (int i = 0; i < g_device_count; ++i) {
13259
+ SYCL_CHECK(ggml_sycl_set_device(i));
13139
13260
 
13140
- const dpct::queue_ptr stream = g_syclStreams[i][0];
13141
- // create sycl handle
13142
- SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream));
13261
+ // create sycl streams
13262
+ for (int is = 0; is < MAX_STREAMS; ++is) {
13263
+ SYCL_CHECK(CHECK_TRY_ERROR(
13264
+ g_syclStreams[i][is] =
13265
+ dpct::get_current_device().create_queue(
13266
+ g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device())));
13143
13267
  }
13144
13268
 
13145
- initialized = true;
13146
- g_sycl_loaded = true;
13269
+ const dpct::queue_ptr stream = g_syclStreams[i][0];
13270
+ // create sycl handle
13271
+ SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream));
13147
13272
  }
13148
13273
  }
13149
13274
  catch (sycl::exception const &exc) {
@@ -16551,22 +16676,24 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
16551
16676
  /* .is_host = */ nullptr,
16552
16677
  };
16553
16678
 
16554
- ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
16679
+ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_index) {
16680
+ if (device_index>=g_device_count or device_index<0) {
16681
+ printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
16682
+ device_index, g_device_count-1);
16683
+ GGML_ASSERT(device_index<g_device_count);
16684
+ }
16555
16685
  static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];
16556
16686
 
16557
- static bool ggml_backend_sycl_buffer_type_initialized = false;
16558
-
16559
- if (!ggml_backend_sycl_buffer_type_initialized) {
16687
+ if (!g_ggml_backend_sycl_buffer_type_initialized) {
16560
16688
  for (int i = 0; i < g_device_count; i++) {
16561
16689
  ggml_backend_sycl_buffer_types[i] = {
16562
16690
  /* .iface = */ ggml_backend_sycl_buffer_type_interface,
16563
16691
  /* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(g_sycl_gpu_mgr->gpus[i])},
16564
16692
  };
16565
16693
  }
16566
- ggml_backend_sycl_buffer_type_initialized = true;
16694
+ g_ggml_backend_sycl_buffer_type_initialized = true;
16567
16695
  }
16568
-
16569
- return &ggml_backend_sycl_buffer_types[device];
16696
+ return &ggml_backend_sycl_buffer_types[device_index];
16570
16697
  }
16571
16698
 
16572
16699
  // sycl split buffer type
@@ -17265,6 +17392,7 @@ static ggml_backend_i ggml_backend_sycl_interface = {
17265
17392
  /* .graph_plan_compute = */ NULL,
17266
17393
  /* .graph_compute = */ ggml_backend_sycl_graph_compute,
17267
17394
  /* .supports_op = */ ggml_backend_sycl_supports_op,
17395
+ /* .offload_op = */ NULL,
17268
17396
  /* .event_new = */ NULL,
17269
17397
  /* .event_free = */ NULL,
17270
17398
  /* .event_record = */ NULL,
@@ -17319,11 +17447,42 @@ GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id) {
17319
17447
  return g_sycl_gpu_mgr->get_index(device_id);
17320
17448
  }
17321
17449
 
17450
+ GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index) {
17451
+ return g_sycl_gpu_mgr->gpus[device_index];
17452
+ }
17453
+
17454
+ GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id) {
17455
+ GGML_ASSERT(main_gpu_id<g_all_sycl_device_count);
17456
+ fprintf(stderr, "ggml_backend_sycl_set_single_device: use single device: [%d]\n", main_gpu_id);
17457
+ if (g_sycl_gpu_mgr) {
17458
+ delete g_sycl_gpu_mgr;
17459
+ }
17460
+ g_sycl_gpu_mgr = new sycl_gpu_mgr(main_gpu_id);
17461
+ g_ggml_sycl_backend_gpu_mode = SYCL_SINGLE_GPU_MODE;
17462
+ ggml_init_by_gpus(g_sycl_gpu_mgr->get_gpu_count());
17463
+ g_ggml_backend_sycl_buffer_type_initialized = false;
17464
+ }
17465
+
17466
+ GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode() {
17467
+ if (g_ggml_sycl_backend_gpu_mode == SYCL_MUL_GPU_MODE) {
17468
+ return;
17469
+ }
17470
+
17471
+ fprintf(stderr, "ggml_backend_sycl_set_mul_device_mode: true\n");
17472
+
17473
+ if (g_sycl_gpu_mgr) {
17474
+ delete g_sycl_gpu_mgr;
17475
+ }
17476
+ g_sycl_gpu_mgr = new sycl_gpu_mgr();
17477
+ g_ggml_sycl_backend_gpu_mode = SYCL_MUL_GPU_MODE;
17478
+ ggml_init_by_gpus(g_sycl_gpu_mgr->get_gpu_count());
17479
+ g_ggml_backend_sycl_buffer_type_initialized = false;
17480
+ }
17481
+
17322
17482
  extern "C" int ggml_backend_sycl_reg_devices();
17323
17483
 
17324
17484
  int ggml_backend_sycl_reg_devices() {
17325
- if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
17326
- g_device_count = g_sycl_gpu_mgr->get_gpu_count();
17485
+ ggml_backend_sycl_set_mul_device_mode();
17327
17486
  assert(g_device_count>0);
17328
17487
  for (int i = 0; i < g_device_count; i++) {
17329
17488
  int id = g_sycl_gpu_mgr->gpus[i];
@@ -13,7 +13,7 @@
13
13
  extern "C" {
14
14
  #endif
15
15
 
16
- #define GGML_SYCL_MAX_DEVICES 16
16
+ #define GGML_SYCL_MAX_DEVICES 48
17
17
  #define GGML_SYCL_NAME "SYCL"
18
18
 
19
19
  GGML_API void ggml_init_sycl(void);
@@ -29,6 +29,11 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_typ
29
29
  GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
30
30
  GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
31
31
 
32
+ // TODO: these are temporary
33
+ // ref: https://github.com/ggerganov/llama.cpp/pull/6022#issuecomment-1992615670
34
+ GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index);
35
+ GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id);
36
+ GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode();
32
37
  #ifdef __cplusplus
33
38
  }
34
39
  #endif
@@ -710,6 +710,12 @@ static uint32_t ggml_vk_find_queue_family_index(std::vector<vk::QueueFamilyPrope
710
710
  }
711
711
  }
712
712
 
713
+ // All commands that are allowed on a queue that supports transfer operations are also allowed on a queue that supports either graphics or compute operations.
714
+ // Thus, if the capabilities of a queue family include VK_QUEUE_GRAPHICS_BIT or VK_QUEUE_COMPUTE_BIT, then reporting the VK_QUEUE_TRANSFER_BIT capability separately for that queue family is optional.
715
+ if (compute_index >= 0) {
716
+ return compute_index;
717
+ }
718
+
713
719
  std::cerr << "ggml_vulkan: No suitable queue family index found." << std::endl;
714
720
 
715
721
  for(auto &q_family : queue_family_props) {
@@ -5693,6 +5699,7 @@ static ggml_backend_i ggml_backend_vk_interface = {
5693
5699
  /* .graph_plan_compute = */ NULL,
5694
5700
  /* .graph_compute = */ ggml_backend_vk_graph_compute,
5695
5701
  /* .supports_op = */ ggml_backend_vk_supports_op,
5702
+ /* .offload_op = */ NULL,
5696
5703
  /* .event_new = */ NULL,
5697
5704
  /* .event_free = */ NULL,
5698
5705
  /* .event_record = */ NULL,