llama_cpp 0.15.2 → 0.15.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -43,19 +43,59 @@
43
43
  #include <mutex>
44
44
  #include <stdint.h>
45
45
  #include <stdio.h>
46
+ #include <stdarg.h>
47
+ #include <stdlib.h>
46
48
  #include <string>
47
49
  #include <vector>
48
50
 
49
51
  static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
50
52
 
53
+ static void ggml_cuda_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
54
+ GGML_UNUSED(level);
55
+ GGML_UNUSED(user_data);
56
+ fprintf(stderr, "%s", msg);
57
+ }
58
+
59
+ ggml_log_callback ggml_cuda_log_callback = ggml_cuda_default_log_callback;
60
+ void * ggml_cuda_log_user_data = NULL;
61
+
62
+ GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data) {
63
+ ggml_cuda_log_callback = log_callback;
64
+ ggml_cuda_log_user_data = user_data;
65
+ }
66
+
67
+ #define GGML_CUDA_LOG_INFO(...) ggml_cuda_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
68
+ #define GGML_CUDA_LOG_WARN(...) ggml_cuda_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
69
+ #define GGML_CUDA_LOG_ERROR(...) ggml_cuda_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
70
+
71
+ GGML_ATTRIBUTE_FORMAT(2, 3)
72
+ static void ggml_cuda_log(enum ggml_log_level level, const char * format, ...) {
73
+ if (ggml_cuda_log_callback != NULL) {
74
+ va_list args;
75
+ va_start(args, format);
76
+ char buffer[128];
77
+ int len = vsnprintf(buffer, 128, format, args);
78
+ if (len < 128) {
79
+ ggml_cuda_log_callback(level, buffer, ggml_cuda_log_user_data);
80
+ } else {
81
+ std::vector<char> buffer2(len + 1); // vsnprintf adds a null terminator
82
+ va_end(args);
83
+ va_start(args, format);
84
+ vsnprintf(&buffer2[0], buffer2.size(), format, args);
85
+ ggml_cuda_log_callback(level, buffer2.data(), ggml_cuda_log_user_data);
86
+ }
87
+ va_end(args);
88
+ }
89
+ }
90
+
51
91
  [[noreturn]]
52
92
  void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
53
93
  int id = -1; // in case cudaGetDevice fails
54
94
  cudaGetDevice(&id);
55
95
 
56
- fprintf(stderr, "CUDA error: %s\n", msg);
57
- fprintf(stderr, " current device: %d, in function %s at %s:%d\n", id, func, file, line);
58
- fprintf(stderr, " %s\n", stmt);
96
+ GGML_CUDA_LOG_ERROR("CUDA error: %s\n", msg);
97
+ GGML_CUDA_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
98
+ GGML_CUDA_LOG_ERROR(" %s\n", stmt);
59
99
  // abort with GGML_ASSERT to get a stack trace
60
100
  GGML_ASSERT(!"CUDA error");
61
101
  }
@@ -79,6 +119,20 @@ int ggml_cuda_get_device() {
79
119
  return id;
80
120
  }
81
121
 
122
+ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
123
+ ggml_cuda_set_device(device);
124
+ #if defined(GGML_USE_HIPBLAS) && defined(GGML_HIP_UMA)
125
+ auto res = hipMallocManaged(ptr, size);
126
+ if (res == hipSuccess) {
127
+ // if error we "need" to know why...
128
+ CUDA_CHECK(hipMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
129
+ }
130
+ return res;
131
+ #else
132
+ return cudaMalloc(ptr, size);
133
+ #endif
134
+ }
135
+
82
136
  static ggml_cuda_device_info ggml_cuda_init() {
83
137
  #ifdef __HIP_PLATFORM_AMD__
84
138
  // Workaround for a rocBLAS bug when using multiple graphics cards:
@@ -91,7 +145,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
91
145
 
92
146
  cudaError_t err = cudaGetDeviceCount(&info.device_count);
93
147
  if (err != cudaSuccess) {
94
- fprintf(stderr, "%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));
148
+ GGML_CUDA_LOG_ERROR("%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));
95
149
  return info;
96
150
  }
97
151
 
@@ -99,16 +153,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
99
153
 
100
154
  int64_t total_vram = 0;
101
155
  #if defined(GGML_CUDA_FORCE_MMQ)
102
- fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
156
+ GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
103
157
  #else
104
- fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
158
+ GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
105
159
  #endif
106
160
  #if defined(CUDA_USE_TENSOR_CORES)
107
- fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
161
+ GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
108
162
  #else
109
- fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
163
+ GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
110
164
  #endif
111
- fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
165
+ GGML_CUDA_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
112
166
  for (int id = 0; id < info.device_count; ++id) {
113
167
  int device_vmm = 0;
114
168
 
@@ -129,7 +183,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
129
183
 
130
184
  cudaDeviceProp prop;
131
185
  CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
132
- fprintf(stderr, " Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
186
+ GGML_CUDA_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
133
187
 
134
188
  info.default_tensor_split[id] = total_vram;
135
189
  total_vram += prop.totalGlobalMem;
@@ -231,12 +285,12 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
231
285
  size_t look_ahead_size = (size_t) (1.05 * size);
232
286
  look_ahead_size = 256 * ((look_ahead_size + 255)/256);
233
287
  ggml_cuda_set_device(device);
234
- CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
288
+ CUDA_CHECK(ggml_cuda_device_malloc(&ptr, look_ahead_size, device));
235
289
  *actual_size = look_ahead_size;
236
290
  pool_size += look_ahead_size;
237
291
  #ifdef DEBUG_CUDA_MALLOC
238
- fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
239
- (uint32_t)(max_size/1024/1024), (uint32_t)(pool_size/1024/1024), (uint32_t)(size/1024/1024));
292
+ GGML_CUDA_LOG_INFO("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
293
+ (uint32_t)(max_size / 1024 / 1024), (uint32_t)(pool_size / 1024 / 1024), (uint32_t)(size / 1024 / 1024));
240
294
  #endif
241
295
  return ptr;
242
296
  }
@@ -250,7 +304,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
250
304
  return;
251
305
  }
252
306
  }
253
- fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
307
+ GGML_CUDA_LOG_WARN("Cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
254
308
  ggml_cuda_set_device(device);
255
309
  CUDA_CHECK(cudaFree(ptr));
256
310
  pool_size -= size;
@@ -497,9 +551,11 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffe
497
551
  size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
498
552
 
499
553
  void * dev_ptr;
500
- cudaError_t err = cudaMalloc(&dev_ptr, size);
554
+ cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
501
555
  if (err != cudaSuccess) {
502
- fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err));
556
+ // clear the error
557
+ cudaGetLastError();
558
+ GGML_CUDA_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err));
503
559
  return nullptr;
504
560
  }
505
561
 
@@ -756,7 +812,7 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_bu
756
812
  // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
757
813
  ggml_cuda_set_device(id);
758
814
  char * buf;
759
- CUDA_CHECK(cudaMalloc(&buf, size));
815
+ CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id));
760
816
 
761
817
  // set padding to 0 to avoid possible NaN values
762
818
  if (size > original_size) {
@@ -1002,8 +1058,8 @@ static void * ggml_cuda_host_malloc(size_t size) {
1002
1058
  if (err != cudaSuccess) {
1003
1059
  // clear the error
1004
1060
  cudaGetLastError();
1005
- fprintf(stderr, "%s: warning: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
1006
- size/1024.0/1024.0, cudaGetErrorString(err));
1061
+ GGML_CUDA_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
1062
+ size / 1024.0 / 1024.0, cudaGetErrorString(err));
1007
1063
  return nullptr;
1008
1064
  }
1009
1065
 
@@ -1814,7 +1870,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
1814
1870
  }
1815
1871
  }
1816
1872
  #else
1817
- if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
1873
+ if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
1818
1874
  // there is no broadcast and src0, src1 are contiguous across dims 2, 3
1819
1875
  // use cublasGemmStridedBatchedEx
1820
1876
  CUBLAS_CHECK(
@@ -2246,7 +2302,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
2246
2302
  break;
2247
2303
  case GGML_OP_MUL_MAT:
2248
2304
  if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) {
2249
- fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]);
2305
+ GGML_CUDA_LOG_ERROR("%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]);
2250
2306
  return false;
2251
2307
  } else {
2252
2308
  ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst);
@@ -2300,7 +2356,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
2300
2356
 
2301
2357
  cudaError_t err = cudaGetLastError();
2302
2358
  if (err != cudaSuccess) {
2303
- fprintf(stderr, "%s: %s failed\n", __func__, ggml_op_desc(dst));
2359
+ GGML_CUDA_LOG_ERROR("%s: %s failed\n", __func__, ggml_op_desc(dst));
2304
2360
  CUDA_CHECK(err);
2305
2361
  }
2306
2362
 
@@ -2468,15 +2524,15 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2468
2524
 
2469
2525
  bool use_cuda_graph = true;
2470
2526
  bool cuda_graph_update_required = false;
2471
- // pointer to CUDA cpy kernel, which is required to identify
2527
+ // vector of pointers to CUDA cpy kernels, which are required to identify
2472
2528
  // kernel parameters which need updated in the graph for each token
2473
- void * ggml_cuda_cpy_fn_ptr = nullptr;
2529
+ std::vector<void *> ggml_cuda_cpy_fn_ptrs;
2474
2530
 
2475
2531
  if (cuda_ctx->cuda_graph->graph == nullptr) {
2476
2532
  if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
2477
2533
  cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
2478
2534
  #ifndef NDEBUG
2479
- fprintf(stderr, "%s: disabling CUDA graphs due to GPU architecture\n", __func__);
2535
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
2480
2536
  #endif
2481
2537
  }
2482
2538
  }
@@ -2523,14 +2579,14 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2523
2579
  if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
2524
2580
  use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
2525
2581
  #ifndef NDEBUG
2526
- fprintf(stderr, "%s: disabling CUDA graphs due to split buffer\n", __func__);
2582
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__);
2527
2583
  #endif
2528
2584
  }
2529
2585
 
2530
2586
  if (node->op == GGML_OP_MUL_MAT_ID) {
2531
2587
  use_cuda_graph = false; // This node type is not supported by CUDA graph capture
2532
2588
  #ifndef NDEBUG
2533
- fprintf(stderr, "%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
2589
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
2534
2590
  #endif
2535
2591
  }
2536
2592
 
@@ -2539,16 +2595,17 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2539
2595
  // Changes in batch size or context size can cause changes to the grid size of some kernels.
2540
2596
  use_cuda_graph = false;
2541
2597
  #ifndef NDEBUG
2542
- fprintf(stderr, "%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
2598
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
2543
2599
  #endif
2544
2600
  }
2545
2601
 
2546
2602
  if (node->op == GGML_OP_CPY) {
2547
2603
  // store the copy op parameter which changes with each token.
2548
2604
  cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
2549
- if (ggml_cuda_cpy_fn_ptr == nullptr) {
2550
- // store a pointer to the copy op CUDA kernel to identify it later
2551
- ggml_cuda_cpy_fn_ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
2605
+ // store a pointer to each copy op CUDA kernel to identify it later
2606
+ void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
2607
+ if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
2608
+ ggml_cuda_cpy_fn_ptrs.push_back(ptr);
2552
2609
  }
2553
2610
  }
2554
2611
 
@@ -2567,7 +2624,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2567
2624
  if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
2568
2625
  cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
2569
2626
  #ifndef NDEBUG
2570
- fprintf(stderr, "%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
2627
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
2571
2628
  #endif
2572
2629
  }
2573
2630
  }
@@ -2605,7 +2662,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2605
2662
 
2606
2663
  bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
2607
2664
  if (!ok) {
2608
- fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
2665
+ GGML_CUDA_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
2609
2666
  }
2610
2667
  GGML_ASSERT(ok);
2611
2668
  }
@@ -2624,7 +2681,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2624
2681
  use_cuda_graph = false;
2625
2682
  cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
2626
2683
  #ifndef NDEBUG
2627
- fprintf(stderr, "%s: disabling CUDA graphs due to failed graph capture\n", __func__);
2684
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to failed graph capture\n", __func__);
2628
2685
  #endif
2629
2686
  } else {
2630
2687
  graph_evaluated_or_captured = true; // CUDA graph has been captured
@@ -2678,7 +2735,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2678
2735
  if (!cuda_graph_update_required) { // on update steps, the live parameters will already be captured
2679
2736
  int k = 0;
2680
2737
  for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
2681
- if (cuda_ctx->cuda_graph->params[i].func == ggml_cuda_cpy_fn_ptr) {
2738
+ if(count(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), cuda_ctx->cuda_graph->params[i].func) > 0) {
2682
2739
  char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
2683
2740
  cuda_ctx->cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr;
2684
2741
  CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
@@ -2691,7 +2748,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2691
2748
  cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
2692
2749
  if (stat == cudaErrorGraphExecUpdateFailure) {
2693
2750
  #ifndef NDEBUG
2694
- fprintf(stderr, "%s: CUDA graph update failed\n", __func__);
2751
+ GGML_CUDA_LOG_ERROR("%s: CUDA graph update failed\n", __func__);
2695
2752
  #endif
2696
2753
  // The pre-existing graph exec cannot be updated due to violated constraints
2697
2754
  // so instead clear error and re-instantiate
@@ -2829,7 +2886,9 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
2829
2886
  case GGML_OP_CONT:
2830
2887
  case GGML_OP_DIAG_MASK_INF:
2831
2888
  case GGML_OP_SOFT_MAX:
2889
+ return true;
2832
2890
  case GGML_OP_ROPE:
2891
+ return ggml_is_contiguous(op->src[0]);
2833
2892
  case GGML_OP_IM2COL:
2834
2893
  case GGML_OP_POOL_2D:
2835
2894
  case GGML_OP_SUM_ROWS:
@@ -2948,13 +3007,13 @@ static ggml_guid_t ggml_backend_cuda_guid() {
2948
3007
 
2949
3008
  GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
2950
3009
  if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
2951
- fprintf(stderr, "%s: error: invalid device %d\n", __func__, device);
3010
+ GGML_CUDA_LOG_ERROR("%s: invalid device %d\n", __func__, device);
2952
3011
  return nullptr;
2953
3012
  }
2954
3013
 
2955
3014
  ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
2956
3015
  if (ctx == nullptr) {
2957
- fprintf(stderr, "%s: error: failed to allocate context\n", __func__);
3016
+ GGML_CUDA_LOG_ERROR("%s: failed to allocate context\n", __func__);
2958
3017
  return nullptr;
2959
3018
  }
2960
3019
 
@@ -2998,8 +3057,8 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size
2998
3057
  // clear the error
2999
3058
  cudaGetLastError();
3000
3059
 
3001
- fprintf(stderr, "%s: warning: failed to register %.2f MiB of pinned memory: %s\n", __func__,
3002
- size/1024.0/1024.0, cudaGetErrorString(err));
3060
+ GGML_CUDA_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
3061
+ size / 1024.0 / 1024.0, cudaGetErrorString(err));
3003
3062
  return false;
3004
3063
  }
3005
3064
  return true;
@@ -38,6 +38,7 @@ GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t *
38
38
  GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
39
39
  GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
40
40
 
41
+ GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
41
42
  #ifdef __cplusplus
42
43
  }
43
44
  #endif
@@ -17,6 +17,18 @@
17
17
  #define MIN(a, b) ((a) < (b) ? (a) : (b))
18
18
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
19
19
 
20
+ #if defined(_WIN32)
21
+
22
+ #define m512bh(p) p
23
+ #define m512i(p) p
24
+
25
+ #else
26
+
27
+ #define m512bh(p) (__m512bh)(p)
28
+ #define m512i(p) (__m512i)(p)
29
+
30
+ #endif
31
+
20
32
  /**
21
33
  * Converts brain16 to float32.
22
34
  *
@@ -132,6 +144,10 @@ extern "C" {
132
144
  #endif
133
145
  #endif
134
146
 
147
+ #if defined(__ARM_FEATURE_SVE)
148
+ #include <arm_sve.h>
149
+ #endif
150
+
135
151
  // 16-bit float
136
152
  // on Arm, we use __fp16
137
153
  // on x86, we use uint16_t
@@ -443,6 +459,34 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
443
459
  #include <riscv_vector.h>
444
460
  #endif
445
461
 
462
+ #if defined(__loongarch64)
463
+ #if defined(__loongarch_asx)
464
+ #include <lasxintrin.h>
465
+ #endif
466
+ #if defined(__loongarch_sx)
467
+ #include <lsxintrin.h>
468
+ #endif
469
+ #endif
470
+
471
+ #if defined(__loongarch_asx)
472
+
473
+ typedef union {
474
+ int32_t i;
475
+ float f;
476
+ } ft_union;
477
+
478
+ /* float type data load instructions */
479
+ static __m128 __lsx_vreplfr2vr_s(float val) {
480
+ ft_union fi_tmpval = {.f = val};
481
+ return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
482
+ }
483
+
484
+ static __m256 __lasx_xvreplfr2vr_s(float val) {
485
+ ft_union fi_tmpval = {.f = val};
486
+ return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
487
+ }
488
+ #endif
489
+
446
490
  #ifdef __F16C__
447
491
 
448
492
  #ifdef _MSC_VER
@@ -1597,7 +1597,6 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
1597
1597
  {
1598
1598
  GGML_ASSERT(ne00 == ne10);
1599
1599
 
1600
- // TODO: assert that dim2 and dim3 are contiguous
1601
1600
  GGML_ASSERT(ne12 % ne02 == 0);
1602
1601
  GGML_ASSERT(ne13 % ne03 == 0);
1603
1602
 
@@ -1677,6 +1676,10 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
1677
1676
  } break;
1678
1677
  case GGML_OP_ROPE:
1679
1678
  {
1679
+ #pragma message("TODO: implement phi3 frequency factors support")
1680
+ #pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225")
1681
+ GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
1682
+
1680
1683
  GGML_ASSERT(ne10 == ne02);
1681
1684
  GGML_ASSERT(src0t == dstt);
1682
1685
  // const int n_past = ((int32_t *) dst->op_params)[0];