llama_cpp 0.15.2 → 0.15.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -43,19 +43,59 @@
43
43
  #include <mutex>
44
44
  #include <stdint.h>
45
45
  #include <stdio.h>
46
+ #include <stdarg.h>
47
+ #include <stdlib.h>
46
48
  #include <string>
47
49
  #include <vector>
48
50
 
49
51
  static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
50
52
 
53
+ static void ggml_cuda_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
54
+ GGML_UNUSED(level);
55
+ GGML_UNUSED(user_data);
56
+ fprintf(stderr, "%s", msg);
57
+ }
58
+
59
+ ggml_log_callback ggml_cuda_log_callback = ggml_cuda_default_log_callback;
60
+ void * ggml_cuda_log_user_data = NULL;
61
+
62
+ GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data) {
63
+ ggml_cuda_log_callback = log_callback;
64
+ ggml_cuda_log_user_data = user_data;
65
+ }
66
+
67
+ #define GGML_CUDA_LOG_INFO(...) ggml_cuda_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
68
+ #define GGML_CUDA_LOG_WARN(...) ggml_cuda_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
69
+ #define GGML_CUDA_LOG_ERROR(...) ggml_cuda_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
70
+
71
+ GGML_ATTRIBUTE_FORMAT(2, 3)
72
+ static void ggml_cuda_log(enum ggml_log_level level, const char * format, ...) {
73
+ if (ggml_cuda_log_callback != NULL) {
74
+ va_list args;
75
+ va_start(args, format);
76
+ char buffer[128];
77
+ int len = vsnprintf(buffer, 128, format, args);
78
+ if (len < 128) {
79
+ ggml_cuda_log_callback(level, buffer, ggml_cuda_log_user_data);
80
+ } else {
81
+ std::vector<char> buffer2(len + 1); // vsnprintf adds a null terminator
82
+ va_end(args);
83
+ va_start(args, format);
84
+ vsnprintf(&buffer2[0], buffer2.size(), format, args);
85
+ ggml_cuda_log_callback(level, buffer2.data(), ggml_cuda_log_user_data);
86
+ }
87
+ va_end(args);
88
+ }
89
+ }
90
+
51
91
  [[noreturn]]
52
92
  void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
53
93
  int id = -1; // in case cudaGetDevice fails
54
94
  cudaGetDevice(&id);
55
95
 
56
- fprintf(stderr, "CUDA error: %s\n", msg);
57
- fprintf(stderr, " current device: %d, in function %s at %s:%d\n", id, func, file, line);
58
- fprintf(stderr, " %s\n", stmt);
96
+ GGML_CUDA_LOG_ERROR("CUDA error: %s\n", msg);
97
+ GGML_CUDA_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
98
+ GGML_CUDA_LOG_ERROR(" %s\n", stmt);
59
99
  // abort with GGML_ASSERT to get a stack trace
60
100
  GGML_ASSERT(!"CUDA error");
61
101
  }
@@ -79,6 +119,20 @@ int ggml_cuda_get_device() {
79
119
  return id;
80
120
  }
81
121
 
122
+ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
123
+ ggml_cuda_set_device(device);
124
+ #if defined(GGML_USE_HIPBLAS) && defined(GGML_HIP_UMA)
125
+ auto res = hipMallocManaged(ptr, size);
126
+ if (res == hipSuccess) {
127
+ // if error we "need" to know why...
128
+ CUDA_CHECK(hipMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
129
+ }
130
+ return res;
131
+ #else
132
+ return cudaMalloc(ptr, size);
133
+ #endif
134
+ }
135
+
82
136
  static ggml_cuda_device_info ggml_cuda_init() {
83
137
  #ifdef __HIP_PLATFORM_AMD__
84
138
  // Workaround for a rocBLAS bug when using multiple graphics cards:
@@ -91,7 +145,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
91
145
 
92
146
  cudaError_t err = cudaGetDeviceCount(&info.device_count);
93
147
  if (err != cudaSuccess) {
94
- fprintf(stderr, "%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));
148
+ GGML_CUDA_LOG_ERROR("%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));
95
149
  return info;
96
150
  }
97
151
 
@@ -99,16 +153,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
99
153
 
100
154
  int64_t total_vram = 0;
101
155
  #if defined(GGML_CUDA_FORCE_MMQ)
102
- fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
156
+ GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
103
157
  #else
104
- fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
158
+ GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
105
159
  #endif
106
160
  #if defined(CUDA_USE_TENSOR_CORES)
107
- fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
161
+ GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
108
162
  #else
109
- fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
163
+ GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
110
164
  #endif
111
- fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
165
+ GGML_CUDA_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
112
166
  for (int id = 0; id < info.device_count; ++id) {
113
167
  int device_vmm = 0;
114
168
 
@@ -129,7 +183,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
129
183
 
130
184
  cudaDeviceProp prop;
131
185
  CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
132
- fprintf(stderr, " Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
186
+ GGML_CUDA_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
133
187
 
134
188
  info.default_tensor_split[id] = total_vram;
135
189
  total_vram += prop.totalGlobalMem;
@@ -231,12 +285,12 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
231
285
  size_t look_ahead_size = (size_t) (1.05 * size);
232
286
  look_ahead_size = 256 * ((look_ahead_size + 255)/256);
233
287
  ggml_cuda_set_device(device);
234
- CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
288
+ CUDA_CHECK(ggml_cuda_device_malloc(&ptr, look_ahead_size, device));
235
289
  *actual_size = look_ahead_size;
236
290
  pool_size += look_ahead_size;
237
291
  #ifdef DEBUG_CUDA_MALLOC
238
- fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
239
- (uint32_t)(max_size/1024/1024), (uint32_t)(pool_size/1024/1024), (uint32_t)(size/1024/1024));
292
+ GGML_CUDA_LOG_INFO("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
293
+ (uint32_t)(max_size / 1024 / 1024), (uint32_t)(pool_size / 1024 / 1024), (uint32_t)(size / 1024 / 1024));
240
294
  #endif
241
295
  return ptr;
242
296
  }
@@ -250,7 +304,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
250
304
  return;
251
305
  }
252
306
  }
253
- fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
307
+ GGML_CUDA_LOG_WARN("Cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
254
308
  ggml_cuda_set_device(device);
255
309
  CUDA_CHECK(cudaFree(ptr));
256
310
  pool_size -= size;
@@ -497,9 +551,11 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffe
497
551
  size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
498
552
 
499
553
  void * dev_ptr;
500
- cudaError_t err = cudaMalloc(&dev_ptr, size);
554
+ cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
501
555
  if (err != cudaSuccess) {
502
- fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err));
556
+ // clear the error
557
+ cudaGetLastError();
558
+ GGML_CUDA_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err));
503
559
  return nullptr;
504
560
  }
505
561
 
@@ -756,7 +812,7 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_bu
756
812
  // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
757
813
  ggml_cuda_set_device(id);
758
814
  char * buf;
759
- CUDA_CHECK(cudaMalloc(&buf, size));
815
+ CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id));
760
816
 
761
817
  // set padding to 0 to avoid possible NaN values
762
818
  if (size > original_size) {
@@ -1002,8 +1058,8 @@ static void * ggml_cuda_host_malloc(size_t size) {
1002
1058
  if (err != cudaSuccess) {
1003
1059
  // clear the error
1004
1060
  cudaGetLastError();
1005
- fprintf(stderr, "%s: warning: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
1006
- size/1024.0/1024.0, cudaGetErrorString(err));
1061
+ GGML_CUDA_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
1062
+ size / 1024.0 / 1024.0, cudaGetErrorString(err));
1007
1063
  return nullptr;
1008
1064
  }
1009
1065
 
@@ -1814,7 +1870,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
1814
1870
  }
1815
1871
  }
1816
1872
  #else
1817
- if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
1873
+ if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
1818
1874
  // there is no broadcast and src0, src1 are contiguous across dims 2, 3
1819
1875
  // use cublasGemmStridedBatchedEx
1820
1876
  CUBLAS_CHECK(
@@ -2246,7 +2302,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
2246
2302
  break;
2247
2303
  case GGML_OP_MUL_MAT:
2248
2304
  if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) {
2249
- fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]);
2305
+ GGML_CUDA_LOG_ERROR("%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]);
2250
2306
  return false;
2251
2307
  } else {
2252
2308
  ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst);
@@ -2300,7 +2356,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
2300
2356
 
2301
2357
  cudaError_t err = cudaGetLastError();
2302
2358
  if (err != cudaSuccess) {
2303
- fprintf(stderr, "%s: %s failed\n", __func__, ggml_op_desc(dst));
2359
+ GGML_CUDA_LOG_ERROR("%s: %s failed\n", __func__, ggml_op_desc(dst));
2304
2360
  CUDA_CHECK(err);
2305
2361
  }
2306
2362
 
@@ -2468,15 +2524,15 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2468
2524
 
2469
2525
  bool use_cuda_graph = true;
2470
2526
  bool cuda_graph_update_required = false;
2471
- // pointer to CUDA cpy kernel, which is required to identify
2527
+ // vector of pointers to CUDA cpy kernels, which are required to identify
2472
2528
  // kernel parameters which need updated in the graph for each token
2473
- void * ggml_cuda_cpy_fn_ptr = nullptr;
2529
+ std::vector<void *> ggml_cuda_cpy_fn_ptrs;
2474
2530
 
2475
2531
  if (cuda_ctx->cuda_graph->graph == nullptr) {
2476
2532
  if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
2477
2533
  cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
2478
2534
  #ifndef NDEBUG
2479
- fprintf(stderr, "%s: disabling CUDA graphs due to GPU architecture\n", __func__);
2535
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
2480
2536
  #endif
2481
2537
  }
2482
2538
  }
@@ -2523,14 +2579,14 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2523
2579
  if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
2524
2580
  use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
2525
2581
  #ifndef NDEBUG
2526
- fprintf(stderr, "%s: disabling CUDA graphs due to split buffer\n", __func__);
2582
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__);
2527
2583
  #endif
2528
2584
  }
2529
2585
 
2530
2586
  if (node->op == GGML_OP_MUL_MAT_ID) {
2531
2587
  use_cuda_graph = false; // This node type is not supported by CUDA graph capture
2532
2588
  #ifndef NDEBUG
2533
- fprintf(stderr, "%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
2589
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
2534
2590
  #endif
2535
2591
  }
2536
2592
 
@@ -2539,16 +2595,17 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2539
2595
  // Changes in batch size or context size can cause changes to the grid size of some kernels.
2540
2596
  use_cuda_graph = false;
2541
2597
  #ifndef NDEBUG
2542
- fprintf(stderr, "%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
2598
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
2543
2599
  #endif
2544
2600
  }
2545
2601
 
2546
2602
  if (node->op == GGML_OP_CPY) {
2547
2603
  // store the copy op parameter which changes with each token.
2548
2604
  cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
2549
- if (ggml_cuda_cpy_fn_ptr == nullptr) {
2550
- // store a pointer to the copy op CUDA kernel to identify it later
2551
- ggml_cuda_cpy_fn_ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
2605
+ // store a pointer to each copy op CUDA kernel to identify it later
2606
+ void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
2607
+ if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
2608
+ ggml_cuda_cpy_fn_ptrs.push_back(ptr);
2552
2609
  }
2553
2610
  }
2554
2611
 
@@ -2567,7 +2624,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2567
2624
  if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
2568
2625
  cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
2569
2626
  #ifndef NDEBUG
2570
- fprintf(stderr, "%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
2627
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
2571
2628
  #endif
2572
2629
  }
2573
2630
  }
@@ -2605,7 +2662,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2605
2662
 
2606
2663
  bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
2607
2664
  if (!ok) {
2608
- fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
2665
+ GGML_CUDA_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
2609
2666
  }
2610
2667
  GGML_ASSERT(ok);
2611
2668
  }
@@ -2624,7 +2681,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2624
2681
  use_cuda_graph = false;
2625
2682
  cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
2626
2683
  #ifndef NDEBUG
2627
- fprintf(stderr, "%s: disabling CUDA graphs due to failed graph capture\n", __func__);
2684
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to failed graph capture\n", __func__);
2628
2685
  #endif
2629
2686
  } else {
2630
2687
  graph_evaluated_or_captured = true; // CUDA graph has been captured
@@ -2678,7 +2735,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2678
2735
  if (!cuda_graph_update_required) { // on update steps, the live parameters will already be captured
2679
2736
  int k = 0;
2680
2737
  for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
2681
- if (cuda_ctx->cuda_graph->params[i].func == ggml_cuda_cpy_fn_ptr) {
2738
+ if(count(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), cuda_ctx->cuda_graph->params[i].func) > 0) {
2682
2739
  char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
2683
2740
  cuda_ctx->cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr;
2684
2741
  CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
@@ -2691,7 +2748,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2691
2748
  cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
2692
2749
  if (stat == cudaErrorGraphExecUpdateFailure) {
2693
2750
  #ifndef NDEBUG
2694
- fprintf(stderr, "%s: CUDA graph update failed\n", __func__);
2751
+ GGML_CUDA_LOG_ERROR("%s: CUDA graph update failed\n", __func__);
2695
2752
  #endif
2696
2753
  // The pre-existing graph exec cannot be updated due to violated constraints
2697
2754
  // so instead clear error and re-instantiate
@@ -2829,7 +2886,9 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
2829
2886
  case GGML_OP_CONT:
2830
2887
  case GGML_OP_DIAG_MASK_INF:
2831
2888
  case GGML_OP_SOFT_MAX:
2889
+ return true;
2832
2890
  case GGML_OP_ROPE:
2891
+ return ggml_is_contiguous(op->src[0]);
2833
2892
  case GGML_OP_IM2COL:
2834
2893
  case GGML_OP_POOL_2D:
2835
2894
  case GGML_OP_SUM_ROWS:
@@ -2948,13 +3007,13 @@ static ggml_guid_t ggml_backend_cuda_guid() {
2948
3007
 
2949
3008
  GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
2950
3009
  if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
2951
- fprintf(stderr, "%s: error: invalid device %d\n", __func__, device);
3010
+ GGML_CUDA_LOG_ERROR("%s: invalid device %d\n", __func__, device);
2952
3011
  return nullptr;
2953
3012
  }
2954
3013
 
2955
3014
  ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
2956
3015
  if (ctx == nullptr) {
2957
- fprintf(stderr, "%s: error: failed to allocate context\n", __func__);
3016
+ GGML_CUDA_LOG_ERROR("%s: failed to allocate context\n", __func__);
2958
3017
  return nullptr;
2959
3018
  }
2960
3019
 
@@ -2998,8 +3057,8 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size
2998
3057
  // clear the error
2999
3058
  cudaGetLastError();
3000
3059
 
3001
- fprintf(stderr, "%s: warning: failed to register %.2f MiB of pinned memory: %s\n", __func__,
3002
- size/1024.0/1024.0, cudaGetErrorString(err));
3060
+ GGML_CUDA_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
3061
+ size / 1024.0 / 1024.0, cudaGetErrorString(err));
3003
3062
  return false;
3004
3063
  }
3005
3064
  return true;
@@ -38,6 +38,7 @@ GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t *
38
38
  GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
39
39
  GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
40
40
 
41
+ GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
41
42
  #ifdef __cplusplus
42
43
  }
43
44
  #endif
@@ -17,6 +17,18 @@
17
17
  #define MIN(a, b) ((a) < (b) ? (a) : (b))
18
18
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
19
19
 
20
+ #if defined(_WIN32)
21
+
22
+ #define m512bh(p) p
23
+ #define m512i(p) p
24
+
25
+ #else
26
+
27
+ #define m512bh(p) (__m512bh)(p)
28
+ #define m512i(p) (__m512i)(p)
29
+
30
+ #endif
31
+
20
32
  /**
21
33
  * Converts brain16 to float32.
22
34
  *
@@ -132,6 +144,10 @@ extern "C" {
132
144
  #endif
133
145
  #endif
134
146
 
147
+ #if defined(__ARM_FEATURE_SVE)
148
+ #include <arm_sve.h>
149
+ #endif
150
+
135
151
  // 16-bit float
136
152
  // on Arm, we use __fp16
137
153
  // on x86, we use uint16_t
@@ -443,6 +459,34 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
443
459
  #include <riscv_vector.h>
444
460
  #endif
445
461
 
462
+ #if defined(__loongarch64)
463
+ #if defined(__loongarch_asx)
464
+ #include <lasxintrin.h>
465
+ #endif
466
+ #if defined(__loongarch_sx)
467
+ #include <lsxintrin.h>
468
+ #endif
469
+ #endif
470
+
471
+ #if defined(__loongarch_asx)
472
+
473
+ typedef union {
474
+ int32_t i;
475
+ float f;
476
+ } ft_union;
477
+
478
+ /* float type data load instructions */
479
+ static __m128 __lsx_vreplfr2vr_s(float val) {
480
+ ft_union fi_tmpval = {.f = val};
481
+ return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
482
+ }
483
+
484
+ static __m256 __lasx_xvreplfr2vr_s(float val) {
485
+ ft_union fi_tmpval = {.f = val};
486
+ return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
487
+ }
488
+ #endif
489
+
446
490
  #ifdef __F16C__
447
491
 
448
492
  #ifdef _MSC_VER
@@ -1597,7 +1597,6 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
1597
1597
  {
1598
1598
  GGML_ASSERT(ne00 == ne10);
1599
1599
 
1600
- // TODO: assert that dim2 and dim3 are contiguous
1601
1600
  GGML_ASSERT(ne12 % ne02 == 0);
1602
1601
  GGML_ASSERT(ne13 % ne03 == 0);
1603
1602
 
@@ -1677,6 +1676,10 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
1677
1676
  } break;
1678
1677
  case GGML_OP_ROPE:
1679
1678
  {
1679
+ #pragma message("TODO: implement phi3 frequency factors support")
1680
+ #pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225")
1681
+ GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
1682
+
1680
1683
  GGML_ASSERT(ne10 == ne02);
1681
1684
  GGML_ASSERT(src0t == dstt);
1682
1685
  // const int n_past = ((int32_t *) dst->op_params)[0];