llama_cpp 0.15.1 → 0.15.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,7 +4,6 @@
4
4
 
5
5
  #include "ggml-cuda/common.cuh"
6
6
  #include "ggml-cuda/acc.cuh"
7
- #include "ggml-cuda/alibi.cuh"
8
7
  #include "ggml-cuda/arange.cuh"
9
8
  #include "ggml-cuda/argsort.cuh"
10
9
  #include "ggml-cuda/binbcast.cuh"
@@ -44,19 +43,59 @@
44
43
  #include <mutex>
45
44
  #include <stdint.h>
46
45
  #include <stdio.h>
46
+ #include <stdarg.h>
47
+ #include <stdlib.h>
47
48
  #include <string>
48
49
  #include <vector>
49
50
 
50
51
  static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
51
52
 
53
+ static void ggml_cuda_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
54
+ GGML_UNUSED(level);
55
+ GGML_UNUSED(user_data);
56
+ fprintf(stderr, "%s", msg);
57
+ }
58
+
59
+ ggml_log_callback ggml_cuda_log_callback = ggml_cuda_default_log_callback;
60
+ void * ggml_cuda_log_user_data = NULL;
61
+
62
+ GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data) {
63
+ ggml_cuda_log_callback = log_callback;
64
+ ggml_cuda_log_user_data = user_data;
65
+ }
66
+
67
+ #define GGML_CUDA_LOG_INFO(...) ggml_cuda_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
68
+ #define GGML_CUDA_LOG_WARN(...) ggml_cuda_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
69
+ #define GGML_CUDA_LOG_ERROR(...) ggml_cuda_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
70
+
71
+ GGML_ATTRIBUTE_FORMAT(2, 3)
72
+ static void ggml_cuda_log(enum ggml_log_level level, const char * format, ...) {
73
+ if (ggml_cuda_log_callback != NULL) {
74
+ va_list args;
75
+ va_start(args, format);
76
+ char buffer[128];
77
+ int len = vsnprintf(buffer, 128, format, args);
78
+ if (len < 128) {
79
+ ggml_cuda_log_callback(level, buffer, ggml_cuda_log_user_data);
80
+ } else {
81
+ std::vector<char> buffer2(len + 1); // vsnprintf adds a null terminator
82
+ va_end(args);
83
+ va_start(args, format);
84
+ vsnprintf(&buffer2[0], buffer2.size(), format, args);
85
+ ggml_cuda_log_callback(level, buffer2.data(), ggml_cuda_log_user_data);
86
+ }
87
+ va_end(args);
88
+ }
89
+ }
90
+
52
91
  [[noreturn]]
53
92
  void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
54
93
  int id = -1; // in case cudaGetDevice fails
55
94
  cudaGetDevice(&id);
56
95
 
57
- fprintf(stderr, "CUDA error: %s\n", msg);
58
- fprintf(stderr, " current device: %d, in function %s at %s:%d\n", id, func, file, line);
59
- fprintf(stderr, " %s\n", stmt);
96
+ GGML_CUDA_LOG_ERROR("CUDA error: %s\n", msg);
97
+ GGML_CUDA_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
98
+ GGML_CUDA_LOG_ERROR(" %s\n", stmt);
60
99
  // abort with GGML_ASSERT to get a stack trace
61
100
  GGML_ASSERT(!"CUDA error");
62
101
  }
@@ -92,7 +131,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
92
131
 
93
132
  cudaError_t err = cudaGetDeviceCount(&info.device_count);
94
133
  if (err != cudaSuccess) {
95
- fprintf(stderr, "%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));
134
+ GGML_CUDA_LOG_ERROR("%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));
96
135
  return info;
97
136
  }
98
137
 
@@ -100,16 +139,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
100
139
 
101
140
  int64_t total_vram = 0;
102
141
  #if defined(GGML_CUDA_FORCE_MMQ)
103
- fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
142
+ GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
104
143
  #else
105
- fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
144
+ GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
106
145
  #endif
107
146
  #if defined(CUDA_USE_TENSOR_CORES)
108
- fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
147
+ GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
109
148
  #else
110
- fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
149
+ GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
111
150
  #endif
112
- fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
151
+ GGML_CUDA_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
113
152
  for (int id = 0; id < info.device_count; ++id) {
114
153
  int device_vmm = 0;
115
154
 
@@ -130,7 +169,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
130
169
 
131
170
  cudaDeviceProp prop;
132
171
  CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
133
- fprintf(stderr, " Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
172
+ GGML_CUDA_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
134
173
 
135
174
  info.default_tensor_split[id] = total_vram;
136
175
  total_vram += prop.totalGlobalMem;
@@ -236,8 +275,8 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
236
275
  *actual_size = look_ahead_size;
237
276
  pool_size += look_ahead_size;
238
277
  #ifdef DEBUG_CUDA_MALLOC
239
- fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
240
- (uint32_t)(max_size/1024/1024), (uint32_t)(pool_size/1024/1024), (uint32_t)(size/1024/1024));
278
+ GGML_CUDA_LOG_INFO("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
279
+ (uint32_t)(max_size / 1024 / 1024), (uint32_t)(pool_size / 1024 / 1024), (uint32_t)(size / 1024 / 1024));
241
280
  #endif
242
281
  return ptr;
243
282
  }
@@ -251,7 +290,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
251
290
  return;
252
291
  }
253
292
  }
254
- fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
293
+ GGML_CUDA_LOG_WARN("Cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
255
294
  ggml_cuda_set_device(device);
256
295
  CUDA_CHECK(cudaFree(ptr));
257
296
  pool_size -= size;
@@ -500,7 +539,9 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffe
500
539
  void * dev_ptr;
501
540
  cudaError_t err = cudaMalloc(&dev_ptr, size);
502
541
  if (err != cudaSuccess) {
503
- fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err));
542
+ // clear the error
543
+ cudaGetLastError();
544
+ GGML_CUDA_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err));
504
545
  return nullptr;
505
546
  }
506
547
 
@@ -1003,8 +1044,8 @@ static void * ggml_cuda_host_malloc(size_t size) {
1003
1044
  if (err != cudaSuccess) {
1004
1045
  // clear the error
1005
1046
  cudaGetLastError();
1006
- fprintf(stderr, "%s: warning: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
1007
- size/1024.0/1024.0, cudaGetErrorString(err));
1047
+ GGML_CUDA_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
1048
+ size / 1024.0 / 1024.0, cudaGetErrorString(err));
1008
1049
  return nullptr;
1009
1050
  }
1010
1051
 
@@ -2205,6 +2246,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
2205
2246
  case GGML_UNARY_OP_RELU:
2206
2247
  ggml_cuda_op_relu(ctx, dst);
2207
2248
  break;
2249
+ case GGML_UNARY_OP_SIGMOID:
2250
+ ggml_cuda_op_sigmoid(ctx, dst);
2251
+ break;
2208
2252
  case GGML_UNARY_OP_HARDSIGMOID:
2209
2253
  ggml_cuda_op_hardsigmoid(ctx, dst);
2210
2254
  break;
@@ -2244,7 +2288,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
2244
2288
  break;
2245
2289
  case GGML_OP_MUL_MAT:
2246
2290
  if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) {
2247
- fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]);
2291
+ GGML_CUDA_LOG_ERROR("%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]);
2248
2292
  return false;
2249
2293
  } else {
2250
2294
  ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst);
@@ -2277,9 +2321,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
2277
2321
  case GGML_OP_ROPE:
2278
2322
  ggml_cuda_op_rope(ctx, dst);
2279
2323
  break;
2280
- case GGML_OP_ALIBI:
2281
- ggml_cuda_op_alibi(ctx, dst);
2282
- break;
2283
2324
  case GGML_OP_IM2COL:
2284
2325
  ggml_cuda_op_im2col(ctx, dst);
2285
2326
  break;
@@ -2301,7 +2342,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
2301
2342
 
2302
2343
  cudaError_t err = cudaGetLastError();
2303
2344
  if (err != cudaSuccess) {
2304
- fprintf(stderr, "%s: %s failed\n", __func__, ggml_op_desc(dst));
2345
+ GGML_CUDA_LOG_ERROR("%s: %s failed\n", __func__, ggml_op_desc(dst));
2305
2346
  CUDA_CHECK(err);
2306
2347
  }
2307
2348
 
@@ -2477,7 +2518,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2477
2518
  if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
2478
2519
  cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
2479
2520
  #ifndef NDEBUG
2480
- fprintf(stderr, "%s: disabling CUDA graphs due to GPU architecture\n", __func__);
2521
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
2481
2522
  #endif
2482
2523
  }
2483
2524
  }
@@ -2524,14 +2565,14 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2524
2565
  if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
2525
2566
  use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
2526
2567
  #ifndef NDEBUG
2527
- fprintf(stderr, "%s: disabling CUDA graphs due to split buffer\n", __func__);
2568
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__);
2528
2569
  #endif
2529
2570
  }
2530
2571
 
2531
2572
  if (node->op == GGML_OP_MUL_MAT_ID) {
2532
2573
  use_cuda_graph = false; // This node type is not supported by CUDA graph capture
2533
2574
  #ifndef NDEBUG
2534
- fprintf(stderr, "%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
2575
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
2535
2576
  #endif
2536
2577
  }
2537
2578
 
@@ -2540,7 +2581,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2540
2581
  // Changes in batch size or context size can cause changes to the grid size of some kernels.
2541
2582
  use_cuda_graph = false;
2542
2583
  #ifndef NDEBUG
2543
- fprintf(stderr, "%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
2584
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
2544
2585
  #endif
2545
2586
  }
2546
2587
 
@@ -2559,7 +2600,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2559
2600
  }
2560
2601
 
2561
2602
  // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
2562
- if (cuda_graph_update_required) {
2603
+ if (use_cuda_graph && cuda_graph_update_required) {
2563
2604
  cuda_ctx->cuda_graph->number_consecutive_updates++;
2564
2605
  } else {
2565
2606
  cuda_ctx->cuda_graph->number_consecutive_updates = 0;
@@ -2568,7 +2609,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2568
2609
  if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
2569
2610
  cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
2570
2611
  #ifndef NDEBUG
2571
- fprintf(stderr, "%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
2612
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
2572
2613
  #endif
2573
2614
  }
2574
2615
  }
@@ -2606,7 +2647,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2606
2647
 
2607
2648
  bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
2608
2649
  if (!ok) {
2609
- fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
2650
+ GGML_CUDA_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
2610
2651
  }
2611
2652
  GGML_ASSERT(ok);
2612
2653
  }
@@ -2625,7 +2666,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2625
2666
  use_cuda_graph = false;
2626
2667
  cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
2627
2668
  #ifndef NDEBUG
2628
- fprintf(stderr, "%s: disabling CUDA graphs due to failed graph capture\n", __func__);
2669
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to failed graph capture\n", __func__);
2629
2670
  #endif
2630
2671
  } else {
2631
2672
  graph_evaluated_or_captured = true; // CUDA graph has been captured
@@ -2692,7 +2733,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2692
2733
  cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
2693
2734
  if (stat == cudaErrorGraphExecUpdateFailure) {
2694
2735
  #ifndef NDEBUG
2695
- fprintf(stderr, "%s: CUDA graph update failed\n", __func__);
2736
+ GGML_CUDA_LOG_ERROR("%s: CUDA graph update failed\n", __func__);
2696
2737
  #endif
2697
2738
  // The pre-existing graph exec cannot be updated due to violated constraints
2698
2739
  // so instead clear error and re-instantiate
@@ -2714,12 +2755,14 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2714
2755
  }
2715
2756
 
2716
2757
  GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
2758
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
2717
2759
  switch (op->op) {
2718
2760
  case GGML_OP_UNARY:
2719
2761
  switch (ggml_get_unary_op(op)) {
2720
2762
  case GGML_UNARY_OP_GELU:
2721
2763
  case GGML_UNARY_OP_SILU:
2722
2764
  case GGML_UNARY_OP_RELU:
2765
+ case GGML_UNARY_OP_SIGMOID:
2723
2766
  case GGML_UNARY_OP_HARDSIGMOID:
2724
2767
  case GGML_UNARY_OP_HARDSWISH:
2725
2768
  case GGML_UNARY_OP_GELU_QUICK:
@@ -2829,7 +2872,6 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
2829
2872
  case GGML_OP_DIAG_MASK_INF:
2830
2873
  case GGML_OP_SOFT_MAX:
2831
2874
  case GGML_OP_ROPE:
2832
- case GGML_OP_ALIBI:
2833
2875
  case GGML_OP_IM2COL:
2834
2876
  case GGML_OP_POOL_2D:
2835
2877
  case GGML_OP_SUM_ROWS:
@@ -2841,8 +2883,16 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
2841
2883
  case GGML_OP_ARANGE:
2842
2884
  case GGML_OP_TIMESTEP_EMBEDDING:
2843
2885
  case GGML_OP_LEAKY_RELU:
2844
- case GGML_OP_FLASH_ATTN_EXT:
2845
2886
  return true;
2887
+ case GGML_OP_FLASH_ATTN_EXT:
2888
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
2889
+ return op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128;
2890
+ #else
2891
+ if (op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128) {
2892
+ return true;
2893
+ }
2894
+ return ggml_cuda_info().devices[cuda_ctx->device].cc >= CC_VOLTA;
2895
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
2846
2896
  default:
2847
2897
  return false;
2848
2898
  }
@@ -2940,13 +2990,13 @@ static ggml_guid_t ggml_backend_cuda_guid() {
2940
2990
 
2941
2991
  GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
2942
2992
  if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
2943
- fprintf(stderr, "%s: error: invalid device %d\n", __func__, device);
2993
+ GGML_CUDA_LOG_ERROR("%s: invalid device %d\n", __func__, device);
2944
2994
  return nullptr;
2945
2995
  }
2946
2996
 
2947
2997
  ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
2948
2998
  if (ctx == nullptr) {
2949
- fprintf(stderr, "%s: error: failed to allocate context\n", __func__);
2999
+ GGML_CUDA_LOG_ERROR("%s: failed to allocate context\n", __func__);
2950
3000
  return nullptr;
2951
3001
  }
2952
3002
 
@@ -2990,8 +3040,8 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size
2990
3040
  // clear the error
2991
3041
  cudaGetLastError();
2992
3042
 
2993
- fprintf(stderr, "%s: warning: failed to register %.2f MiB of pinned memory: %s\n", __func__,
2994
- size/1024.0/1024.0, cudaGetErrorString(err));
3043
+ GGML_CUDA_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
3044
+ size / 1024.0 / 1024.0, cudaGetErrorString(err));
2995
3045
  return false;
2996
3046
  }
2997
3047
  return true;
@@ -38,6 +38,7 @@ GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t *
38
38
  GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
39
39
  GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
40
40
 
41
+ GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
41
42
  #ifdef __cplusplus
42
43
  }
43
44
  #endif
@@ -17,6 +17,18 @@
17
17
  #define MIN(a, b) ((a) < (b) ? (a) : (b))
18
18
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
19
19
 
20
+ #if defined(_WIN32)
21
+
22
+ #define m512bh(p) p
23
+ #define m512i(p) p
24
+
25
+ #else
26
+
27
+ #define m512bh(p) (__m512bh)(p)
28
+ #define m512i(p) (__m512i)(p)
29
+
30
+ #endif
31
+
20
32
  /**
21
33
  * Converts brain16 to float32.
22
34
  *
@@ -120,9 +132,16 @@ extern "C" {
120
132
  #ifndef __F16C__
121
133
  #define __F16C__
122
134
  #endif
135
+ #endif
136
+
137
+ // __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
138
+ #if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
123
139
  #ifndef __SSE3__
124
140
  #define __SSE3__
125
141
  #endif
142
+ #ifndef __SSSE3__
143
+ #define __SSSE3__
144
+ #endif
126
145
  #endif
127
146
 
128
147
  // 16-bit float
@@ -436,6 +455,34 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
436
455
  #include <riscv_vector.h>
437
456
  #endif
438
457
 
458
+ #if defined(__loongarch64)
459
+ #if defined(__loongarch_asx)
460
+ #include <lasxintrin.h>
461
+ #endif
462
+ #if defined(__loongarch_sx)
463
+ #include <lsxintrin.h>
464
+ #endif
465
+ #endif
466
+
467
+ #if defined(__loongarch_asx)
468
+
469
+ typedef union {
470
+ int32_t i;
471
+ float f;
472
+ } ft_union;
473
+
474
+ /* float type data load instructions */
475
+ static __m128 __lsx_vreplfr2vr_s(float val) {
476
+ ft_union fi_tmpval = {.f = val};
477
+ return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
478
+ }
479
+
480
+ static __m256 __lasx_xvreplfr2vr_s(float val) {
481
+ ft_union fi_tmpval = {.f = val};
482
+ return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
483
+ }
484
+ #endif
485
+
439
486
  #ifdef __F16C__
440
487
 
441
488
  #ifdef _MSC_VER
@@ -1559,12 +1559,18 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
1559
1559
  case GGML_OP_SOFT_MAX:
1560
1560
  {
1561
1561
  float scale;
1562
- memcpy(&scale, dst->op_params, sizeof(float));
1562
+ float max_bias;
1563
1563
 
1564
- #pragma message("TODO: add ggml_vk_soft_max() F16/F32 src1 and src2 support")
1564
+ memcpy(&scale, (float *)dst->op_params + 0, sizeof(float));
1565
+ memcpy(&max_bias, (float *)dst->op_params + 1, sizeof(float));
1566
+
1567
+ #pragma message("TODO: add ggml_vk_soft_max() F16 src1 support")
1565
1568
  #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
1566
1569
  GGML_ASSERT(!src1 || src1t == GGML_TYPE_F32);
1567
- GGML_ASSERT(src2 == nullptr);
1570
+
1571
+ #pragma message("TODO: add ALiBi support")
1572
+ #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/7192")
1573
+ GGML_ASSERT(max_bias == 0.0f);
1568
1574
 
1569
1575
  ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale);
1570
1576
  } break;
@@ -1671,6 +1677,10 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
1671
1677
  } break;
1672
1678
  case GGML_OP_ROPE:
1673
1679
  {
1680
+ #pragma message("TODO: implement phi3 frequency factors support")
1681
+ #pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225")
1682
+ GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
1683
+
1674
1684
  GGML_ASSERT(ne10 == ne02);
1675
1685
  GGML_ASSERT(src0t == dstt);
1676
1686
  // const int n_past = ((int32_t *) dst->op_params)[0];