llama_cpp 0.15.1 → 0.15.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -4,7 +4,6 @@
4
4
 
5
5
  #include "ggml-cuda/common.cuh"
6
6
  #include "ggml-cuda/acc.cuh"
7
- #include "ggml-cuda/alibi.cuh"
8
7
  #include "ggml-cuda/arange.cuh"
9
8
  #include "ggml-cuda/argsort.cuh"
10
9
  #include "ggml-cuda/binbcast.cuh"
@@ -44,19 +43,59 @@
44
43
  #include <mutex>
45
44
  #include <stdint.h>
46
45
  #include <stdio.h>
46
+ #include <stdarg.h>
47
+ #include <stdlib.h>
47
48
  #include <string>
48
49
  #include <vector>
49
50
 
50
51
  static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
51
52
 
53
+ static void ggml_cuda_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
54
+ GGML_UNUSED(level);
55
+ GGML_UNUSED(user_data);
56
+ fprintf(stderr, "%s", msg);
57
+ }
58
+
59
+ ggml_log_callback ggml_cuda_log_callback = ggml_cuda_default_log_callback;
60
+ void * ggml_cuda_log_user_data = NULL;
61
+
62
+ GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data) {
63
+ ggml_cuda_log_callback = log_callback;
64
+ ggml_cuda_log_user_data = user_data;
65
+ }
66
+
67
+ #define GGML_CUDA_LOG_INFO(...) ggml_cuda_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
68
+ #define GGML_CUDA_LOG_WARN(...) ggml_cuda_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
69
+ #define GGML_CUDA_LOG_ERROR(...) ggml_cuda_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
70
+
71
+ GGML_ATTRIBUTE_FORMAT(2, 3)
72
+ static void ggml_cuda_log(enum ggml_log_level level, const char * format, ...) {
73
+ if (ggml_cuda_log_callback != NULL) {
74
+ va_list args;
75
+ va_start(args, format);
76
+ char buffer[128];
77
+ int len = vsnprintf(buffer, 128, format, args);
78
+ if (len < 128) {
79
+ ggml_cuda_log_callback(level, buffer, ggml_cuda_log_user_data);
80
+ } else {
81
+ std::vector<char> buffer2(len + 1); // vsnprintf adds a null terminator
82
+ va_end(args);
83
+ va_start(args, format);
84
+ vsnprintf(&buffer2[0], buffer2.size(), format, args);
85
+ ggml_cuda_log_callback(level, buffer2.data(), ggml_cuda_log_user_data);
86
+ }
87
+ va_end(args);
88
+ }
89
+ }
90
+
52
91
  [[noreturn]]
53
92
  void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
54
93
  int id = -1; // in case cudaGetDevice fails
55
94
  cudaGetDevice(&id);
56
95
 
57
- fprintf(stderr, "CUDA error: %s\n", msg);
58
- fprintf(stderr, " current device: %d, in function %s at %s:%d\n", id, func, file, line);
59
- fprintf(stderr, " %s\n", stmt);
96
+ GGML_CUDA_LOG_ERROR("CUDA error: %s\n", msg);
97
+ GGML_CUDA_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
98
+ GGML_CUDA_LOG_ERROR(" %s\n", stmt);
60
99
  // abort with GGML_ASSERT to get a stack trace
61
100
  GGML_ASSERT(!"CUDA error");
62
101
  }
@@ -92,7 +131,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
92
131
 
93
132
  cudaError_t err = cudaGetDeviceCount(&info.device_count);
94
133
  if (err != cudaSuccess) {
95
- fprintf(stderr, "%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));
134
+ GGML_CUDA_LOG_ERROR("%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));
96
135
  return info;
97
136
  }
98
137
 
@@ -100,16 +139,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
100
139
 
101
140
  int64_t total_vram = 0;
102
141
  #if defined(GGML_CUDA_FORCE_MMQ)
103
- fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
142
+ GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
104
143
  #else
105
- fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
144
+ GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
106
145
  #endif
107
146
  #if defined(CUDA_USE_TENSOR_CORES)
108
- fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
147
+ GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
109
148
  #else
110
- fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
149
+ GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
111
150
  #endif
112
- fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
151
+ GGML_CUDA_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
113
152
  for (int id = 0; id < info.device_count; ++id) {
114
153
  int device_vmm = 0;
115
154
 
@@ -130,7 +169,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
130
169
 
131
170
  cudaDeviceProp prop;
132
171
  CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
133
- fprintf(stderr, " Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
172
+ GGML_CUDA_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
134
173
 
135
174
  info.default_tensor_split[id] = total_vram;
136
175
  total_vram += prop.totalGlobalMem;
@@ -236,8 +275,8 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
236
275
  *actual_size = look_ahead_size;
237
276
  pool_size += look_ahead_size;
238
277
  #ifdef DEBUG_CUDA_MALLOC
239
- fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
240
- (uint32_t)(max_size/1024/1024), (uint32_t)(pool_size/1024/1024), (uint32_t)(size/1024/1024));
278
+ GGML_CUDA_LOG_INFO("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
279
+ (uint32_t)(max_size / 1024 / 1024), (uint32_t)(pool_size / 1024 / 1024), (uint32_t)(size / 1024 / 1024));
241
280
  #endif
242
281
  return ptr;
243
282
  }
@@ -251,7 +290,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
251
290
  return;
252
291
  }
253
292
  }
254
- fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
293
+ GGML_CUDA_LOG_WARN("Cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
255
294
  ggml_cuda_set_device(device);
256
295
  CUDA_CHECK(cudaFree(ptr));
257
296
  pool_size -= size;
@@ -500,7 +539,9 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffe
500
539
  void * dev_ptr;
501
540
  cudaError_t err = cudaMalloc(&dev_ptr, size);
502
541
  if (err != cudaSuccess) {
503
- fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err));
542
+ // clear the error
543
+ cudaGetLastError();
544
+ GGML_CUDA_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err));
504
545
  return nullptr;
505
546
  }
506
547
 
@@ -1003,8 +1044,8 @@ static void * ggml_cuda_host_malloc(size_t size) {
1003
1044
  if (err != cudaSuccess) {
1004
1045
  // clear the error
1005
1046
  cudaGetLastError();
1006
- fprintf(stderr, "%s: warning: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
1007
- size/1024.0/1024.0, cudaGetErrorString(err));
1047
+ GGML_CUDA_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
1048
+ size / 1024.0 / 1024.0, cudaGetErrorString(err));
1008
1049
  return nullptr;
1009
1050
  }
1010
1051
 
@@ -2205,6 +2246,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
2205
2246
  case GGML_UNARY_OP_RELU:
2206
2247
  ggml_cuda_op_relu(ctx, dst);
2207
2248
  break;
2249
+ case GGML_UNARY_OP_SIGMOID:
2250
+ ggml_cuda_op_sigmoid(ctx, dst);
2251
+ break;
2208
2252
  case GGML_UNARY_OP_HARDSIGMOID:
2209
2253
  ggml_cuda_op_hardsigmoid(ctx, dst);
2210
2254
  break;
@@ -2244,7 +2288,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
2244
2288
  break;
2245
2289
  case GGML_OP_MUL_MAT:
2246
2290
  if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) {
2247
- fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]);
2291
+ GGML_CUDA_LOG_ERROR("%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]);
2248
2292
  return false;
2249
2293
  } else {
2250
2294
  ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst);
@@ -2277,9 +2321,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
2277
2321
  case GGML_OP_ROPE:
2278
2322
  ggml_cuda_op_rope(ctx, dst);
2279
2323
  break;
2280
- case GGML_OP_ALIBI:
2281
- ggml_cuda_op_alibi(ctx, dst);
2282
- break;
2283
2324
  case GGML_OP_IM2COL:
2284
2325
  ggml_cuda_op_im2col(ctx, dst);
2285
2326
  break;
@@ -2301,7 +2342,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
2301
2342
 
2302
2343
  cudaError_t err = cudaGetLastError();
2303
2344
  if (err != cudaSuccess) {
2304
- fprintf(stderr, "%s: %s failed\n", __func__, ggml_op_desc(dst));
2345
+ GGML_CUDA_LOG_ERROR("%s: %s failed\n", __func__, ggml_op_desc(dst));
2305
2346
  CUDA_CHECK(err);
2306
2347
  }
2307
2348
 
@@ -2477,7 +2518,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2477
2518
  if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
2478
2519
  cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
2479
2520
  #ifndef NDEBUG
2480
- fprintf(stderr, "%s: disabling CUDA graphs due to GPU architecture\n", __func__);
2521
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
2481
2522
  #endif
2482
2523
  }
2483
2524
  }
@@ -2524,14 +2565,14 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2524
2565
  if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
2525
2566
  use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
2526
2567
  #ifndef NDEBUG
2527
- fprintf(stderr, "%s: disabling CUDA graphs due to split buffer\n", __func__);
2568
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__);
2528
2569
  #endif
2529
2570
  }
2530
2571
 
2531
2572
  if (node->op == GGML_OP_MUL_MAT_ID) {
2532
2573
  use_cuda_graph = false; // This node type is not supported by CUDA graph capture
2533
2574
  #ifndef NDEBUG
2534
- fprintf(stderr, "%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
2575
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
2535
2576
  #endif
2536
2577
  }
2537
2578
 
@@ -2540,7 +2581,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2540
2581
  // Changes in batch size or context size can cause changes to the grid size of some kernels.
2541
2582
  use_cuda_graph = false;
2542
2583
  #ifndef NDEBUG
2543
- fprintf(stderr, "%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
2584
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
2544
2585
  #endif
2545
2586
  }
2546
2587
 
@@ -2559,7 +2600,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2559
2600
  }
2560
2601
 
2561
2602
  // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
2562
- if (cuda_graph_update_required) {
2603
+ if (use_cuda_graph && cuda_graph_update_required) {
2563
2604
  cuda_ctx->cuda_graph->number_consecutive_updates++;
2564
2605
  } else {
2565
2606
  cuda_ctx->cuda_graph->number_consecutive_updates = 0;
@@ -2568,7 +2609,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2568
2609
  if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
2569
2610
  cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
2570
2611
  #ifndef NDEBUG
2571
- fprintf(stderr, "%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
2612
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
2572
2613
  #endif
2573
2614
  }
2574
2615
  }
@@ -2606,7 +2647,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2606
2647
 
2607
2648
  bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
2608
2649
  if (!ok) {
2609
- fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
2650
+ GGML_CUDA_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
2610
2651
  }
2611
2652
  GGML_ASSERT(ok);
2612
2653
  }
@@ -2625,7 +2666,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2625
2666
  use_cuda_graph = false;
2626
2667
  cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
2627
2668
  #ifndef NDEBUG
2628
- fprintf(stderr, "%s: disabling CUDA graphs due to failed graph capture\n", __func__);
2669
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to failed graph capture\n", __func__);
2629
2670
  #endif
2630
2671
  } else {
2631
2672
  graph_evaluated_or_captured = true; // CUDA graph has been captured
@@ -2692,7 +2733,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2692
2733
  cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
2693
2734
  if (stat == cudaErrorGraphExecUpdateFailure) {
2694
2735
  #ifndef NDEBUG
2695
- fprintf(stderr, "%s: CUDA graph update failed\n", __func__);
2736
+ GGML_CUDA_LOG_ERROR("%s: CUDA graph update failed\n", __func__);
2696
2737
  #endif
2697
2738
  // The pre-existing graph exec cannot be updated due to violated constraints
2698
2739
  // so instead clear error and re-instantiate
@@ -2714,12 +2755,14 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2714
2755
  }
2715
2756
 
2716
2757
  GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
2758
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
2717
2759
  switch (op->op) {
2718
2760
  case GGML_OP_UNARY:
2719
2761
  switch (ggml_get_unary_op(op)) {
2720
2762
  case GGML_UNARY_OP_GELU:
2721
2763
  case GGML_UNARY_OP_SILU:
2722
2764
  case GGML_UNARY_OP_RELU:
2765
+ case GGML_UNARY_OP_SIGMOID:
2723
2766
  case GGML_UNARY_OP_HARDSIGMOID:
2724
2767
  case GGML_UNARY_OP_HARDSWISH:
2725
2768
  case GGML_UNARY_OP_GELU_QUICK:
@@ -2829,7 +2872,6 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
2829
2872
  case GGML_OP_DIAG_MASK_INF:
2830
2873
  case GGML_OP_SOFT_MAX:
2831
2874
  case GGML_OP_ROPE:
2832
- case GGML_OP_ALIBI:
2833
2875
  case GGML_OP_IM2COL:
2834
2876
  case GGML_OP_POOL_2D:
2835
2877
  case GGML_OP_SUM_ROWS:
@@ -2841,8 +2883,16 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
2841
2883
  case GGML_OP_ARANGE:
2842
2884
  case GGML_OP_TIMESTEP_EMBEDDING:
2843
2885
  case GGML_OP_LEAKY_RELU:
2844
- case GGML_OP_FLASH_ATTN_EXT:
2845
2886
  return true;
2887
+ case GGML_OP_FLASH_ATTN_EXT:
2888
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
2889
+ return op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128;
2890
+ #else
2891
+ if (op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128) {
2892
+ return true;
2893
+ }
2894
+ return ggml_cuda_info().devices[cuda_ctx->device].cc >= CC_VOLTA;
2895
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
2846
2896
  default:
2847
2897
  return false;
2848
2898
  }
@@ -2940,13 +2990,13 @@ static ggml_guid_t ggml_backend_cuda_guid() {
2940
2990
 
2941
2991
  GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
2942
2992
  if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
2943
- fprintf(stderr, "%s: error: invalid device %d\n", __func__, device);
2993
+ GGML_CUDA_LOG_ERROR("%s: invalid device %d\n", __func__, device);
2944
2994
  return nullptr;
2945
2995
  }
2946
2996
 
2947
2997
  ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
2948
2998
  if (ctx == nullptr) {
2949
- fprintf(stderr, "%s: error: failed to allocate context\n", __func__);
2999
+ GGML_CUDA_LOG_ERROR("%s: failed to allocate context\n", __func__);
2950
3000
  return nullptr;
2951
3001
  }
2952
3002
 
@@ -2990,8 +3040,8 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size
2990
3040
  // clear the error
2991
3041
  cudaGetLastError();
2992
3042
 
2993
- fprintf(stderr, "%s: warning: failed to register %.2f MiB of pinned memory: %s\n", __func__,
2994
- size/1024.0/1024.0, cudaGetErrorString(err));
3043
+ GGML_CUDA_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
3044
+ size / 1024.0 / 1024.0, cudaGetErrorString(err));
2995
3045
  return false;
2996
3046
  }
2997
3047
  return true;
@@ -38,6 +38,7 @@ GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t *
38
38
  GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
39
39
  GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
40
40
 
41
+ GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
41
42
  #ifdef __cplusplus
42
43
  }
43
44
  #endif
@@ -17,6 +17,18 @@
17
17
  #define MIN(a, b) ((a) < (b) ? (a) : (b))
18
18
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
19
19
 
20
+ #if defined(_WIN32)
21
+
22
+ #define m512bh(p) p
23
+ #define m512i(p) p
24
+
25
+ #else
26
+
27
+ #define m512bh(p) (__m512bh)(p)
28
+ #define m512i(p) (__m512i)(p)
29
+
30
+ #endif
31
+
20
32
  /**
21
33
  * Converts brain16 to float32.
22
34
  *
@@ -120,9 +132,16 @@ extern "C" {
120
132
  #ifndef __F16C__
121
133
  #define __F16C__
122
134
  #endif
135
+ #endif
136
+
137
+ // __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
138
+ #if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
123
139
  #ifndef __SSE3__
124
140
  #define __SSE3__
125
141
  #endif
142
+ #ifndef __SSSE3__
143
+ #define __SSSE3__
144
+ #endif
126
145
  #endif
127
146
 
128
147
  // 16-bit float
@@ -436,6 +455,34 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
436
455
  #include <riscv_vector.h>
437
456
  #endif
438
457
 
458
+ #if defined(__loongarch64)
459
+ #if defined(__loongarch_asx)
460
+ #include <lasxintrin.h>
461
+ #endif
462
+ #if defined(__loongarch_sx)
463
+ #include <lsxintrin.h>
464
+ #endif
465
+ #endif
466
+
467
+ #if defined(__loongarch_asx)
468
+
469
+ typedef union {
470
+ int32_t i;
471
+ float f;
472
+ } ft_union;
473
+
474
+ /* float type data load instructions */
475
+ static __m128 __lsx_vreplfr2vr_s(float val) {
476
+ ft_union fi_tmpval = {.f = val};
477
+ return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
478
+ }
479
+
480
+ static __m256 __lasx_xvreplfr2vr_s(float val) {
481
+ ft_union fi_tmpval = {.f = val};
482
+ return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
483
+ }
484
+ #endif
485
+
439
486
  #ifdef __F16C__
440
487
 
441
488
  #ifdef _MSC_VER
@@ -1559,12 +1559,18 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
1559
1559
  case GGML_OP_SOFT_MAX:
1560
1560
  {
1561
1561
  float scale;
1562
- memcpy(&scale, dst->op_params, sizeof(float));
1562
+ float max_bias;
1563
1563
 
1564
- #pragma message("TODO: add ggml_vk_soft_max() F16/F32 src1 and src2 support")
1564
+ memcpy(&scale, (float *)dst->op_params + 0, sizeof(float));
1565
+ memcpy(&max_bias, (float *)dst->op_params + 1, sizeof(float));
1566
+
1567
+ #pragma message("TODO: add ggml_vk_soft_max() F16 src1 support")
1565
1568
  #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
1566
1569
  GGML_ASSERT(!src1 || src1t == GGML_TYPE_F32);
1567
- GGML_ASSERT(src2 == nullptr);
1570
+
1571
+ #pragma message("TODO: add ALiBi support")
1572
+ #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/7192")
1573
+ GGML_ASSERT(max_bias == 0.0f);
1568
1574
 
1569
1575
  ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale);
1570
1576
  } break;
@@ -1671,6 +1677,10 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
1671
1677
  } break;
1672
1678
  case GGML_OP_ROPE:
1673
1679
  {
1680
+ #pragma message("TODO: implement phi3 frequency factors support")
1681
+ #pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225")
1682
+ GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
1683
+
1674
1684
  GGML_ASSERT(ne10 == ne02);
1675
1685
  GGML_ASSERT(src0t == dstt);
1676
1686
  // const int n_past = ((int32_t *) dst->op_params)[0];