llama_cpp 0.14.7 → 0.15.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,6 +14,7 @@
14
14
  #include "ggml-cuda/cpy.cuh"
15
15
  #include "ggml-cuda/diagmask.cuh"
16
16
  #include "ggml-cuda/dmmv.cuh"
17
+ #include "ggml-cuda/fattn.cuh"
17
18
  #include "ggml-cuda/getrows.cuh"
18
19
  #include "ggml-cuda/im2col.cuh"
19
20
  #include "ggml-cuda/mmq.cuh"
@@ -112,7 +113,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
112
113
  for (int id = 0; id < info.device_count; ++id) {
113
114
  int device_vmm = 0;
114
115
 
115
- #if !defined(GGML_USE_HIPBLAS)
116
+ #if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
116
117
  CUdevice device;
117
118
  CU_CHECK(cuDeviceGet(&device, id));
118
119
  CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
@@ -140,6 +141,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
140
141
  info.devices[id].cc = 100*prop.major + 10*prop.minor;
141
142
  #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
142
143
  info.devices[id].smpb = prop.sharedMemPerBlock;
144
+ info.devices[id].nsm = prop.multiProcessorCount;
143
145
  }
144
146
 
145
147
  for (int id = 0; id < info.device_count; ++id) {
@@ -257,7 +259,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
257
259
  };
258
260
 
259
261
  // pool with virtual memory
260
- #if !defined(GGML_USE_HIPBLAS)
262
+ #if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
261
263
  struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
262
264
  static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
263
265
 
@@ -354,7 +356,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
354
356
  #endif // !defined(GGML_USE_HIPBLAS)
355
357
 
356
358
  std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
357
- #if !defined(GGML_USE_HIPBLAS)
359
+ #if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
358
360
  if (ggml_cuda_info().devices[device].vmm) {
359
361
  return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
360
362
  }
@@ -1645,7 +1647,7 @@ static void ggml_cuda_op_mul_mat(
1645
1647
  }
1646
1648
  }
1647
1649
 
1648
- static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
1650
+ static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1649
1651
  GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
1650
1652
  GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
1651
1653
  GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
@@ -1668,7 +1670,7 @@ static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const gg
1668
1670
  ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
1669
1671
  }
1670
1672
 
1671
- static void ggml_cuda_mul_mat_vec_nc(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
1673
+ static void ggml_cuda_mul_mat_vec_nc(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1672
1674
  GGML_ASSERT(!ggml_is_transposed(src0));
1673
1675
  GGML_ASSERT(!ggml_is_transposed(src1));
1674
1676
  GGML_ASSERT(!ggml_is_permuted(src0));
@@ -2290,6 +2292,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
2290
2292
  case GGML_OP_ARGSORT:
2291
2293
  ggml_cuda_op_argsort(ctx, dst);
2292
2294
  break;
2295
+ case GGML_OP_FLASH_ATTN_EXT:
2296
+ ggml_cuda_flash_attn_ext(ctx, dst);
2297
+ break;
2293
2298
  default:
2294
2299
  return false;
2295
2300
  }
@@ -2405,32 +2410,304 @@ GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
2405
2410
  GGML_UNUSED(backend);
2406
2411
  }
2407
2412
 
2413
+ static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
2414
+ graph_node_properties->node_address = node->data;
2415
+ graph_node_properties->node_op = node->op;
2416
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
2417
+ graph_node_properties->ne[i] = node->ne[i];
2418
+ graph_node_properties->nb[i] = node->nb[i];
2419
+ }
2420
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
2421
+ graph_node_properties->src_address[i] = node->src[i] ? node->src[i]->data : nullptr;
2422
+ }
2423
+ }
2424
+
2425
+ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
2426
+ if (node->data != graph_node_properties->node_address &&
2427
+ node->op != GGML_OP_CPY &&
2428
+ node->op != GGML_OP_VIEW) {
2429
+ return false;
2430
+ }
2431
+
2432
+ if (node->op != graph_node_properties->node_op) {
2433
+ return false;
2434
+ }
2435
+
2436
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
2437
+ if (node->ne[i] != graph_node_properties->ne[i]) {
2438
+ return false;
2439
+ }
2440
+ if (node->nb[i] != graph_node_properties->nb[i]) {
2441
+ return false;
2442
+ }
2443
+ }
2444
+
2445
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
2446
+ if (node->src[i] &&
2447
+ node->src[i]->data != graph_node_properties->src_address[i] &&
2448
+ node->op != GGML_OP_CPY &&
2449
+ node->op != GGML_OP_VIEW
2450
+ ) {
2451
+ return false;
2452
+ }
2453
+ }
2454
+ return true;
2455
+ }
2456
+
2408
2457
  GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
2409
2458
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2410
2459
 
2411
2460
  ggml_cuda_set_device(cuda_ctx->device);
2412
2461
 
2413
- for (int i = 0; i < cgraph->n_nodes; i++) {
2414
- ggml_tensor * node = cgraph->nodes[i];
2462
+ #ifdef USE_CUDA_GRAPH
2463
+ static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
2415
2464
 
2416
- if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
2417
- continue;
2465
+ // Objects required for CUDA Graph
2466
+ if (cuda_ctx->cuda_graph == nullptr) {
2467
+ cuda_ctx->cuda_graph.reset(new ggml_cuda_graph());
2468
+ }
2469
+
2470
+ bool use_cuda_graph = true;
2471
+ bool cuda_graph_update_required = false;
2472
+ // pointer to CUDA cpy kernel, which is required to identify
2473
+ // kernel parameters which need updated in the graph for each token
2474
+ void * ggml_cuda_cpy_fn_ptr = nullptr;
2475
+
2476
+ if (cuda_ctx->cuda_graph->graph == nullptr) {
2477
+ if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
2478
+ cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
2479
+ #ifndef NDEBUG
2480
+ fprintf(stderr, "%s: disabling CUDA graphs due to GPU architecture\n", __func__);
2481
+ #endif
2482
+ }
2483
+ }
2484
+
2485
+ // Disable CUDA graphs in presence of env var, old GPU, use-case which is changing too rapidly,
2486
+ // or previous graph capture failure.
2487
+ // Also disable for multi-gpu for now. TO DO investigate
2488
+ if (disable_cuda_graphs_due_to_env
2489
+ || cuda_ctx->cuda_graph->disable_due_to_gpu_arch
2490
+ || cuda_ctx->cuda_graph->disable_due_to_too_many_updates
2491
+ || cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture) {
2492
+ use_cuda_graph = false;
2493
+ }
2494
+
2495
+ if (use_cuda_graph) {
2496
+ if (cuda_ctx->cuda_graph->instance == nullptr) {
2497
+ cuda_graph_update_required = true;
2498
+ }
2499
+
2500
+ // Check if the graph size has changed
2501
+ if (cuda_ctx->cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) {
2502
+ cuda_graph_update_required = true;
2503
+ cuda_ctx->cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes);
2504
+ }
2505
+
2506
+ // Loop over nodes in GGML graph to determine if CUDA graph update is required
2507
+ // and store properties to allow this comparison for the next token
2508
+ for (int i = 0; i < cgraph->n_nodes; i++) {
2509
+ bool has_matching_properties = true;
2510
+ if (!cuda_graph_update_required) {
2511
+ has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
2512
+ }
2513
+ if (!has_matching_properties) {
2514
+ cuda_graph_update_required = true;
2515
+ }
2516
+ set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
2517
+ }
2518
+
2519
+ // Loop over nodes in GGML graph to obtain info needed for CUDA graph
2520
+ cuda_ctx->cuda_graph->updated_kernel_arg.clear();
2521
+ for (int i = 0; i < cgraph->n_nodes; i++) {
2522
+ ggml_tensor * node = cgraph->nodes[i];
2523
+
2524
+ if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
2525
+ use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
2526
+ #ifndef NDEBUG
2527
+ fprintf(stderr, "%s: disabling CUDA graphs due to split buffer\n", __func__);
2528
+ #endif
2529
+ }
2530
+
2531
+ if (node->op == GGML_OP_MUL_MAT_ID) {
2532
+ use_cuda_graph = false; // This node type is not supported by CUDA graph capture
2533
+ #ifndef NDEBUG
2534
+ fprintf(stderr, "%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
2535
+ #endif
2536
+ }
2537
+
2538
+ if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) {
2539
+ // disable CUDA graphs for batch size > 1 for now.
2540
+ // Changes in batch size or context size can cause changes to the grid size of some kernels.
2541
+ use_cuda_graph = false;
2542
+ #ifndef NDEBUG
2543
+ fprintf(stderr, "%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
2544
+ #endif
2545
+ }
2546
+
2547
+ if (node->op == GGML_OP_CPY) {
2548
+ // store the copy op parameter which changes with each token.
2549
+ cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
2550
+ if (ggml_cuda_cpy_fn_ptr == nullptr) {
2551
+ // store a pointer to the copy op CUDA kernel to identify it later
2552
+ ggml_cuda_cpy_fn_ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
2553
+ }
2554
+ }
2555
+
2556
+ if (!use_cuda_graph) {
2557
+ break;
2558
+ }
2418
2559
  }
2419
2560
 
2561
+ // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
2562
+ if (cuda_graph_update_required) {
2563
+ cuda_ctx->cuda_graph->number_consecutive_updates++;
2564
+ } else {
2565
+ cuda_ctx->cuda_graph->number_consecutive_updates = 0;
2566
+ }
2567
+
2568
+ if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
2569
+ cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
2420
2570
  #ifndef NDEBUG
2421
- assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
2422
- for (int j = 0; j < GGML_MAX_SRC; j++) {
2423
- if (node->src[j] != nullptr) {
2424
- assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
2571
+ fprintf(stderr, "%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
2572
+ #endif
2573
+ }
2574
+ }
2575
+
2576
+ if (use_cuda_graph && cuda_graph_update_required) { // Start CUDA graph capture
2577
+ CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
2578
+ }
2579
+
2580
+ #else
2581
+ bool use_cuda_graph = false;
2582
+ bool cuda_graph_update_required = false;
2583
+ #endif // USE_CUDA_GRAPH
2584
+
2585
+ bool graph_evaluated_or_captured = false;
2586
+
2587
+ while (!graph_evaluated_or_captured) {
2588
+ // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
2589
+ // With the use of CUDA graphs, the execution will be performed by the graph launch.
2590
+ if (!use_cuda_graph || cuda_graph_update_required) {
2591
+ for (int i = 0; i < cgraph->n_nodes; i++) {
2592
+ ggml_tensor * node = cgraph->nodes[i];
2593
+
2594
+ if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
2595
+ continue;
2596
+ }
2597
+
2598
+ #ifndef NDEBUG
2599
+ assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
2600
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
2601
+ if (node->src[j] != nullptr) {
2602
+ assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
2603
+ }
2604
+ }
2605
+ #endif
2606
+
2607
+ bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
2608
+ if (!ok) {
2609
+ fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
2610
+ }
2611
+ GGML_ASSERT(ok);
2425
2612
  }
2426
2613
  }
2614
+
2615
+ #ifdef USE_CUDA_GRAPH
2616
+ if (use_cuda_graph && cuda_graph_update_required) { // End CUDA graph capture
2617
+ if (cuda_ctx->cuda_graph->graph != nullptr) {
2618
+ CUDA_CHECK(cudaGraphDestroy(cuda_ctx->cuda_graph->graph));
2619
+ cuda_ctx->cuda_graph->graph = nullptr;
2620
+ }
2621
+ CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_ctx->cuda_graph->graph));
2622
+
2623
+ #if 0
2624
+ if (disable_cuda_graphs_due_to_failed_capture) {
2625
+ use_cuda_graph = false;
2626
+ cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
2627
+ #ifndef NDEBUG
2628
+ fprintf(stderr, "%s: disabling CUDA graphs due to failed graph capture\n", __func__);
2427
2629
  #endif
2630
+ } else {
2631
+ graph_evaluated_or_captured = true; // CUDA graph has been captured
2632
+ }
2633
+ #endif
2634
+ graph_evaluated_or_captured = true; // CUDA graph has been captured
2635
+ } else {
2636
+ graph_evaluated_or_captured = true; // ggml graph has been directly evaluated
2637
+ }
2638
+ }
2639
+
2640
+ if (use_cuda_graph) {
2641
+ if (cuda_ctx->cuda_graph->instance == nullptr) { // Create executable graph from captured graph.
2642
+ CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
2643
+ }
2644
+
2645
+ // Perform update to graph (if required for this token), and change copy parameter (required for every token)
2646
+
2647
+ if (cuda_graph_update_required) {
2648
+ // Extract nodes from graph
2649
+ if (cuda_ctx->cuda_graph->num_nodes == 0) {
2650
+ // First call with null argument gets number of nodes in graph
2651
+ CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
2652
+ }
2653
+ // Subsequent call with non-null argument gets nodes
2654
+ cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes);
2655
+ cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes);
2656
+ if (cuda_ctx->cuda_graph->num_nodes > 0) {
2657
+ CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, cuda_ctx->cuda_graph->nodes.data(), &cuda_ctx->cuda_graph->num_nodes));
2658
+
2659
+ // Loop over nodes, and extract kernel parameters from each node
2660
+ for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
2661
+ cudaGraphNodeType node_type;
2662
+ CUDA_CHECK(cudaGraphNodeGetType(cuda_ctx->cuda_graph->nodes[i], &node_type));
2663
+ if (node_type == cudaGraphNodeTypeKernel) {
2664
+ cudaError_t stat = cudaGraphKernelNodeGetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]); // Get params using runtime
2665
+ if (stat == cudaErrorInvalidDeviceFunction) {
2666
+ // Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
2667
+ // We don't need to update blas nodes, so clear error and move on.
2668
+ cudaGetLastError();
2669
+ } else {
2670
+ GGML_ASSERT(stat == cudaSuccess);
2671
+ }
2672
+ }
2673
+ }
2674
+ }
2675
+ }
2676
+
2677
+ // One of the arguments to the copy kernel is updated for each token, hence we need to
2678
+ // replace that argument with the updated value in the CUDA graph
2679
+ if (!cuda_graph_update_required) { // on update steps, the live parameters will already be captured
2680
+ int k = 0;
2681
+ for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
2682
+ if (cuda_ctx->cuda_graph->params[i].func == ggml_cuda_cpy_fn_ptr) {
2683
+ char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
2684
+ cuda_ctx->cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr;
2685
+ CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
2686
+ }
2687
+ }
2688
+ }
2428
2689
 
2429
- bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
2430
- if (!ok) {
2431
- fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
2690
+ // Update graph executable
2691
+ cudaGraphExecUpdateResultInfo result_info;
2692
+ cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
2693
+ if (stat == cudaErrorGraphExecUpdateFailure) {
2694
+ #ifndef NDEBUG
2695
+ fprintf(stderr, "%s: CUDA graph update failed\n", __func__);
2696
+ #endif
2697
+ // The pre-existing graph exec cannot be updated due to violated constraints
2698
+ // so instead clear error and re-instantiate
2699
+ cudaGetLastError();
2700
+ CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance));
2701
+ cuda_ctx->cuda_graph->instance = nullptr;
2702
+ CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
2703
+ } else {
2704
+ GGML_ASSERT(stat == cudaSuccess);
2432
2705
  }
2433
- GGML_ASSERT(ok);
2706
+ // Launch graph
2707
+ CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream()));
2708
+ #else
2709
+ graph_evaluated_or_captured = true;
2710
+ #endif // USE_CUDA_GRAPH
2434
2711
  }
2435
2712
 
2436
2713
  return GGML_STATUS_SUCCESS;
@@ -2564,6 +2841,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
2564
2841
  case GGML_OP_ARANGE:
2565
2842
  case GGML_OP_TIMESTEP_EMBEDDING:
2566
2843
  case GGML_OP_LEAKY_RELU:
2844
+ case GGML_OP_FLASH_ATTN_EXT:
2567
2845
  return true;
2568
2846
  default:
2569
2847
  return false;
@@ -17,6 +17,83 @@
17
17
  #define MIN(a, b) ((a) < (b) ? (a) : (b))
18
18
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
19
19
 
20
+ /**
21
+ * Converts brain16 to float32.
22
+ *
23
+ * The bfloat16 floating point format has the following structure:
24
+ *
25
+ * ┌sign
26
+ * │
27
+ * │ ┌exponent
28
+ * │ │
29
+ * │ │ ┌mantissa
30
+ * │ │ │
31
+ * │┌──┴───┐┌─┴───┐
32
+ * 0b0000000000000000 brain16
33
+ *
34
+ * Since bf16 has the same number of exponent bits as a 32bit float,
35
+ * encoding and decoding numbers becomes relatively straightforward.
36
+ *
37
+ * ┌sign
38
+ * │
39
+ * │ ┌exponent
40
+ * │ │
41
+ * │ │ ┌mantissa
42
+ * │ │ │
43
+ * │┌──┴───┐┌─┴───────────────────┐
44
+ * 0b00000000000000000000000000000000 IEEE binary32
45
+ *
46
+ * For comparison, the standard fp16 format has fewer exponent bits.
47
+ *
48
+ * ┌sign
49
+ * │
50
+ * │ ┌exponent
51
+ * │ │
52
+ * │ │ ┌mantissa
53
+ * │ │ │
54
+ * │┌─┴─┐┌─┴──────┐
55
+ * 0b0000000000000000 IEEE binary16
56
+ *
57
+ * @see IEEE 754-2008
58
+ */
59
+ static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
60
+ union {
61
+ float f;
62
+ uint32_t i;
63
+ } u;
64
+ u.i = (uint32_t)h.bits << 16;
65
+ return u.f;
66
+ }
67
+
68
+ /**
69
+ * Converts float32 to brain16.
70
+ *
71
+ * This function is binary identical to AMD Zen4 VCVTNEPS2BF16.
72
+ * Subnormals shall be flushed to zero, and NANs will be quiet.
73
+ * This code should vectorize nicely if using modern compilers.
74
+ */
75
+ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
76
+ ggml_bf16_t h;
77
+ union {
78
+ float f;
79
+ uint32_t i;
80
+ } u;
81
+ u.f = s;
82
+ if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
83
+ h.bits = (u.i >> 16) | 64; /* force to quiet */
84
+ return h;
85
+ }
86
+ if (!(u.i & 0x7f800000)) { /* subnormal */
87
+ h.bits = (u.i & 0x80000000) >> 16; /* flush to zero */
88
+ return h;
89
+ }
90
+ h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
91
+ return h;
92
+ }
93
+
94
+ #define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
95
+ #define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
96
+
20
97
  #ifdef __cplusplus
21
98
  extern "C" {
22
99
  #endif
@@ -313,7 +390,7 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
313
390
 
314
391
  #endif // defined(__ARM_NEON)
315
392
 
316
- #if defined(__ARM_NEON) && !defined(__MSC_VER)
393
+ #if defined(__ARM_NEON) && !defined(_MSC_VER)
317
394
 
318
395
  #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
319
396
  #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
@@ -1427,6 +1427,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
1427
1427
  for (int i = node_start; i < node_end; ++i) {
1428
1428
  struct ggml_tensor * src0 = gf->nodes[i]->src[0];
1429
1429
  struct ggml_tensor * src1 = gf->nodes[i]->src[1];
1430
+ struct ggml_tensor * src2 = gf->nodes[i]->src[2]; GGML_UNUSED(src2);
1430
1431
  struct ggml_tensor * dst = gf->nodes[i];
1431
1432
  GGML_ASSERT(dst->data != nullptr);
1432
1433
 
@@ -1559,6 +1560,12 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
1559
1560
  {
1560
1561
  float scale;
1561
1562
  memcpy(&scale, dst->op_params, sizeof(float));
1563
+
1564
+ #pragma message("TODO: add ggml_vk_soft_max() F16/F32 src1 and src2 support")
1565
+ #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
1566
+ GGML_ASSERT(!src1 || src1t == GGML_TYPE_F32);
1567
+ GGML_ASSERT(src2 == nullptr);
1568
+
1562
1569
  ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale);
1563
1570
  } break;
1564
1571
  case GGML_OP_DIAG_MASK_INF: