llama_cpp 0.14.7 → 0.15.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -14,6 +14,7 @@
14
14
  #include "ggml-cuda/cpy.cuh"
15
15
  #include "ggml-cuda/diagmask.cuh"
16
16
  #include "ggml-cuda/dmmv.cuh"
17
+ #include "ggml-cuda/fattn.cuh"
17
18
  #include "ggml-cuda/getrows.cuh"
18
19
  #include "ggml-cuda/im2col.cuh"
19
20
  #include "ggml-cuda/mmq.cuh"
@@ -112,7 +113,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
112
113
  for (int id = 0; id < info.device_count; ++id) {
113
114
  int device_vmm = 0;
114
115
 
115
- #if !defined(GGML_USE_HIPBLAS)
116
+ #if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
116
117
  CUdevice device;
117
118
  CU_CHECK(cuDeviceGet(&device, id));
118
119
  CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
@@ -140,6 +141,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
140
141
  info.devices[id].cc = 100*prop.major + 10*prop.minor;
141
142
  #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
142
143
  info.devices[id].smpb = prop.sharedMemPerBlock;
144
+ info.devices[id].nsm = prop.multiProcessorCount;
143
145
  }
144
146
 
145
147
  for (int id = 0; id < info.device_count; ++id) {
@@ -257,7 +259,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
257
259
  };
258
260
 
259
261
  // pool with virtual memory
260
- #if !defined(GGML_USE_HIPBLAS)
262
+ #if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
261
263
  struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
262
264
  static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
263
265
 
@@ -354,7 +356,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
354
356
  #endif // !defined(GGML_USE_HIPBLAS)
355
357
 
356
358
  std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
357
- #if !defined(GGML_USE_HIPBLAS)
359
+ #if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
358
360
  if (ggml_cuda_info().devices[device].vmm) {
359
361
  return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
360
362
  }
@@ -1645,7 +1647,7 @@ static void ggml_cuda_op_mul_mat(
1645
1647
  }
1646
1648
  }
1647
1649
 
1648
- static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
1650
+ static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1649
1651
  GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
1650
1652
  GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
1651
1653
  GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
@@ -1668,7 +1670,7 @@ static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const gg
1668
1670
  ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
1669
1671
  }
1670
1672
 
1671
- static void ggml_cuda_mul_mat_vec_nc(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
1673
+ static void ggml_cuda_mul_mat_vec_nc(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1672
1674
  GGML_ASSERT(!ggml_is_transposed(src0));
1673
1675
  GGML_ASSERT(!ggml_is_transposed(src1));
1674
1676
  GGML_ASSERT(!ggml_is_permuted(src0));
@@ -2290,6 +2292,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
2290
2292
  case GGML_OP_ARGSORT:
2291
2293
  ggml_cuda_op_argsort(ctx, dst);
2292
2294
  break;
2295
+ case GGML_OP_FLASH_ATTN_EXT:
2296
+ ggml_cuda_flash_attn_ext(ctx, dst);
2297
+ break;
2293
2298
  default:
2294
2299
  return false;
2295
2300
  }
@@ -2405,32 +2410,304 @@ GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
2405
2410
  GGML_UNUSED(backend);
2406
2411
  }
2407
2412
 
2413
+ static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
2414
+ graph_node_properties->node_address = node->data;
2415
+ graph_node_properties->node_op = node->op;
2416
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
2417
+ graph_node_properties->ne[i] = node->ne[i];
2418
+ graph_node_properties->nb[i] = node->nb[i];
2419
+ }
2420
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
2421
+ graph_node_properties->src_address[i] = node->src[i] ? node->src[i]->data : nullptr;
2422
+ }
2423
+ }
2424
+
2425
+ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
2426
+ if (node->data != graph_node_properties->node_address &&
2427
+ node->op != GGML_OP_CPY &&
2428
+ node->op != GGML_OP_VIEW) {
2429
+ return false;
2430
+ }
2431
+
2432
+ if (node->op != graph_node_properties->node_op) {
2433
+ return false;
2434
+ }
2435
+
2436
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
2437
+ if (node->ne[i] != graph_node_properties->ne[i]) {
2438
+ return false;
2439
+ }
2440
+ if (node->nb[i] != graph_node_properties->nb[i]) {
2441
+ return false;
2442
+ }
2443
+ }
2444
+
2445
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
2446
+ if (node->src[i] &&
2447
+ node->src[i]->data != graph_node_properties->src_address[i] &&
2448
+ node->op != GGML_OP_CPY &&
2449
+ node->op != GGML_OP_VIEW
2450
+ ) {
2451
+ return false;
2452
+ }
2453
+ }
2454
+ return true;
2455
+ }
2456
+
2408
2457
  GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
2409
2458
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2410
2459
 
2411
2460
  ggml_cuda_set_device(cuda_ctx->device);
2412
2461
 
2413
- for (int i = 0; i < cgraph->n_nodes; i++) {
2414
- ggml_tensor * node = cgraph->nodes[i];
2462
+ #ifdef USE_CUDA_GRAPH
2463
+ static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
2415
2464
 
2416
- if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
2417
- continue;
2465
+ // Objects required for CUDA Graph
2466
+ if (cuda_ctx->cuda_graph == nullptr) {
2467
+ cuda_ctx->cuda_graph.reset(new ggml_cuda_graph());
2468
+ }
2469
+
2470
+ bool use_cuda_graph = true;
2471
+ bool cuda_graph_update_required = false;
2472
+ // pointer to CUDA cpy kernel, which is required to identify
2473
+ // kernel parameters which need updated in the graph for each token
2474
+ void * ggml_cuda_cpy_fn_ptr = nullptr;
2475
+
2476
+ if (cuda_ctx->cuda_graph->graph == nullptr) {
2477
+ if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
2478
+ cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
2479
+ #ifndef NDEBUG
2480
+ fprintf(stderr, "%s: disabling CUDA graphs due to GPU architecture\n", __func__);
2481
+ #endif
2482
+ }
2483
+ }
2484
+
2485
+ // Disable CUDA graphs in presence of env var, old GPU, use-case which is changing too rapidly,
2486
+ // or previous graph capture failure.
2487
+ // Also disable for multi-gpu for now. TO DO investigate
2488
+ if (disable_cuda_graphs_due_to_env
2489
+ || cuda_ctx->cuda_graph->disable_due_to_gpu_arch
2490
+ || cuda_ctx->cuda_graph->disable_due_to_too_many_updates
2491
+ || cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture) {
2492
+ use_cuda_graph = false;
2493
+ }
2494
+
2495
+ if (use_cuda_graph) {
2496
+ if (cuda_ctx->cuda_graph->instance == nullptr) {
2497
+ cuda_graph_update_required = true;
2498
+ }
2499
+
2500
+ // Check if the graph size has changed
2501
+ if (cuda_ctx->cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) {
2502
+ cuda_graph_update_required = true;
2503
+ cuda_ctx->cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes);
2504
+ }
2505
+
2506
+ // Loop over nodes in GGML graph to determine if CUDA graph update is required
2507
+ // and store properties to allow this comparison for the next token
2508
+ for (int i = 0; i < cgraph->n_nodes; i++) {
2509
+ bool has_matching_properties = true;
2510
+ if (!cuda_graph_update_required) {
2511
+ has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
2512
+ }
2513
+ if (!has_matching_properties) {
2514
+ cuda_graph_update_required = true;
2515
+ }
2516
+ set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
2517
+ }
2518
+
2519
+ // Loop over nodes in GGML graph to obtain info needed for CUDA graph
2520
+ cuda_ctx->cuda_graph->updated_kernel_arg.clear();
2521
+ for (int i = 0; i < cgraph->n_nodes; i++) {
2522
+ ggml_tensor * node = cgraph->nodes[i];
2523
+
2524
+ if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
2525
+ use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
2526
+ #ifndef NDEBUG
2527
+ fprintf(stderr, "%s: disabling CUDA graphs due to split buffer\n", __func__);
2528
+ #endif
2529
+ }
2530
+
2531
+ if (node->op == GGML_OP_MUL_MAT_ID) {
2532
+ use_cuda_graph = false; // This node type is not supported by CUDA graph capture
2533
+ #ifndef NDEBUG
2534
+ fprintf(stderr, "%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
2535
+ #endif
2536
+ }
2537
+
2538
+ if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) {
2539
+ // disable CUDA graphs for batch size > 1 for now.
2540
+ // Changes in batch size or context size can cause changes to the grid size of some kernels.
2541
+ use_cuda_graph = false;
2542
+ #ifndef NDEBUG
2543
+ fprintf(stderr, "%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
2544
+ #endif
2545
+ }
2546
+
2547
+ if (node->op == GGML_OP_CPY) {
2548
+ // store the copy op parameter which changes with each token.
2549
+ cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
2550
+ if (ggml_cuda_cpy_fn_ptr == nullptr) {
2551
+ // store a pointer to the copy op CUDA kernel to identify it later
2552
+ ggml_cuda_cpy_fn_ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
2553
+ }
2554
+ }
2555
+
2556
+ if (!use_cuda_graph) {
2557
+ break;
2558
+ }
2418
2559
  }
2419
2560
 
2561
+ // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
2562
+ if (cuda_graph_update_required) {
2563
+ cuda_ctx->cuda_graph->number_consecutive_updates++;
2564
+ } else {
2565
+ cuda_ctx->cuda_graph->number_consecutive_updates = 0;
2566
+ }
2567
+
2568
+ if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
2569
+ cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
2420
2570
  #ifndef NDEBUG
2421
- assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
2422
- for (int j = 0; j < GGML_MAX_SRC; j++) {
2423
- if (node->src[j] != nullptr) {
2424
- assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
2571
+ fprintf(stderr, "%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
2572
+ #endif
2573
+ }
2574
+ }
2575
+
2576
+ if (use_cuda_graph && cuda_graph_update_required) { // Start CUDA graph capture
2577
+ CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
2578
+ }
2579
+
2580
+ #else
2581
+ bool use_cuda_graph = false;
2582
+ bool cuda_graph_update_required = false;
2583
+ #endif // USE_CUDA_GRAPH
2584
+
2585
+ bool graph_evaluated_or_captured = false;
2586
+
2587
+ while (!graph_evaluated_or_captured) {
2588
+ // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
2589
+ // With the use of CUDA graphs, the execution will be performed by the graph launch.
2590
+ if (!use_cuda_graph || cuda_graph_update_required) {
2591
+ for (int i = 0; i < cgraph->n_nodes; i++) {
2592
+ ggml_tensor * node = cgraph->nodes[i];
2593
+
2594
+ if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
2595
+ continue;
2596
+ }
2597
+
2598
+ #ifndef NDEBUG
2599
+ assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
2600
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
2601
+ if (node->src[j] != nullptr) {
2602
+ assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
2603
+ }
2604
+ }
2605
+ #endif
2606
+
2607
+ bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
2608
+ if (!ok) {
2609
+ fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
2610
+ }
2611
+ GGML_ASSERT(ok);
2425
2612
  }
2426
2613
  }
2614
+
2615
+ #ifdef USE_CUDA_GRAPH
2616
+ if (use_cuda_graph && cuda_graph_update_required) { // End CUDA graph capture
2617
+ if (cuda_ctx->cuda_graph->graph != nullptr) {
2618
+ CUDA_CHECK(cudaGraphDestroy(cuda_ctx->cuda_graph->graph));
2619
+ cuda_ctx->cuda_graph->graph = nullptr;
2620
+ }
2621
+ CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_ctx->cuda_graph->graph));
2622
+
2623
+ #if 0
2624
+ if (disable_cuda_graphs_due_to_failed_capture) {
2625
+ use_cuda_graph = false;
2626
+ cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
2627
+ #ifndef NDEBUG
2628
+ fprintf(stderr, "%s: disabling CUDA graphs due to failed graph capture\n", __func__);
2427
2629
  #endif
2630
+ } else {
2631
+ graph_evaluated_or_captured = true; // CUDA graph has been captured
2632
+ }
2633
+ #endif
2634
+ graph_evaluated_or_captured = true; // CUDA graph has been captured
2635
+ } else {
2636
+ graph_evaluated_or_captured = true; // ggml graph has been directly evaluated
2637
+ }
2638
+ }
2639
+
2640
+ if (use_cuda_graph) {
2641
+ if (cuda_ctx->cuda_graph->instance == nullptr) { // Create executable graph from captured graph.
2642
+ CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
2643
+ }
2644
+
2645
+ // Perform update to graph (if required for this token), and change copy parameter (required for every token)
2646
+
2647
+ if (cuda_graph_update_required) {
2648
+ // Extract nodes from graph
2649
+ if (cuda_ctx->cuda_graph->num_nodes == 0) {
2650
+ // First call with null argument gets number of nodes in graph
2651
+ CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
2652
+ }
2653
+ // Subsequent call with non-null argument gets nodes
2654
+ cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes);
2655
+ cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes);
2656
+ if (cuda_ctx->cuda_graph->num_nodes > 0) {
2657
+ CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, cuda_ctx->cuda_graph->nodes.data(), &cuda_ctx->cuda_graph->num_nodes));
2658
+
2659
+ // Loop over nodes, and extract kernel parameters from each node
2660
+ for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
2661
+ cudaGraphNodeType node_type;
2662
+ CUDA_CHECK(cudaGraphNodeGetType(cuda_ctx->cuda_graph->nodes[i], &node_type));
2663
+ if (node_type == cudaGraphNodeTypeKernel) {
2664
+ cudaError_t stat = cudaGraphKernelNodeGetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]); // Get params using runtime
2665
+ if (stat == cudaErrorInvalidDeviceFunction) {
2666
+ // Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
2667
+ // We don't need to update blas nodes, so clear error and move on.
2668
+ cudaGetLastError();
2669
+ } else {
2670
+ GGML_ASSERT(stat == cudaSuccess);
2671
+ }
2672
+ }
2673
+ }
2674
+ }
2675
+ }
2676
+
2677
+ // One of the arguments to the copy kernel is updated for each token, hence we need to
2678
+ // replace that argument with the updated value in the CUDA graph
2679
+ if (!cuda_graph_update_required) { // on update steps, the live parameters will already be captured
2680
+ int k = 0;
2681
+ for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
2682
+ if (cuda_ctx->cuda_graph->params[i].func == ggml_cuda_cpy_fn_ptr) {
2683
+ char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
2684
+ cuda_ctx->cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr;
2685
+ CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
2686
+ }
2687
+ }
2688
+ }
2428
2689
 
2429
- bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
2430
- if (!ok) {
2431
- fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
2690
+ // Update graph executable
2691
+ cudaGraphExecUpdateResultInfo result_info;
2692
+ cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
2693
+ if (stat == cudaErrorGraphExecUpdateFailure) {
2694
+ #ifndef NDEBUG
2695
+ fprintf(stderr, "%s: CUDA graph update failed\n", __func__);
2696
+ #endif
2697
+ // The pre-existing graph exec cannot be updated due to violated constraints
2698
+ // so instead clear error and re-instantiate
2699
+ cudaGetLastError();
2700
+ CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance));
2701
+ cuda_ctx->cuda_graph->instance = nullptr;
2702
+ CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
2703
+ } else {
2704
+ GGML_ASSERT(stat == cudaSuccess);
2432
2705
  }
2433
- GGML_ASSERT(ok);
2706
+ // Launch graph
2707
+ CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream()));
2708
+ #else
2709
+ graph_evaluated_or_captured = true;
2710
+ #endif // USE_CUDA_GRAPH
2434
2711
  }
2435
2712
 
2436
2713
  return GGML_STATUS_SUCCESS;
@@ -2564,6 +2841,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
2564
2841
  case GGML_OP_ARANGE:
2565
2842
  case GGML_OP_TIMESTEP_EMBEDDING:
2566
2843
  case GGML_OP_LEAKY_RELU:
2844
+ case GGML_OP_FLASH_ATTN_EXT:
2567
2845
  return true;
2568
2846
  default:
2569
2847
  return false;
@@ -17,6 +17,83 @@
17
17
  #define MIN(a, b) ((a) < (b) ? (a) : (b))
18
18
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
19
19
 
20
+ /**
21
+ * Converts brain16 to float32.
22
+ *
23
+ * The bfloat16 floating point format has the following structure:
24
+ *
25
+ * ┌sign
26
+ * │
27
+ * │ ┌exponent
28
+ * │ │
29
+ * │ │ ┌mantissa
30
+ * │ │ │
31
+ * │┌──┴───┐┌─┴───┐
32
+ * 0b0000000000000000 brain16
33
+ *
34
+ * Since bf16 has the same number of exponent bits as a 32bit float,
35
+ * encoding and decoding numbers becomes relatively straightforward.
36
+ *
37
+ * ┌sign
38
+ * │
39
+ * │ ┌exponent
40
+ * │ │
41
+ * │ │ ┌mantissa
42
+ * │ │ │
43
+ * │┌──┴───┐┌─┴───────────────────┐
44
+ * 0b00000000000000000000000000000000 IEEE binary32
45
+ *
46
+ * For comparison, the standard fp16 format has fewer exponent bits.
47
+ *
48
+ * ┌sign
49
+ * │
50
+ * │ ┌exponent
51
+ * │ │
52
+ * │ │ ┌mantissa
53
+ * │ │ │
54
+ * │┌─┴─┐┌─┴──────┐
55
+ * 0b0000000000000000 IEEE binary16
56
+ *
57
+ * @see IEEE 754-2008
58
+ */
59
+ static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
60
+ union {
61
+ float f;
62
+ uint32_t i;
63
+ } u;
64
+ u.i = (uint32_t)h.bits << 16;
65
+ return u.f;
66
+ }
67
+
68
+ /**
69
+ * Converts float32 to brain16.
70
+ *
71
+ * This function is binary identical to AMD Zen4 VCVTNEPS2BF16.
72
+ * Subnormals shall be flushed to zero, and NANs will be quiet.
73
+ * This code should vectorize nicely if using modern compilers.
74
+ */
75
+ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
76
+ ggml_bf16_t h;
77
+ union {
78
+ float f;
79
+ uint32_t i;
80
+ } u;
81
+ u.f = s;
82
+ if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
83
+ h.bits = (u.i >> 16) | 64; /* force to quiet */
84
+ return h;
85
+ }
86
+ if (!(u.i & 0x7f800000)) { /* subnormal */
87
+ h.bits = (u.i & 0x80000000) >> 16; /* flush to zero */
88
+ return h;
89
+ }
90
+ h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
91
+ return h;
92
+ }
93
+
94
+ #define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
95
+ #define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
96
+
20
97
  #ifdef __cplusplus
21
98
  extern "C" {
22
99
  #endif
@@ -313,7 +390,7 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
313
390
 
314
391
  #endif // defined(__ARM_NEON)
315
392
 
316
- #if defined(__ARM_NEON) && !defined(__MSC_VER)
393
+ #if defined(__ARM_NEON) && !defined(_MSC_VER)
317
394
 
318
395
  #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
319
396
  #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
@@ -1427,6 +1427,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
1427
1427
  for (int i = node_start; i < node_end; ++i) {
1428
1428
  struct ggml_tensor * src0 = gf->nodes[i]->src[0];
1429
1429
  struct ggml_tensor * src1 = gf->nodes[i]->src[1];
1430
+ struct ggml_tensor * src2 = gf->nodes[i]->src[2]; GGML_UNUSED(src2);
1430
1431
  struct ggml_tensor * dst = gf->nodes[i];
1431
1432
  GGML_ASSERT(dst->data != nullptr);
1432
1433
 
@@ -1559,6 +1560,12 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
1559
1560
  {
1560
1561
  float scale;
1561
1562
  memcpy(&scale, dst->op_params, sizeof(float));
1563
+
1564
+ #pragma message("TODO: add ggml_vk_soft_max() F16/F32 src1 and src2 support")
1565
+ #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
1566
+ GGML_ASSERT(!src1 || src1t == GGML_TYPE_F32);
1567
+ GGML_ASSERT(src2 == nullptr);
1568
+
1562
1569
  ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale);
1563
1570
  } break;
1564
1571
  case GGML_OP_DIAG_MASK_INF: