llama_cpp 0.10.1 → 0.10.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -31,6 +31,7 @@
31
31
  #define CUDA_R_16F HIPBLAS_R_16F
32
32
  #define CUDA_R_32F HIPBLAS_R_32F
33
33
  #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
34
+ #define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
34
35
  #define cublasCreate hipblasCreate
35
36
  #define cublasGemmEx hipblasGemmEx
36
37
  #define cublasGemmBatchedEx hipblasGemmBatchedEx
@@ -40,6 +41,7 @@
40
41
  #define cublasSetStream hipblasSetStream
41
42
  #define cublasSgemm hipblasSgemm
42
43
  #define cublasStatus_t hipblasStatus_t
44
+ #define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
43
45
  #define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
44
46
  #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
45
47
  #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
@@ -58,8 +60,13 @@
58
60
  #define cudaGetDeviceProperties hipGetDeviceProperties
59
61
  #define cudaGetErrorString hipGetErrorString
60
62
  #define cudaGetLastError hipGetLastError
63
+ #ifdef GGML_HIP_UMA
64
+ #define cudaMalloc hipMallocManaged
65
+ #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
66
+ #else
61
67
  #define cudaMalloc hipMalloc
62
68
  #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
69
+ #endif
63
70
  #define cudaMemcpy hipMemcpy
64
71
  #define cudaMemcpy2DAsync hipMemcpy2DAsync
65
72
  #define cudaMemcpyAsync hipMemcpyAsync
@@ -78,6 +85,7 @@
78
85
  #define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
79
86
  #define cudaStream_t hipStream_t
80
87
  #define cudaSuccess hipSuccess
88
+ #define __trap abort
81
89
  #else
82
90
  #include <cuda_runtime.h>
83
91
  #include <cublas_v2.h>
@@ -510,6 +518,14 @@ static size_t g_scratch_offset = 0;
510
518
 
511
519
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
512
520
 
521
+ [[noreturn]]
522
+ static __device__ void bad_arch() {
523
+ printf("ERROR: ggml-cuda was compiled without support for the current GPU architecture.\n");
524
+ __trap();
525
+
526
+ (void) bad_arch; // suppress unused function warning
527
+ }
528
+
513
529
  static __device__ __forceinline__ float warp_reduce_sum(float x) {
514
530
  #pragma unroll
515
531
  for (int mask = 16; mask > 0; mask >>= 1) {
@@ -1970,8 +1986,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
1970
1986
  // second part effectively subtracts 8 from each quant value
1971
1987
  return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
1972
1988
  #else
1973
- assert(false);
1974
- return 0.0f; // only to satisfy the compiler
1989
+ bad_arch();
1975
1990
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1976
1991
  }
1977
1992
 
@@ -2008,8 +2023,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
2008
2023
  // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
2009
2024
  return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
2010
2025
  #else
2011
- assert(false);
2012
- return 0.0f; // only to satisfy the compiler
2026
+ bad_arch();
2013
2027
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2014
2028
  }
2015
2029
 
@@ -2044,8 +2058,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
2044
2058
  // second part effectively subtracts 16 from each quant value
2045
2059
  return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
2046
2060
  #else
2047
- assert(false);
2048
- return 0.0f; // only to satisfy the compiler
2061
+ bad_arch();
2049
2062
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2050
2063
  }
2051
2064
 
@@ -2090,8 +2103,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
2090
2103
  return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
2091
2104
 
2092
2105
  #else
2093
- assert(false);
2094
- return 0.0f; // only to satisfy the compiler
2106
+ bad_arch();
2095
2107
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2096
2108
  }
2097
2109
 
@@ -2112,8 +2124,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp
2112
2124
 
2113
2125
  return d8_0*d8_1 * sumi;
2114
2126
  #else
2115
- assert(false);
2116
- return 0.0f; // only to satisfy the compiler
2127
+ bad_arch();
2117
2128
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2118
2129
  }
2119
2130
 
@@ -2143,8 +2154,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
2143
2154
  // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
2144
2155
  return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
2145
2156
  #else
2146
- assert(false);
2147
- return 0.0f; // only to satisfy the compiler
2157
+ bad_arch();
2148
2158
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2149
2159
  }
2150
2160
 
@@ -2179,8 +2189,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
2179
2189
 
2180
2190
  return dm2f.x*sumf_d - dm2f.y*sumf_m;
2181
2191
  #else
2182
- assert(false);
2183
- return 0.0f; // only to satisfy the compiler
2192
+ bad_arch();
2184
2193
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2185
2194
  }
2186
2195
 
@@ -2217,8 +2226,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
2217
2226
 
2218
2227
  return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
2219
2228
  #else
2220
- assert(false);
2221
- return 0.0f; // only to satisfy the compiler
2229
+ bad_arch();
2222
2230
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2223
2231
  }
2224
2232
 
@@ -2258,8 +2266,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
2258
2266
 
2259
2267
  return d3 * sumf;
2260
2268
  #else
2261
- assert(false);
2262
- return 0.0f; // only to satisfy the compiler
2269
+ bad_arch();
2263
2270
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2264
2271
  }
2265
2272
 
@@ -2284,8 +2291,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
2284
2291
 
2285
2292
  return d3*d8 * sumi;
2286
2293
  #else
2287
- assert(false);
2288
- return 0.0f; // only to satisfy the compiler
2294
+ bad_arch();
2289
2295
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2290
2296
  }
2291
2297
 
@@ -2318,8 +2324,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
2318
2324
  return dm4f.x*sumf_d - dm4f.y*sumf_m;
2319
2325
 
2320
2326
  #else
2321
- assert(false);
2322
- return 0.0f; // only to satisfy the compiler
2327
+ bad_arch();
2323
2328
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2324
2329
  }
2325
2330
 
@@ -2352,8 +2357,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
2352
2357
  return dm4f.x*sumf_d - dm4f.y*sumf_m;
2353
2358
 
2354
2359
  #else
2355
- assert(false);
2356
- return 0.0f; // only to satisfy the compiler
2360
+ bad_arch();
2357
2361
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2358
2362
  }
2359
2363
 
@@ -2393,8 +2397,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
2393
2397
  return dm5f.x*sumf_d - dm5f.y*sumf_m;
2394
2398
 
2395
2399
  #else
2396
- assert(false);
2397
- return 0.0f; // only to satisfy the compiler
2400
+ bad_arch();
2398
2401
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2399
2402
  }
2400
2403
 
@@ -2427,8 +2430,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
2427
2430
  return dm4f.x*sumf_d - dm4f.y*sumf_m;
2428
2431
 
2429
2432
  #else
2430
- assert(false);
2431
- return 0.0f; // only to satisfy the compiler
2433
+ bad_arch();
2432
2434
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2433
2435
  }
2434
2436
 
@@ -2458,8 +2460,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
2458
2460
 
2459
2461
  return d*sumf;
2460
2462
  #else
2461
- assert(false);
2462
- return 0.0f; // only to satisfy the compiler
2463
+ bad_arch();
2463
2464
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2464
2465
  }
2465
2466
 
@@ -2490,8 +2491,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
2490
2491
  return d6 * sumf_d;
2491
2492
 
2492
2493
  #else
2493
- assert(false);
2494
- return 0.0f; // only to satisfy the compiler
2494
+ bad_arch();
2495
2495
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2496
2496
  }
2497
2497
 
@@ -3357,8 +3357,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
3357
3357
  return dall * sumf_d - dmin * sumf_m;
3358
3358
 
3359
3359
  #else
3360
- assert(false);
3361
- return 0.0f; // only to satisfy the compiler
3360
+ bad_arch();
3362
3361
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
3363
3362
 
3364
3363
  #endif
@@ -3541,8 +3540,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
3541
3540
  return d * sumf_d;
3542
3541
 
3543
3542
  #else
3544
- assert(false);
3545
- return 0.0f; // only to satisfy the compiler
3543
+ bad_arch();
3546
3544
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
3547
3545
 
3548
3546
  #endif
@@ -3952,7 +3950,7 @@ template <bool need_check> static __global__ void
3952
3950
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3953
3951
  #else
3954
3952
  (void) vec_dot_q4_0_q8_1_mul_mat;
3955
- assert(false);
3953
+ bad_arch();
3956
3954
  #endif // __CUDA_ARCH__ >= CC_VOLTA
3957
3955
  }
3958
3956
 
@@ -4021,7 +4019,7 @@ template <bool need_check> static __global__ void
4021
4019
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4022
4020
  #else
4023
4021
  (void) vec_dot_q4_1_q8_1_mul_mat;
4024
- assert(false);
4022
+ bad_arch();
4025
4023
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4026
4024
  }
4027
4025
 
@@ -4088,7 +4086,7 @@ template <bool need_check> static __global__ void
4088
4086
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4089
4087
  #else
4090
4088
  (void) vec_dot_q5_0_q8_1_mul_mat;
4091
- assert(false);
4089
+ bad_arch();
4092
4090
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4093
4091
  }
4094
4092
 
@@ -4155,7 +4153,7 @@ mul_mat_q5_1(
4155
4153
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4156
4154
  #else
4157
4155
  (void) vec_dot_q5_1_q8_1_mul_mat;
4158
- assert(false);
4156
+ bad_arch();
4159
4157
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4160
4158
  }
4161
4159
 
@@ -4222,7 +4220,7 @@ template <bool need_check> static __global__ void
4222
4220
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4223
4221
  #else
4224
4222
  (void) vec_dot_q8_0_q8_1_mul_mat;
4225
- assert(false);
4223
+ bad_arch();
4226
4224
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4227
4225
  }
4228
4226
 
@@ -4289,7 +4287,7 @@ mul_mat_q2_K(
4289
4287
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4290
4288
  #else
4291
4289
  (void) vec_dot_q2_K_q8_1_mul_mat;
4292
- assert(false);
4290
+ bad_arch();
4293
4291
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4294
4292
  }
4295
4293
 
@@ -4358,7 +4356,7 @@ template <bool need_check> static __global__ void
4358
4356
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4359
4357
  #else
4360
4358
  (void) vec_dot_q3_K_q8_1_mul_mat;
4361
- assert(false);
4359
+ bad_arch();
4362
4360
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4363
4361
  }
4364
4362
 
@@ -4427,7 +4425,7 @@ template <bool need_check> static __global__ void
4427
4425
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4428
4426
  #else
4429
4427
  (void) vec_dot_q4_K_q8_1_mul_mat;
4430
- assert(false);
4428
+ bad_arch();
4431
4429
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4432
4430
  }
4433
4431
 
@@ -4494,7 +4492,7 @@ mul_mat_q5_K(
4494
4492
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4495
4493
  #else
4496
4494
  (void) vec_dot_q5_K_q8_1_mul_mat;
4497
- assert(false);
4495
+ bad_arch();
4498
4496
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4499
4497
  }
4500
4498
 
@@ -4563,7 +4561,7 @@ template <bool need_check> static __global__ void
4563
4561
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4564
4562
  #else
4565
4563
  (void) vec_dot_q6_K_q8_1_mul_mat;
4566
- assert(false);
4564
+ bad_arch();
4567
4565
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4568
4566
  }
4569
4567
 
@@ -4998,7 +4996,16 @@ static __global__ void rope_neox(
4998
4996
  const int ib = col / n_dims;
4999
4997
  const int ic = col % n_dims;
5000
4998
 
5001
- const int i = row*ncols + ib*n_dims + ic/2;
4999
+ if (ib > 0) {
5000
+ const int i = row*ncols + ib*n_dims + ic;
5001
+
5002
+ dst[i + 0] = x[i + 0];
5003
+ dst[i + 1] = x[i + 1];
5004
+
5005
+ return;
5006
+ }
5007
+
5008
+ const int i = row*ncols + ib*n_dims + ic/2;
5002
5009
  const int i2 = row/p_delta_rows;
5003
5010
 
5004
5011
  float cur_rot = inv_ndims * ic - ib;
@@ -6814,6 +6821,7 @@ static void ggml_cuda_op_get_rows(
6814
6821
  break;
6815
6822
  default:
6816
6823
  // TODO: k-quants
6824
+ fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
6817
6825
  GGML_ASSERT(false);
6818
6826
  break;
6819
6827
  }
@@ -7057,6 +7065,7 @@ inline void ggml_cuda_op_upscale(
7057
7065
 
7058
7066
  (void) src1;
7059
7067
  (void) dst;
7068
+ (void) src1_dd;
7060
7069
  }
7061
7070
 
7062
7071
  inline void ggml_cuda_op_pad(
@@ -7073,6 +7082,7 @@ inline void ggml_cuda_op_pad(
7073
7082
 
7074
7083
  (void) src1;
7075
7084
  (void) dst;
7085
+ (void) src1_dd;
7076
7086
  }
7077
7087
 
7078
7088
  inline void ggml_cuda_op_rms_norm(
@@ -7376,7 +7386,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
7376
7386
 
7377
7387
  const int compute_capability = g_compute_capabilities[id];
7378
7388
 
7379
- if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
7389
+ if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
7380
7390
  // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
7381
7391
  half * src0_as_f16 = nullptr;
7382
7392
  size_t src0_as = 0;
@@ -7690,17 +7700,9 @@ inline void ggml_cuda_op_scale(
7690
7700
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7691
7701
 
7692
7702
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7693
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
7694
7703
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
7695
7704
 
7696
- float scale;
7697
- // HACK: support for ggml backend interface
7698
- if (src1->backend == GGML_BACKEND_CPU) {
7699
- scale = ((float *) src1->data)[0];
7700
- } else {
7701
- // TODO: pass pointer to kernel instead of copying to host
7702
- CUDA_CHECK(cudaMemcpy(&scale, src1->data, sizeof(float), cudaMemcpyDeviceToHost));
7703
- }
7705
+ const float scale = ((float *) dst->op_params)[0];
7704
7706
 
7705
7707
  scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
7706
7708
  CUDA_CHECK(cudaGetLastError());
@@ -7747,8 +7749,6 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
7747
7749
  const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
7748
7750
  const bool dst_on_device = dst->backend == GGML_BACKEND_GPU;
7749
7751
 
7750
- const bool src1_stays_on_host = use_src1 && dst->op == GGML_OP_SCALE;
7751
-
7752
7752
  // dd = data device
7753
7753
  float * src0_ddf = nullptr;
7754
7754
  float * src1_ddf = nullptr;
@@ -7769,7 +7769,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
7769
7769
  CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
7770
7770
  }
7771
7771
 
7772
- if (use_src1 && !src1_stays_on_host) {
7772
+ if (use_src1) {
7773
7773
  if (src1_on_device) {
7774
7774
  src1_ddf = (float *) src1_extra->data_device[g_main_device];
7775
7775
  } else {
@@ -7817,6 +7817,11 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
7817
7817
  }
7818
7818
 
7819
7819
  #ifdef NDEBUG
7820
+ for (int id = 0; id < g_device_count; ++id) {
7821
+ CUDA_CHECK(ggml_cuda_set_device(id));
7822
+ CUDA_CHECK(cudaDeviceSynchronize());
7823
+ }
7824
+
7820
7825
  for (int id = 0; id < g_device_count; ++id) {
7821
7826
  CUDA_CHECK(ggml_cuda_set_device(id));
7822
7827
 
@@ -7868,8 +7873,6 @@ static void ggml_cuda_op_mul_mat(
7868
7873
  const int nb2 = dst->nb[2];
7869
7874
  const int nb3 = dst->nb[3];
7870
7875
 
7871
- ggml_cuda_set_peer_access(ne11);
7872
-
7873
7876
  GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
7874
7877
  GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
7875
7878
 
@@ -8300,27 +8303,27 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
8300
8303
  }
8301
8304
 
8302
8305
  static __global__ void k_compute_batched_ptrs(
8303
- const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
8306
+ const half * src0_as_f16, const half * src1_as_f16, char * dst,
8304
8307
  const void ** ptrs_src, void ** ptrs_dst,
8305
- int ne12, int ne13,
8306
- int ne23,
8307
- int nb02, int nb03,
8308
- int nb12, int nb13,
8309
- int nb2, int nb3,
8310
- int r2, int r3) {
8311
- int i13 = blockIdx.x * blockDim.x + threadIdx.x;
8312
- int i12 = blockIdx.y * blockDim.y + threadIdx.y;
8308
+ int64_t ne12, int64_t ne13,
8309
+ int64_t ne23,
8310
+ size_t nb02, size_t nb03,
8311
+ size_t nb12, size_t nb13,
8312
+ size_t nbd2, size_t nbd3,
8313
+ int64_t r2, int64_t r3) {
8314
+ int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x;
8315
+ int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y;
8313
8316
 
8314
8317
  if (i13 >= ne13 || i12 >= ne12) {
8315
8318
  return;
8316
8319
  }
8317
8320
 
8318
- int i03 = i13 / r3;
8319
- int i02 = i12 / r2;
8321
+ int64_t i03 = i13 / r3;
8322
+ int64_t i02 = i12 / r2;
8320
8323
 
8321
8324
  ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
8322
8325
  ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
8323
- ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
8326
+ ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
8324
8327
  }
8325
8328
 
8326
8329
  static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -8376,7 +8379,41 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8376
8379
  to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
8377
8380
 
8378
8381
  size_t dst_as = 0;
8379
- half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
8382
+
8383
+ half * dst_f16 = nullptr;
8384
+ char * dst_t = nullptr;
8385
+
8386
+ cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
8387
+ cudaDataType_t cu_data_type = CUDA_R_16F;
8388
+
8389
+ // dst strides
8390
+ size_t nbd2 = dst->nb[2];
8391
+ size_t nbd3 = dst->nb[3];
8392
+
8393
+ const half alpha_f16 = 1.0f;
8394
+ const half beta_f16 = 0.0f;
8395
+
8396
+ const float alpha_f32 = 1.0f;
8397
+ const float beta_f32 = 0.0f;
8398
+
8399
+ const void * alpha = &alpha_f16;
8400
+ const void * beta = &beta_f16;
8401
+
8402
+ if (dst->op_params[0] == GGML_PREC_DEFAULT) {
8403
+ dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
8404
+ dst_t = (char *) dst_f16;
8405
+
8406
+ nbd2 /= sizeof(float) / sizeof(half);
8407
+ nbd3 /= sizeof(float) / sizeof(half);
8408
+ } else {
8409
+ dst_t = (char *) dst_ddf;
8410
+
8411
+ cu_compute_type = CUBLAS_COMPUTE_32F;
8412
+ cu_data_type = CUDA_R_32F;
8413
+
8414
+ alpha = &alpha_f32;
8415
+ beta = &beta_f32;
8416
+ }
8380
8417
 
8381
8418
  GGML_ASSERT(ne12 % ne02 == 0);
8382
8419
  GGML_ASSERT(ne13 % ne03 == 0);
@@ -8385,9 +8422,6 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8385
8422
  const int64_t r2 = ne12/ne02;
8386
8423
  const int64_t r3 = ne13/ne03;
8387
8424
 
8388
- const half alpha_f16 = 1.0f;
8389
- const half beta_f16 = 0.0f;
8390
-
8391
8425
  #if 0
8392
8426
  // use cublasGemmEx
8393
8427
  {
@@ -8397,12 +8431,12 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8397
8431
  int i02 = i12 / r2;
8398
8432
 
8399
8433
  CUBLAS_CHECK(
8400
- cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
8434
+ cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
8401
8435
  ne01, ne11, ne10,
8402
- &alpha_f16, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
8403
- (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
8404
- &beta_f16, ( char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2, CUDA_R_16F, ne01,
8405
- CUBLAS_COMPUTE_16F,
8436
+ alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
8437
+ (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
8438
+ beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01,
8439
+ cu_compute_type,
8406
8440
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
8407
8441
  }
8408
8442
  }
@@ -8414,11 +8448,11 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8414
8448
  CUBLAS_CHECK(
8415
8449
  cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
8416
8450
  ne01, ne11, ne10,
8417
- &alpha_f16, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
8418
- (const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
8419
- &beta_f16, ( char *) dst_f16, CUDA_R_16F, ne01, dst->nb[2]/sizeof(float), // strideC
8451
+ alpha, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
8452
+ (const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
8453
+ beta, ( char *) dst_t, cu_data_type, ne01, dst->nb[2]/sizeof(float), // strideC
8420
8454
  ne12*ne13,
8421
- CUBLAS_COMPUTE_16F,
8455
+ cu_compute_type,
8422
8456
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
8423
8457
  } else {
8424
8458
  // use cublasGemmBatchedEx
@@ -8435,24 +8469,24 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8435
8469
 
8436
8470
  dim3 block_dims(ne13, ne12);
8437
8471
  k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
8438
- src0_as_f16, src1_as_f16, dst_f16,
8472
+ src0_as_f16, src1_as_f16, dst_t,
8439
8473
  ptrs_src, ptrs_dst,
8440
8474
  ne12, ne13,
8441
8475
  ne23,
8442
8476
  nb02, nb03,
8443
8477
  nb12, nb13,
8444
- dst->nb[2], dst->nb[3],
8478
+ nbd2, nbd3,
8445
8479
  r2, r3);
8446
8480
  CUDA_CHECK(cudaGetLastError());
8447
8481
 
8448
8482
  CUBLAS_CHECK(
8449
8483
  cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
8450
8484
  ne01, ne11, ne10,
8451
- &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
8452
- (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
8453
- &beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
8485
+ alpha, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
8486
+ (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
8487
+ beta, ( void **) (ptrs_dst + 0*ne23), cu_data_type, ne01,
8454
8488
  ne23,
8455
- CUBLAS_COMPUTE_16F,
8489
+ cu_compute_type,
8456
8490
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
8457
8491
 
8458
8492
  if (ptrs_src_s != 0) {
@@ -8464,11 +8498,14 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8464
8498
  }
8465
8499
  #endif
8466
8500
 
8467
- const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
8468
- to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
8501
+ if (dst->op_params[0] == GGML_PREC_DEFAULT) {
8502
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
8503
+ to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
8504
+
8505
+ ggml_cuda_pool_free(dst_f16, dst_as);
8506
+ }
8469
8507
 
8470
8508
  ggml_cuda_pool_free(src1_as_f16, src1_as);
8471
- ggml_cuda_pool_free(dst_f16, dst_as);
8472
8509
  }
8473
8510
 
8474
8511
  static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -8732,7 +8769,8 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
8732
8769
  // TODO: mmq/mmv support
8733
8770
  #endif
8734
8771
 
8735
- GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
8772
+ const int64_t nb11 = src1->nb[1];
8773
+ const int64_t nb1 = dst->nb[1];
8736
8774
 
8737
8775
  const struct ggml_tensor * ids = src0;
8738
8776
  const int32_t id = ((int32_t *) dst->op_params)[0];
@@ -8740,10 +8778,12 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
8740
8778
 
8741
8779
  std::vector<char> ids_host(ggml_nbytes(ids));
8742
8780
 
8781
+ const cudaStream_t stream = g_cudaStreams[g_main_device][0];
8782
+
8743
8783
  if (ids->backend == GGML_BACKEND_GPU) {
8744
8784
  const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
8745
- CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
8746
- CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
8785
+ CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
8786
+ CUDA_CHECK(cudaStreamSynchronize(stream));
8747
8787
  } else {
8748
8788
  memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
8749
8789
  }
@@ -8757,37 +8797,110 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
8757
8797
  ggml_tensor src1_row = *src1;
8758
8798
  ggml_tensor dst_row = *dst;
8759
8799
 
8760
- src1_row.ne[1] = 1;
8761
- dst_row.ne[1] = 1;
8762
-
8763
- src1_row.nb[2] = src1_row.nb[1];
8764
- dst_row.nb[2] = dst_row.nb[1];
8765
-
8766
- src1_row.nb[3] = src1_row.nb[1];
8767
- dst_row.nb[3] = dst_row.nb[1];
8800
+ src1_row.backend = GGML_BACKEND_GPU;
8801
+ dst_row.backend = GGML_BACKEND_GPU;
8768
8802
 
8769
8803
  src1_row.extra = &src1_row_extra;
8770
8804
  dst_row.extra = &dst_row_extra;
8771
8805
 
8806
+ char * src1_original = src1->backend == GGML_BACKEND_CPU ?
8807
+ (char *) src1->data : (char *) src1_extra->data_device[g_main_device];
8808
+ char * dst_original = dst->backend == GGML_BACKEND_CPU ?
8809
+ (char *) dst->data : (char *) dst_extra->data_device[g_main_device];
8810
+
8811
+ if (src1->ne[1] == 1) {
8812
+ GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
8813
+ GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
8814
+
8815
+ for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
8816
+ //int32_t row_id;
8817
+ //CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
8818
+ //CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
8819
+
8820
+ const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
8772
8821
 
8773
- for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
8774
- //int32_t row_id;
8775
- //CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
8776
- //CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
8822
+ GGML_ASSERT(row_id >= 0 && row_id < n_as);
8777
8823
 
8778
- const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
8824
+ const struct ggml_tensor * src0_row = dst->src[row_id + 2];
8779
8825
 
8780
- GGML_ASSERT(row_id >= 0 && row_id < n_as);
8826
+ src1_row_extra.data_device[g_main_device] = src1_original + i01*src1->nb[1];
8827
+ src1_row.data = (char *) src1->data + i01*src1->nb[1]; // TODO why is this set?
8828
+
8829
+ dst_row_extra.data_device[g_main_device] = dst_original + i01*dst->nb[1];
8830
+ dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set?
8831
+
8832
+ ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
8833
+ }
8834
+ } else {
8835
+ size_t as_src1, as_dst;
8836
+ char * src1_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(src1), &as_src1);
8837
+ char * dst_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(dst), &as_dst);
8781
8838
 
8782
- const struct ggml_tensor * src0_row = dst->src[row_id + 2];
8839
+ src1_row_extra.data_device[g_main_device] = src1_contiguous;
8840
+ dst_row_extra.data_device[g_main_device] = dst_contiguous;
8783
8841
 
8784
- src1_row_extra.data_device[g_main_device] = (char *) src1_extra->data_device[g_main_device] + i01*src1->nb[1];
8785
- src1_row.data = (char *) src1->data + i01*src1->nb[1];
8842
+ const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_CPU ?
8843
+ cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
8844
+ const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_CPU ?
8845
+ cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
8786
8846
 
8787
- dst_row_extra.data_device[g_main_device] = (char *) dst_extra->data_device[g_main_device] + i01*dst->nb[1];
8788
- dst_row.data = (char *) dst->data + i01*dst->nb[1];
8847
+ for (int32_t row_id = 0; row_id < n_as; ++row_id) {
8848
+ const struct ggml_tensor * src0_row = dst->src[row_id + 2];
8789
8849
 
8790
- ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
8850
+ int64_t num_src1_rows = 0;
8851
+ for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
8852
+ const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
8853
+
8854
+ if (row_id_i != row_id) {
8855
+ continue;
8856
+ }
8857
+
8858
+ GGML_ASSERT(row_id >= 0 && row_id < n_as);
8859
+
8860
+ CUDA_CHECK(cudaMemcpyAsync(src1_contiguous + num_src1_rows*nb11, src1_original + i01*nb11,
8861
+ nb11, src1_kind, stream));
8862
+ num_src1_rows++;
8863
+ }
8864
+
8865
+ if (num_src1_rows == 0) {
8866
+ continue;
8867
+ }
8868
+
8869
+ src1_row.ne[1] = num_src1_rows;
8870
+ dst_row.ne[1] = num_src1_rows;
8871
+
8872
+ src1_row.nb[1] = nb11;
8873
+ src1_row.nb[2] = num_src1_rows*nb11;
8874
+ src1_row.nb[3] = num_src1_rows*nb11;
8875
+
8876
+ dst_row.nb[1] = nb1;
8877
+ dst_row.nb[2] = num_src1_rows*nb1;
8878
+ dst_row.nb[3] = num_src1_rows*nb1;
8879
+
8880
+ ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
8881
+
8882
+ num_src1_rows = 0;
8883
+ for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
8884
+ const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
8885
+
8886
+ if (row_id_i != row_id) {
8887
+ continue;
8888
+ }
8889
+
8890
+ GGML_ASSERT(row_id >= 0 && row_id < n_as);
8891
+
8892
+ CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous + num_src1_rows*nb1,
8893
+ nb1, dst_kind, stream));
8894
+ num_src1_rows++;
8895
+ }
8896
+ }
8897
+
8898
+ ggml_cuda_pool_free(src1_contiguous, as_src1);
8899
+ ggml_cuda_pool_free(dst_contiguous, as_dst);
8900
+ }
8901
+
8902
+ if (dst->backend == GGML_BACKEND_CPU) {
8903
+ CUDA_CHECK(cudaStreamSynchronize(stream));
8791
8904
  }
8792
8905
  }
8793
8906
 
@@ -8958,7 +9071,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
8958
9071
 
8959
9072
  char * buf;
8960
9073
  CUDA_CHECK(cudaMalloc(&buf, size));
8961
- char * buf_host = (char*)data + offset_split;
9074
+ char * buf_host = (char *)data + offset_split;
8962
9075
 
8963
9076
  // set padding to 0 to avoid possible NaN values
8964
9077
  if (size > original_size) {
@@ -8980,7 +9093,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
8980
9093
  }
8981
9094
 
8982
9095
  void ggml_cuda_free_data(struct ggml_tensor * tensor) {
8983
- if (!tensor || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
9096
+ if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
8984
9097
  return;
8985
9098
  }
8986
9099
 
@@ -9103,11 +9216,10 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
9103
9216
 
9104
9217
  ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
9105
9218
 
9106
- const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
9107
- tensor->op == GGML_OP_VIEW;
9219
+ const bool inplace = tensor->view_src != nullptr;
9108
9220
 
9109
- if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
9110
- ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
9221
+ if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) {
9222
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
9111
9223
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
9112
9224
  size_t view_offset = 0;
9113
9225
  if (tensor->op == GGML_OP_VIEW) {
@@ -9187,14 +9299,14 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
9187
9299
  || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
9188
9300
  || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
9189
9301
 
9190
- if (!any_on_device && tensor->op != GGML_OP_MUL_MAT) {
9302
+ if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
9191
9303
  return false;
9192
9304
  }
9193
9305
 
9194
9306
  if (tensor->op == GGML_OP_MUL_MAT) {
9195
9307
  if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
9196
9308
  #ifndef NDEBUG
9197
- fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = " PRId64 ", src1->ne[3] = " PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
9309
+ fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
9198
9310
  #endif
9199
9311
  return false;
9200
9312
  }
@@ -9323,6 +9435,10 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
9323
9435
  return false;
9324
9436
  }
9325
9437
 
9438
+ if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) {
9439
+ ggml_cuda_set_peer_access(tensor->src[1]->ne[1]);
9440
+ }
9441
+
9326
9442
  if (params->ith != 0) {
9327
9443
  return true;
9328
9444
  }
@@ -9396,7 +9512,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
9396
9512
  ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
9397
9513
 
9398
9514
  if (tensor->view_src != NULL && tensor->view_offs == 0) {
9399
- assert(tensor->view_src->buffer->buft == buffer->buft); // TODO
9515
+ assert(tensor->view_src->buffer->buft == buffer->buft);
9400
9516
  tensor->backend = tensor->view_src->backend;
9401
9517
  tensor->extra = tensor->view_src->extra;
9402
9518
  return;
@@ -9427,23 +9543,34 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
9427
9543
  }
9428
9544
 
9429
9545
  static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
9430
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
9431
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
9432
9546
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
9433
9547
 
9434
- CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
9548
+ ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
9435
9549
 
9436
- UNUSED(buffer);
9550
+ ggml_cuda_set_device(ctx->device);
9551
+ CUDA_CHECK(cudaDeviceSynchronize());
9552
+
9553
+ CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
9437
9554
  }
9438
9555
 
9439
9556
  static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
9440
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
9441
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
9442
9557
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
9443
9558
 
9559
+ ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
9560
+
9561
+ ggml_cuda_set_device(ctx->device);
9562
+ CUDA_CHECK(cudaDeviceSynchronize());
9563
+
9444
9564
  CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
9565
+ }
9445
9566
 
9446
- UNUSED(buffer);
9567
+ static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
9568
+ ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
9569
+
9570
+ ggml_cuda_set_device(ctx->device);
9571
+ CUDA_CHECK(cudaDeviceSynchronize());
9572
+
9573
+ CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size));
9447
9574
  }
9448
9575
 
9449
9576
  static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
@@ -9454,6 +9581,7 @@ static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
9454
9581
  /* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
9455
9582
  /* .cpy_tensor_from = */ NULL,
9456
9583
  /* .cpy_tensor_to = */ NULL,
9584
+ /* .clear = */ ggml_backend_cuda_buffer_clear,
9457
9585
  };
9458
9586
 
9459
9587
  // cuda buffer type
@@ -9505,35 +9633,36 @@ static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_t
9505
9633
  UNUSED(buft);
9506
9634
  }
9507
9635
 
9508
- static ggml_backend_buffer_type_i cuda_backend_buffer_type_interface = {
9636
+ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
9509
9637
  /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
9510
9638
  /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
9511
9639
  /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
9512
9640
  /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
9641
+ /* .is_host = */ nullptr,
9513
9642
  };
9514
9643
 
9515
9644
  ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
9516
- static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda[GGML_CUDA_MAX_DEVICES];
9517
- static bool ggml_backend_buffer_type_cuda_initialized = false;
9518
- if (!ggml_backend_buffer_type_cuda_initialized) {
9645
+ static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
9646
+
9647
+ static bool ggml_backend_cuda_buffer_type_initialized = false;
9648
+
9649
+ if (!ggml_backend_cuda_buffer_type_initialized) {
9519
9650
  for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
9520
- ggml_backend_buffer_type_cuda[i] = {
9521
- /* .iface = */ cuda_backend_buffer_type_interface,
9651
+ ggml_backend_cuda_buffer_types[i] = {
9652
+ /* .iface = */ ggml_backend_cuda_buffer_type_interface,
9522
9653
  /* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
9523
9654
  };
9524
9655
  }
9525
- ggml_backend_buffer_type_cuda_initialized = true;
9656
+ ggml_backend_cuda_buffer_type_initialized = true;
9526
9657
  }
9527
9658
 
9528
- return &ggml_backend_buffer_type_cuda[device];
9659
+ return &ggml_backend_cuda_buffer_types[device];
9529
9660
  }
9530
9661
 
9531
9662
  // host buffer type
9532
9663
 
9533
9664
  static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
9534
- ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
9535
- CUDA_CHECK(cudaFreeHost(ctx->dev_ptr));
9536
- delete ctx;
9665
+ CUDA_CHECK(cudaFreeHost(buffer->context));
9537
9666
  }
9538
9667
 
9539
9668
  static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
@@ -9546,24 +9675,21 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
9546
9675
  buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
9547
9676
 
9548
9677
  return buffer;
9549
-
9550
- UNUSED(buft);
9551
9678
  }
9552
9679
 
9553
- struct ggml_backend_buffer_type_i cuda_backend_host_buffer_type_interface = {
9554
- /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
9555
- /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
9556
- /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
9557
- /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
9558
- };
9559
-
9560
9680
  ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
9561
- static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda_host = {
9562
- /* .iface = */ cuda_backend_host_buffer_type_interface,
9681
+ static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
9682
+ /* .iface = */ {
9683
+ /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
9684
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
9685
+ /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
9686
+ /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
9687
+ /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
9688
+ },
9563
9689
  /* .context = */ nullptr,
9564
9690
  };
9565
9691
 
9566
- return &ggml_backend_buffer_type_cuda_host;
9692
+ return &ggml_backend_cuda_buffer_type_host;
9567
9693
  }
9568
9694
 
9569
9695
  // backend
@@ -9595,8 +9721,6 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens
9595
9721
  ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
9596
9722
 
9597
9723
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
9598
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
9599
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
9600
9724
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
9601
9725
 
9602
9726
  CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
@@ -9606,8 +9730,6 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm
9606
9730
  ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
9607
9731
 
9608
9732
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
9609
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
9610
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
9611
9733
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
9612
9734
 
9613
9735
  CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));