llama_cpp 0.10.1 → 0.10.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -31,6 +31,7 @@
31
31
  #define CUDA_R_16F HIPBLAS_R_16F
32
32
  #define CUDA_R_32F HIPBLAS_R_32F
33
33
  #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
34
+ #define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
34
35
  #define cublasCreate hipblasCreate
35
36
  #define cublasGemmEx hipblasGemmEx
36
37
  #define cublasGemmBatchedEx hipblasGemmBatchedEx
@@ -40,6 +41,7 @@
40
41
  #define cublasSetStream hipblasSetStream
41
42
  #define cublasSgemm hipblasSgemm
42
43
  #define cublasStatus_t hipblasStatus_t
44
+ #define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
43
45
  #define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
44
46
  #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
45
47
  #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
@@ -58,8 +60,13 @@
58
60
  #define cudaGetDeviceProperties hipGetDeviceProperties
59
61
  #define cudaGetErrorString hipGetErrorString
60
62
  #define cudaGetLastError hipGetLastError
63
+ #ifdef GGML_HIP_UMA
64
+ #define cudaMalloc hipMallocManaged
65
+ #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
66
+ #else
61
67
  #define cudaMalloc hipMalloc
62
68
  #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
69
+ #endif
63
70
  #define cudaMemcpy hipMemcpy
64
71
  #define cudaMemcpy2DAsync hipMemcpy2DAsync
65
72
  #define cudaMemcpyAsync hipMemcpyAsync
@@ -78,6 +85,7 @@
78
85
  #define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
79
86
  #define cudaStream_t hipStream_t
80
87
  #define cudaSuccess hipSuccess
88
+ #define __trap abort
81
89
  #else
82
90
  #include <cuda_runtime.h>
83
91
  #include <cublas_v2.h>
@@ -510,6 +518,14 @@ static size_t g_scratch_offset = 0;
510
518
 
511
519
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
512
520
 
521
+ [[noreturn]]
522
+ static __device__ void bad_arch() {
523
+ printf("ERROR: ggml-cuda was compiled without support for the current GPU architecture.\n");
524
+ __trap();
525
+
526
+ (void) bad_arch; // suppress unused function warning
527
+ }
528
+
513
529
  static __device__ __forceinline__ float warp_reduce_sum(float x) {
514
530
  #pragma unroll
515
531
  for (int mask = 16; mask > 0; mask >>= 1) {
@@ -1970,8 +1986,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
1970
1986
  // second part effectively subtracts 8 from each quant value
1971
1987
  return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
1972
1988
  #else
1973
- assert(false);
1974
- return 0.0f; // only to satisfy the compiler
1989
+ bad_arch();
1975
1990
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1976
1991
  }
1977
1992
 
@@ -2008,8 +2023,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
2008
2023
  // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
2009
2024
  return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
2010
2025
  #else
2011
- assert(false);
2012
- return 0.0f; // only to satisfy the compiler
2026
+ bad_arch();
2013
2027
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2014
2028
  }
2015
2029
 
@@ -2044,8 +2058,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
2044
2058
  // second part effectively subtracts 16 from each quant value
2045
2059
  return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
2046
2060
  #else
2047
- assert(false);
2048
- return 0.0f; // only to satisfy the compiler
2061
+ bad_arch();
2049
2062
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2050
2063
  }
2051
2064
 
@@ -2090,8 +2103,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
2090
2103
  return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
2091
2104
 
2092
2105
  #else
2093
- assert(false);
2094
- return 0.0f; // only to satisfy the compiler
2106
+ bad_arch();
2095
2107
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2096
2108
  }
2097
2109
 
@@ -2112,8 +2124,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp
2112
2124
 
2113
2125
  return d8_0*d8_1 * sumi;
2114
2126
  #else
2115
- assert(false);
2116
- return 0.0f; // only to satisfy the compiler
2127
+ bad_arch();
2117
2128
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2118
2129
  }
2119
2130
 
@@ -2143,8 +2154,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
2143
2154
  // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
2144
2155
  return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
2145
2156
  #else
2146
- assert(false);
2147
- return 0.0f; // only to satisfy the compiler
2157
+ bad_arch();
2148
2158
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2149
2159
  }
2150
2160
 
@@ -2179,8 +2189,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
2179
2189
 
2180
2190
  return dm2f.x*sumf_d - dm2f.y*sumf_m;
2181
2191
  #else
2182
- assert(false);
2183
- return 0.0f; // only to satisfy the compiler
2192
+ bad_arch();
2184
2193
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2185
2194
  }
2186
2195
 
@@ -2217,8 +2226,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
2217
2226
 
2218
2227
  return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
2219
2228
  #else
2220
- assert(false);
2221
- return 0.0f; // only to satisfy the compiler
2229
+ bad_arch();
2222
2230
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2223
2231
  }
2224
2232
 
@@ -2258,8 +2266,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
2258
2266
 
2259
2267
  return d3 * sumf;
2260
2268
  #else
2261
- assert(false);
2262
- return 0.0f; // only to satisfy the compiler
2269
+ bad_arch();
2263
2270
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2264
2271
  }
2265
2272
 
@@ -2284,8 +2291,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
2284
2291
 
2285
2292
  return d3*d8 * sumi;
2286
2293
  #else
2287
- assert(false);
2288
- return 0.0f; // only to satisfy the compiler
2294
+ bad_arch();
2289
2295
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2290
2296
  }
2291
2297
 
@@ -2318,8 +2324,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
2318
2324
  return dm4f.x*sumf_d - dm4f.y*sumf_m;
2319
2325
 
2320
2326
  #else
2321
- assert(false);
2322
- return 0.0f; // only to satisfy the compiler
2327
+ bad_arch();
2323
2328
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2324
2329
  }
2325
2330
 
@@ -2352,8 +2357,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
2352
2357
  return dm4f.x*sumf_d - dm4f.y*sumf_m;
2353
2358
 
2354
2359
  #else
2355
- assert(false);
2356
- return 0.0f; // only to satisfy the compiler
2360
+ bad_arch();
2357
2361
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2358
2362
  }
2359
2363
 
@@ -2393,8 +2397,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
2393
2397
  return dm5f.x*sumf_d - dm5f.y*sumf_m;
2394
2398
 
2395
2399
  #else
2396
- assert(false);
2397
- return 0.0f; // only to satisfy the compiler
2400
+ bad_arch();
2398
2401
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2399
2402
  }
2400
2403
 
@@ -2427,8 +2430,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
2427
2430
  return dm4f.x*sumf_d - dm4f.y*sumf_m;
2428
2431
 
2429
2432
  #else
2430
- assert(false);
2431
- return 0.0f; // only to satisfy the compiler
2433
+ bad_arch();
2432
2434
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2433
2435
  }
2434
2436
 
@@ -2458,8 +2460,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
2458
2460
 
2459
2461
  return d*sumf;
2460
2462
  #else
2461
- assert(false);
2462
- return 0.0f; // only to satisfy the compiler
2463
+ bad_arch();
2463
2464
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2464
2465
  }
2465
2466
 
@@ -2490,8 +2491,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
2490
2491
  return d6 * sumf_d;
2491
2492
 
2492
2493
  #else
2493
- assert(false);
2494
- return 0.0f; // only to satisfy the compiler
2494
+ bad_arch();
2495
2495
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2496
2496
  }
2497
2497
 
@@ -3357,8 +3357,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
3357
3357
  return dall * sumf_d - dmin * sumf_m;
3358
3358
 
3359
3359
  #else
3360
- assert(false);
3361
- return 0.0f; // only to satisfy the compiler
3360
+ bad_arch();
3362
3361
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
3363
3362
 
3364
3363
  #endif
@@ -3541,8 +3540,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
3541
3540
  return d * sumf_d;
3542
3541
 
3543
3542
  #else
3544
- assert(false);
3545
- return 0.0f; // only to satisfy the compiler
3543
+ bad_arch();
3546
3544
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
3547
3545
 
3548
3546
  #endif
@@ -3952,7 +3950,7 @@ template <bool need_check> static __global__ void
3952
3950
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3953
3951
  #else
3954
3952
  (void) vec_dot_q4_0_q8_1_mul_mat;
3955
- assert(false);
3953
+ bad_arch();
3956
3954
  #endif // __CUDA_ARCH__ >= CC_VOLTA
3957
3955
  }
3958
3956
 
@@ -4021,7 +4019,7 @@ template <bool need_check> static __global__ void
4021
4019
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4022
4020
  #else
4023
4021
  (void) vec_dot_q4_1_q8_1_mul_mat;
4024
- assert(false);
4022
+ bad_arch();
4025
4023
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4026
4024
  }
4027
4025
 
@@ -4088,7 +4086,7 @@ template <bool need_check> static __global__ void
4088
4086
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4089
4087
  #else
4090
4088
  (void) vec_dot_q5_0_q8_1_mul_mat;
4091
- assert(false);
4089
+ bad_arch();
4092
4090
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4093
4091
  }
4094
4092
 
@@ -4155,7 +4153,7 @@ mul_mat_q5_1(
4155
4153
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4156
4154
  #else
4157
4155
  (void) vec_dot_q5_1_q8_1_mul_mat;
4158
- assert(false);
4156
+ bad_arch();
4159
4157
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4160
4158
  }
4161
4159
 
@@ -4222,7 +4220,7 @@ template <bool need_check> static __global__ void
4222
4220
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4223
4221
  #else
4224
4222
  (void) vec_dot_q8_0_q8_1_mul_mat;
4225
- assert(false);
4223
+ bad_arch();
4226
4224
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4227
4225
  }
4228
4226
 
@@ -4289,7 +4287,7 @@ mul_mat_q2_K(
4289
4287
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4290
4288
  #else
4291
4289
  (void) vec_dot_q2_K_q8_1_mul_mat;
4292
- assert(false);
4290
+ bad_arch();
4293
4291
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4294
4292
  }
4295
4293
 
@@ -4358,7 +4356,7 @@ template <bool need_check> static __global__ void
4358
4356
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4359
4357
  #else
4360
4358
  (void) vec_dot_q3_K_q8_1_mul_mat;
4361
- assert(false);
4359
+ bad_arch();
4362
4360
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4363
4361
  }
4364
4362
 
@@ -4427,7 +4425,7 @@ template <bool need_check> static __global__ void
4427
4425
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4428
4426
  #else
4429
4427
  (void) vec_dot_q4_K_q8_1_mul_mat;
4430
- assert(false);
4428
+ bad_arch();
4431
4429
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4432
4430
  }
4433
4431
 
@@ -4494,7 +4492,7 @@ mul_mat_q5_K(
4494
4492
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4495
4493
  #else
4496
4494
  (void) vec_dot_q5_K_q8_1_mul_mat;
4497
- assert(false);
4495
+ bad_arch();
4498
4496
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4499
4497
  }
4500
4498
 
@@ -4563,7 +4561,7 @@ template <bool need_check> static __global__ void
4563
4561
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4564
4562
  #else
4565
4563
  (void) vec_dot_q6_K_q8_1_mul_mat;
4566
- assert(false);
4564
+ bad_arch();
4567
4565
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4568
4566
  }
4569
4567
 
@@ -4998,7 +4996,16 @@ static __global__ void rope_neox(
4998
4996
  const int ib = col / n_dims;
4999
4997
  const int ic = col % n_dims;
5000
4998
 
5001
- const int i = row*ncols + ib*n_dims + ic/2;
4999
+ if (ib > 0) {
5000
+ const int i = row*ncols + ib*n_dims + ic;
5001
+
5002
+ dst[i + 0] = x[i + 0];
5003
+ dst[i + 1] = x[i + 1];
5004
+
5005
+ return;
5006
+ }
5007
+
5008
+ const int i = row*ncols + ib*n_dims + ic/2;
5002
5009
  const int i2 = row/p_delta_rows;
5003
5010
 
5004
5011
  float cur_rot = inv_ndims * ic - ib;
@@ -6814,6 +6821,7 @@ static void ggml_cuda_op_get_rows(
6814
6821
  break;
6815
6822
  default:
6816
6823
  // TODO: k-quants
6824
+ fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
6817
6825
  GGML_ASSERT(false);
6818
6826
  break;
6819
6827
  }
@@ -7057,6 +7065,7 @@ inline void ggml_cuda_op_upscale(
7057
7065
 
7058
7066
  (void) src1;
7059
7067
  (void) dst;
7068
+ (void) src1_dd;
7060
7069
  }
7061
7070
 
7062
7071
  inline void ggml_cuda_op_pad(
@@ -7073,6 +7082,7 @@ inline void ggml_cuda_op_pad(
7073
7082
 
7074
7083
  (void) src1;
7075
7084
  (void) dst;
7085
+ (void) src1_dd;
7076
7086
  }
7077
7087
 
7078
7088
  inline void ggml_cuda_op_rms_norm(
@@ -7376,7 +7386,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
7376
7386
 
7377
7387
  const int compute_capability = g_compute_capabilities[id];
7378
7388
 
7379
- if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
7389
+ if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
7380
7390
  // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
7381
7391
  half * src0_as_f16 = nullptr;
7382
7392
  size_t src0_as = 0;
@@ -7690,17 +7700,9 @@ inline void ggml_cuda_op_scale(
7690
7700
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7691
7701
 
7692
7702
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7693
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
7694
7703
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
7695
7704
 
7696
- float scale;
7697
- // HACK: support for ggml backend interface
7698
- if (src1->backend == GGML_BACKEND_CPU) {
7699
- scale = ((float *) src1->data)[0];
7700
- } else {
7701
- // TODO: pass pointer to kernel instead of copying to host
7702
- CUDA_CHECK(cudaMemcpy(&scale, src1->data, sizeof(float), cudaMemcpyDeviceToHost));
7703
- }
7705
+ const float scale = ((float *) dst->op_params)[0];
7704
7706
 
7705
7707
  scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
7706
7708
  CUDA_CHECK(cudaGetLastError());
@@ -7747,8 +7749,6 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
7747
7749
  const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
7748
7750
  const bool dst_on_device = dst->backend == GGML_BACKEND_GPU;
7749
7751
 
7750
- const bool src1_stays_on_host = use_src1 && dst->op == GGML_OP_SCALE;
7751
-
7752
7752
  // dd = data device
7753
7753
  float * src0_ddf = nullptr;
7754
7754
  float * src1_ddf = nullptr;
@@ -7769,7 +7769,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
7769
7769
  CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
7770
7770
  }
7771
7771
 
7772
- if (use_src1 && !src1_stays_on_host) {
7772
+ if (use_src1) {
7773
7773
  if (src1_on_device) {
7774
7774
  src1_ddf = (float *) src1_extra->data_device[g_main_device];
7775
7775
  } else {
@@ -7817,6 +7817,11 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
7817
7817
  }
7818
7818
 
7819
7819
  #ifdef NDEBUG
7820
+ for (int id = 0; id < g_device_count; ++id) {
7821
+ CUDA_CHECK(ggml_cuda_set_device(id));
7822
+ CUDA_CHECK(cudaDeviceSynchronize());
7823
+ }
7824
+
7820
7825
  for (int id = 0; id < g_device_count; ++id) {
7821
7826
  CUDA_CHECK(ggml_cuda_set_device(id));
7822
7827
 
@@ -7868,8 +7873,6 @@ static void ggml_cuda_op_mul_mat(
7868
7873
  const int nb2 = dst->nb[2];
7869
7874
  const int nb3 = dst->nb[3];
7870
7875
 
7871
- ggml_cuda_set_peer_access(ne11);
7872
-
7873
7876
  GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
7874
7877
  GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
7875
7878
 
@@ -8300,27 +8303,27 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
8300
8303
  }
8301
8304
 
8302
8305
  static __global__ void k_compute_batched_ptrs(
8303
- const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
8306
+ const half * src0_as_f16, const half * src1_as_f16, char * dst,
8304
8307
  const void ** ptrs_src, void ** ptrs_dst,
8305
- int ne12, int ne13,
8306
- int ne23,
8307
- int nb02, int nb03,
8308
- int nb12, int nb13,
8309
- int nb2, int nb3,
8310
- int r2, int r3) {
8311
- int i13 = blockIdx.x * blockDim.x + threadIdx.x;
8312
- int i12 = blockIdx.y * blockDim.y + threadIdx.y;
8308
+ int64_t ne12, int64_t ne13,
8309
+ int64_t ne23,
8310
+ size_t nb02, size_t nb03,
8311
+ size_t nb12, size_t nb13,
8312
+ size_t nbd2, size_t nbd3,
8313
+ int64_t r2, int64_t r3) {
8314
+ int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x;
8315
+ int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y;
8313
8316
 
8314
8317
  if (i13 >= ne13 || i12 >= ne12) {
8315
8318
  return;
8316
8319
  }
8317
8320
 
8318
- int i03 = i13 / r3;
8319
- int i02 = i12 / r2;
8321
+ int64_t i03 = i13 / r3;
8322
+ int64_t i02 = i12 / r2;
8320
8323
 
8321
8324
  ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
8322
8325
  ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
8323
- ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
8326
+ ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
8324
8327
  }
8325
8328
 
8326
8329
  static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -8376,7 +8379,41 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8376
8379
  to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
8377
8380
 
8378
8381
  size_t dst_as = 0;
8379
- half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
8382
+
8383
+ half * dst_f16 = nullptr;
8384
+ char * dst_t = nullptr;
8385
+
8386
+ cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
8387
+ cudaDataType_t cu_data_type = CUDA_R_16F;
8388
+
8389
+ // dst strides
8390
+ size_t nbd2 = dst->nb[2];
8391
+ size_t nbd3 = dst->nb[3];
8392
+
8393
+ const half alpha_f16 = 1.0f;
8394
+ const half beta_f16 = 0.0f;
8395
+
8396
+ const float alpha_f32 = 1.0f;
8397
+ const float beta_f32 = 0.0f;
8398
+
8399
+ const void * alpha = &alpha_f16;
8400
+ const void * beta = &beta_f16;
8401
+
8402
+ if (dst->op_params[0] == GGML_PREC_DEFAULT) {
8403
+ dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
8404
+ dst_t = (char *) dst_f16;
8405
+
8406
+ nbd2 /= sizeof(float) / sizeof(half);
8407
+ nbd3 /= sizeof(float) / sizeof(half);
8408
+ } else {
8409
+ dst_t = (char *) dst_ddf;
8410
+
8411
+ cu_compute_type = CUBLAS_COMPUTE_32F;
8412
+ cu_data_type = CUDA_R_32F;
8413
+
8414
+ alpha = &alpha_f32;
8415
+ beta = &beta_f32;
8416
+ }
8380
8417
 
8381
8418
  GGML_ASSERT(ne12 % ne02 == 0);
8382
8419
  GGML_ASSERT(ne13 % ne03 == 0);
@@ -8385,9 +8422,6 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8385
8422
  const int64_t r2 = ne12/ne02;
8386
8423
  const int64_t r3 = ne13/ne03;
8387
8424
 
8388
- const half alpha_f16 = 1.0f;
8389
- const half beta_f16 = 0.0f;
8390
-
8391
8425
  #if 0
8392
8426
  // use cublasGemmEx
8393
8427
  {
@@ -8397,12 +8431,12 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8397
8431
  int i02 = i12 / r2;
8398
8432
 
8399
8433
  CUBLAS_CHECK(
8400
- cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
8434
+ cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
8401
8435
  ne01, ne11, ne10,
8402
- &alpha_f16, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
8403
- (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
8404
- &beta_f16, ( char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2, CUDA_R_16F, ne01,
8405
- CUBLAS_COMPUTE_16F,
8436
+ alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
8437
+ (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
8438
+ beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01,
8439
+ cu_compute_type,
8406
8440
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
8407
8441
  }
8408
8442
  }
@@ -8414,11 +8448,11 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8414
8448
  CUBLAS_CHECK(
8415
8449
  cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
8416
8450
  ne01, ne11, ne10,
8417
- &alpha_f16, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
8418
- (const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
8419
- &beta_f16, ( char *) dst_f16, CUDA_R_16F, ne01, dst->nb[2]/sizeof(float), // strideC
8451
+ alpha, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
8452
+ (const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
8453
+ beta, ( char *) dst_t, cu_data_type, ne01, dst->nb[2]/sizeof(float), // strideC
8420
8454
  ne12*ne13,
8421
- CUBLAS_COMPUTE_16F,
8455
+ cu_compute_type,
8422
8456
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
8423
8457
  } else {
8424
8458
  // use cublasGemmBatchedEx
@@ -8435,24 +8469,24 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8435
8469
 
8436
8470
  dim3 block_dims(ne13, ne12);
8437
8471
  k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
8438
- src0_as_f16, src1_as_f16, dst_f16,
8472
+ src0_as_f16, src1_as_f16, dst_t,
8439
8473
  ptrs_src, ptrs_dst,
8440
8474
  ne12, ne13,
8441
8475
  ne23,
8442
8476
  nb02, nb03,
8443
8477
  nb12, nb13,
8444
- dst->nb[2], dst->nb[3],
8478
+ nbd2, nbd3,
8445
8479
  r2, r3);
8446
8480
  CUDA_CHECK(cudaGetLastError());
8447
8481
 
8448
8482
  CUBLAS_CHECK(
8449
8483
  cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
8450
8484
  ne01, ne11, ne10,
8451
- &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
8452
- (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
8453
- &beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
8485
+ alpha, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
8486
+ (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
8487
+ beta, ( void **) (ptrs_dst + 0*ne23), cu_data_type, ne01,
8454
8488
  ne23,
8455
- CUBLAS_COMPUTE_16F,
8489
+ cu_compute_type,
8456
8490
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
8457
8491
 
8458
8492
  if (ptrs_src_s != 0) {
@@ -8464,11 +8498,14 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8464
8498
  }
8465
8499
  #endif
8466
8500
 
8467
- const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
8468
- to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
8501
+ if (dst->op_params[0] == GGML_PREC_DEFAULT) {
8502
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
8503
+ to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
8504
+
8505
+ ggml_cuda_pool_free(dst_f16, dst_as);
8506
+ }
8469
8507
 
8470
8508
  ggml_cuda_pool_free(src1_as_f16, src1_as);
8471
- ggml_cuda_pool_free(dst_f16, dst_as);
8472
8509
  }
8473
8510
 
8474
8511
  static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -8732,7 +8769,8 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
8732
8769
  // TODO: mmq/mmv support
8733
8770
  #endif
8734
8771
 
8735
- GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
8772
+ const int64_t nb11 = src1->nb[1];
8773
+ const int64_t nb1 = dst->nb[1];
8736
8774
 
8737
8775
  const struct ggml_tensor * ids = src0;
8738
8776
  const int32_t id = ((int32_t *) dst->op_params)[0];
@@ -8740,10 +8778,12 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
8740
8778
 
8741
8779
  std::vector<char> ids_host(ggml_nbytes(ids));
8742
8780
 
8781
+ const cudaStream_t stream = g_cudaStreams[g_main_device][0];
8782
+
8743
8783
  if (ids->backend == GGML_BACKEND_GPU) {
8744
8784
  const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
8745
- CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
8746
- CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
8785
+ CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
8786
+ CUDA_CHECK(cudaStreamSynchronize(stream));
8747
8787
  } else {
8748
8788
  memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
8749
8789
  }
@@ -8757,37 +8797,110 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
8757
8797
  ggml_tensor src1_row = *src1;
8758
8798
  ggml_tensor dst_row = *dst;
8759
8799
 
8760
- src1_row.ne[1] = 1;
8761
- dst_row.ne[1] = 1;
8762
-
8763
- src1_row.nb[2] = src1_row.nb[1];
8764
- dst_row.nb[2] = dst_row.nb[1];
8765
-
8766
- src1_row.nb[3] = src1_row.nb[1];
8767
- dst_row.nb[3] = dst_row.nb[1];
8800
+ src1_row.backend = GGML_BACKEND_GPU;
8801
+ dst_row.backend = GGML_BACKEND_GPU;
8768
8802
 
8769
8803
  src1_row.extra = &src1_row_extra;
8770
8804
  dst_row.extra = &dst_row_extra;
8771
8805
 
8806
+ char * src1_original = src1->backend == GGML_BACKEND_CPU ?
8807
+ (char *) src1->data : (char *) src1_extra->data_device[g_main_device];
8808
+ char * dst_original = dst->backend == GGML_BACKEND_CPU ?
8809
+ (char *) dst->data : (char *) dst_extra->data_device[g_main_device];
8810
+
8811
+ if (src1->ne[1] == 1) {
8812
+ GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
8813
+ GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
8814
+
8815
+ for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
8816
+ //int32_t row_id;
8817
+ //CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
8818
+ //CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
8819
+
8820
+ const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
8772
8821
 
8773
- for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
8774
- //int32_t row_id;
8775
- //CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
8776
- //CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
8822
+ GGML_ASSERT(row_id >= 0 && row_id < n_as);
8777
8823
 
8778
- const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
8824
+ const struct ggml_tensor * src0_row = dst->src[row_id + 2];
8779
8825
 
8780
- GGML_ASSERT(row_id >= 0 && row_id < n_as);
8826
+ src1_row_extra.data_device[g_main_device] = src1_original + i01*src1->nb[1];
8827
+ src1_row.data = (char *) src1->data + i01*src1->nb[1]; // TODO why is this set?
8828
+
8829
+ dst_row_extra.data_device[g_main_device] = dst_original + i01*dst->nb[1];
8830
+ dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set?
8831
+
8832
+ ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
8833
+ }
8834
+ } else {
8835
+ size_t as_src1, as_dst;
8836
+ char * src1_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(src1), &as_src1);
8837
+ char * dst_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(dst), &as_dst);
8781
8838
 
8782
- const struct ggml_tensor * src0_row = dst->src[row_id + 2];
8839
+ src1_row_extra.data_device[g_main_device] = src1_contiguous;
8840
+ dst_row_extra.data_device[g_main_device] = dst_contiguous;
8783
8841
 
8784
- src1_row_extra.data_device[g_main_device] = (char *) src1_extra->data_device[g_main_device] + i01*src1->nb[1];
8785
- src1_row.data = (char *) src1->data + i01*src1->nb[1];
8842
+ const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_CPU ?
8843
+ cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
8844
+ const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_CPU ?
8845
+ cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
8786
8846
 
8787
- dst_row_extra.data_device[g_main_device] = (char *) dst_extra->data_device[g_main_device] + i01*dst->nb[1];
8788
- dst_row.data = (char *) dst->data + i01*dst->nb[1];
8847
+ for (int32_t row_id = 0; row_id < n_as; ++row_id) {
8848
+ const struct ggml_tensor * src0_row = dst->src[row_id + 2];
8789
8849
 
8790
- ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
8850
+ int64_t num_src1_rows = 0;
8851
+ for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
8852
+ const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
8853
+
8854
+ if (row_id_i != row_id) {
8855
+ continue;
8856
+ }
8857
+
8858
+ GGML_ASSERT(row_id >= 0 && row_id < n_as);
8859
+
8860
+ CUDA_CHECK(cudaMemcpyAsync(src1_contiguous + num_src1_rows*nb11, src1_original + i01*nb11,
8861
+ nb11, src1_kind, stream));
8862
+ num_src1_rows++;
8863
+ }
8864
+
8865
+ if (num_src1_rows == 0) {
8866
+ continue;
8867
+ }
8868
+
8869
+ src1_row.ne[1] = num_src1_rows;
8870
+ dst_row.ne[1] = num_src1_rows;
8871
+
8872
+ src1_row.nb[1] = nb11;
8873
+ src1_row.nb[2] = num_src1_rows*nb11;
8874
+ src1_row.nb[3] = num_src1_rows*nb11;
8875
+
8876
+ dst_row.nb[1] = nb1;
8877
+ dst_row.nb[2] = num_src1_rows*nb1;
8878
+ dst_row.nb[3] = num_src1_rows*nb1;
8879
+
8880
+ ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
8881
+
8882
+ num_src1_rows = 0;
8883
+ for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
8884
+ const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
8885
+
8886
+ if (row_id_i != row_id) {
8887
+ continue;
8888
+ }
8889
+
8890
+ GGML_ASSERT(row_id >= 0 && row_id < n_as);
8891
+
8892
+ CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous + num_src1_rows*nb1,
8893
+ nb1, dst_kind, stream));
8894
+ num_src1_rows++;
8895
+ }
8896
+ }
8897
+
8898
+ ggml_cuda_pool_free(src1_contiguous, as_src1);
8899
+ ggml_cuda_pool_free(dst_contiguous, as_dst);
8900
+ }
8901
+
8902
+ if (dst->backend == GGML_BACKEND_CPU) {
8903
+ CUDA_CHECK(cudaStreamSynchronize(stream));
8791
8904
  }
8792
8905
  }
8793
8906
 
@@ -8958,7 +9071,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
8958
9071
 
8959
9072
  char * buf;
8960
9073
  CUDA_CHECK(cudaMalloc(&buf, size));
8961
- char * buf_host = (char*)data + offset_split;
9074
+ char * buf_host = (char *)data + offset_split;
8962
9075
 
8963
9076
  // set padding to 0 to avoid possible NaN values
8964
9077
  if (size > original_size) {
@@ -8980,7 +9093,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
8980
9093
  }
8981
9094
 
8982
9095
  void ggml_cuda_free_data(struct ggml_tensor * tensor) {
8983
- if (!tensor || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
9096
+ if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
8984
9097
  return;
8985
9098
  }
8986
9099
 
@@ -9103,11 +9216,10 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
9103
9216
 
9104
9217
  ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
9105
9218
 
9106
- const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
9107
- tensor->op == GGML_OP_VIEW;
9219
+ const bool inplace = tensor->view_src != nullptr;
9108
9220
 
9109
- if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
9110
- ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
9221
+ if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) {
9222
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
9111
9223
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
9112
9224
  size_t view_offset = 0;
9113
9225
  if (tensor->op == GGML_OP_VIEW) {
@@ -9187,14 +9299,14 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
9187
9299
  || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
9188
9300
  || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
9189
9301
 
9190
- if (!any_on_device && tensor->op != GGML_OP_MUL_MAT) {
9302
+ if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
9191
9303
  return false;
9192
9304
  }
9193
9305
 
9194
9306
  if (tensor->op == GGML_OP_MUL_MAT) {
9195
9307
  if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
9196
9308
  #ifndef NDEBUG
9197
- fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = " PRId64 ", src1->ne[3] = " PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
9309
+ fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
9198
9310
  #endif
9199
9311
  return false;
9200
9312
  }
@@ -9323,6 +9435,10 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
9323
9435
  return false;
9324
9436
  }
9325
9437
 
9438
+ if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) {
9439
+ ggml_cuda_set_peer_access(tensor->src[1]->ne[1]);
9440
+ }
9441
+
9326
9442
  if (params->ith != 0) {
9327
9443
  return true;
9328
9444
  }
@@ -9396,7 +9512,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
9396
9512
  ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
9397
9513
 
9398
9514
  if (tensor->view_src != NULL && tensor->view_offs == 0) {
9399
- assert(tensor->view_src->buffer->buft == buffer->buft); // TODO
9515
+ assert(tensor->view_src->buffer->buft == buffer->buft);
9400
9516
  tensor->backend = tensor->view_src->backend;
9401
9517
  tensor->extra = tensor->view_src->extra;
9402
9518
  return;
@@ -9427,23 +9543,34 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
9427
9543
  }
9428
9544
 
9429
9545
  static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
9430
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
9431
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
9432
9546
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
9433
9547
 
9434
- CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
9548
+ ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
9435
9549
 
9436
- UNUSED(buffer);
9550
+ ggml_cuda_set_device(ctx->device);
9551
+ CUDA_CHECK(cudaDeviceSynchronize());
9552
+
9553
+ CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
9437
9554
  }
9438
9555
 
9439
9556
  static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
9440
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
9441
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
9442
9557
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
9443
9558
 
9559
+ ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
9560
+
9561
+ ggml_cuda_set_device(ctx->device);
9562
+ CUDA_CHECK(cudaDeviceSynchronize());
9563
+
9444
9564
  CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
9565
+ }
9445
9566
 
9446
- UNUSED(buffer);
9567
+ static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
9568
+ ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
9569
+
9570
+ ggml_cuda_set_device(ctx->device);
9571
+ CUDA_CHECK(cudaDeviceSynchronize());
9572
+
9573
+ CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size));
9447
9574
  }
9448
9575
 
9449
9576
  static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
@@ -9454,6 +9581,7 @@ static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
9454
9581
  /* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
9455
9582
  /* .cpy_tensor_from = */ NULL,
9456
9583
  /* .cpy_tensor_to = */ NULL,
9584
+ /* .clear = */ ggml_backend_cuda_buffer_clear,
9457
9585
  };
9458
9586
 
9459
9587
  // cuda buffer type
@@ -9505,35 +9633,36 @@ static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_t
9505
9633
  UNUSED(buft);
9506
9634
  }
9507
9635
 
9508
- static ggml_backend_buffer_type_i cuda_backend_buffer_type_interface = {
9636
+ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
9509
9637
  /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
9510
9638
  /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
9511
9639
  /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
9512
9640
  /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
9641
+ /* .is_host = */ nullptr,
9513
9642
  };
9514
9643
 
9515
9644
  ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
9516
- static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda[GGML_CUDA_MAX_DEVICES];
9517
- static bool ggml_backend_buffer_type_cuda_initialized = false;
9518
- if (!ggml_backend_buffer_type_cuda_initialized) {
9645
+ static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
9646
+
9647
+ static bool ggml_backend_cuda_buffer_type_initialized = false;
9648
+
9649
+ if (!ggml_backend_cuda_buffer_type_initialized) {
9519
9650
  for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
9520
- ggml_backend_buffer_type_cuda[i] = {
9521
- /* .iface = */ cuda_backend_buffer_type_interface,
9651
+ ggml_backend_cuda_buffer_types[i] = {
9652
+ /* .iface = */ ggml_backend_cuda_buffer_type_interface,
9522
9653
  /* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
9523
9654
  };
9524
9655
  }
9525
- ggml_backend_buffer_type_cuda_initialized = true;
9656
+ ggml_backend_cuda_buffer_type_initialized = true;
9526
9657
  }
9527
9658
 
9528
- return &ggml_backend_buffer_type_cuda[device];
9659
+ return &ggml_backend_cuda_buffer_types[device];
9529
9660
  }
9530
9661
 
9531
9662
  // host buffer type
9532
9663
 
9533
9664
  static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
9534
- ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
9535
- CUDA_CHECK(cudaFreeHost(ctx->dev_ptr));
9536
- delete ctx;
9665
+ CUDA_CHECK(cudaFreeHost(buffer->context));
9537
9666
  }
9538
9667
 
9539
9668
  static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
@@ -9546,24 +9675,21 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
9546
9675
  buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
9547
9676
 
9548
9677
  return buffer;
9549
-
9550
- UNUSED(buft);
9551
9678
  }
9552
9679
 
9553
- struct ggml_backend_buffer_type_i cuda_backend_host_buffer_type_interface = {
9554
- /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
9555
- /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
9556
- /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
9557
- /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
9558
- };
9559
-
9560
9680
  ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
9561
- static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda_host = {
9562
- /* .iface = */ cuda_backend_host_buffer_type_interface,
9681
+ static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
9682
+ /* .iface = */ {
9683
+ /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
9684
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
9685
+ /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
9686
+ /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
9687
+ /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
9688
+ },
9563
9689
  /* .context = */ nullptr,
9564
9690
  };
9565
9691
 
9566
- return &ggml_backend_buffer_type_cuda_host;
9692
+ return &ggml_backend_cuda_buffer_type_host;
9567
9693
  }
9568
9694
 
9569
9695
  // backend
@@ -9595,8 +9721,6 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens
9595
9721
  ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
9596
9722
 
9597
9723
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
9598
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
9599
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
9600
9724
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
9601
9725
 
9602
9726
  CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
@@ -9606,8 +9730,6 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm
9606
9730
  ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
9607
9731
 
9608
9732
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
9609
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
9610
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
9611
9733
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
9612
9734
 
9613
9735
  CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));