llama_cpp 0.10.1 → 0.10.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/llama_cpp.cpp +16 -1
- data/ext/llama_cpp/src/ggml-alloc.c +12 -4
- data/ext/llama_cpp/src/ggml-backend-impl.h +12 -8
- data/ext/llama_cpp/src/ggml-backend.c +75 -5
- data/ext/llama_cpp/src/ggml-backend.h +7 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +284 -162
- data/ext/llama_cpp/src/ggml-metal.h +3 -0
- data/ext/llama_cpp/src/ggml-metal.m +190 -44
- data/ext/llama_cpp/src/ggml-metal.metal +11 -2
- data/ext/llama_cpp/src/ggml.c +262 -89
- data/ext/llama_cpp/src/ggml.h +24 -10
- data/ext/llama_cpp/src/llama.cpp +926 -780
- data/ext/llama_cpp/src/llama.h +8 -3
- data/lib/llama_cpp/version.rb +2 -2
- metadata +2 -2
@@ -31,6 +31,7 @@
|
|
31
31
|
#define CUDA_R_16F HIPBLAS_R_16F
|
32
32
|
#define CUDA_R_32F HIPBLAS_R_32F
|
33
33
|
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
34
|
+
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
|
34
35
|
#define cublasCreate hipblasCreate
|
35
36
|
#define cublasGemmEx hipblasGemmEx
|
36
37
|
#define cublasGemmBatchedEx hipblasGemmBatchedEx
|
@@ -40,6 +41,7 @@
|
|
40
41
|
#define cublasSetStream hipblasSetStream
|
41
42
|
#define cublasSgemm hipblasSgemm
|
42
43
|
#define cublasStatus_t hipblasStatus_t
|
44
|
+
#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
|
43
45
|
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
|
44
46
|
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
|
45
47
|
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
|
@@ -58,8 +60,13 @@
|
|
58
60
|
#define cudaGetDeviceProperties hipGetDeviceProperties
|
59
61
|
#define cudaGetErrorString hipGetErrorString
|
60
62
|
#define cudaGetLastError hipGetLastError
|
63
|
+
#ifdef GGML_HIP_UMA
|
64
|
+
#define cudaMalloc hipMallocManaged
|
65
|
+
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
|
66
|
+
#else
|
61
67
|
#define cudaMalloc hipMalloc
|
62
68
|
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
69
|
+
#endif
|
63
70
|
#define cudaMemcpy hipMemcpy
|
64
71
|
#define cudaMemcpy2DAsync hipMemcpy2DAsync
|
65
72
|
#define cudaMemcpyAsync hipMemcpyAsync
|
@@ -78,6 +85,7 @@
|
|
78
85
|
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
|
79
86
|
#define cudaStream_t hipStream_t
|
80
87
|
#define cudaSuccess hipSuccess
|
88
|
+
#define __trap abort
|
81
89
|
#else
|
82
90
|
#include <cuda_runtime.h>
|
83
91
|
#include <cublas_v2.h>
|
@@ -510,6 +518,14 @@ static size_t g_scratch_offset = 0;
|
|
510
518
|
|
511
519
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
512
520
|
|
521
|
+
[[noreturn]]
|
522
|
+
static __device__ void bad_arch() {
|
523
|
+
printf("ERROR: ggml-cuda was compiled without support for the current GPU architecture.\n");
|
524
|
+
__trap();
|
525
|
+
|
526
|
+
(void) bad_arch; // suppress unused function warning
|
527
|
+
}
|
528
|
+
|
513
529
|
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
514
530
|
#pragma unroll
|
515
531
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
@@ -1970,8 +1986,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
|
|
1970
1986
|
// second part effectively subtracts 8 from each quant value
|
1971
1987
|
return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
|
1972
1988
|
#else
|
1973
|
-
|
1974
|
-
return 0.0f; // only to satisfy the compiler
|
1989
|
+
bad_arch();
|
1975
1990
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1976
1991
|
}
|
1977
1992
|
|
@@ -2008,8 +2023,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
|
|
2008
2023
|
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
|
2009
2024
|
return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
|
2010
2025
|
#else
|
2011
|
-
|
2012
|
-
return 0.0f; // only to satisfy the compiler
|
2026
|
+
bad_arch();
|
2013
2027
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2014
2028
|
}
|
2015
2029
|
|
@@ -2044,8 +2058,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
|
|
2044
2058
|
// second part effectively subtracts 16 from each quant value
|
2045
2059
|
return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
|
2046
2060
|
#else
|
2047
|
-
|
2048
|
-
return 0.0f; // only to satisfy the compiler
|
2061
|
+
bad_arch();
|
2049
2062
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2050
2063
|
}
|
2051
2064
|
|
@@ -2090,8 +2103,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
|
|
2090
2103
|
return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
|
2091
2104
|
|
2092
2105
|
#else
|
2093
|
-
|
2094
|
-
return 0.0f; // only to satisfy the compiler
|
2106
|
+
bad_arch();
|
2095
2107
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2096
2108
|
}
|
2097
2109
|
|
@@ -2112,8 +2124,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp
|
|
2112
2124
|
|
2113
2125
|
return d8_0*d8_1 * sumi;
|
2114
2126
|
#else
|
2115
|
-
|
2116
|
-
return 0.0f; // only to satisfy the compiler
|
2127
|
+
bad_arch();
|
2117
2128
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2118
2129
|
}
|
2119
2130
|
|
@@ -2143,8 +2154,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
|
|
2143
2154
|
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
|
2144
2155
|
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
|
2145
2156
|
#else
|
2146
|
-
|
2147
|
-
return 0.0f; // only to satisfy the compiler
|
2157
|
+
bad_arch();
|
2148
2158
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2149
2159
|
}
|
2150
2160
|
|
@@ -2179,8 +2189,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
|
|
2179
2189
|
|
2180
2190
|
return dm2f.x*sumf_d - dm2f.y*sumf_m;
|
2181
2191
|
#else
|
2182
|
-
|
2183
|
-
return 0.0f; // only to satisfy the compiler
|
2192
|
+
bad_arch();
|
2184
2193
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2185
2194
|
}
|
2186
2195
|
|
@@ -2217,8 +2226,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
|
|
2217
2226
|
|
2218
2227
|
return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
|
2219
2228
|
#else
|
2220
|
-
|
2221
|
-
return 0.0f; // only to satisfy the compiler
|
2229
|
+
bad_arch();
|
2222
2230
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2223
2231
|
}
|
2224
2232
|
|
@@ -2258,8 +2266,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
|
|
2258
2266
|
|
2259
2267
|
return d3 * sumf;
|
2260
2268
|
#else
|
2261
|
-
|
2262
|
-
return 0.0f; // only to satisfy the compiler
|
2269
|
+
bad_arch();
|
2263
2270
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2264
2271
|
}
|
2265
2272
|
|
@@ -2284,8 +2291,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
|
|
2284
2291
|
|
2285
2292
|
return d3*d8 * sumi;
|
2286
2293
|
#else
|
2287
|
-
|
2288
|
-
return 0.0f; // only to satisfy the compiler
|
2294
|
+
bad_arch();
|
2289
2295
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2290
2296
|
}
|
2291
2297
|
|
@@ -2318,8 +2324,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
|
|
2318
2324
|
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
2319
2325
|
|
2320
2326
|
#else
|
2321
|
-
|
2322
|
-
return 0.0f; // only to satisfy the compiler
|
2327
|
+
bad_arch();
|
2323
2328
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2324
2329
|
}
|
2325
2330
|
|
@@ -2352,8 +2357,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
|
|
2352
2357
|
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
2353
2358
|
|
2354
2359
|
#else
|
2355
|
-
|
2356
|
-
return 0.0f; // only to satisfy the compiler
|
2360
|
+
bad_arch();
|
2357
2361
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2358
2362
|
}
|
2359
2363
|
|
@@ -2393,8 +2397,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
|
|
2393
2397
|
return dm5f.x*sumf_d - dm5f.y*sumf_m;
|
2394
2398
|
|
2395
2399
|
#else
|
2396
|
-
|
2397
|
-
return 0.0f; // only to satisfy the compiler
|
2400
|
+
bad_arch();
|
2398
2401
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2399
2402
|
}
|
2400
2403
|
|
@@ -2427,8 +2430,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
|
|
2427
2430
|
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
2428
2431
|
|
2429
2432
|
#else
|
2430
|
-
|
2431
|
-
return 0.0f; // only to satisfy the compiler
|
2433
|
+
bad_arch();
|
2432
2434
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2433
2435
|
}
|
2434
2436
|
|
@@ -2458,8 +2460,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
|
|
2458
2460
|
|
2459
2461
|
return d*sumf;
|
2460
2462
|
#else
|
2461
|
-
|
2462
|
-
return 0.0f; // only to satisfy the compiler
|
2463
|
+
bad_arch();
|
2463
2464
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2464
2465
|
}
|
2465
2466
|
|
@@ -2490,8 +2491,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
|
|
2490
2491
|
return d6 * sumf_d;
|
2491
2492
|
|
2492
2493
|
#else
|
2493
|
-
|
2494
|
-
return 0.0f; // only to satisfy the compiler
|
2494
|
+
bad_arch();
|
2495
2495
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2496
2496
|
}
|
2497
2497
|
|
@@ -3357,8 +3357,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
3357
3357
|
return dall * sumf_d - dmin * sumf_m;
|
3358
3358
|
|
3359
3359
|
#else
|
3360
|
-
|
3361
|
-
return 0.0f; // only to satisfy the compiler
|
3360
|
+
bad_arch();
|
3362
3361
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
3363
3362
|
|
3364
3363
|
#endif
|
@@ -3541,8 +3540,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
3541
3540
|
return d * sumf_d;
|
3542
3541
|
|
3543
3542
|
#else
|
3544
|
-
|
3545
|
-
return 0.0f; // only to satisfy the compiler
|
3543
|
+
bad_arch();
|
3546
3544
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
3547
3545
|
|
3548
3546
|
#endif
|
@@ -3952,7 +3950,7 @@ template <bool need_check> static __global__ void
|
|
3952
3950
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3953
3951
|
#else
|
3954
3952
|
(void) vec_dot_q4_0_q8_1_mul_mat;
|
3955
|
-
|
3953
|
+
bad_arch();
|
3956
3954
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
3957
3955
|
}
|
3958
3956
|
|
@@ -4021,7 +4019,7 @@ template <bool need_check> static __global__ void
|
|
4021
4019
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4022
4020
|
#else
|
4023
4021
|
(void) vec_dot_q4_1_q8_1_mul_mat;
|
4024
|
-
|
4022
|
+
bad_arch();
|
4025
4023
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
4026
4024
|
}
|
4027
4025
|
|
@@ -4088,7 +4086,7 @@ template <bool need_check> static __global__ void
|
|
4088
4086
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4089
4087
|
#else
|
4090
4088
|
(void) vec_dot_q5_0_q8_1_mul_mat;
|
4091
|
-
|
4089
|
+
bad_arch();
|
4092
4090
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
4093
4091
|
}
|
4094
4092
|
|
@@ -4155,7 +4153,7 @@ mul_mat_q5_1(
|
|
4155
4153
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4156
4154
|
#else
|
4157
4155
|
(void) vec_dot_q5_1_q8_1_mul_mat;
|
4158
|
-
|
4156
|
+
bad_arch();
|
4159
4157
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
4160
4158
|
}
|
4161
4159
|
|
@@ -4222,7 +4220,7 @@ template <bool need_check> static __global__ void
|
|
4222
4220
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4223
4221
|
#else
|
4224
4222
|
(void) vec_dot_q8_0_q8_1_mul_mat;
|
4225
|
-
|
4223
|
+
bad_arch();
|
4226
4224
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
4227
4225
|
}
|
4228
4226
|
|
@@ -4289,7 +4287,7 @@ mul_mat_q2_K(
|
|
4289
4287
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4290
4288
|
#else
|
4291
4289
|
(void) vec_dot_q2_K_q8_1_mul_mat;
|
4292
|
-
|
4290
|
+
bad_arch();
|
4293
4291
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
4294
4292
|
}
|
4295
4293
|
|
@@ -4358,7 +4356,7 @@ template <bool need_check> static __global__ void
|
|
4358
4356
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4359
4357
|
#else
|
4360
4358
|
(void) vec_dot_q3_K_q8_1_mul_mat;
|
4361
|
-
|
4359
|
+
bad_arch();
|
4362
4360
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
4363
4361
|
}
|
4364
4362
|
|
@@ -4427,7 +4425,7 @@ template <bool need_check> static __global__ void
|
|
4427
4425
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4428
4426
|
#else
|
4429
4427
|
(void) vec_dot_q4_K_q8_1_mul_mat;
|
4430
|
-
|
4428
|
+
bad_arch();
|
4431
4429
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
4432
4430
|
}
|
4433
4431
|
|
@@ -4494,7 +4492,7 @@ mul_mat_q5_K(
|
|
4494
4492
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4495
4493
|
#else
|
4496
4494
|
(void) vec_dot_q5_K_q8_1_mul_mat;
|
4497
|
-
|
4495
|
+
bad_arch();
|
4498
4496
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
4499
4497
|
}
|
4500
4498
|
|
@@ -4563,7 +4561,7 @@ template <bool need_check> static __global__ void
|
|
4563
4561
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4564
4562
|
#else
|
4565
4563
|
(void) vec_dot_q6_K_q8_1_mul_mat;
|
4566
|
-
|
4564
|
+
bad_arch();
|
4567
4565
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
4568
4566
|
}
|
4569
4567
|
|
@@ -4998,7 +4996,16 @@ static __global__ void rope_neox(
|
|
4998
4996
|
const int ib = col / n_dims;
|
4999
4997
|
const int ic = col % n_dims;
|
5000
4998
|
|
5001
|
-
|
4999
|
+
if (ib > 0) {
|
5000
|
+
const int i = row*ncols + ib*n_dims + ic;
|
5001
|
+
|
5002
|
+
dst[i + 0] = x[i + 0];
|
5003
|
+
dst[i + 1] = x[i + 1];
|
5004
|
+
|
5005
|
+
return;
|
5006
|
+
}
|
5007
|
+
|
5008
|
+
const int i = row*ncols + ib*n_dims + ic/2;
|
5002
5009
|
const int i2 = row/p_delta_rows;
|
5003
5010
|
|
5004
5011
|
float cur_rot = inv_ndims * ic - ib;
|
@@ -6814,6 +6821,7 @@ static void ggml_cuda_op_get_rows(
|
|
6814
6821
|
break;
|
6815
6822
|
default:
|
6816
6823
|
// TODO: k-quants
|
6824
|
+
fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
|
6817
6825
|
GGML_ASSERT(false);
|
6818
6826
|
break;
|
6819
6827
|
}
|
@@ -7057,6 +7065,7 @@ inline void ggml_cuda_op_upscale(
|
|
7057
7065
|
|
7058
7066
|
(void) src1;
|
7059
7067
|
(void) dst;
|
7068
|
+
(void) src1_dd;
|
7060
7069
|
}
|
7061
7070
|
|
7062
7071
|
inline void ggml_cuda_op_pad(
|
@@ -7073,6 +7082,7 @@ inline void ggml_cuda_op_pad(
|
|
7073
7082
|
|
7074
7083
|
(void) src1;
|
7075
7084
|
(void) dst;
|
7085
|
+
(void) src1_dd;
|
7076
7086
|
}
|
7077
7087
|
|
7078
7088
|
inline void ggml_cuda_op_rms_norm(
|
@@ -7376,7 +7386,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
7376
7386
|
|
7377
7387
|
const int compute_capability = g_compute_capabilities[id];
|
7378
7388
|
|
7379
|
-
if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
|
7389
|
+
if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
|
7380
7390
|
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
|
7381
7391
|
half * src0_as_f16 = nullptr;
|
7382
7392
|
size_t src0_as = 0;
|
@@ -7690,17 +7700,9 @@ inline void ggml_cuda_op_scale(
|
|
7690
7700
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
7691
7701
|
|
7692
7702
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7693
|
-
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
7694
7703
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
7695
7704
|
|
7696
|
-
float scale;
|
7697
|
-
// HACK: support for ggml backend interface
|
7698
|
-
if (src1->backend == GGML_BACKEND_CPU) {
|
7699
|
-
scale = ((float *) src1->data)[0];
|
7700
|
-
} else {
|
7701
|
-
// TODO: pass pointer to kernel instead of copying to host
|
7702
|
-
CUDA_CHECK(cudaMemcpy(&scale, src1->data, sizeof(float), cudaMemcpyDeviceToHost));
|
7703
|
-
}
|
7705
|
+
const float scale = ((float *) dst->op_params)[0];
|
7704
7706
|
|
7705
7707
|
scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
|
7706
7708
|
CUDA_CHECK(cudaGetLastError());
|
@@ -7747,8 +7749,6 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
|
7747
7749
|
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
|
7748
7750
|
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU;
|
7749
7751
|
|
7750
|
-
const bool src1_stays_on_host = use_src1 && dst->op == GGML_OP_SCALE;
|
7751
|
-
|
7752
7752
|
// dd = data device
|
7753
7753
|
float * src0_ddf = nullptr;
|
7754
7754
|
float * src1_ddf = nullptr;
|
@@ -7769,7 +7769,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
|
7769
7769
|
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
|
7770
7770
|
}
|
7771
7771
|
|
7772
|
-
if (use_src1
|
7772
|
+
if (use_src1) {
|
7773
7773
|
if (src1_on_device) {
|
7774
7774
|
src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
7775
7775
|
} else {
|
@@ -7817,6 +7817,11 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
|
|
7817
7817
|
}
|
7818
7818
|
|
7819
7819
|
#ifdef NDEBUG
|
7820
|
+
for (int id = 0; id < g_device_count; ++id) {
|
7821
|
+
CUDA_CHECK(ggml_cuda_set_device(id));
|
7822
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
7823
|
+
}
|
7824
|
+
|
7820
7825
|
for (int id = 0; id < g_device_count; ++id) {
|
7821
7826
|
CUDA_CHECK(ggml_cuda_set_device(id));
|
7822
7827
|
|
@@ -7868,8 +7873,6 @@ static void ggml_cuda_op_mul_mat(
|
|
7868
7873
|
const int nb2 = dst->nb[2];
|
7869
7874
|
const int nb3 = dst->nb[3];
|
7870
7875
|
|
7871
|
-
ggml_cuda_set_peer_access(ne11);
|
7872
|
-
|
7873
7876
|
GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
|
7874
7877
|
GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
|
7875
7878
|
|
@@ -8300,27 +8303,27 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
8300
8303
|
}
|
8301
8304
|
|
8302
8305
|
static __global__ void k_compute_batched_ptrs(
|
8303
|
-
const half * src0_as_f16, const half * src1_as_f16,
|
8306
|
+
const half * src0_as_f16, const half * src1_as_f16, char * dst,
|
8304
8307
|
const void ** ptrs_src, void ** ptrs_dst,
|
8305
|
-
|
8306
|
-
|
8307
|
-
|
8308
|
-
|
8309
|
-
|
8310
|
-
|
8311
|
-
|
8312
|
-
|
8308
|
+
int64_t ne12, int64_t ne13,
|
8309
|
+
int64_t ne23,
|
8310
|
+
size_t nb02, size_t nb03,
|
8311
|
+
size_t nb12, size_t nb13,
|
8312
|
+
size_t nbd2, size_t nbd3,
|
8313
|
+
int64_t r2, int64_t r3) {
|
8314
|
+
int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x;
|
8315
|
+
int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y;
|
8313
8316
|
|
8314
8317
|
if (i13 >= ne13 || i12 >= ne12) {
|
8315
8318
|
return;
|
8316
8319
|
}
|
8317
8320
|
|
8318
|
-
|
8319
|
-
|
8321
|
+
int64_t i03 = i13 / r3;
|
8322
|
+
int64_t i02 = i12 / r2;
|
8320
8323
|
|
8321
8324
|
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
|
8322
8325
|
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
|
8323
|
-
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *)
|
8326
|
+
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
|
8324
8327
|
}
|
8325
8328
|
|
8326
8329
|
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -8376,7 +8379,41 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
8376
8379
|
to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
|
8377
8380
|
|
8378
8381
|
size_t dst_as = 0;
|
8379
|
-
|
8382
|
+
|
8383
|
+
half * dst_f16 = nullptr;
|
8384
|
+
char * dst_t = nullptr;
|
8385
|
+
|
8386
|
+
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
|
8387
|
+
cudaDataType_t cu_data_type = CUDA_R_16F;
|
8388
|
+
|
8389
|
+
// dst strides
|
8390
|
+
size_t nbd2 = dst->nb[2];
|
8391
|
+
size_t nbd3 = dst->nb[3];
|
8392
|
+
|
8393
|
+
const half alpha_f16 = 1.0f;
|
8394
|
+
const half beta_f16 = 0.0f;
|
8395
|
+
|
8396
|
+
const float alpha_f32 = 1.0f;
|
8397
|
+
const float beta_f32 = 0.0f;
|
8398
|
+
|
8399
|
+
const void * alpha = &alpha_f16;
|
8400
|
+
const void * beta = &beta_f16;
|
8401
|
+
|
8402
|
+
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
|
8403
|
+
dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
|
8404
|
+
dst_t = (char *) dst_f16;
|
8405
|
+
|
8406
|
+
nbd2 /= sizeof(float) / sizeof(half);
|
8407
|
+
nbd3 /= sizeof(float) / sizeof(half);
|
8408
|
+
} else {
|
8409
|
+
dst_t = (char *) dst_ddf;
|
8410
|
+
|
8411
|
+
cu_compute_type = CUBLAS_COMPUTE_32F;
|
8412
|
+
cu_data_type = CUDA_R_32F;
|
8413
|
+
|
8414
|
+
alpha = &alpha_f32;
|
8415
|
+
beta = &beta_f32;
|
8416
|
+
}
|
8380
8417
|
|
8381
8418
|
GGML_ASSERT(ne12 % ne02 == 0);
|
8382
8419
|
GGML_ASSERT(ne13 % ne03 == 0);
|
@@ -8385,9 +8422,6 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
8385
8422
|
const int64_t r2 = ne12/ne02;
|
8386
8423
|
const int64_t r3 = ne13/ne03;
|
8387
8424
|
|
8388
|
-
const half alpha_f16 = 1.0f;
|
8389
|
-
const half beta_f16 = 0.0f;
|
8390
|
-
|
8391
8425
|
#if 0
|
8392
8426
|
// use cublasGemmEx
|
8393
8427
|
{
|
@@ -8397,12 +8431,12 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
8397
8431
|
int i02 = i12 / r2;
|
8398
8432
|
|
8399
8433
|
CUBLAS_CHECK(
|
8400
|
-
cublasGemmEx(g_cublas_handles[
|
8434
|
+
cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
8401
8435
|
ne01, ne11, ne10,
|
8402
|
-
|
8403
|
-
|
8404
|
-
|
8405
|
-
|
8436
|
+
alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
|
8437
|
+
(const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
|
8438
|
+
beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01,
|
8439
|
+
cu_compute_type,
|
8406
8440
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
8407
8441
|
}
|
8408
8442
|
}
|
@@ -8414,11 +8448,11 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
8414
8448
|
CUBLAS_CHECK(
|
8415
8449
|
cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
8416
8450
|
ne01, ne11, ne10,
|
8417
|
-
|
8418
|
-
|
8419
|
-
|
8451
|
+
alpha, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
|
8452
|
+
(const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
|
8453
|
+
beta, ( char *) dst_t, cu_data_type, ne01, dst->nb[2]/sizeof(float), // strideC
|
8420
8454
|
ne12*ne13,
|
8421
|
-
|
8455
|
+
cu_compute_type,
|
8422
8456
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
8423
8457
|
} else {
|
8424
8458
|
// use cublasGemmBatchedEx
|
@@ -8435,24 +8469,24 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
8435
8469
|
|
8436
8470
|
dim3 block_dims(ne13, ne12);
|
8437
8471
|
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
|
8438
|
-
src0_as_f16, src1_as_f16,
|
8472
|
+
src0_as_f16, src1_as_f16, dst_t,
|
8439
8473
|
ptrs_src, ptrs_dst,
|
8440
8474
|
ne12, ne13,
|
8441
8475
|
ne23,
|
8442
8476
|
nb02, nb03,
|
8443
8477
|
nb12, nb13,
|
8444
|
-
|
8478
|
+
nbd2, nbd3,
|
8445
8479
|
r2, r3);
|
8446
8480
|
CUDA_CHECK(cudaGetLastError());
|
8447
8481
|
|
8448
8482
|
CUBLAS_CHECK(
|
8449
8483
|
cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
8450
8484
|
ne01, ne11, ne10,
|
8451
|
-
|
8452
|
-
|
8453
|
-
|
8485
|
+
alpha, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
|
8486
|
+
(const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
|
8487
|
+
beta, ( void **) (ptrs_dst + 0*ne23), cu_data_type, ne01,
|
8454
8488
|
ne23,
|
8455
|
-
|
8489
|
+
cu_compute_type,
|
8456
8490
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
8457
8491
|
|
8458
8492
|
if (ptrs_src_s != 0) {
|
@@ -8464,11 +8498,14 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
8464
8498
|
}
|
8465
8499
|
#endif
|
8466
8500
|
|
8467
|
-
|
8468
|
-
|
8501
|
+
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
|
8502
|
+
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
8503
|
+
to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
|
8504
|
+
|
8505
|
+
ggml_cuda_pool_free(dst_f16, dst_as);
|
8506
|
+
}
|
8469
8507
|
|
8470
8508
|
ggml_cuda_pool_free(src1_as_f16, src1_as);
|
8471
|
-
ggml_cuda_pool_free(dst_f16, dst_as);
|
8472
8509
|
}
|
8473
8510
|
|
8474
8511
|
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -8732,7 +8769,8 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
|
8732
8769
|
// TODO: mmq/mmv support
|
8733
8770
|
#endif
|
8734
8771
|
|
8735
|
-
|
8772
|
+
const int64_t nb11 = src1->nb[1];
|
8773
|
+
const int64_t nb1 = dst->nb[1];
|
8736
8774
|
|
8737
8775
|
const struct ggml_tensor * ids = src0;
|
8738
8776
|
const int32_t id = ((int32_t *) dst->op_params)[0];
|
@@ -8740,10 +8778,12 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
|
8740
8778
|
|
8741
8779
|
std::vector<char> ids_host(ggml_nbytes(ids));
|
8742
8780
|
|
8781
|
+
const cudaStream_t stream = g_cudaStreams[g_main_device][0];
|
8782
|
+
|
8743
8783
|
if (ids->backend == GGML_BACKEND_GPU) {
|
8744
8784
|
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
|
8745
|
-
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost,
|
8746
|
-
CUDA_CHECK(cudaStreamSynchronize(
|
8785
|
+
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
|
8786
|
+
CUDA_CHECK(cudaStreamSynchronize(stream));
|
8747
8787
|
} else {
|
8748
8788
|
memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
|
8749
8789
|
}
|
@@ -8757,37 +8797,110 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
|
8757
8797
|
ggml_tensor src1_row = *src1;
|
8758
8798
|
ggml_tensor dst_row = *dst;
|
8759
8799
|
|
8760
|
-
src1_row.
|
8761
|
-
dst_row.
|
8762
|
-
|
8763
|
-
src1_row.nb[2] = src1_row.nb[1];
|
8764
|
-
dst_row.nb[2] = dst_row.nb[1];
|
8765
|
-
|
8766
|
-
src1_row.nb[3] = src1_row.nb[1];
|
8767
|
-
dst_row.nb[3] = dst_row.nb[1];
|
8800
|
+
src1_row.backend = GGML_BACKEND_GPU;
|
8801
|
+
dst_row.backend = GGML_BACKEND_GPU;
|
8768
8802
|
|
8769
8803
|
src1_row.extra = &src1_row_extra;
|
8770
8804
|
dst_row.extra = &dst_row_extra;
|
8771
8805
|
|
8806
|
+
char * src1_original = src1->backend == GGML_BACKEND_CPU ?
|
8807
|
+
(char *) src1->data : (char *) src1_extra->data_device[g_main_device];
|
8808
|
+
char * dst_original = dst->backend == GGML_BACKEND_CPU ?
|
8809
|
+
(char *) dst->data : (char *) dst_extra->data_device[g_main_device];
|
8810
|
+
|
8811
|
+
if (src1->ne[1] == 1) {
|
8812
|
+
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
|
8813
|
+
GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
|
8814
|
+
|
8815
|
+
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
8816
|
+
//int32_t row_id;
|
8817
|
+
//CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
|
8818
|
+
//CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
|
8819
|
+
|
8820
|
+
const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
8772
8821
|
|
8773
|
-
|
8774
|
-
//int32_t row_id;
|
8775
|
-
//CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
|
8776
|
-
//CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
|
8822
|
+
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
8777
8823
|
|
8778
|
-
|
8824
|
+
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
|
8779
8825
|
|
8780
|
-
|
8826
|
+
src1_row_extra.data_device[g_main_device] = src1_original + i01*src1->nb[1];
|
8827
|
+
src1_row.data = (char *) src1->data + i01*src1->nb[1]; // TODO why is this set?
|
8828
|
+
|
8829
|
+
dst_row_extra.data_device[g_main_device] = dst_original + i01*dst->nb[1];
|
8830
|
+
dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set?
|
8831
|
+
|
8832
|
+
ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
|
8833
|
+
}
|
8834
|
+
} else {
|
8835
|
+
size_t as_src1, as_dst;
|
8836
|
+
char * src1_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(src1), &as_src1);
|
8837
|
+
char * dst_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(dst), &as_dst);
|
8781
8838
|
|
8782
|
-
|
8839
|
+
src1_row_extra.data_device[g_main_device] = src1_contiguous;
|
8840
|
+
dst_row_extra.data_device[g_main_device] = dst_contiguous;
|
8783
8841
|
|
8784
|
-
|
8785
|
-
|
8842
|
+
const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_CPU ?
|
8843
|
+
cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
|
8844
|
+
const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_CPU ?
|
8845
|
+
cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
|
8786
8846
|
|
8787
|
-
|
8788
|
-
|
8847
|
+
for (int32_t row_id = 0; row_id < n_as; ++row_id) {
|
8848
|
+
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
|
8789
8849
|
|
8790
|
-
|
8850
|
+
int64_t num_src1_rows = 0;
|
8851
|
+
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
8852
|
+
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
8853
|
+
|
8854
|
+
if (row_id_i != row_id) {
|
8855
|
+
continue;
|
8856
|
+
}
|
8857
|
+
|
8858
|
+
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
8859
|
+
|
8860
|
+
CUDA_CHECK(cudaMemcpyAsync(src1_contiguous + num_src1_rows*nb11, src1_original + i01*nb11,
|
8861
|
+
nb11, src1_kind, stream));
|
8862
|
+
num_src1_rows++;
|
8863
|
+
}
|
8864
|
+
|
8865
|
+
if (num_src1_rows == 0) {
|
8866
|
+
continue;
|
8867
|
+
}
|
8868
|
+
|
8869
|
+
src1_row.ne[1] = num_src1_rows;
|
8870
|
+
dst_row.ne[1] = num_src1_rows;
|
8871
|
+
|
8872
|
+
src1_row.nb[1] = nb11;
|
8873
|
+
src1_row.nb[2] = num_src1_rows*nb11;
|
8874
|
+
src1_row.nb[3] = num_src1_rows*nb11;
|
8875
|
+
|
8876
|
+
dst_row.nb[1] = nb1;
|
8877
|
+
dst_row.nb[2] = num_src1_rows*nb1;
|
8878
|
+
dst_row.nb[3] = num_src1_rows*nb1;
|
8879
|
+
|
8880
|
+
ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
|
8881
|
+
|
8882
|
+
num_src1_rows = 0;
|
8883
|
+
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
8884
|
+
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
8885
|
+
|
8886
|
+
if (row_id_i != row_id) {
|
8887
|
+
continue;
|
8888
|
+
}
|
8889
|
+
|
8890
|
+
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
8891
|
+
|
8892
|
+
CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous + num_src1_rows*nb1,
|
8893
|
+
nb1, dst_kind, stream));
|
8894
|
+
num_src1_rows++;
|
8895
|
+
}
|
8896
|
+
}
|
8897
|
+
|
8898
|
+
ggml_cuda_pool_free(src1_contiguous, as_src1);
|
8899
|
+
ggml_cuda_pool_free(dst_contiguous, as_dst);
|
8900
|
+
}
|
8901
|
+
|
8902
|
+
if (dst->backend == GGML_BACKEND_CPU) {
|
8903
|
+
CUDA_CHECK(cudaStreamSynchronize(stream));
|
8791
8904
|
}
|
8792
8905
|
}
|
8793
8906
|
|
@@ -8958,7 +9071,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
8958
9071
|
|
8959
9072
|
char * buf;
|
8960
9073
|
CUDA_CHECK(cudaMalloc(&buf, size));
|
8961
|
-
char * buf_host = (char*)data + offset_split;
|
9074
|
+
char * buf_host = (char *)data + offset_split;
|
8962
9075
|
|
8963
9076
|
// set padding to 0 to avoid possible NaN values
|
8964
9077
|
if (size > original_size) {
|
@@ -8980,7 +9093,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
8980
9093
|
}
|
8981
9094
|
|
8982
9095
|
void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
8983
|
-
if (!tensor || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
|
9096
|
+
if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
|
8984
9097
|
return;
|
8985
9098
|
}
|
8986
9099
|
|
@@ -9103,11 +9216,10 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
|
|
9103
9216
|
|
9104
9217
|
ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
|
9105
9218
|
|
9106
|
-
const bool inplace =
|
9107
|
-
tensor->op == GGML_OP_VIEW;
|
9219
|
+
const bool inplace = tensor->view_src != nullptr;
|
9108
9220
|
|
9109
|
-
if (inplace && (tensor->
|
9110
|
-
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->
|
9221
|
+
if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) {
|
9222
|
+
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
|
9111
9223
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
9112
9224
|
size_t view_offset = 0;
|
9113
9225
|
if (tensor->op == GGML_OP_VIEW) {
|
@@ -9187,14 +9299,14 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
9187
9299
|
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
9188
9300
|
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
9189
9301
|
|
9190
|
-
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT) {
|
9302
|
+
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
|
9191
9303
|
return false;
|
9192
9304
|
}
|
9193
9305
|
|
9194
9306
|
if (tensor->op == GGML_OP_MUL_MAT) {
|
9195
9307
|
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
|
9196
9308
|
#ifndef NDEBUG
|
9197
|
-
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = " PRId64 ", src1->ne[3] = " PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
|
9309
|
+
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
|
9198
9310
|
#endif
|
9199
9311
|
return false;
|
9200
9312
|
}
|
@@ -9323,6 +9435,10 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
9323
9435
|
return false;
|
9324
9436
|
}
|
9325
9437
|
|
9438
|
+
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) {
|
9439
|
+
ggml_cuda_set_peer_access(tensor->src[1]->ne[1]);
|
9440
|
+
}
|
9441
|
+
|
9326
9442
|
if (params->ith != 0) {
|
9327
9443
|
return true;
|
9328
9444
|
}
|
@@ -9396,7 +9512,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
|
|
9396
9512
|
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
9397
9513
|
|
9398
9514
|
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
9399
|
-
assert(tensor->view_src->buffer->buft == buffer->buft);
|
9515
|
+
assert(tensor->view_src->buffer->buft == buffer->buft);
|
9400
9516
|
tensor->backend = tensor->view_src->backend;
|
9401
9517
|
tensor->extra = tensor->view_src->extra;
|
9402
9518
|
return;
|
@@ -9427,23 +9543,34 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
|
|
9427
9543
|
}
|
9428
9544
|
|
9429
9545
|
static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
9430
|
-
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
9431
|
-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
9432
9546
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
9433
9547
|
|
9434
|
-
|
9548
|
+
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
9435
9549
|
|
9436
|
-
|
9550
|
+
ggml_cuda_set_device(ctx->device);
|
9551
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
9552
|
+
|
9553
|
+
CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
|
9437
9554
|
}
|
9438
9555
|
|
9439
9556
|
static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
9440
|
-
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
9441
|
-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
9442
9557
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
9443
9558
|
|
9559
|
+
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
9560
|
+
|
9561
|
+
ggml_cuda_set_device(ctx->device);
|
9562
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
9563
|
+
|
9444
9564
|
CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
|
9565
|
+
}
|
9445
9566
|
|
9446
|
-
|
9567
|
+
static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
9568
|
+
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
9569
|
+
|
9570
|
+
ggml_cuda_set_device(ctx->device);
|
9571
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
9572
|
+
|
9573
|
+
CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size));
|
9447
9574
|
}
|
9448
9575
|
|
9449
9576
|
static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
|
@@ -9454,6 +9581,7 @@ static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
|
|
9454
9581
|
/* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
|
9455
9582
|
/* .cpy_tensor_from = */ NULL,
|
9456
9583
|
/* .cpy_tensor_to = */ NULL,
|
9584
|
+
/* .clear = */ ggml_backend_cuda_buffer_clear,
|
9457
9585
|
};
|
9458
9586
|
|
9459
9587
|
// cuda buffer type
|
@@ -9505,35 +9633,36 @@ static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_t
|
|
9505
9633
|
UNUSED(buft);
|
9506
9634
|
}
|
9507
9635
|
|
9508
|
-
static ggml_backend_buffer_type_i
|
9636
|
+
static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
|
9509
9637
|
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
|
9510
9638
|
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
9511
9639
|
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
9512
9640
|
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
|
9641
|
+
/* .is_host = */ nullptr,
|
9513
9642
|
};
|
9514
9643
|
|
9515
9644
|
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
9516
|
-
static struct ggml_backend_buffer_type
|
9517
|
-
|
9518
|
-
|
9645
|
+
static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
|
9646
|
+
|
9647
|
+
static bool ggml_backend_cuda_buffer_type_initialized = false;
|
9648
|
+
|
9649
|
+
if (!ggml_backend_cuda_buffer_type_initialized) {
|
9519
9650
|
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
|
9520
|
-
|
9521
|
-
/* .iface = */
|
9651
|
+
ggml_backend_cuda_buffer_types[i] = {
|
9652
|
+
/* .iface = */ ggml_backend_cuda_buffer_type_interface,
|
9522
9653
|
/* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
|
9523
9654
|
};
|
9524
9655
|
}
|
9525
|
-
|
9656
|
+
ggml_backend_cuda_buffer_type_initialized = true;
|
9526
9657
|
}
|
9527
9658
|
|
9528
|
-
return &
|
9659
|
+
return &ggml_backend_cuda_buffer_types[device];
|
9529
9660
|
}
|
9530
9661
|
|
9531
9662
|
// host buffer type
|
9532
9663
|
|
9533
9664
|
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
9534
|
-
|
9535
|
-
CUDA_CHECK(cudaFreeHost(ctx->dev_ptr));
|
9536
|
-
delete ctx;
|
9665
|
+
CUDA_CHECK(cudaFreeHost(buffer->context));
|
9537
9666
|
}
|
9538
9667
|
|
9539
9668
|
static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
@@ -9546,24 +9675,21 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
|
|
9546
9675
|
buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
|
9547
9676
|
|
9548
9677
|
return buffer;
|
9549
|
-
|
9550
|
-
UNUSED(buft);
|
9551
9678
|
}
|
9552
9679
|
|
9553
|
-
struct ggml_backend_buffer_type_i cuda_backend_host_buffer_type_interface = {
|
9554
|
-
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
|
9555
|
-
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
9556
|
-
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
9557
|
-
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
9558
|
-
};
|
9559
|
-
|
9560
9680
|
ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
9561
|
-
static struct ggml_backend_buffer_type
|
9562
|
-
/* .iface = */
|
9681
|
+
static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
|
9682
|
+
/* .iface = */ {
|
9683
|
+
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
|
9684
|
+
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
9685
|
+
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
9686
|
+
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
9687
|
+
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
9688
|
+
},
|
9563
9689
|
/* .context = */ nullptr,
|
9564
9690
|
};
|
9565
9691
|
|
9566
|
-
return &
|
9692
|
+
return &ggml_backend_cuda_buffer_type_host;
|
9567
9693
|
}
|
9568
9694
|
|
9569
9695
|
// backend
|
@@ -9595,8 +9721,6 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens
|
|
9595
9721
|
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
9596
9722
|
|
9597
9723
|
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
9598
|
-
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
9599
|
-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
9600
9724
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
9601
9725
|
|
9602
9726
|
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
|
@@ -9606,8 +9730,6 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm
|
|
9606
9730
|
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
9607
9731
|
|
9608
9732
|
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
9609
|
-
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
9610
|
-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
9611
9733
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
9612
9734
|
|
9613
9735
|
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
|