llama_cpp 0.10.1 → 0.10.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/llama_cpp.cpp +16 -1
- data/ext/llama_cpp/src/ggml-alloc.c +12 -4
- data/ext/llama_cpp/src/ggml-backend-impl.h +12 -8
- data/ext/llama_cpp/src/ggml-backend.c +75 -5
- data/ext/llama_cpp/src/ggml-backend.h +7 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +284 -162
- data/ext/llama_cpp/src/ggml-metal.h +3 -0
- data/ext/llama_cpp/src/ggml-metal.m +190 -44
- data/ext/llama_cpp/src/ggml-metal.metal +11 -2
- data/ext/llama_cpp/src/ggml.c +262 -89
- data/ext/llama_cpp/src/ggml.h +24 -10
- data/ext/llama_cpp/src/llama.cpp +926 -780
- data/ext/llama_cpp/src/llama.h +8 -3
- data/lib/llama_cpp/version.rb +2 -2
- metadata +2 -2
@@ -31,6 +31,7 @@
|
|
31
31
|
#define CUDA_R_16F HIPBLAS_R_16F
|
32
32
|
#define CUDA_R_32F HIPBLAS_R_32F
|
33
33
|
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
34
|
+
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
|
34
35
|
#define cublasCreate hipblasCreate
|
35
36
|
#define cublasGemmEx hipblasGemmEx
|
36
37
|
#define cublasGemmBatchedEx hipblasGemmBatchedEx
|
@@ -40,6 +41,7 @@
|
|
40
41
|
#define cublasSetStream hipblasSetStream
|
41
42
|
#define cublasSgemm hipblasSgemm
|
42
43
|
#define cublasStatus_t hipblasStatus_t
|
44
|
+
#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
|
43
45
|
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
|
44
46
|
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
|
45
47
|
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
|
@@ -58,8 +60,13 @@
|
|
58
60
|
#define cudaGetDeviceProperties hipGetDeviceProperties
|
59
61
|
#define cudaGetErrorString hipGetErrorString
|
60
62
|
#define cudaGetLastError hipGetLastError
|
63
|
+
#ifdef GGML_HIP_UMA
|
64
|
+
#define cudaMalloc hipMallocManaged
|
65
|
+
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
|
66
|
+
#else
|
61
67
|
#define cudaMalloc hipMalloc
|
62
68
|
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
69
|
+
#endif
|
63
70
|
#define cudaMemcpy hipMemcpy
|
64
71
|
#define cudaMemcpy2DAsync hipMemcpy2DAsync
|
65
72
|
#define cudaMemcpyAsync hipMemcpyAsync
|
@@ -78,6 +85,7 @@
|
|
78
85
|
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
|
79
86
|
#define cudaStream_t hipStream_t
|
80
87
|
#define cudaSuccess hipSuccess
|
88
|
+
#define __trap abort
|
81
89
|
#else
|
82
90
|
#include <cuda_runtime.h>
|
83
91
|
#include <cublas_v2.h>
|
@@ -510,6 +518,14 @@ static size_t g_scratch_offset = 0;
|
|
510
518
|
|
511
519
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
512
520
|
|
521
|
+
[[noreturn]]
|
522
|
+
static __device__ void bad_arch() {
|
523
|
+
printf("ERROR: ggml-cuda was compiled without support for the current GPU architecture.\n");
|
524
|
+
__trap();
|
525
|
+
|
526
|
+
(void) bad_arch; // suppress unused function warning
|
527
|
+
}
|
528
|
+
|
513
529
|
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
514
530
|
#pragma unroll
|
515
531
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
@@ -1970,8 +1986,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
|
|
1970
1986
|
// second part effectively subtracts 8 from each quant value
|
1971
1987
|
return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
|
1972
1988
|
#else
|
1973
|
-
|
1974
|
-
return 0.0f; // only to satisfy the compiler
|
1989
|
+
bad_arch();
|
1975
1990
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1976
1991
|
}
|
1977
1992
|
|
@@ -2008,8 +2023,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
|
|
2008
2023
|
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
|
2009
2024
|
return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
|
2010
2025
|
#else
|
2011
|
-
|
2012
|
-
return 0.0f; // only to satisfy the compiler
|
2026
|
+
bad_arch();
|
2013
2027
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2014
2028
|
}
|
2015
2029
|
|
@@ -2044,8 +2058,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
|
|
2044
2058
|
// second part effectively subtracts 16 from each quant value
|
2045
2059
|
return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
|
2046
2060
|
#else
|
2047
|
-
|
2048
|
-
return 0.0f; // only to satisfy the compiler
|
2061
|
+
bad_arch();
|
2049
2062
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2050
2063
|
}
|
2051
2064
|
|
@@ -2090,8 +2103,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
|
|
2090
2103
|
return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
|
2091
2104
|
|
2092
2105
|
#else
|
2093
|
-
|
2094
|
-
return 0.0f; // only to satisfy the compiler
|
2106
|
+
bad_arch();
|
2095
2107
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2096
2108
|
}
|
2097
2109
|
|
@@ -2112,8 +2124,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp
|
|
2112
2124
|
|
2113
2125
|
return d8_0*d8_1 * sumi;
|
2114
2126
|
#else
|
2115
|
-
|
2116
|
-
return 0.0f; // only to satisfy the compiler
|
2127
|
+
bad_arch();
|
2117
2128
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2118
2129
|
}
|
2119
2130
|
|
@@ -2143,8 +2154,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
|
|
2143
2154
|
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
|
2144
2155
|
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
|
2145
2156
|
#else
|
2146
|
-
|
2147
|
-
return 0.0f; // only to satisfy the compiler
|
2157
|
+
bad_arch();
|
2148
2158
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2149
2159
|
}
|
2150
2160
|
|
@@ -2179,8 +2189,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
|
|
2179
2189
|
|
2180
2190
|
return dm2f.x*sumf_d - dm2f.y*sumf_m;
|
2181
2191
|
#else
|
2182
|
-
|
2183
|
-
return 0.0f; // only to satisfy the compiler
|
2192
|
+
bad_arch();
|
2184
2193
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2185
2194
|
}
|
2186
2195
|
|
@@ -2217,8 +2226,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
|
|
2217
2226
|
|
2218
2227
|
return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
|
2219
2228
|
#else
|
2220
|
-
|
2221
|
-
return 0.0f; // only to satisfy the compiler
|
2229
|
+
bad_arch();
|
2222
2230
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2223
2231
|
}
|
2224
2232
|
|
@@ -2258,8 +2266,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
|
|
2258
2266
|
|
2259
2267
|
return d3 * sumf;
|
2260
2268
|
#else
|
2261
|
-
|
2262
|
-
return 0.0f; // only to satisfy the compiler
|
2269
|
+
bad_arch();
|
2263
2270
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2264
2271
|
}
|
2265
2272
|
|
@@ -2284,8 +2291,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
|
|
2284
2291
|
|
2285
2292
|
return d3*d8 * sumi;
|
2286
2293
|
#else
|
2287
|
-
|
2288
|
-
return 0.0f; // only to satisfy the compiler
|
2294
|
+
bad_arch();
|
2289
2295
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2290
2296
|
}
|
2291
2297
|
|
@@ -2318,8 +2324,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
|
|
2318
2324
|
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
2319
2325
|
|
2320
2326
|
#else
|
2321
|
-
|
2322
|
-
return 0.0f; // only to satisfy the compiler
|
2327
|
+
bad_arch();
|
2323
2328
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2324
2329
|
}
|
2325
2330
|
|
@@ -2352,8 +2357,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
|
|
2352
2357
|
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
2353
2358
|
|
2354
2359
|
#else
|
2355
|
-
|
2356
|
-
return 0.0f; // only to satisfy the compiler
|
2360
|
+
bad_arch();
|
2357
2361
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2358
2362
|
}
|
2359
2363
|
|
@@ -2393,8 +2397,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
|
|
2393
2397
|
return dm5f.x*sumf_d - dm5f.y*sumf_m;
|
2394
2398
|
|
2395
2399
|
#else
|
2396
|
-
|
2397
|
-
return 0.0f; // only to satisfy the compiler
|
2400
|
+
bad_arch();
|
2398
2401
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2399
2402
|
}
|
2400
2403
|
|
@@ -2427,8 +2430,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
|
|
2427
2430
|
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
2428
2431
|
|
2429
2432
|
#else
|
2430
|
-
|
2431
|
-
return 0.0f; // only to satisfy the compiler
|
2433
|
+
bad_arch();
|
2432
2434
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2433
2435
|
}
|
2434
2436
|
|
@@ -2458,8 +2460,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
|
|
2458
2460
|
|
2459
2461
|
return d*sumf;
|
2460
2462
|
#else
|
2461
|
-
|
2462
|
-
return 0.0f; // only to satisfy the compiler
|
2463
|
+
bad_arch();
|
2463
2464
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2464
2465
|
}
|
2465
2466
|
|
@@ -2490,8 +2491,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
|
|
2490
2491
|
return d6 * sumf_d;
|
2491
2492
|
|
2492
2493
|
#else
|
2493
|
-
|
2494
|
-
return 0.0f; // only to satisfy the compiler
|
2494
|
+
bad_arch();
|
2495
2495
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2496
2496
|
}
|
2497
2497
|
|
@@ -3357,8 +3357,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
3357
3357
|
return dall * sumf_d - dmin * sumf_m;
|
3358
3358
|
|
3359
3359
|
#else
|
3360
|
-
|
3361
|
-
return 0.0f; // only to satisfy the compiler
|
3360
|
+
bad_arch();
|
3362
3361
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
3363
3362
|
|
3364
3363
|
#endif
|
@@ -3541,8 +3540,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
3541
3540
|
return d * sumf_d;
|
3542
3541
|
|
3543
3542
|
#else
|
3544
|
-
|
3545
|
-
return 0.0f; // only to satisfy the compiler
|
3543
|
+
bad_arch();
|
3546
3544
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
3547
3545
|
|
3548
3546
|
#endif
|
@@ -3952,7 +3950,7 @@ template <bool need_check> static __global__ void
|
|
3952
3950
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3953
3951
|
#else
|
3954
3952
|
(void) vec_dot_q4_0_q8_1_mul_mat;
|
3955
|
-
|
3953
|
+
bad_arch();
|
3956
3954
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
3957
3955
|
}
|
3958
3956
|
|
@@ -4021,7 +4019,7 @@ template <bool need_check> static __global__ void
|
|
4021
4019
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4022
4020
|
#else
|
4023
4021
|
(void) vec_dot_q4_1_q8_1_mul_mat;
|
4024
|
-
|
4022
|
+
bad_arch();
|
4025
4023
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
4026
4024
|
}
|
4027
4025
|
|
@@ -4088,7 +4086,7 @@ template <bool need_check> static __global__ void
|
|
4088
4086
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4089
4087
|
#else
|
4090
4088
|
(void) vec_dot_q5_0_q8_1_mul_mat;
|
4091
|
-
|
4089
|
+
bad_arch();
|
4092
4090
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
4093
4091
|
}
|
4094
4092
|
|
@@ -4155,7 +4153,7 @@ mul_mat_q5_1(
|
|
4155
4153
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4156
4154
|
#else
|
4157
4155
|
(void) vec_dot_q5_1_q8_1_mul_mat;
|
4158
|
-
|
4156
|
+
bad_arch();
|
4159
4157
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
4160
4158
|
}
|
4161
4159
|
|
@@ -4222,7 +4220,7 @@ template <bool need_check> static __global__ void
|
|
4222
4220
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4223
4221
|
#else
|
4224
4222
|
(void) vec_dot_q8_0_q8_1_mul_mat;
|
4225
|
-
|
4223
|
+
bad_arch();
|
4226
4224
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
4227
4225
|
}
|
4228
4226
|
|
@@ -4289,7 +4287,7 @@ mul_mat_q2_K(
|
|
4289
4287
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4290
4288
|
#else
|
4291
4289
|
(void) vec_dot_q2_K_q8_1_mul_mat;
|
4292
|
-
|
4290
|
+
bad_arch();
|
4293
4291
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
4294
4292
|
}
|
4295
4293
|
|
@@ -4358,7 +4356,7 @@ template <bool need_check> static __global__ void
|
|
4358
4356
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4359
4357
|
#else
|
4360
4358
|
(void) vec_dot_q3_K_q8_1_mul_mat;
|
4361
|
-
|
4359
|
+
bad_arch();
|
4362
4360
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
4363
4361
|
}
|
4364
4362
|
|
@@ -4427,7 +4425,7 @@ template <bool need_check> static __global__ void
|
|
4427
4425
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4428
4426
|
#else
|
4429
4427
|
(void) vec_dot_q4_K_q8_1_mul_mat;
|
4430
|
-
|
4428
|
+
bad_arch();
|
4431
4429
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
4432
4430
|
}
|
4433
4431
|
|
@@ -4494,7 +4492,7 @@ mul_mat_q5_K(
|
|
4494
4492
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4495
4493
|
#else
|
4496
4494
|
(void) vec_dot_q5_K_q8_1_mul_mat;
|
4497
|
-
|
4495
|
+
bad_arch();
|
4498
4496
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
4499
4497
|
}
|
4500
4498
|
|
@@ -4563,7 +4561,7 @@ template <bool need_check> static __global__ void
|
|
4563
4561
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4564
4562
|
#else
|
4565
4563
|
(void) vec_dot_q6_K_q8_1_mul_mat;
|
4566
|
-
|
4564
|
+
bad_arch();
|
4567
4565
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
4568
4566
|
}
|
4569
4567
|
|
@@ -4998,7 +4996,16 @@ static __global__ void rope_neox(
|
|
4998
4996
|
const int ib = col / n_dims;
|
4999
4997
|
const int ic = col % n_dims;
|
5000
4998
|
|
5001
|
-
|
4999
|
+
if (ib > 0) {
|
5000
|
+
const int i = row*ncols + ib*n_dims + ic;
|
5001
|
+
|
5002
|
+
dst[i + 0] = x[i + 0];
|
5003
|
+
dst[i + 1] = x[i + 1];
|
5004
|
+
|
5005
|
+
return;
|
5006
|
+
}
|
5007
|
+
|
5008
|
+
const int i = row*ncols + ib*n_dims + ic/2;
|
5002
5009
|
const int i2 = row/p_delta_rows;
|
5003
5010
|
|
5004
5011
|
float cur_rot = inv_ndims * ic - ib;
|
@@ -6814,6 +6821,7 @@ static void ggml_cuda_op_get_rows(
|
|
6814
6821
|
break;
|
6815
6822
|
default:
|
6816
6823
|
// TODO: k-quants
|
6824
|
+
fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
|
6817
6825
|
GGML_ASSERT(false);
|
6818
6826
|
break;
|
6819
6827
|
}
|
@@ -7057,6 +7065,7 @@ inline void ggml_cuda_op_upscale(
|
|
7057
7065
|
|
7058
7066
|
(void) src1;
|
7059
7067
|
(void) dst;
|
7068
|
+
(void) src1_dd;
|
7060
7069
|
}
|
7061
7070
|
|
7062
7071
|
inline void ggml_cuda_op_pad(
|
@@ -7073,6 +7082,7 @@ inline void ggml_cuda_op_pad(
|
|
7073
7082
|
|
7074
7083
|
(void) src1;
|
7075
7084
|
(void) dst;
|
7085
|
+
(void) src1_dd;
|
7076
7086
|
}
|
7077
7087
|
|
7078
7088
|
inline void ggml_cuda_op_rms_norm(
|
@@ -7376,7 +7386,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
7376
7386
|
|
7377
7387
|
const int compute_capability = g_compute_capabilities[id];
|
7378
7388
|
|
7379
|
-
if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
|
7389
|
+
if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
|
7380
7390
|
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
|
7381
7391
|
half * src0_as_f16 = nullptr;
|
7382
7392
|
size_t src0_as = 0;
|
@@ -7690,17 +7700,9 @@ inline void ggml_cuda_op_scale(
|
|
7690
7700
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
7691
7701
|
|
7692
7702
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
7693
|
-
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
7694
7703
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
7695
7704
|
|
7696
|
-
float scale;
|
7697
|
-
// HACK: support for ggml backend interface
|
7698
|
-
if (src1->backend == GGML_BACKEND_CPU) {
|
7699
|
-
scale = ((float *) src1->data)[0];
|
7700
|
-
} else {
|
7701
|
-
// TODO: pass pointer to kernel instead of copying to host
|
7702
|
-
CUDA_CHECK(cudaMemcpy(&scale, src1->data, sizeof(float), cudaMemcpyDeviceToHost));
|
7703
|
-
}
|
7705
|
+
const float scale = ((float *) dst->op_params)[0];
|
7704
7706
|
|
7705
7707
|
scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
|
7706
7708
|
CUDA_CHECK(cudaGetLastError());
|
@@ -7747,8 +7749,6 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
|
7747
7749
|
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
|
7748
7750
|
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU;
|
7749
7751
|
|
7750
|
-
const bool src1_stays_on_host = use_src1 && dst->op == GGML_OP_SCALE;
|
7751
|
-
|
7752
7752
|
// dd = data device
|
7753
7753
|
float * src0_ddf = nullptr;
|
7754
7754
|
float * src1_ddf = nullptr;
|
@@ -7769,7 +7769,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
|
7769
7769
|
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
|
7770
7770
|
}
|
7771
7771
|
|
7772
|
-
if (use_src1
|
7772
|
+
if (use_src1) {
|
7773
7773
|
if (src1_on_device) {
|
7774
7774
|
src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
7775
7775
|
} else {
|
@@ -7817,6 +7817,11 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
|
|
7817
7817
|
}
|
7818
7818
|
|
7819
7819
|
#ifdef NDEBUG
|
7820
|
+
for (int id = 0; id < g_device_count; ++id) {
|
7821
|
+
CUDA_CHECK(ggml_cuda_set_device(id));
|
7822
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
7823
|
+
}
|
7824
|
+
|
7820
7825
|
for (int id = 0; id < g_device_count; ++id) {
|
7821
7826
|
CUDA_CHECK(ggml_cuda_set_device(id));
|
7822
7827
|
|
@@ -7868,8 +7873,6 @@ static void ggml_cuda_op_mul_mat(
|
|
7868
7873
|
const int nb2 = dst->nb[2];
|
7869
7874
|
const int nb3 = dst->nb[3];
|
7870
7875
|
|
7871
|
-
ggml_cuda_set_peer_access(ne11);
|
7872
|
-
|
7873
7876
|
GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
|
7874
7877
|
GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
|
7875
7878
|
|
@@ -8300,27 +8303,27 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
8300
8303
|
}
|
8301
8304
|
|
8302
8305
|
static __global__ void k_compute_batched_ptrs(
|
8303
|
-
const half * src0_as_f16, const half * src1_as_f16,
|
8306
|
+
const half * src0_as_f16, const half * src1_as_f16, char * dst,
|
8304
8307
|
const void ** ptrs_src, void ** ptrs_dst,
|
8305
|
-
|
8306
|
-
|
8307
|
-
|
8308
|
-
|
8309
|
-
|
8310
|
-
|
8311
|
-
|
8312
|
-
|
8308
|
+
int64_t ne12, int64_t ne13,
|
8309
|
+
int64_t ne23,
|
8310
|
+
size_t nb02, size_t nb03,
|
8311
|
+
size_t nb12, size_t nb13,
|
8312
|
+
size_t nbd2, size_t nbd3,
|
8313
|
+
int64_t r2, int64_t r3) {
|
8314
|
+
int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x;
|
8315
|
+
int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y;
|
8313
8316
|
|
8314
8317
|
if (i13 >= ne13 || i12 >= ne12) {
|
8315
8318
|
return;
|
8316
8319
|
}
|
8317
8320
|
|
8318
|
-
|
8319
|
-
|
8321
|
+
int64_t i03 = i13 / r3;
|
8322
|
+
int64_t i02 = i12 / r2;
|
8320
8323
|
|
8321
8324
|
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
|
8322
8325
|
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
|
8323
|
-
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *)
|
8326
|
+
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
|
8324
8327
|
}
|
8325
8328
|
|
8326
8329
|
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -8376,7 +8379,41 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
8376
8379
|
to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
|
8377
8380
|
|
8378
8381
|
size_t dst_as = 0;
|
8379
|
-
|
8382
|
+
|
8383
|
+
half * dst_f16 = nullptr;
|
8384
|
+
char * dst_t = nullptr;
|
8385
|
+
|
8386
|
+
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
|
8387
|
+
cudaDataType_t cu_data_type = CUDA_R_16F;
|
8388
|
+
|
8389
|
+
// dst strides
|
8390
|
+
size_t nbd2 = dst->nb[2];
|
8391
|
+
size_t nbd3 = dst->nb[3];
|
8392
|
+
|
8393
|
+
const half alpha_f16 = 1.0f;
|
8394
|
+
const half beta_f16 = 0.0f;
|
8395
|
+
|
8396
|
+
const float alpha_f32 = 1.0f;
|
8397
|
+
const float beta_f32 = 0.0f;
|
8398
|
+
|
8399
|
+
const void * alpha = &alpha_f16;
|
8400
|
+
const void * beta = &beta_f16;
|
8401
|
+
|
8402
|
+
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
|
8403
|
+
dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
|
8404
|
+
dst_t = (char *) dst_f16;
|
8405
|
+
|
8406
|
+
nbd2 /= sizeof(float) / sizeof(half);
|
8407
|
+
nbd3 /= sizeof(float) / sizeof(half);
|
8408
|
+
} else {
|
8409
|
+
dst_t = (char *) dst_ddf;
|
8410
|
+
|
8411
|
+
cu_compute_type = CUBLAS_COMPUTE_32F;
|
8412
|
+
cu_data_type = CUDA_R_32F;
|
8413
|
+
|
8414
|
+
alpha = &alpha_f32;
|
8415
|
+
beta = &beta_f32;
|
8416
|
+
}
|
8380
8417
|
|
8381
8418
|
GGML_ASSERT(ne12 % ne02 == 0);
|
8382
8419
|
GGML_ASSERT(ne13 % ne03 == 0);
|
@@ -8385,9 +8422,6 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
8385
8422
|
const int64_t r2 = ne12/ne02;
|
8386
8423
|
const int64_t r3 = ne13/ne03;
|
8387
8424
|
|
8388
|
-
const half alpha_f16 = 1.0f;
|
8389
|
-
const half beta_f16 = 0.0f;
|
8390
|
-
|
8391
8425
|
#if 0
|
8392
8426
|
// use cublasGemmEx
|
8393
8427
|
{
|
@@ -8397,12 +8431,12 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
8397
8431
|
int i02 = i12 / r2;
|
8398
8432
|
|
8399
8433
|
CUBLAS_CHECK(
|
8400
|
-
cublasGemmEx(g_cublas_handles[
|
8434
|
+
cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
8401
8435
|
ne01, ne11, ne10,
|
8402
|
-
|
8403
|
-
|
8404
|
-
|
8405
|
-
|
8436
|
+
alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
|
8437
|
+
(const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
|
8438
|
+
beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01,
|
8439
|
+
cu_compute_type,
|
8406
8440
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
8407
8441
|
}
|
8408
8442
|
}
|
@@ -8414,11 +8448,11 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
8414
8448
|
CUBLAS_CHECK(
|
8415
8449
|
cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
8416
8450
|
ne01, ne11, ne10,
|
8417
|
-
|
8418
|
-
|
8419
|
-
|
8451
|
+
alpha, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
|
8452
|
+
(const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
|
8453
|
+
beta, ( char *) dst_t, cu_data_type, ne01, dst->nb[2]/sizeof(float), // strideC
|
8420
8454
|
ne12*ne13,
|
8421
|
-
|
8455
|
+
cu_compute_type,
|
8422
8456
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
8423
8457
|
} else {
|
8424
8458
|
// use cublasGemmBatchedEx
|
@@ -8435,24 +8469,24 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
8435
8469
|
|
8436
8470
|
dim3 block_dims(ne13, ne12);
|
8437
8471
|
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
|
8438
|
-
src0_as_f16, src1_as_f16,
|
8472
|
+
src0_as_f16, src1_as_f16, dst_t,
|
8439
8473
|
ptrs_src, ptrs_dst,
|
8440
8474
|
ne12, ne13,
|
8441
8475
|
ne23,
|
8442
8476
|
nb02, nb03,
|
8443
8477
|
nb12, nb13,
|
8444
|
-
|
8478
|
+
nbd2, nbd3,
|
8445
8479
|
r2, r3);
|
8446
8480
|
CUDA_CHECK(cudaGetLastError());
|
8447
8481
|
|
8448
8482
|
CUBLAS_CHECK(
|
8449
8483
|
cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
8450
8484
|
ne01, ne11, ne10,
|
8451
|
-
|
8452
|
-
|
8453
|
-
|
8485
|
+
alpha, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
|
8486
|
+
(const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
|
8487
|
+
beta, ( void **) (ptrs_dst + 0*ne23), cu_data_type, ne01,
|
8454
8488
|
ne23,
|
8455
|
-
|
8489
|
+
cu_compute_type,
|
8456
8490
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
8457
8491
|
|
8458
8492
|
if (ptrs_src_s != 0) {
|
@@ -8464,11 +8498,14 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
8464
8498
|
}
|
8465
8499
|
#endif
|
8466
8500
|
|
8467
|
-
|
8468
|
-
|
8501
|
+
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
|
8502
|
+
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
8503
|
+
to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
|
8504
|
+
|
8505
|
+
ggml_cuda_pool_free(dst_f16, dst_as);
|
8506
|
+
}
|
8469
8507
|
|
8470
8508
|
ggml_cuda_pool_free(src1_as_f16, src1_as);
|
8471
|
-
ggml_cuda_pool_free(dst_f16, dst_as);
|
8472
8509
|
}
|
8473
8510
|
|
8474
8511
|
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -8732,7 +8769,8 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
|
8732
8769
|
// TODO: mmq/mmv support
|
8733
8770
|
#endif
|
8734
8771
|
|
8735
|
-
|
8772
|
+
const int64_t nb11 = src1->nb[1];
|
8773
|
+
const int64_t nb1 = dst->nb[1];
|
8736
8774
|
|
8737
8775
|
const struct ggml_tensor * ids = src0;
|
8738
8776
|
const int32_t id = ((int32_t *) dst->op_params)[0];
|
@@ -8740,10 +8778,12 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
|
8740
8778
|
|
8741
8779
|
std::vector<char> ids_host(ggml_nbytes(ids));
|
8742
8780
|
|
8781
|
+
const cudaStream_t stream = g_cudaStreams[g_main_device][0];
|
8782
|
+
|
8743
8783
|
if (ids->backend == GGML_BACKEND_GPU) {
|
8744
8784
|
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
|
8745
|
-
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost,
|
8746
|
-
CUDA_CHECK(cudaStreamSynchronize(
|
8785
|
+
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
|
8786
|
+
CUDA_CHECK(cudaStreamSynchronize(stream));
|
8747
8787
|
} else {
|
8748
8788
|
memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
|
8749
8789
|
}
|
@@ -8757,37 +8797,110 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
|
8757
8797
|
ggml_tensor src1_row = *src1;
|
8758
8798
|
ggml_tensor dst_row = *dst;
|
8759
8799
|
|
8760
|
-
src1_row.
|
8761
|
-
dst_row.
|
8762
|
-
|
8763
|
-
src1_row.nb[2] = src1_row.nb[1];
|
8764
|
-
dst_row.nb[2] = dst_row.nb[1];
|
8765
|
-
|
8766
|
-
src1_row.nb[3] = src1_row.nb[1];
|
8767
|
-
dst_row.nb[3] = dst_row.nb[1];
|
8800
|
+
src1_row.backend = GGML_BACKEND_GPU;
|
8801
|
+
dst_row.backend = GGML_BACKEND_GPU;
|
8768
8802
|
|
8769
8803
|
src1_row.extra = &src1_row_extra;
|
8770
8804
|
dst_row.extra = &dst_row_extra;
|
8771
8805
|
|
8806
|
+
char * src1_original = src1->backend == GGML_BACKEND_CPU ?
|
8807
|
+
(char *) src1->data : (char *) src1_extra->data_device[g_main_device];
|
8808
|
+
char * dst_original = dst->backend == GGML_BACKEND_CPU ?
|
8809
|
+
(char *) dst->data : (char *) dst_extra->data_device[g_main_device];
|
8810
|
+
|
8811
|
+
if (src1->ne[1] == 1) {
|
8812
|
+
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
|
8813
|
+
GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
|
8814
|
+
|
8815
|
+
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
8816
|
+
//int32_t row_id;
|
8817
|
+
//CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
|
8818
|
+
//CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
|
8819
|
+
|
8820
|
+
const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
8772
8821
|
|
8773
|
-
|
8774
|
-
//int32_t row_id;
|
8775
|
-
//CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
|
8776
|
-
//CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
|
8822
|
+
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
8777
8823
|
|
8778
|
-
|
8824
|
+
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
|
8779
8825
|
|
8780
|
-
|
8826
|
+
src1_row_extra.data_device[g_main_device] = src1_original + i01*src1->nb[1];
|
8827
|
+
src1_row.data = (char *) src1->data + i01*src1->nb[1]; // TODO why is this set?
|
8828
|
+
|
8829
|
+
dst_row_extra.data_device[g_main_device] = dst_original + i01*dst->nb[1];
|
8830
|
+
dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set?
|
8831
|
+
|
8832
|
+
ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
|
8833
|
+
}
|
8834
|
+
} else {
|
8835
|
+
size_t as_src1, as_dst;
|
8836
|
+
char * src1_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(src1), &as_src1);
|
8837
|
+
char * dst_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(dst), &as_dst);
|
8781
8838
|
|
8782
|
-
|
8839
|
+
src1_row_extra.data_device[g_main_device] = src1_contiguous;
|
8840
|
+
dst_row_extra.data_device[g_main_device] = dst_contiguous;
|
8783
8841
|
|
8784
|
-
|
8785
|
-
|
8842
|
+
const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_CPU ?
|
8843
|
+
cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
|
8844
|
+
const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_CPU ?
|
8845
|
+
cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
|
8786
8846
|
|
8787
|
-
|
8788
|
-
|
8847
|
+
for (int32_t row_id = 0; row_id < n_as; ++row_id) {
|
8848
|
+
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
|
8789
8849
|
|
8790
|
-
|
8850
|
+
int64_t num_src1_rows = 0;
|
8851
|
+
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
8852
|
+
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
8853
|
+
|
8854
|
+
if (row_id_i != row_id) {
|
8855
|
+
continue;
|
8856
|
+
}
|
8857
|
+
|
8858
|
+
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
8859
|
+
|
8860
|
+
CUDA_CHECK(cudaMemcpyAsync(src1_contiguous + num_src1_rows*nb11, src1_original + i01*nb11,
|
8861
|
+
nb11, src1_kind, stream));
|
8862
|
+
num_src1_rows++;
|
8863
|
+
}
|
8864
|
+
|
8865
|
+
if (num_src1_rows == 0) {
|
8866
|
+
continue;
|
8867
|
+
}
|
8868
|
+
|
8869
|
+
src1_row.ne[1] = num_src1_rows;
|
8870
|
+
dst_row.ne[1] = num_src1_rows;
|
8871
|
+
|
8872
|
+
src1_row.nb[1] = nb11;
|
8873
|
+
src1_row.nb[2] = num_src1_rows*nb11;
|
8874
|
+
src1_row.nb[3] = num_src1_rows*nb11;
|
8875
|
+
|
8876
|
+
dst_row.nb[1] = nb1;
|
8877
|
+
dst_row.nb[2] = num_src1_rows*nb1;
|
8878
|
+
dst_row.nb[3] = num_src1_rows*nb1;
|
8879
|
+
|
8880
|
+
ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
|
8881
|
+
|
8882
|
+
num_src1_rows = 0;
|
8883
|
+
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
8884
|
+
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
8885
|
+
|
8886
|
+
if (row_id_i != row_id) {
|
8887
|
+
continue;
|
8888
|
+
}
|
8889
|
+
|
8890
|
+
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
8891
|
+
|
8892
|
+
CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous + num_src1_rows*nb1,
|
8893
|
+
nb1, dst_kind, stream));
|
8894
|
+
num_src1_rows++;
|
8895
|
+
}
|
8896
|
+
}
|
8897
|
+
|
8898
|
+
ggml_cuda_pool_free(src1_contiguous, as_src1);
|
8899
|
+
ggml_cuda_pool_free(dst_contiguous, as_dst);
|
8900
|
+
}
|
8901
|
+
|
8902
|
+
if (dst->backend == GGML_BACKEND_CPU) {
|
8903
|
+
CUDA_CHECK(cudaStreamSynchronize(stream));
|
8791
8904
|
}
|
8792
8905
|
}
|
8793
8906
|
|
@@ -8958,7 +9071,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
8958
9071
|
|
8959
9072
|
char * buf;
|
8960
9073
|
CUDA_CHECK(cudaMalloc(&buf, size));
|
8961
|
-
char * buf_host = (char*)data + offset_split;
|
9074
|
+
char * buf_host = (char *)data + offset_split;
|
8962
9075
|
|
8963
9076
|
// set padding to 0 to avoid possible NaN values
|
8964
9077
|
if (size > original_size) {
|
@@ -8980,7 +9093,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
8980
9093
|
}
|
8981
9094
|
|
8982
9095
|
void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
8983
|
-
if (!tensor || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
|
9096
|
+
if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
|
8984
9097
|
return;
|
8985
9098
|
}
|
8986
9099
|
|
@@ -9103,11 +9216,10 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
|
|
9103
9216
|
|
9104
9217
|
ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
|
9105
9218
|
|
9106
|
-
const bool inplace =
|
9107
|
-
tensor->op == GGML_OP_VIEW;
|
9219
|
+
const bool inplace = tensor->view_src != nullptr;
|
9108
9220
|
|
9109
|
-
if (inplace && (tensor->
|
9110
|
-
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->
|
9221
|
+
if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) {
|
9222
|
+
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
|
9111
9223
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
9112
9224
|
size_t view_offset = 0;
|
9113
9225
|
if (tensor->op == GGML_OP_VIEW) {
|
@@ -9187,14 +9299,14 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
9187
9299
|
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
9188
9300
|
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
9189
9301
|
|
9190
|
-
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT) {
|
9302
|
+
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
|
9191
9303
|
return false;
|
9192
9304
|
}
|
9193
9305
|
|
9194
9306
|
if (tensor->op == GGML_OP_MUL_MAT) {
|
9195
9307
|
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
|
9196
9308
|
#ifndef NDEBUG
|
9197
|
-
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = " PRId64 ", src1->ne[3] = " PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
|
9309
|
+
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
|
9198
9310
|
#endif
|
9199
9311
|
return false;
|
9200
9312
|
}
|
@@ -9323,6 +9435,10 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
9323
9435
|
return false;
|
9324
9436
|
}
|
9325
9437
|
|
9438
|
+
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) {
|
9439
|
+
ggml_cuda_set_peer_access(tensor->src[1]->ne[1]);
|
9440
|
+
}
|
9441
|
+
|
9326
9442
|
if (params->ith != 0) {
|
9327
9443
|
return true;
|
9328
9444
|
}
|
@@ -9396,7 +9512,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
|
|
9396
9512
|
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
9397
9513
|
|
9398
9514
|
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
9399
|
-
assert(tensor->view_src->buffer->buft == buffer->buft);
|
9515
|
+
assert(tensor->view_src->buffer->buft == buffer->buft);
|
9400
9516
|
tensor->backend = tensor->view_src->backend;
|
9401
9517
|
tensor->extra = tensor->view_src->extra;
|
9402
9518
|
return;
|
@@ -9427,23 +9543,34 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
|
|
9427
9543
|
}
|
9428
9544
|
|
9429
9545
|
static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
9430
|
-
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
9431
|
-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
9432
9546
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
9433
9547
|
|
9434
|
-
|
9548
|
+
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
9435
9549
|
|
9436
|
-
|
9550
|
+
ggml_cuda_set_device(ctx->device);
|
9551
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
9552
|
+
|
9553
|
+
CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
|
9437
9554
|
}
|
9438
9555
|
|
9439
9556
|
static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
9440
|
-
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
9441
|
-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
9442
9557
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
9443
9558
|
|
9559
|
+
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
9560
|
+
|
9561
|
+
ggml_cuda_set_device(ctx->device);
|
9562
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
9563
|
+
|
9444
9564
|
CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
|
9565
|
+
}
|
9445
9566
|
|
9446
|
-
|
9567
|
+
static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
9568
|
+
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
9569
|
+
|
9570
|
+
ggml_cuda_set_device(ctx->device);
|
9571
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
9572
|
+
|
9573
|
+
CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size));
|
9447
9574
|
}
|
9448
9575
|
|
9449
9576
|
static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
|
@@ -9454,6 +9581,7 @@ static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
|
|
9454
9581
|
/* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
|
9455
9582
|
/* .cpy_tensor_from = */ NULL,
|
9456
9583
|
/* .cpy_tensor_to = */ NULL,
|
9584
|
+
/* .clear = */ ggml_backend_cuda_buffer_clear,
|
9457
9585
|
};
|
9458
9586
|
|
9459
9587
|
// cuda buffer type
|
@@ -9505,35 +9633,36 @@ static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_t
|
|
9505
9633
|
UNUSED(buft);
|
9506
9634
|
}
|
9507
9635
|
|
9508
|
-
static ggml_backend_buffer_type_i
|
9636
|
+
static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
|
9509
9637
|
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
|
9510
9638
|
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
9511
9639
|
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
9512
9640
|
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
|
9641
|
+
/* .is_host = */ nullptr,
|
9513
9642
|
};
|
9514
9643
|
|
9515
9644
|
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
9516
|
-
static struct ggml_backend_buffer_type
|
9517
|
-
|
9518
|
-
|
9645
|
+
static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
|
9646
|
+
|
9647
|
+
static bool ggml_backend_cuda_buffer_type_initialized = false;
|
9648
|
+
|
9649
|
+
if (!ggml_backend_cuda_buffer_type_initialized) {
|
9519
9650
|
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
|
9520
|
-
|
9521
|
-
/* .iface = */
|
9651
|
+
ggml_backend_cuda_buffer_types[i] = {
|
9652
|
+
/* .iface = */ ggml_backend_cuda_buffer_type_interface,
|
9522
9653
|
/* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
|
9523
9654
|
};
|
9524
9655
|
}
|
9525
|
-
|
9656
|
+
ggml_backend_cuda_buffer_type_initialized = true;
|
9526
9657
|
}
|
9527
9658
|
|
9528
|
-
return &
|
9659
|
+
return &ggml_backend_cuda_buffer_types[device];
|
9529
9660
|
}
|
9530
9661
|
|
9531
9662
|
// host buffer type
|
9532
9663
|
|
9533
9664
|
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
9534
|
-
|
9535
|
-
CUDA_CHECK(cudaFreeHost(ctx->dev_ptr));
|
9536
|
-
delete ctx;
|
9665
|
+
CUDA_CHECK(cudaFreeHost(buffer->context));
|
9537
9666
|
}
|
9538
9667
|
|
9539
9668
|
static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
@@ -9546,24 +9675,21 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
|
|
9546
9675
|
buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
|
9547
9676
|
|
9548
9677
|
return buffer;
|
9549
|
-
|
9550
|
-
UNUSED(buft);
|
9551
9678
|
}
|
9552
9679
|
|
9553
|
-
struct ggml_backend_buffer_type_i cuda_backend_host_buffer_type_interface = {
|
9554
|
-
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
|
9555
|
-
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
9556
|
-
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
9557
|
-
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
9558
|
-
};
|
9559
|
-
|
9560
9680
|
ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
9561
|
-
static struct ggml_backend_buffer_type
|
9562
|
-
/* .iface = */
|
9681
|
+
static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
|
9682
|
+
/* .iface = */ {
|
9683
|
+
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
|
9684
|
+
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
9685
|
+
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
9686
|
+
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
9687
|
+
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
9688
|
+
},
|
9563
9689
|
/* .context = */ nullptr,
|
9564
9690
|
};
|
9565
9691
|
|
9566
|
-
return &
|
9692
|
+
return &ggml_backend_cuda_buffer_type_host;
|
9567
9693
|
}
|
9568
9694
|
|
9569
9695
|
// backend
|
@@ -9595,8 +9721,6 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens
|
|
9595
9721
|
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
9596
9722
|
|
9597
9723
|
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
9598
|
-
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
9599
|
-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
9600
9724
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
9601
9725
|
|
9602
9726
|
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
|
@@ -9606,8 +9730,6 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm
|
|
9606
9730
|
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
9607
9731
|
|
9608
9732
|
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
9609
|
-
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
9610
|
-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
9611
9733
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
9612
9734
|
|
9613
9735
|
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
|