llama_cpp 0.5.3 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/README.md +6 -5
- data/examples/chat.rb +13 -13
- data/examples/embedding.rb +9 -9
- data/ext/llama_cpp/llama_cpp.cpp +547 -272
- data/ext/llama_cpp/src/ggml-alloc.c +8 -2
- data/ext/llama_cpp/src/ggml-alloc.h +1 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +209 -82
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.m +163 -84
- data/ext/llama_cpp/src/ggml-metal.metal +121 -38
- data/ext/llama_cpp/src/ggml.c +1596 -842
- data/ext/llama_cpp/src/ggml.h +116 -35
- data/ext/llama_cpp/src/llama.cpp +1015 -586
- data/ext/llama_cpp/src/llama.h +304 -119
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +5 -9
- data/sig/llama_cpp.rbs +65 -34
- metadata +3 -3
@@ -77,7 +77,7 @@ struct free_block {
|
|
77
77
|
size_t size;
|
78
78
|
};
|
79
79
|
|
80
|
-
#define MAX_FREE_BLOCKS
|
80
|
+
#define MAX_FREE_BLOCKS 256
|
81
81
|
|
82
82
|
struct ggml_allocr {
|
83
83
|
void * data;
|
@@ -187,6 +187,7 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
|
187
187
|
}
|
188
188
|
|
189
189
|
tensor->data = addr;
|
190
|
+
AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data);
|
190
191
|
|
191
192
|
#ifdef GGML_ALLOCATOR_DEBUG
|
192
193
|
add_allocated_tensor(alloc, tensor);
|
@@ -218,7 +219,8 @@ static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tens
|
|
218
219
|
|
219
220
|
size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
|
220
221
|
size = aligned_offset(NULL, size, alloc->alignment);
|
221
|
-
AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);
|
222
|
+
AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
|
223
|
+
AT_PRINTF("%s: alloc->data = %p alloc->data+alloc->size = %p alloc->data+alloc->max_size = %p\n", __func__, alloc->data, (char*)alloc->data + alloc->size, (char*)alloc->data + alloc->max_size);
|
222
224
|
|
223
225
|
#ifdef GGML_ALLOCATOR_DEBUG
|
224
226
|
remove_allocated_tensor(alloc, tensor);
|
@@ -631,3 +633,7 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
|
|
631
633
|
size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
|
632
634
|
return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
|
633
635
|
}
|
636
|
+
|
637
|
+
size_t ggml_allocr_max_size(struct ggml_allocr * alloc) {
|
638
|
+
return alloc->max_size;
|
639
|
+
}
|
@@ -19,6 +19,7 @@ GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
|
|
19
19
|
GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc);
|
20
20
|
GGML_API void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor);
|
21
21
|
GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
|
22
|
+
GGML_API size_t ggml_allocr_max_size(struct ggml_allocr * alloc);
|
22
23
|
|
23
24
|
|
24
25
|
#ifdef __cplusplus
|
@@ -1,3 +1,4 @@
|
|
1
|
+
#include <algorithm>
|
1
2
|
#include <cstddef>
|
2
3
|
#include <cstdint>
|
3
4
|
#include <limits>
|
@@ -14,9 +15,11 @@
|
|
14
15
|
// for rocblas_initialize()
|
15
16
|
#include "rocblas/rocblas.h"
|
16
17
|
#endif // __HIP_PLATFORM_AMD__
|
18
|
+
#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
|
17
19
|
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
|
18
20
|
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
|
19
21
|
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
|
22
|
+
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
|
20
23
|
#define CUBLAS_OP_N HIPBLAS_OP_N
|
21
24
|
#define CUBLAS_OP_T HIPBLAS_OP_T
|
22
25
|
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
|
@@ -235,8 +238,12 @@ static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t *
|
|
235
238
|
return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
236
239
|
}
|
237
240
|
|
241
|
+
template<typename T>
|
242
|
+
using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int k, cudaStream_t stream);
|
243
|
+
typedef to_t_cuda_t<float> to_fp32_cuda_t;
|
244
|
+
typedef to_t_cuda_t<half> to_fp16_cuda_t;
|
245
|
+
|
238
246
|
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
|
239
|
-
typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream);
|
240
247
|
typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
|
241
248
|
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
242
249
|
typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
@@ -461,7 +468,7 @@ static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
|
461
468
|
static bool g_mul_mat_q = true;
|
462
469
|
|
463
470
|
static void * g_scratch_buffer = nullptr;
|
464
|
-
static size_t g_scratch_size =
|
471
|
+
static size_t g_scratch_size = 0; // disabled by default
|
465
472
|
static size_t g_scratch_offset = 0;
|
466
473
|
|
467
474
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
@@ -1515,6 +1522,14 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
|
|
1515
1522
|
v.y = x[ib + iqs + 1];
|
1516
1523
|
}
|
1517
1524
|
|
1525
|
+
static __device__ void convert_f32(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
1526
|
+
const float * x = (const float *) vx;
|
1527
|
+
|
1528
|
+
// automatic half -> float type cast if dfloat == float
|
1529
|
+
v.x = x[ib + iqs + 0];
|
1530
|
+
v.y = x[ib + iqs + 1];
|
1531
|
+
}
|
1532
|
+
|
1518
1533
|
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
|
1519
1534
|
const int ix = blockDim.x*blockIdx.x + threadIdx.x;
|
1520
1535
|
|
@@ -1554,8 +1569,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
|
1554
1569
|
reinterpret_cast<half&>(y[ib].ds.y) = sum;
|
1555
1570
|
}
|
1556
1571
|
|
1557
|
-
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
1558
|
-
static __global__ void dequantize_block(const void * __restrict__ vx,
|
1572
|
+
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
1573
|
+
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
|
1559
1574
|
const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
|
1560
1575
|
|
1561
1576
|
if (i >= k) {
|
@@ -4355,8 +4370,10 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
|
4355
4370
|
}
|
4356
4371
|
|
4357
4372
|
// rope == RoPE == rotary positional embedding
|
4358
|
-
|
4359
|
-
|
4373
|
+
|
4374
|
+
template<typename T, bool has_pos>
|
4375
|
+
static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
|
4376
|
+
const int p_delta_rows, const float theta_scale) {
|
4360
4377
|
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
4361
4378
|
|
4362
4379
|
if (col >= ncols) {
|
@@ -4365,8 +4382,11 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
|
|
4365
4382
|
|
4366
4383
|
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
4367
4384
|
const int i = row*ncols + col;
|
4385
|
+
const int i2 = row/p_delta_rows;
|
4368
4386
|
|
4369
|
-
const
|
4387
|
+
const int p = has_pos ? pos[i2] : 0;
|
4388
|
+
const float p0 = p*freq_scale;
|
4389
|
+
const float theta = p0*powf(theta_scale, col/2);
|
4370
4390
|
const float sin_theta = sinf(theta);
|
4371
4391
|
const float cos_theta = cosf(theta);
|
4372
4392
|
|
@@ -4377,8 +4397,9 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
|
|
4377
4397
|
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
4378
4398
|
}
|
4379
4399
|
|
4380
|
-
|
4381
|
-
|
4400
|
+
template<typename T, bool has_pos>
|
4401
|
+
static __global__ void rope_neox(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
|
4402
|
+
const int p_delta_rows, const float theta_scale) {
|
4382
4403
|
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
4383
4404
|
|
4384
4405
|
if (col >= ncols) {
|
@@ -4387,8 +4408,11 @@ static __global__ void rope_neox_f32(const float * x, float * dst, const int nco
|
|
4387
4408
|
|
4388
4409
|
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
4389
4410
|
const int i = row*ncols + col/2;
|
4411
|
+
const int i2 = row/p_delta_rows;
|
4390
4412
|
|
4391
|
-
const
|
4413
|
+
const int p = has_pos ? pos[i2] : 0;
|
4414
|
+
const float p0 = p*freq_scale;
|
4415
|
+
const float theta = p0*powf(theta_scale, col/2);
|
4392
4416
|
const float sin_theta = sinf(theta);
|
4393
4417
|
const float cos_theta = cosf(theta);
|
4394
4418
|
|
@@ -4399,8 +4423,8 @@ static __global__ void rope_neox_f32(const float * x, float * dst, const int nco
|
|
4399
4423
|
dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
|
4400
4424
|
}
|
4401
4425
|
|
4402
|
-
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float
|
4403
|
-
const
|
4426
|
+
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const int32_t * pos, const float freq_scale,
|
4427
|
+
const int p_delta_rows, const float theta_scale, const int n_ctx) {
|
4404
4428
|
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
4405
4429
|
const int half_n_dims = ncols/4;
|
4406
4430
|
|
@@ -4410,11 +4434,13 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
|
|
4410
4434
|
|
4411
4435
|
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
4412
4436
|
const int i = row*ncols + col;
|
4437
|
+
const int i2 = row/p_delta_rows;
|
4413
4438
|
|
4414
4439
|
const float col_theta_scale = powf(theta_scale, col);
|
4415
|
-
|
4440
|
+
// FIXME: this is likely wrong
|
4441
|
+
const int p = pos != nullptr ? pos[i2] : 0;
|
4416
4442
|
|
4417
|
-
const float theta = min(p,
|
4443
|
+
const float theta = min(p, n_ctx - 2)*freq_scale*col_theta_scale;
|
4418
4444
|
const float sin_theta = sinf(theta);
|
4419
4445
|
const float cos_theta = cosf(theta);
|
4420
4446
|
|
@@ -4424,7 +4450,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
|
|
4424
4450
|
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
4425
4451
|
dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
|
4426
4452
|
|
4427
|
-
const float block_theta = max(p -
|
4453
|
+
const float block_theta = ((float)max(p - n_ctx - 2, 0))*col_theta_scale;
|
4428
4454
|
const float sin_block_theta = sinf(block_theta);
|
4429
4455
|
const float cos_block_theta = cosf(block_theta);
|
4430
4456
|
|
@@ -4826,6 +4852,11 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c
|
|
4826
4852
|
dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4827
4853
|
}
|
4828
4854
|
|
4855
|
+
static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cudaStream_t stream) {
|
4856
|
+
const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
4857
|
+
dequantize_block<1, 1, convert_f32><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4858
|
+
}
|
4859
|
+
|
4829
4860
|
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4830
4861
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
4831
4862
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
@@ -4835,6 +4866,15 @@ static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, floa
|
|
4835
4866
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
4836
4867
|
}
|
4837
4868
|
|
4869
|
+
static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
4870
|
+
switch (type) {
|
4871
|
+
case GGML_TYPE_F32:
|
4872
|
+
return convert_fp32_to_fp16_cuda;
|
4873
|
+
default:
|
4874
|
+
return nullptr;
|
4875
|
+
}
|
4876
|
+
}
|
4877
|
+
|
4838
4878
|
static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
4839
4879
|
switch (type) {
|
4840
4880
|
case GGML_TYPE_Q4_0:
|
@@ -5361,31 +5401,41 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
|
|
5361
5401
|
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
|
5362
5402
|
}
|
5363
5403
|
|
5364
|
-
|
5365
|
-
|
5404
|
+
template<typename T>
|
5405
|
+
static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
|
5406
|
+
const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
5366
5407
|
GGML_ASSERT(ncols % 2 == 0);
|
5367
5408
|
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
5368
5409
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
5369
5410
|
const dim3 block_nums(nrows, num_blocks_x, 1);
|
5370
|
-
|
5411
|
+
if (pos == nullptr) {
|
5412
|
+
rope<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
|
5413
|
+
} else {
|
5414
|
+
rope<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
|
5415
|
+
}
|
5371
5416
|
}
|
5372
5417
|
|
5373
|
-
|
5374
|
-
|
5418
|
+
template<typename T>
|
5419
|
+
static void rope_neox_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
|
5420
|
+
const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
5375
5421
|
GGML_ASSERT(ncols % 2 == 0);
|
5376
5422
|
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
5377
5423
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
5378
5424
|
const dim3 block_nums(nrows, num_blocks_x, 1);
|
5379
|
-
|
5425
|
+
if (pos == nullptr) {
|
5426
|
+
rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
|
5427
|
+
} else {
|
5428
|
+
rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
|
5429
|
+
}
|
5380
5430
|
}
|
5381
5431
|
|
5382
|
-
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float
|
5383
|
-
const
|
5432
|
+
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
|
5433
|
+
const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
|
5384
5434
|
GGML_ASSERT(ncols % 4 == 0);
|
5385
5435
|
const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
|
5386
5436
|
const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
|
5387
5437
|
const dim3 block_nums(num_blocks_x, nrows, 1);
|
5388
|
-
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols,
|
5438
|
+
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale, n_ctx);
|
5389
5439
|
}
|
5390
5440
|
|
5391
5441
|
static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
|
@@ -6016,8 +6066,6 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
6016
6066
|
GGML_ASSERT(src1_ddf_i != nullptr);
|
6017
6067
|
GGML_ASSERT(dst_dd_i != nullptr);
|
6018
6068
|
|
6019
|
-
const float alpha = 1.0f;
|
6020
|
-
const float beta = 0.0f;
|
6021
6069
|
|
6022
6070
|
const int64_t ne00 = src0->ne[0];
|
6023
6071
|
|
@@ -6026,16 +6074,6 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
6026
6074
|
const int64_t ne0 = dst->ne[0];
|
6027
6075
|
const int64_t row_diff = row_high - row_low;
|
6028
6076
|
|
6029
|
-
float * src0_ddq_as_f32;
|
6030
|
-
size_t src0_as = 0;
|
6031
|
-
|
6032
|
-
if (src0->type != GGML_TYPE_F32) {
|
6033
|
-
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
6034
|
-
src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
|
6035
|
-
to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
|
6036
|
-
}
|
6037
|
-
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
|
6038
|
-
|
6039
6077
|
int id;
|
6040
6078
|
CUDA_CHECK(cudaGetDevice(&id));
|
6041
6079
|
|
@@ -6043,16 +6081,72 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
6043
6081
|
// ldc == nrows of the matrix that cuBLAS writes into
|
6044
6082
|
int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
|
6045
6083
|
|
6046
|
-
|
6047
|
-
|
6048
|
-
|
6049
|
-
|
6050
|
-
|
6051
|
-
|
6052
|
-
|
6084
|
+
const int compute_capability = g_compute_capabilities[id];
|
6085
|
+
|
6086
|
+
if (compute_capability >= CC_TURING && src0->type == GGML_TYPE_F16 && ggml_is_contiguous(src0) && ldc == row_diff) {
|
6087
|
+
// convert src1 to fp16, multiply as fp16, convert dst to fp32
|
6088
|
+
half * src1_as_f16 = nullptr;
|
6089
|
+
size_t src1_as = 0;
|
6090
|
+
if (src1->type != GGML_TYPE_F16) {
|
6091
|
+
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
6092
|
+
GGML_ASSERT(to_fp16_cuda != nullptr);
|
6093
|
+
size_t ne = src1_ncols*ne10;
|
6094
|
+
src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src1_as);
|
6095
|
+
to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream);
|
6096
|
+
}
|
6097
|
+
const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddq_i : src1_as_f16;
|
6098
|
+
|
6099
|
+
size_t dst_as = 0;
|
6100
|
+
half * dst_f16 = (half *) ggml_cuda_pool_malloc(row_diff*src1_ncols * sizeof(half), &dst_as);
|
6101
|
+
|
6102
|
+
const half alpha_f16 = 1.0f;
|
6103
|
+
const half beta_f16 = 0.0f;
|
6104
|
+
|
6105
|
+
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
|
6106
|
+
CUBLAS_CHECK(
|
6107
|
+
cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
6108
|
+
row_diff, src1_ncols, ne10,
|
6109
|
+
&alpha_f16, src0_dd_i, CUDA_R_16F, ne00,
|
6110
|
+
src1_ptr, CUDA_R_16F, ne10,
|
6111
|
+
&beta_f16, dst_f16, CUDA_R_16F, ldc,
|
6112
|
+
CUBLAS_COMPUTE_16F,
|
6113
|
+
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
6114
|
+
|
6115
|
+
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
6116
|
+
to_fp32_cuda(dst_f16, dst_dd_i, row_diff*src1_ncols, stream);
|
6117
|
+
|
6118
|
+
ggml_cuda_pool_free(dst_f16, dst_as);
|
6053
6119
|
|
6054
|
-
|
6055
|
-
|
6120
|
+
if (src1_as != 0) {
|
6121
|
+
ggml_cuda_pool_free(src1_as_f16, src1_as);
|
6122
|
+
}
|
6123
|
+
}
|
6124
|
+
else {
|
6125
|
+
float * src0_ddq_as_f32 = nullptr;
|
6126
|
+
size_t src0_as = 0;
|
6127
|
+
|
6128
|
+
if (src0->type != GGML_TYPE_F32) {
|
6129
|
+
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
6130
|
+
GGML_ASSERT(to_fp32_cuda != nullptr);
|
6131
|
+
src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
|
6132
|
+
to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
|
6133
|
+
}
|
6134
|
+
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
|
6135
|
+
|
6136
|
+
const float alpha = 1.0f;
|
6137
|
+
const float beta = 0.0f;
|
6138
|
+
|
6139
|
+
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
|
6140
|
+
CUBLAS_CHECK(
|
6141
|
+
cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
6142
|
+
row_diff, src1_ncols, ne10,
|
6143
|
+
&alpha, src0_ddf_i, ne00,
|
6144
|
+
src1_ddf_i, ne10,
|
6145
|
+
&beta, dst_dd_i, ldc));
|
6146
|
+
|
6147
|
+
if (src0_as != 0) {
|
6148
|
+
ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
|
6149
|
+
}
|
6056
6150
|
}
|
6057
6151
|
|
6058
6152
|
(void) dst;
|
@@ -6064,14 +6158,16 @@ inline void ggml_cuda_op_rope(
|
|
6064
6158
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6065
6159
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6066
6160
|
|
6067
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6068
|
-
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6161
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
6162
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
6163
|
+
GGML_ASSERT(src0->type == dst->type);
|
6069
6164
|
|
6070
6165
|
const int64_t ne00 = src0->ne[0];
|
6071
6166
|
const int64_t ne01 = src0->ne[1];
|
6167
|
+
const int64_t ne2 = dst->ne[2];
|
6072
6168
|
const int64_t nrows = ggml_nrows(src0);
|
6073
6169
|
|
6074
|
-
const int n_past = ((int32_t *) dst->op_params)[0];
|
6170
|
+
//const int n_past = ((int32_t *) dst->op_params)[0];
|
6075
6171
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
6076
6172
|
const int mode = ((int32_t *) dst->op_params)[2];
|
6077
6173
|
const int n_ctx = ((int32_t *) dst->op_params)[3];
|
@@ -6082,19 +6178,38 @@ inline void ggml_cuda_op_rope(
|
|
6082
6178
|
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
6083
6179
|
|
6084
6180
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
6085
|
-
|
6181
|
+
|
6182
|
+
const int32_t * pos = nullptr;
|
6183
|
+
if ((mode & 1) == 0) {
|
6184
|
+
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
6185
|
+
GGML_ASSERT(src1->ne[0] == ne2);
|
6186
|
+
pos = (const int32_t *) src1_dd;
|
6187
|
+
}
|
6086
6188
|
|
6087
6189
|
const bool is_neox = mode & 2;
|
6088
6190
|
const bool is_glm = mode & 4;
|
6089
6191
|
|
6090
6192
|
// compute
|
6091
6193
|
if (is_glm) {
|
6092
|
-
|
6194
|
+
GGML_ASSERT(false);
|
6195
|
+
rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, n_ctx, main_stream);
|
6093
6196
|
} else if (is_neox) {
|
6094
6197
|
GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
|
6095
|
-
|
6198
|
+
if (src0->type == GGML_TYPE_F32) {
|
6199
|
+
rope_neox_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
|
6200
|
+
} else if (src0->type == GGML_TYPE_F16) {
|
6201
|
+
rope_neox_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
|
6202
|
+
} else {
|
6203
|
+
GGML_ASSERT(false);
|
6204
|
+
}
|
6096
6205
|
} else {
|
6097
|
-
|
6206
|
+
if (src0->type == GGML_TYPE_F32) {
|
6207
|
+
rope_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
|
6208
|
+
} else if (src0->type == GGML_TYPE_F16) {
|
6209
|
+
rope_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
|
6210
|
+
} else {
|
6211
|
+
GGML_ASSERT(false);
|
6212
|
+
}
|
6098
6213
|
}
|
6099
6214
|
|
6100
6215
|
(void) src1;
|
@@ -6265,7 +6380,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
|
6265
6380
|
}
|
6266
6381
|
}
|
6267
6382
|
|
6268
|
-
void ggml_cuda_set_peer_access(const int n_tokens) {
|
6383
|
+
static void ggml_cuda_set_peer_access(const int n_tokens) {
|
6269
6384
|
static bool peer_access_enabled = false;
|
6270
6385
|
|
6271
6386
|
const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
|
@@ -6593,27 +6708,27 @@ static void ggml_cuda_op_mul_mat(
|
|
6593
6708
|
}
|
6594
6709
|
}
|
6595
6710
|
|
6596
|
-
void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6711
|
+
static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6597
6712
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
|
6598
6713
|
}
|
6599
6714
|
|
6600
|
-
void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6715
|
+
static void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6601
6716
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
|
6602
6717
|
}
|
6603
6718
|
|
6604
|
-
void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6719
|
+
static void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6605
6720
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
|
6606
6721
|
}
|
6607
6722
|
|
6608
|
-
void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6723
|
+
static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6609
6724
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
|
6610
6725
|
}
|
6611
6726
|
|
6612
|
-
void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6727
|
+
static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6613
6728
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
|
6614
6729
|
}
|
6615
6730
|
|
6616
|
-
void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6731
|
+
static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6617
6732
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
|
6618
6733
|
}
|
6619
6734
|
|
@@ -6624,17 +6739,13 @@ bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_te
|
|
6624
6739
|
const int64_t ne1 = dst->ne[1];
|
6625
6740
|
|
6626
6741
|
// TODO: find the optimal values for these
|
6627
|
-
|
6628
|
-
|
6629
|
-
|
6630
|
-
|
6631
|
-
return true;
|
6632
|
-
}
|
6633
|
-
|
6634
|
-
return false;
|
6742
|
+
return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
6743
|
+
src1->type == GGML_TYPE_F32 &&
|
6744
|
+
dst->type == GGML_TYPE_F32 &&
|
6745
|
+
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32);
|
6635
6746
|
}
|
6636
6747
|
|
6637
|
-
void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
6748
|
+
static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
6638
6749
|
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
6639
6750
|
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
6640
6751
|
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
|
@@ -6663,7 +6774,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
|
|
6663
6774
|
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
|
6664
6775
|
}
|
6665
6776
|
|
6666
|
-
void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
6777
|
+
static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
6667
6778
|
GGML_ASSERT(!ggml_is_contiguous(src0) && ggml_is_contiguous(src1));
|
6668
6779
|
GGML_ASSERT(!ggml_is_permuted(src0));
|
6669
6780
|
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
@@ -6697,7 +6808,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
|
|
6697
6808
|
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
6698
6809
|
}
|
6699
6810
|
|
6700
|
-
void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6811
|
+
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6701
6812
|
bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
|
6702
6813
|
src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
|
6703
6814
|
|
@@ -6741,11 +6852,11 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
|
|
6741
6852
|
}
|
6742
6853
|
}
|
6743
6854
|
|
6744
|
-
void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6855
|
+
static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6745
6856
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
|
6746
6857
|
}
|
6747
6858
|
|
6748
|
-
void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6859
|
+
static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6749
6860
|
const int64_t ne = ggml_nelements(src0);
|
6750
6861
|
GGML_ASSERT(ne == ggml_nelements(src1));
|
6751
6862
|
|
@@ -6787,35 +6898,37 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
6787
6898
|
ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
6788
6899
|
ne10, ne11, nb10, nb11, nb12, main_stream);
|
6789
6900
|
} else {
|
6901
|
+
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
6902
|
+
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
6790
6903
|
GGML_ASSERT(false);
|
6791
6904
|
}
|
6792
6905
|
|
6793
6906
|
(void) dst;
|
6794
6907
|
}
|
6795
6908
|
|
6796
|
-
void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6909
|
+
static void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6797
6910
|
ggml_cuda_cpy(src0, dst, nullptr);
|
6798
6911
|
(void) src1;
|
6799
6912
|
}
|
6800
6913
|
|
6801
|
-
void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6914
|
+
static void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6802
6915
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_diag_mask_inf);
|
6803
6916
|
}
|
6804
6917
|
|
6805
|
-
void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6918
|
+
static void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6806
6919
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_soft_max);
|
6807
6920
|
}
|
6808
6921
|
|
6809
|
-
void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6922
|
+
static void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6810
6923
|
GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
|
6811
6924
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rope);
|
6812
6925
|
}
|
6813
6926
|
|
6814
|
-
void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6927
|
+
static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6815
6928
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
|
6816
6929
|
}
|
6817
6930
|
|
6818
|
-
void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6931
|
+
static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6819
6932
|
(void) src0;
|
6820
6933
|
(void) src1;
|
6821
6934
|
(void) dst;
|
@@ -6938,11 +7051,13 @@ static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
|
6938
7051
|
return extra;
|
6939
7052
|
}
|
6940
7053
|
|
6941
|
-
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
|
7054
|
+
static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
|
6942
7055
|
if (scratch && g_scratch_size == 0) {
|
6943
7056
|
return;
|
6944
7057
|
}
|
6945
7058
|
|
7059
|
+
tensor->backend = GGML_BACKEND_GPU;
|
7060
|
+
|
6946
7061
|
// recursively assign CUDA buffers until a compute tensor is found
|
6947
7062
|
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
|
6948
7063
|
const ggml_op src0_op = tensor->src[0]->op;
|
@@ -6954,8 +7069,6 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
6954
7069
|
ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
|
6955
7070
|
}
|
6956
7071
|
|
6957
|
-
tensor->backend = GGML_BACKEND_GPU;
|
6958
|
-
|
6959
7072
|
if (scratch && no_alloc) {
|
6960
7073
|
return;
|
6961
7074
|
}
|
@@ -7040,6 +7153,15 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
|
|
7040
7153
|
tensor->extra = extra;
|
7041
7154
|
}
|
7042
7155
|
|
7156
|
+
void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
|
7157
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
7158
|
+
GGML_ASSERT(ggml_is_contiguous(tensor));
|
7159
|
+
|
7160
|
+
struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
7161
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
7162
|
+
CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
|
7163
|
+
}
|
7164
|
+
|
7043
7165
|
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
|
7044
7166
|
ggml_cuda_assign_buffers_impl(tensor, true, false, false);
|
7045
7167
|
}
|
@@ -7075,7 +7197,12 @@ void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
|
|
7075
7197
|
}
|
7076
7198
|
|
7077
7199
|
void ggml_cuda_set_scratch_size(const size_t scratch_size) {
|
7078
|
-
|
7200
|
+
// this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
|
7201
|
+
// it still won't always work as expected, but it's better than nothing
|
7202
|
+
if (scratch_size > g_scratch_size) {
|
7203
|
+
ggml_cuda_free_scratch();
|
7204
|
+
}
|
7205
|
+
g_scratch_size = std::max(g_scratch_size, scratch_size);
|
7079
7206
|
}
|
7080
7207
|
|
7081
7208
|
void ggml_cuda_free_scratch() {
|
@@ -31,6 +31,7 @@ GGML_API void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tens
|
|
31
31
|
|
32
32
|
GGML_API void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor);
|
33
33
|
GGML_API void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset);
|
34
|
+
GGML_API void ggml_cuda_copy_to_device(struct ggml_tensor * tensor);
|
34
35
|
|
35
36
|
GGML_API void ggml_cuda_set_main_device(int main_device);
|
36
37
|
GGML_API void ggml_cuda_set_mul_mat_q(bool mul_mat_q);
|
@@ -19,6 +19,8 @@
|
|
19
19
|
|
20
20
|
#pragma once
|
21
21
|
|
22
|
+
#include "ggml.h"
|
23
|
+
|
22
24
|
#include <stddef.h>
|
23
25
|
#include <stdbool.h>
|
24
26
|
|
@@ -33,6 +35,8 @@ struct ggml_cgraph;
|
|
33
35
|
extern "C" {
|
34
36
|
#endif
|
35
37
|
|
38
|
+
void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
|
39
|
+
|
36
40
|
struct ggml_metal_context;
|
37
41
|
|
38
42
|
// number of command buffers to use
|