llama_cpp 0.9.2 → 0.9.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -1
- data/ext/llama_cpp/llama_cpp.cpp +12 -0
- data/ext/llama_cpp/src/ggml-alloc.c +378 -208
- data/ext/llama_cpp/src/ggml-alloc.h +68 -16
- data/ext/llama_cpp/src/ggml-backend-impl.h +87 -0
- data/ext/llama_cpp/src/ggml-backend.c +578 -13
- data/ext/llama_cpp/src/ggml-backend.h +70 -77
- data/ext/llama_cpp/src/ggml-cuda.cu +194 -8
- data/ext/llama_cpp/src/ggml-impl.h +13 -7
- data/ext/llama_cpp/src/ggml-metal.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.m +113 -32
- data/ext/llama_cpp/src/ggml-metal.metal +107 -1
- data/ext/llama_cpp/src/ggml-quants.c +173 -73
- data/ext/llama_cpp/src/ggml.c +826 -1482
- data/ext/llama_cpp/src/ggml.h +63 -45
- data/ext/llama_cpp/src/llama.cpp +364 -38
- data/ext/llama_cpp/src/llama.h +6 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +3 -2
@@ -1,51 +1,20 @@
|
|
1
1
|
#pragma once
|
2
2
|
|
3
3
|
#include "ggml.h"
|
4
|
+
#include "ggml-alloc.h"
|
4
5
|
|
5
6
|
#ifdef __cplusplus
|
6
7
|
extern "C" {
|
7
8
|
#endif
|
8
|
-
struct ggml_backend;
|
9
|
-
struct ggml_backend_buffer;
|
10
|
-
|
11
|
-
// type-erased backend-specific types / wrappers
|
12
|
-
typedef void * ggml_backend_context_t;
|
13
|
-
typedef void * ggml_backend_graph_plan_t;
|
14
|
-
typedef void * ggml_backend_buffer_context_t;
|
15
|
-
|
16
|
-
// avoid accessing internals of these types
|
17
|
-
typedef struct ggml_backend * ggml_backend_t;
|
18
|
-
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
19
9
|
|
20
10
|
//
|
21
|
-
//
|
11
|
+
// Backend buffer
|
22
12
|
//
|
23
13
|
|
24
|
-
struct
|
25
|
-
|
26
|
-
void * (*get_base) (ggml_backend_buffer_t buffer); // get base pointer
|
27
|
-
size_t (*get_alloc_size)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-allocation callback
|
28
|
-
void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // post-allocation callback
|
29
|
-
void (*free_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-free callback
|
30
|
-
};
|
31
|
-
|
32
|
-
// TODO: hide behind API
|
33
|
-
struct ggml_backend_buffer {
|
34
|
-
struct ggml_backend_buffer_i iface;
|
35
|
-
|
36
|
-
ggml_backend_t backend;
|
37
|
-
ggml_backend_buffer_context_t context;
|
38
|
-
|
39
|
-
size_t size;
|
40
|
-
};
|
14
|
+
struct ggml_backend_buffer;
|
15
|
+
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
41
16
|
|
42
17
|
// backend buffer functions
|
43
|
-
GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
|
44
|
-
struct ggml_backend * backend,
|
45
|
-
struct ggml_backend_buffer_i iface,
|
46
|
-
ggml_backend_buffer_context_t context,
|
47
|
-
size_t size);
|
48
|
-
|
49
18
|
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
50
19
|
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
51
20
|
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
|
@@ -55,50 +24,13 @@ extern "C" {
|
|
55
24
|
GGML_API void ggml_backend_buffer_free_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
56
25
|
|
57
26
|
//
|
58
|
-
//
|
27
|
+
// Backend
|
59
28
|
//
|
60
29
|
|
61
|
-
struct
|
62
|
-
|
63
|
-
|
64
|
-
void (*free)(ggml_backend_t backend);
|
65
|
-
|
66
|
-
// buffer allocation
|
67
|
-
ggml_backend_buffer_t (*alloc_buffer)(ggml_backend_t backend, size_t size);
|
68
|
-
|
69
|
-
// get buffer alignment
|
70
|
-
size_t (*get_alignment)(ggml_backend_t backend);
|
71
|
-
|
72
|
-
// tensor data access
|
73
|
-
// these functions can be asynchronous, helper functions are provided for synchronous access that automatically call synchronize
|
74
|
-
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
75
|
-
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
76
|
-
void (*synchronize) (ggml_backend_t backend);
|
77
|
-
|
78
|
-
// (optional) copy tensor between different backends, allow for single-copy tranfers
|
79
|
-
void (*cpy_tensor_from)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
80
|
-
void (*cpy_tensor_to) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
81
|
-
|
82
|
-
// compute graph with a plan
|
83
|
-
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
84
|
-
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
85
|
-
void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
86
|
-
|
87
|
-
// compute graph without a plan
|
88
|
-
void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
89
|
-
|
90
|
-
// check if the backend supports an operation
|
91
|
-
bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
92
|
-
};
|
93
|
-
|
94
|
-
// TODO: hide behind API
|
95
|
-
struct ggml_backend {
|
96
|
-
struct ggml_backend_i iface;
|
97
|
-
|
98
|
-
ggml_backend_context_t context;
|
99
|
-
};
|
30
|
+
struct ggml_backend;
|
31
|
+
typedef struct ggml_backend * ggml_backend_t;
|
32
|
+
typedef void * ggml_backend_graph_plan_t;
|
100
33
|
|
101
|
-
// backend helper functions
|
102
34
|
GGML_API ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor);
|
103
35
|
|
104
36
|
GGML_API const char * ggml_backend_name(ggml_backend_t backend);
|
@@ -133,11 +65,72 @@ extern "C" {
|
|
133
65
|
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
|
134
66
|
|
135
67
|
GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend);
|
136
|
-
|
137
68
|
GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
|
138
69
|
|
70
|
+
// Create a backend buffer from an existing pointer
|
139
71
|
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size);
|
140
72
|
|
73
|
+
|
74
|
+
//
|
75
|
+
// Backend scheduler
|
76
|
+
//
|
77
|
+
|
78
|
+
// The backend scheduler allows for multiple backends to be used together
|
79
|
+
// Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
|
80
|
+
// The backends are selected based on:
|
81
|
+
// - the backend that supports the operation
|
82
|
+
// - the location of the pre-allocated tensors (e.g. the weights)
|
83
|
+
/*
|
84
|
+
Example usage:
|
85
|
+
|
86
|
+
sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, num_backends);
|
87
|
+
// sched is initialized with measure allocators and cannot be used until allocated with a measure graph
|
88
|
+
|
89
|
+
// initialize buffers from a measure graph
|
90
|
+
measure_graph = build_graph(sched); // use the allocr to allocate inputs as needed
|
91
|
+
|
92
|
+
// in build_graph:
|
93
|
+
build_graph(...) {
|
94
|
+
// allocating tensors in a specific backend (optional, recommended: pre-allocate inputs in a different buffer)
|
95
|
+
alloc_cpu = ggml_backend_sched_get_allocr(sched, backend_cpu);
|
96
|
+
ggml_allocr_alloc(alloc_cpu, tensor);
|
97
|
+
|
98
|
+
// manually assigning nodes to a backend (optional, shouldn't be needed in most cases)
|
99
|
+
struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
|
100
|
+
ggml_backend_sched_set_node_backend(sched, node, backend_gpu);
|
101
|
+
}
|
102
|
+
|
103
|
+
// allocate backend buffers from measure graph
|
104
|
+
ggml_backend_sched_init_measure(sched, measure_graph);
|
105
|
+
|
106
|
+
// the scheduler is now ready to compute graphs
|
107
|
+
|
108
|
+
// compute
|
109
|
+
graph = build_graph(sched);
|
110
|
+
ggml_backend_sched_graph_compute(sched, graph);
|
111
|
+
*/
|
112
|
+
|
113
|
+
struct ggml_backend_sched;
|
114
|
+
typedef struct ggml_backend_sched * ggml_backend_sched_t;
|
115
|
+
|
116
|
+
// Initialize a backend scheduler
|
117
|
+
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends);
|
118
|
+
|
119
|
+
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
120
|
+
|
121
|
+
// Initialize backend buffers from a measure graph
|
122
|
+
GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
|
123
|
+
|
124
|
+
GGML_API ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend);
|
125
|
+
GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend);
|
126
|
+
|
127
|
+
GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
128
|
+
|
129
|
+
// Allocate a graph on the backend scheduler
|
130
|
+
GGML_API void ggml_backend_sched_graph_compute(
|
131
|
+
ggml_backend_sched_t sched,
|
132
|
+
struct ggml_cgraph * graph);
|
133
|
+
|
141
134
|
#ifdef __cplusplus
|
142
135
|
}
|
143
136
|
#endif
|
@@ -81,12 +81,15 @@
|
|
81
81
|
|
82
82
|
#include "ggml-cuda.h"
|
83
83
|
#include "ggml.h"
|
84
|
+
#include "ggml-backend-impl.h"
|
84
85
|
|
85
86
|
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
86
87
|
#define CC_VOLTA 700
|
87
88
|
#define CC_OFFSET_AMD 1000000
|
88
89
|
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
89
90
|
|
91
|
+
#define GGML_CUDA_MAX_NODES 8192
|
92
|
+
|
90
93
|
// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
|
91
94
|
// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
|
92
95
|
// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
|
@@ -433,6 +436,8 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
433
436
|
#define CUDA_MUL_BLOCK_SIZE 256
|
434
437
|
#define CUDA_GELU_BLOCK_SIZE 256
|
435
438
|
#define CUDA_SILU_BLOCK_SIZE 256
|
439
|
+
#define CUDA_RELU_BLOCK_SIZE 256
|
440
|
+
#define CUDA_SQR_BLOCK_SIZE 256
|
436
441
|
#define CUDA_CPY_BLOCK_SIZE 32
|
437
442
|
#define CUDA_SCALE_BLOCK_SIZE 256
|
438
443
|
#define CUDA_CLAMP_BLOCK_SIZE 256
|
@@ -553,6 +558,24 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
|
|
553
558
|
dst[i] = x[i] / (1.0f + expf(-x[i]));
|
554
559
|
}
|
555
560
|
|
561
|
+
static __global__ void relu_f32(const float * x, float * dst, const int k) {
|
562
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
563
|
+
|
564
|
+
if (i >= k) {
|
565
|
+
return;
|
566
|
+
}
|
567
|
+
dst[i] = fmaxf(x[i], 0);
|
568
|
+
}
|
569
|
+
|
570
|
+
static __global__ void sqr_f32(const float * x, float * dst, const int k) {
|
571
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
572
|
+
|
573
|
+
if (i >= k) {
|
574
|
+
return;
|
575
|
+
}
|
576
|
+
dst[i] = x[i] * x[i];
|
577
|
+
}
|
578
|
+
|
556
579
|
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
557
580
|
#pragma unroll
|
558
581
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
@@ -4468,6 +4491,13 @@ static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
|
|
4468
4491
|
*dsti = __float2half(*xi);
|
4469
4492
|
}
|
4470
4493
|
|
4494
|
+
static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) {
|
4495
|
+
const half * xi = (const half *) cxi;
|
4496
|
+
half * dsti = (half *) cdsti;
|
4497
|
+
|
4498
|
+
*dsti = *xi;
|
4499
|
+
}
|
4500
|
+
|
4471
4501
|
template <cpy_kernel_t cpy_1>
|
4472
4502
|
static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
4473
4503
|
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
@@ -4721,6 +4751,25 @@ static __global__ void clamp_f32(const float * x, float * dst, const float min,
|
|
4721
4751
|
dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
|
4722
4752
|
}
|
4723
4753
|
|
4754
|
+
static __global__ void im2col_f32_f16(
|
4755
|
+
const float * x, half * dst,
|
4756
|
+
int ofs0, int ofs1, int IW, int IH, int CHW,
|
4757
|
+
int s0, int s1, int p0, int p1, int d0, int d1) {
|
4758
|
+
const int iiw = blockIdx.z * s0 + threadIdx.z * d0 - p0;
|
4759
|
+
const int iih = blockIdx.y * s1 + threadIdx.y * d1 - p1;
|
4760
|
+
|
4761
|
+
const int offset_dst =
|
4762
|
+
(threadIdx.x * gridDim.y * gridDim.z + blockIdx.y * gridDim.z + blockIdx.z) * CHW +
|
4763
|
+
(blockIdx.x * (blockDim.y * blockDim.z) + threadIdx.y * blockDim.z + threadIdx.z);
|
4764
|
+
|
4765
|
+
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
4766
|
+
dst[offset_dst] = __float2half(0.0f);
|
4767
|
+
} else {
|
4768
|
+
const int offset_src = threadIdx.x * ofs0 + blockIdx.x * ofs1;
|
4769
|
+
dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
|
4770
|
+
}
|
4771
|
+
}
|
4772
|
+
|
4724
4773
|
template<int qk, int qr, dequantize_kernel_t dq>
|
4725
4774
|
static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
|
4726
4775
|
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
|
@@ -4759,6 +4808,16 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
|
|
4759
4808
|
silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
4760
4809
|
}
|
4761
4810
|
|
4811
|
+
static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
4812
|
+
const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
|
4813
|
+
relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
4814
|
+
}
|
4815
|
+
|
4816
|
+
static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
4817
|
+
const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
|
4818
|
+
sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
4819
|
+
}
|
4820
|
+
|
4762
4821
|
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4763
4822
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
4764
4823
|
if (ncols < 1024) {
|
@@ -5611,6 +5670,16 @@ static void ggml_cpy_f32_f16_cuda(
|
|
5611
5670
|
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
5612
5671
|
}
|
5613
5672
|
|
5673
|
+
static void ggml_cpy_f16_f16_cuda(
|
5674
|
+
const char * cx, char * cdst, const int ne,
|
5675
|
+
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
5676
|
+
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
5677
|
+
|
5678
|
+
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
5679
|
+
cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
5680
|
+
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
5681
|
+
}
|
5682
|
+
|
5614
5683
|
static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
|
5615
5684
|
const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
|
5616
5685
|
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
|
@@ -5694,6 +5763,15 @@ static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, c
|
|
5694
5763
|
soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
|
5695
5764
|
}
|
5696
5765
|
|
5766
|
+
static void im2col_f32_f16_cuda(const float * x, half * dst,
|
5767
|
+
int OH, int IW, int IH, int OW, int IC,
|
5768
|
+
int KH, int KW, int N, int ofs0, int ofs1,
|
5769
|
+
int s0, int s1, int p0, int p1, int d0, int d1, cudaStream_t stream) {
|
5770
|
+
dim3 block_nums(IC, OH, OW);
|
5771
|
+
dim3 block_dims(N, KH, KW);
|
5772
|
+
im2col_f32_f16<<<block_nums, block_dims, 0, stream>>>(x, dst, ofs0, ofs1, IW, IH, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
|
5773
|
+
}
|
5774
|
+
|
5697
5775
|
// buffer pool for cuda
|
5698
5776
|
#define MAX_CUDA_BUFFERS 256
|
5699
5777
|
|
@@ -5762,7 +5840,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
|
|
5762
5840
|
return ptr;
|
5763
5841
|
}
|
5764
5842
|
#ifdef DEBUG_CUDA_MALLOC
|
5765
|
-
fprintf(stderr, "%s: %d buffers, max_size = %u
|
5843
|
+
fprintf(stderr, "%s: %d buffers, max_size = %u MiB, tot_size = %u MiB, requested %u MiB\n", __func__, nnz,
|
5766
5844
|
(uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
|
5767
5845
|
#endif
|
5768
5846
|
void * ptr;
|
@@ -5900,7 +5978,7 @@ void * ggml_cuda_host_malloc(size_t size) {
|
|
5900
5978
|
// The allocation error can be bypassed. A null ptr will assigned out of this function.
|
5901
5979
|
// This can fixed the OOM error in WSL.
|
5902
5980
|
cudaGetLastError();
|
5903
|
-
fprintf(stderr, "WARNING: failed to allocate %.2f
|
5981
|
+
fprintf(stderr, "WARNING: failed to allocate %.2f MiB of pinned memory: %s\n",
|
5904
5982
|
size/1024.0/1024.0, cudaGetErrorString(err));
|
5905
5983
|
return nullptr;
|
5906
5984
|
}
|
@@ -6128,6 +6206,34 @@ inline void ggml_cuda_op_silu(
|
|
6128
6206
|
(void) src1_dd;
|
6129
6207
|
}
|
6130
6208
|
|
6209
|
+
inline void ggml_cuda_op_relu(
|
6210
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6211
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6212
|
+
|
6213
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6214
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6215
|
+
|
6216
|
+
relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
6217
|
+
|
6218
|
+
(void) src1;
|
6219
|
+
(void) dst;
|
6220
|
+
(void) src1_dd;
|
6221
|
+
}
|
6222
|
+
|
6223
|
+
inline void ggml_cuda_op_sqr(
|
6224
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6225
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6226
|
+
|
6227
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6228
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6229
|
+
|
6230
|
+
sqr_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
6231
|
+
|
6232
|
+
(void) src1;
|
6233
|
+
(void) dst;
|
6234
|
+
(void) src1_dd;
|
6235
|
+
}
|
6236
|
+
|
6131
6237
|
inline void ggml_cuda_op_norm(
|
6132
6238
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6133
6239
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
@@ -6250,6 +6356,7 @@ static int64_t get_row_rounding(ggml_type type) {
|
|
6250
6356
|
case GGML_TYPE_Q8_0:
|
6251
6357
|
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
6252
6358
|
case GGML_TYPE_F16:
|
6359
|
+
case GGML_TYPE_F32:
|
6253
6360
|
return 1;
|
6254
6361
|
case GGML_TYPE_Q2_K:
|
6255
6362
|
return max_compute_capability >= CC_RDNA2 ? 128 : 32;
|
@@ -6272,6 +6379,7 @@ static int64_t get_row_rounding(ggml_type type) {
|
|
6272
6379
|
case GGML_TYPE_Q8_0:
|
6273
6380
|
return 64;
|
6274
6381
|
case GGML_TYPE_F16:
|
6382
|
+
case GGML_TYPE_F32:
|
6275
6383
|
return 1;
|
6276
6384
|
case GGML_TYPE_Q2_K:
|
6277
6385
|
case GGML_TYPE_Q3_K:
|
@@ -6463,8 +6571,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
6463
6571
|
src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src1_as);
|
6464
6572
|
to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream);
|
6465
6573
|
}
|
6466
|
-
const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *)
|
6467
|
-
|
6574
|
+
const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16;
|
6468
6575
|
size_t dst_as = 0;
|
6469
6576
|
half * dst_f16 = (half *) ggml_cuda_pool_malloc(row_diff*src1_ncols * sizeof(half), &dst_as);
|
6470
6577
|
|
@@ -6639,6 +6746,45 @@ inline void ggml_cuda_op_alibi(
|
|
6639
6746
|
(void) src1_dd;
|
6640
6747
|
}
|
6641
6748
|
|
6749
|
+
inline void ggml_cuda_op_im2col(
|
6750
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6751
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6752
|
+
|
6753
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
6754
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
6755
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F16);
|
6756
|
+
|
6757
|
+
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
|
6758
|
+
const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
|
6759
|
+
const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
|
6760
|
+
const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
|
6761
|
+
const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
|
6762
|
+
const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
|
6763
|
+
|
6764
|
+
const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
|
6765
|
+
|
6766
|
+
const int64_t N = src1->ne[is_2D ? 3 : 2];
|
6767
|
+
const int64_t IC = src1->ne[is_2D ? 2 : 1];
|
6768
|
+
const int64_t IH = is_2D ? src1->ne[1] : 1;
|
6769
|
+
const int64_t IW = src1->ne[0];
|
6770
|
+
|
6771
|
+
const int64_t KH = is_2D ? src0->ne[1] : 1;
|
6772
|
+
const int64_t KW = src0->ne[0];
|
6773
|
+
|
6774
|
+
const int64_t OH = is_2D ? dst->ne[2] : 1;
|
6775
|
+
const int64_t OW = dst->ne[1];
|
6776
|
+
|
6777
|
+
const size_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4; // nb is byte offset, src is type float32
|
6778
|
+
const size_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
|
6779
|
+
|
6780
|
+
im2col_f32_f16_cuda(src1_dd, (half*) dst_dd,
|
6781
|
+
OH, IW, IH, OW, IC, KH, KW, N,
|
6782
|
+
ofs0, ofs1, s0, s1, p0, p1, d0, d1, main_stream);
|
6783
|
+
|
6784
|
+
(void) src0;
|
6785
|
+
(void) src0_dd;
|
6786
|
+
}
|
6787
|
+
|
6642
6788
|
inline void ggml_cuda_op_diag_mask_inf(
|
6643
6789
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6644
6790
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
@@ -7160,6 +7306,14 @@ static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, g
|
|
7160
7306
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
|
7161
7307
|
}
|
7162
7308
|
|
7309
|
+
static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7310
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu);
|
7311
|
+
}
|
7312
|
+
|
7313
|
+
static void ggml_cuda_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7314
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sqr);
|
7315
|
+
}
|
7316
|
+
|
7163
7317
|
static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7164
7318
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
|
7165
7319
|
}
|
@@ -7543,6 +7697,9 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
7543
7697
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
7544
7698
|
ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
7545
7699
|
ne10, ne11, nb10, nb11, nb12, main_stream);
|
7700
|
+
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
7701
|
+
ggml_cpy_f16_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
7702
|
+
ne10, ne11, nb10, nb11, nb12, main_stream);
|
7546
7703
|
} else {
|
7547
7704
|
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
7548
7705
|
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
@@ -7574,6 +7731,10 @@ static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1,
|
|
7574
7731
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
|
7575
7732
|
}
|
7576
7733
|
|
7734
|
+
static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7735
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
|
7736
|
+
}
|
7737
|
+
|
7577
7738
|
static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7578
7739
|
(void) src0;
|
7579
7740
|
(void) src1;
|
@@ -7685,11 +7846,11 @@ static size_t g_temp_tensor_extra_index = 0;
|
|
7685
7846
|
|
7686
7847
|
static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
7687
7848
|
if (g_temp_tensor_extras == nullptr) {
|
7688
|
-
g_temp_tensor_extras = new ggml_tensor_extra_gpu[
|
7849
|
+
g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
|
7689
7850
|
}
|
7690
7851
|
|
7691
7852
|
size_t alloc_index = g_temp_tensor_extra_index;
|
7692
|
-
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) %
|
7853
|
+
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
|
7693
7854
|
ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
|
7694
7855
|
memset(extra, 0, sizeof(*extra));
|
7695
7856
|
|
@@ -7867,6 +8028,15 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
7867
8028
|
return false;
|
7868
8029
|
}
|
7869
8030
|
|
8031
|
+
if (tensor->op == GGML_OP_MUL_MAT) {
|
8032
|
+
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
|
8033
|
+
#ifndef NDEBUG
|
8034
|
+
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %d, src1->ne[3] = %d - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
|
8035
|
+
#endif
|
8036
|
+
return false;
|
8037
|
+
}
|
8038
|
+
}
|
8039
|
+
|
7870
8040
|
switch (tensor->op) {
|
7871
8041
|
case GGML_OP_REPEAT:
|
7872
8042
|
func = ggml_cuda_repeat;
|
@@ -7891,6 +8061,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
7891
8061
|
case GGML_UNARY_OP_SILU:
|
7892
8062
|
func = ggml_cuda_silu;
|
7893
8063
|
break;
|
8064
|
+
case GGML_UNARY_OP_RELU:
|
8065
|
+
func = ggml_cuda_relu;
|
8066
|
+
break;
|
7894
8067
|
default:
|
7895
8068
|
return false;
|
7896
8069
|
} break;
|
@@ -7909,6 +8082,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
7909
8082
|
case GGML_OP_SCALE:
|
7910
8083
|
func = ggml_cuda_scale;
|
7911
8084
|
break;
|
8085
|
+
case GGML_OP_SQR:
|
8086
|
+
func = ggml_cuda_sqr;
|
8087
|
+
break;
|
7912
8088
|
case GGML_OP_CLAMP:
|
7913
8089
|
if (!any_on_device) {
|
7914
8090
|
return false;
|
@@ -7939,6 +8115,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
7939
8115
|
case GGML_OP_ALIBI:
|
7940
8116
|
func = ggml_cuda_alibi;
|
7941
8117
|
break;
|
8118
|
+
case GGML_OP_IM2COL:
|
8119
|
+
func = ggml_cuda_im2col;
|
8120
|
+
break;
|
7942
8121
|
default:
|
7943
8122
|
return false;
|
7944
8123
|
}
|
@@ -7998,11 +8177,11 @@ struct ggml_backend_buffer_context_cuda {
|
|
7998
8177
|
|
7999
8178
|
ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
8000
8179
|
if (temp_tensor_extras == nullptr) {
|
8001
|
-
temp_tensor_extras = new ggml_tensor_extra_gpu[
|
8180
|
+
temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
|
8002
8181
|
}
|
8003
8182
|
|
8004
8183
|
size_t alloc_index = temp_tensor_extra_index;
|
8005
|
-
temp_tensor_extra_index = (temp_tensor_extra_index + 1) %
|
8184
|
+
temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
|
8006
8185
|
ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
|
8007
8186
|
memset(extra, 0, sizeof(*extra));
|
8008
8187
|
|
@@ -8088,7 +8267,12 @@ static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backe
|
|
8088
8267
|
ggml_cuda_set_device(g_main_device);
|
8089
8268
|
|
8090
8269
|
ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda;
|
8270
|
+
|
8271
|
+
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
|
8272
|
+
|
8273
|
+
ggml_cuda_set_device(g_main_device);
|
8091
8274
|
CUDA_CHECK(cudaMalloc(&ctx->device, size));
|
8275
|
+
|
8092
8276
|
return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size);
|
8093
8277
|
}
|
8094
8278
|
|
@@ -8155,6 +8339,8 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
|
|
8155
8339
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
8156
8340
|
ggml_tensor * node = cgraph->nodes[i];
|
8157
8341
|
|
8342
|
+
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
|
8343
|
+
continue;
|
8158
8344
|
assert(node->backend == GGML_BACKEND_GPU);
|
8159
8345
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
8160
8346
|
if (node->src[j] != nullptr) {
|
@@ -39,12 +39,6 @@ extern "C" {
|
|
39
39
|
#endif
|
40
40
|
#endif
|
41
41
|
|
42
|
-
#undef MIN
|
43
|
-
#undef MAX
|
44
|
-
|
45
|
-
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
46
|
-
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
47
|
-
|
48
42
|
// 16-bit float
|
49
43
|
// on Arm, we use __fp16
|
50
44
|
// on x86, we use uint16_t
|
@@ -230,7 +224,19 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
|
230
224
|
|
231
225
|
#endif
|
232
226
|
|
233
|
-
|
227
|
+
#define GGML_HASHTABLE_FULL ((size_t)-1)
|
228
|
+
#define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2)
|
229
|
+
|
230
|
+
bool ggml_hash_contains (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
|
231
|
+
|
232
|
+
// returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
|
233
|
+
size_t ggml_hash_find (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
|
234
|
+
|
235
|
+
// returns GGML_HAHSHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
|
236
|
+
size_t ggml_hash_insert ( struct ggml_hash_set hash_set, struct ggml_tensor * key);
|
237
|
+
|
238
|
+
// return index, asserts if table is full
|
239
|
+
size_t ggml_hash_find_or_insert( struct ggml_hash_set hash_set, struct ggml_tensor * key);
|
234
240
|
|
235
241
|
#ifdef __cplusplus
|
236
242
|
}
|