llama_cpp 0.9.2 → 0.9.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -1
- data/ext/llama_cpp/llama_cpp.cpp +12 -0
- data/ext/llama_cpp/src/ggml-alloc.c +378 -208
- data/ext/llama_cpp/src/ggml-alloc.h +68 -16
- data/ext/llama_cpp/src/ggml-backend-impl.h +87 -0
- data/ext/llama_cpp/src/ggml-backend.c +578 -13
- data/ext/llama_cpp/src/ggml-backend.h +70 -77
- data/ext/llama_cpp/src/ggml-cuda.cu +194 -8
- data/ext/llama_cpp/src/ggml-impl.h +13 -7
- data/ext/llama_cpp/src/ggml-metal.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.m +113 -32
- data/ext/llama_cpp/src/ggml-metal.metal +107 -1
- data/ext/llama_cpp/src/ggml-quants.c +173 -73
- data/ext/llama_cpp/src/ggml.c +826 -1482
- data/ext/llama_cpp/src/ggml.h +63 -45
- data/ext/llama_cpp/src/llama.cpp +364 -38
- data/ext/llama_cpp/src/llama.h +6 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +3 -2
@@ -1,51 +1,20 @@
|
|
1
1
|
#pragma once
|
2
2
|
|
3
3
|
#include "ggml.h"
|
4
|
+
#include "ggml-alloc.h"
|
4
5
|
|
5
6
|
#ifdef __cplusplus
|
6
7
|
extern "C" {
|
7
8
|
#endif
|
8
|
-
struct ggml_backend;
|
9
|
-
struct ggml_backend_buffer;
|
10
|
-
|
11
|
-
// type-erased backend-specific types / wrappers
|
12
|
-
typedef void * ggml_backend_context_t;
|
13
|
-
typedef void * ggml_backend_graph_plan_t;
|
14
|
-
typedef void * ggml_backend_buffer_context_t;
|
15
|
-
|
16
|
-
// avoid accessing internals of these types
|
17
|
-
typedef struct ggml_backend * ggml_backend_t;
|
18
|
-
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
19
9
|
|
20
10
|
//
|
21
|
-
//
|
11
|
+
// Backend buffer
|
22
12
|
//
|
23
13
|
|
24
|
-
struct
|
25
|
-
|
26
|
-
void * (*get_base) (ggml_backend_buffer_t buffer); // get base pointer
|
27
|
-
size_t (*get_alloc_size)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-allocation callback
|
28
|
-
void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // post-allocation callback
|
29
|
-
void (*free_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-free callback
|
30
|
-
};
|
31
|
-
|
32
|
-
// TODO: hide behind API
|
33
|
-
struct ggml_backend_buffer {
|
34
|
-
struct ggml_backend_buffer_i iface;
|
35
|
-
|
36
|
-
ggml_backend_t backend;
|
37
|
-
ggml_backend_buffer_context_t context;
|
38
|
-
|
39
|
-
size_t size;
|
40
|
-
};
|
14
|
+
struct ggml_backend_buffer;
|
15
|
+
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
41
16
|
|
42
17
|
// backend buffer functions
|
43
|
-
GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
|
44
|
-
struct ggml_backend * backend,
|
45
|
-
struct ggml_backend_buffer_i iface,
|
46
|
-
ggml_backend_buffer_context_t context,
|
47
|
-
size_t size);
|
48
|
-
|
49
18
|
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
50
19
|
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
51
20
|
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
|
@@ -55,50 +24,13 @@ extern "C" {
|
|
55
24
|
GGML_API void ggml_backend_buffer_free_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
56
25
|
|
57
26
|
//
|
58
|
-
//
|
27
|
+
// Backend
|
59
28
|
//
|
60
29
|
|
61
|
-
struct
|
62
|
-
|
63
|
-
|
64
|
-
void (*free)(ggml_backend_t backend);
|
65
|
-
|
66
|
-
// buffer allocation
|
67
|
-
ggml_backend_buffer_t (*alloc_buffer)(ggml_backend_t backend, size_t size);
|
68
|
-
|
69
|
-
// get buffer alignment
|
70
|
-
size_t (*get_alignment)(ggml_backend_t backend);
|
71
|
-
|
72
|
-
// tensor data access
|
73
|
-
// these functions can be asynchronous, helper functions are provided for synchronous access that automatically call synchronize
|
74
|
-
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
75
|
-
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
76
|
-
void (*synchronize) (ggml_backend_t backend);
|
77
|
-
|
78
|
-
// (optional) copy tensor between different backends, allow for single-copy tranfers
|
79
|
-
void (*cpy_tensor_from)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
80
|
-
void (*cpy_tensor_to) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
81
|
-
|
82
|
-
// compute graph with a plan
|
83
|
-
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
84
|
-
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
85
|
-
void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
86
|
-
|
87
|
-
// compute graph without a plan
|
88
|
-
void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
89
|
-
|
90
|
-
// check if the backend supports an operation
|
91
|
-
bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
92
|
-
};
|
93
|
-
|
94
|
-
// TODO: hide behind API
|
95
|
-
struct ggml_backend {
|
96
|
-
struct ggml_backend_i iface;
|
97
|
-
|
98
|
-
ggml_backend_context_t context;
|
99
|
-
};
|
30
|
+
struct ggml_backend;
|
31
|
+
typedef struct ggml_backend * ggml_backend_t;
|
32
|
+
typedef void * ggml_backend_graph_plan_t;
|
100
33
|
|
101
|
-
// backend helper functions
|
102
34
|
GGML_API ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor);
|
103
35
|
|
104
36
|
GGML_API const char * ggml_backend_name(ggml_backend_t backend);
|
@@ -133,11 +65,72 @@ extern "C" {
|
|
133
65
|
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
|
134
66
|
|
135
67
|
GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend);
|
136
|
-
|
137
68
|
GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
|
138
69
|
|
70
|
+
// Create a backend buffer from an existing pointer
|
139
71
|
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size);
|
140
72
|
|
73
|
+
|
74
|
+
//
|
75
|
+
// Backend scheduler
|
76
|
+
//
|
77
|
+
|
78
|
+
// The backend scheduler allows for multiple backends to be used together
|
79
|
+
// Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
|
80
|
+
// The backends are selected based on:
|
81
|
+
// - the backend that supports the operation
|
82
|
+
// - the location of the pre-allocated tensors (e.g. the weights)
|
83
|
+
/*
|
84
|
+
Example usage:
|
85
|
+
|
86
|
+
sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, num_backends);
|
87
|
+
// sched is initialized with measure allocators and cannot be used until allocated with a measure graph
|
88
|
+
|
89
|
+
// initialize buffers from a measure graph
|
90
|
+
measure_graph = build_graph(sched); // use the allocr to allocate inputs as needed
|
91
|
+
|
92
|
+
// in build_graph:
|
93
|
+
build_graph(...) {
|
94
|
+
// allocating tensors in a specific backend (optional, recommended: pre-allocate inputs in a different buffer)
|
95
|
+
alloc_cpu = ggml_backend_sched_get_allocr(sched, backend_cpu);
|
96
|
+
ggml_allocr_alloc(alloc_cpu, tensor);
|
97
|
+
|
98
|
+
// manually assigning nodes to a backend (optional, shouldn't be needed in most cases)
|
99
|
+
struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
|
100
|
+
ggml_backend_sched_set_node_backend(sched, node, backend_gpu);
|
101
|
+
}
|
102
|
+
|
103
|
+
// allocate backend buffers from measure graph
|
104
|
+
ggml_backend_sched_init_measure(sched, measure_graph);
|
105
|
+
|
106
|
+
// the scheduler is now ready to compute graphs
|
107
|
+
|
108
|
+
// compute
|
109
|
+
graph = build_graph(sched);
|
110
|
+
ggml_backend_sched_graph_compute(sched, graph);
|
111
|
+
*/
|
112
|
+
|
113
|
+
struct ggml_backend_sched;
|
114
|
+
typedef struct ggml_backend_sched * ggml_backend_sched_t;
|
115
|
+
|
116
|
+
// Initialize a backend scheduler
|
117
|
+
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends);
|
118
|
+
|
119
|
+
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
120
|
+
|
121
|
+
// Initialize backend buffers from a measure graph
|
122
|
+
GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
|
123
|
+
|
124
|
+
GGML_API ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend);
|
125
|
+
GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend);
|
126
|
+
|
127
|
+
GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
128
|
+
|
129
|
+
// Allocate a graph on the backend scheduler
|
130
|
+
GGML_API void ggml_backend_sched_graph_compute(
|
131
|
+
ggml_backend_sched_t sched,
|
132
|
+
struct ggml_cgraph * graph);
|
133
|
+
|
141
134
|
#ifdef __cplusplus
|
142
135
|
}
|
143
136
|
#endif
|
@@ -81,12 +81,15 @@
|
|
81
81
|
|
82
82
|
#include "ggml-cuda.h"
|
83
83
|
#include "ggml.h"
|
84
|
+
#include "ggml-backend-impl.h"
|
84
85
|
|
85
86
|
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
86
87
|
#define CC_VOLTA 700
|
87
88
|
#define CC_OFFSET_AMD 1000000
|
88
89
|
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
89
90
|
|
91
|
+
#define GGML_CUDA_MAX_NODES 8192
|
92
|
+
|
90
93
|
// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
|
91
94
|
// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
|
92
95
|
// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
|
@@ -433,6 +436,8 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
433
436
|
#define CUDA_MUL_BLOCK_SIZE 256
|
434
437
|
#define CUDA_GELU_BLOCK_SIZE 256
|
435
438
|
#define CUDA_SILU_BLOCK_SIZE 256
|
439
|
+
#define CUDA_RELU_BLOCK_SIZE 256
|
440
|
+
#define CUDA_SQR_BLOCK_SIZE 256
|
436
441
|
#define CUDA_CPY_BLOCK_SIZE 32
|
437
442
|
#define CUDA_SCALE_BLOCK_SIZE 256
|
438
443
|
#define CUDA_CLAMP_BLOCK_SIZE 256
|
@@ -553,6 +558,24 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
|
|
553
558
|
dst[i] = x[i] / (1.0f + expf(-x[i]));
|
554
559
|
}
|
555
560
|
|
561
|
+
static __global__ void relu_f32(const float * x, float * dst, const int k) {
|
562
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
563
|
+
|
564
|
+
if (i >= k) {
|
565
|
+
return;
|
566
|
+
}
|
567
|
+
dst[i] = fmaxf(x[i], 0);
|
568
|
+
}
|
569
|
+
|
570
|
+
static __global__ void sqr_f32(const float * x, float * dst, const int k) {
|
571
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
572
|
+
|
573
|
+
if (i >= k) {
|
574
|
+
return;
|
575
|
+
}
|
576
|
+
dst[i] = x[i] * x[i];
|
577
|
+
}
|
578
|
+
|
556
579
|
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
557
580
|
#pragma unroll
|
558
581
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
@@ -4468,6 +4491,13 @@ static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
|
|
4468
4491
|
*dsti = __float2half(*xi);
|
4469
4492
|
}
|
4470
4493
|
|
4494
|
+
static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) {
|
4495
|
+
const half * xi = (const half *) cxi;
|
4496
|
+
half * dsti = (half *) cdsti;
|
4497
|
+
|
4498
|
+
*dsti = *xi;
|
4499
|
+
}
|
4500
|
+
|
4471
4501
|
template <cpy_kernel_t cpy_1>
|
4472
4502
|
static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
4473
4503
|
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
@@ -4721,6 +4751,25 @@ static __global__ void clamp_f32(const float * x, float * dst, const float min,
|
|
4721
4751
|
dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
|
4722
4752
|
}
|
4723
4753
|
|
4754
|
+
static __global__ void im2col_f32_f16(
|
4755
|
+
const float * x, half * dst,
|
4756
|
+
int ofs0, int ofs1, int IW, int IH, int CHW,
|
4757
|
+
int s0, int s1, int p0, int p1, int d0, int d1) {
|
4758
|
+
const int iiw = blockIdx.z * s0 + threadIdx.z * d0 - p0;
|
4759
|
+
const int iih = blockIdx.y * s1 + threadIdx.y * d1 - p1;
|
4760
|
+
|
4761
|
+
const int offset_dst =
|
4762
|
+
(threadIdx.x * gridDim.y * gridDim.z + blockIdx.y * gridDim.z + blockIdx.z) * CHW +
|
4763
|
+
(blockIdx.x * (blockDim.y * blockDim.z) + threadIdx.y * blockDim.z + threadIdx.z);
|
4764
|
+
|
4765
|
+
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
4766
|
+
dst[offset_dst] = __float2half(0.0f);
|
4767
|
+
} else {
|
4768
|
+
const int offset_src = threadIdx.x * ofs0 + blockIdx.x * ofs1;
|
4769
|
+
dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
|
4770
|
+
}
|
4771
|
+
}
|
4772
|
+
|
4724
4773
|
template<int qk, int qr, dequantize_kernel_t dq>
|
4725
4774
|
static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
|
4726
4775
|
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
|
@@ -4759,6 +4808,16 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
|
|
4759
4808
|
silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
4760
4809
|
}
|
4761
4810
|
|
4811
|
+
static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
4812
|
+
const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
|
4813
|
+
relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
4814
|
+
}
|
4815
|
+
|
4816
|
+
static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
4817
|
+
const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
|
4818
|
+
sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
4819
|
+
}
|
4820
|
+
|
4762
4821
|
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4763
4822
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
4764
4823
|
if (ncols < 1024) {
|
@@ -5611,6 +5670,16 @@ static void ggml_cpy_f32_f16_cuda(
|
|
5611
5670
|
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
5612
5671
|
}
|
5613
5672
|
|
5673
|
+
static void ggml_cpy_f16_f16_cuda(
|
5674
|
+
const char * cx, char * cdst, const int ne,
|
5675
|
+
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
5676
|
+
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
5677
|
+
|
5678
|
+
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
5679
|
+
cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
5680
|
+
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
5681
|
+
}
|
5682
|
+
|
5614
5683
|
static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
|
5615
5684
|
const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
|
5616
5685
|
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
|
@@ -5694,6 +5763,15 @@ static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, c
|
|
5694
5763
|
soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
|
5695
5764
|
}
|
5696
5765
|
|
5766
|
+
static void im2col_f32_f16_cuda(const float * x, half * dst,
|
5767
|
+
int OH, int IW, int IH, int OW, int IC,
|
5768
|
+
int KH, int KW, int N, int ofs0, int ofs1,
|
5769
|
+
int s0, int s1, int p0, int p1, int d0, int d1, cudaStream_t stream) {
|
5770
|
+
dim3 block_nums(IC, OH, OW);
|
5771
|
+
dim3 block_dims(N, KH, KW);
|
5772
|
+
im2col_f32_f16<<<block_nums, block_dims, 0, stream>>>(x, dst, ofs0, ofs1, IW, IH, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
|
5773
|
+
}
|
5774
|
+
|
5697
5775
|
// buffer pool for cuda
|
5698
5776
|
#define MAX_CUDA_BUFFERS 256
|
5699
5777
|
|
@@ -5762,7 +5840,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
|
|
5762
5840
|
return ptr;
|
5763
5841
|
}
|
5764
5842
|
#ifdef DEBUG_CUDA_MALLOC
|
5765
|
-
fprintf(stderr, "%s: %d buffers, max_size = %u
|
5843
|
+
fprintf(stderr, "%s: %d buffers, max_size = %u MiB, tot_size = %u MiB, requested %u MiB\n", __func__, nnz,
|
5766
5844
|
(uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
|
5767
5845
|
#endif
|
5768
5846
|
void * ptr;
|
@@ -5900,7 +5978,7 @@ void * ggml_cuda_host_malloc(size_t size) {
|
|
5900
5978
|
// The allocation error can be bypassed. A null ptr will assigned out of this function.
|
5901
5979
|
// This can fixed the OOM error in WSL.
|
5902
5980
|
cudaGetLastError();
|
5903
|
-
fprintf(stderr, "WARNING: failed to allocate %.2f
|
5981
|
+
fprintf(stderr, "WARNING: failed to allocate %.2f MiB of pinned memory: %s\n",
|
5904
5982
|
size/1024.0/1024.0, cudaGetErrorString(err));
|
5905
5983
|
return nullptr;
|
5906
5984
|
}
|
@@ -6128,6 +6206,34 @@ inline void ggml_cuda_op_silu(
|
|
6128
6206
|
(void) src1_dd;
|
6129
6207
|
}
|
6130
6208
|
|
6209
|
+
inline void ggml_cuda_op_relu(
|
6210
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6211
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6212
|
+
|
6213
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6214
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6215
|
+
|
6216
|
+
relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
6217
|
+
|
6218
|
+
(void) src1;
|
6219
|
+
(void) dst;
|
6220
|
+
(void) src1_dd;
|
6221
|
+
}
|
6222
|
+
|
6223
|
+
inline void ggml_cuda_op_sqr(
|
6224
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6225
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6226
|
+
|
6227
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6228
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6229
|
+
|
6230
|
+
sqr_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
6231
|
+
|
6232
|
+
(void) src1;
|
6233
|
+
(void) dst;
|
6234
|
+
(void) src1_dd;
|
6235
|
+
}
|
6236
|
+
|
6131
6237
|
inline void ggml_cuda_op_norm(
|
6132
6238
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6133
6239
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
@@ -6250,6 +6356,7 @@ static int64_t get_row_rounding(ggml_type type) {
|
|
6250
6356
|
case GGML_TYPE_Q8_0:
|
6251
6357
|
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
6252
6358
|
case GGML_TYPE_F16:
|
6359
|
+
case GGML_TYPE_F32:
|
6253
6360
|
return 1;
|
6254
6361
|
case GGML_TYPE_Q2_K:
|
6255
6362
|
return max_compute_capability >= CC_RDNA2 ? 128 : 32;
|
@@ -6272,6 +6379,7 @@ static int64_t get_row_rounding(ggml_type type) {
|
|
6272
6379
|
case GGML_TYPE_Q8_0:
|
6273
6380
|
return 64;
|
6274
6381
|
case GGML_TYPE_F16:
|
6382
|
+
case GGML_TYPE_F32:
|
6275
6383
|
return 1;
|
6276
6384
|
case GGML_TYPE_Q2_K:
|
6277
6385
|
case GGML_TYPE_Q3_K:
|
@@ -6463,8 +6571,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
6463
6571
|
src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src1_as);
|
6464
6572
|
to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream);
|
6465
6573
|
}
|
6466
|
-
const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *)
|
6467
|
-
|
6574
|
+
const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16;
|
6468
6575
|
size_t dst_as = 0;
|
6469
6576
|
half * dst_f16 = (half *) ggml_cuda_pool_malloc(row_diff*src1_ncols * sizeof(half), &dst_as);
|
6470
6577
|
|
@@ -6639,6 +6746,45 @@ inline void ggml_cuda_op_alibi(
|
|
6639
6746
|
(void) src1_dd;
|
6640
6747
|
}
|
6641
6748
|
|
6749
|
+
inline void ggml_cuda_op_im2col(
|
6750
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6751
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6752
|
+
|
6753
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
6754
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
6755
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F16);
|
6756
|
+
|
6757
|
+
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
|
6758
|
+
const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
|
6759
|
+
const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
|
6760
|
+
const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
|
6761
|
+
const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
|
6762
|
+
const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
|
6763
|
+
|
6764
|
+
const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
|
6765
|
+
|
6766
|
+
const int64_t N = src1->ne[is_2D ? 3 : 2];
|
6767
|
+
const int64_t IC = src1->ne[is_2D ? 2 : 1];
|
6768
|
+
const int64_t IH = is_2D ? src1->ne[1] : 1;
|
6769
|
+
const int64_t IW = src1->ne[0];
|
6770
|
+
|
6771
|
+
const int64_t KH = is_2D ? src0->ne[1] : 1;
|
6772
|
+
const int64_t KW = src0->ne[0];
|
6773
|
+
|
6774
|
+
const int64_t OH = is_2D ? dst->ne[2] : 1;
|
6775
|
+
const int64_t OW = dst->ne[1];
|
6776
|
+
|
6777
|
+
const size_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4; // nb is byte offset, src is type float32
|
6778
|
+
const size_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
|
6779
|
+
|
6780
|
+
im2col_f32_f16_cuda(src1_dd, (half*) dst_dd,
|
6781
|
+
OH, IW, IH, OW, IC, KH, KW, N,
|
6782
|
+
ofs0, ofs1, s0, s1, p0, p1, d0, d1, main_stream);
|
6783
|
+
|
6784
|
+
(void) src0;
|
6785
|
+
(void) src0_dd;
|
6786
|
+
}
|
6787
|
+
|
6642
6788
|
inline void ggml_cuda_op_diag_mask_inf(
|
6643
6789
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6644
6790
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
@@ -7160,6 +7306,14 @@ static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, g
|
|
7160
7306
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
|
7161
7307
|
}
|
7162
7308
|
|
7309
|
+
static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7310
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu);
|
7311
|
+
}
|
7312
|
+
|
7313
|
+
static void ggml_cuda_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7314
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sqr);
|
7315
|
+
}
|
7316
|
+
|
7163
7317
|
static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7164
7318
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
|
7165
7319
|
}
|
@@ -7543,6 +7697,9 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
7543
7697
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
7544
7698
|
ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
7545
7699
|
ne10, ne11, nb10, nb11, nb12, main_stream);
|
7700
|
+
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
7701
|
+
ggml_cpy_f16_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
7702
|
+
ne10, ne11, nb10, nb11, nb12, main_stream);
|
7546
7703
|
} else {
|
7547
7704
|
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
7548
7705
|
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
@@ -7574,6 +7731,10 @@ static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1,
|
|
7574
7731
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
|
7575
7732
|
}
|
7576
7733
|
|
7734
|
+
static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7735
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
|
7736
|
+
}
|
7737
|
+
|
7577
7738
|
static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7578
7739
|
(void) src0;
|
7579
7740
|
(void) src1;
|
@@ -7685,11 +7846,11 @@ static size_t g_temp_tensor_extra_index = 0;
|
|
7685
7846
|
|
7686
7847
|
static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
7687
7848
|
if (g_temp_tensor_extras == nullptr) {
|
7688
|
-
g_temp_tensor_extras = new ggml_tensor_extra_gpu[
|
7849
|
+
g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
|
7689
7850
|
}
|
7690
7851
|
|
7691
7852
|
size_t alloc_index = g_temp_tensor_extra_index;
|
7692
|
-
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) %
|
7853
|
+
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
|
7693
7854
|
ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
|
7694
7855
|
memset(extra, 0, sizeof(*extra));
|
7695
7856
|
|
@@ -7867,6 +8028,15 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
7867
8028
|
return false;
|
7868
8029
|
}
|
7869
8030
|
|
8031
|
+
if (tensor->op == GGML_OP_MUL_MAT) {
|
8032
|
+
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
|
8033
|
+
#ifndef NDEBUG
|
8034
|
+
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %d, src1->ne[3] = %d - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
|
8035
|
+
#endif
|
8036
|
+
return false;
|
8037
|
+
}
|
8038
|
+
}
|
8039
|
+
|
7870
8040
|
switch (tensor->op) {
|
7871
8041
|
case GGML_OP_REPEAT:
|
7872
8042
|
func = ggml_cuda_repeat;
|
@@ -7891,6 +8061,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
7891
8061
|
case GGML_UNARY_OP_SILU:
|
7892
8062
|
func = ggml_cuda_silu;
|
7893
8063
|
break;
|
8064
|
+
case GGML_UNARY_OP_RELU:
|
8065
|
+
func = ggml_cuda_relu;
|
8066
|
+
break;
|
7894
8067
|
default:
|
7895
8068
|
return false;
|
7896
8069
|
} break;
|
@@ -7909,6 +8082,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
7909
8082
|
case GGML_OP_SCALE:
|
7910
8083
|
func = ggml_cuda_scale;
|
7911
8084
|
break;
|
8085
|
+
case GGML_OP_SQR:
|
8086
|
+
func = ggml_cuda_sqr;
|
8087
|
+
break;
|
7912
8088
|
case GGML_OP_CLAMP:
|
7913
8089
|
if (!any_on_device) {
|
7914
8090
|
return false;
|
@@ -7939,6 +8115,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
7939
8115
|
case GGML_OP_ALIBI:
|
7940
8116
|
func = ggml_cuda_alibi;
|
7941
8117
|
break;
|
8118
|
+
case GGML_OP_IM2COL:
|
8119
|
+
func = ggml_cuda_im2col;
|
8120
|
+
break;
|
7942
8121
|
default:
|
7943
8122
|
return false;
|
7944
8123
|
}
|
@@ -7998,11 +8177,11 @@ struct ggml_backend_buffer_context_cuda {
|
|
7998
8177
|
|
7999
8178
|
ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
8000
8179
|
if (temp_tensor_extras == nullptr) {
|
8001
|
-
temp_tensor_extras = new ggml_tensor_extra_gpu[
|
8180
|
+
temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
|
8002
8181
|
}
|
8003
8182
|
|
8004
8183
|
size_t alloc_index = temp_tensor_extra_index;
|
8005
|
-
temp_tensor_extra_index = (temp_tensor_extra_index + 1) %
|
8184
|
+
temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
|
8006
8185
|
ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
|
8007
8186
|
memset(extra, 0, sizeof(*extra));
|
8008
8187
|
|
@@ -8088,7 +8267,12 @@ static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backe
|
|
8088
8267
|
ggml_cuda_set_device(g_main_device);
|
8089
8268
|
|
8090
8269
|
ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda;
|
8270
|
+
|
8271
|
+
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
|
8272
|
+
|
8273
|
+
ggml_cuda_set_device(g_main_device);
|
8091
8274
|
CUDA_CHECK(cudaMalloc(&ctx->device, size));
|
8275
|
+
|
8092
8276
|
return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size);
|
8093
8277
|
}
|
8094
8278
|
|
@@ -8155,6 +8339,8 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
|
|
8155
8339
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
8156
8340
|
ggml_tensor * node = cgraph->nodes[i];
|
8157
8341
|
|
8342
|
+
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
|
8343
|
+
continue;
|
8158
8344
|
assert(node->backend == GGML_BACKEND_GPU);
|
8159
8345
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
8160
8346
|
if (node->src[j] != nullptr) {
|
@@ -39,12 +39,6 @@ extern "C" {
|
|
39
39
|
#endif
|
40
40
|
#endif
|
41
41
|
|
42
|
-
#undef MIN
|
43
|
-
#undef MAX
|
44
|
-
|
45
|
-
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
46
|
-
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
47
|
-
|
48
42
|
// 16-bit float
|
49
43
|
// on Arm, we use __fp16
|
50
44
|
// on x86, we use uint16_t
|
@@ -230,7 +224,19 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
|
230
224
|
|
231
225
|
#endif
|
232
226
|
|
233
|
-
|
227
|
+
#define GGML_HASHTABLE_FULL ((size_t)-1)
|
228
|
+
#define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2)
|
229
|
+
|
230
|
+
bool ggml_hash_contains (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
|
231
|
+
|
232
|
+
// returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
|
233
|
+
size_t ggml_hash_find (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
|
234
|
+
|
235
|
+
// returns GGML_HAHSHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
|
236
|
+
size_t ggml_hash_insert ( struct ggml_hash_set hash_set, struct ggml_tensor * key);
|
237
|
+
|
238
|
+
// return index, asserts if table is full
|
239
|
+
size_t ggml_hash_find_or_insert( struct ggml_hash_set hash_set, struct ggml_tensor * key);
|
234
240
|
|
235
241
|
#ifdef __cplusplus
|
236
242
|
}
|