llama_cpp 0.9.2 → 0.9.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,51 +1,20 @@
1
1
  #pragma once
2
2
 
3
3
  #include "ggml.h"
4
+ #include "ggml-alloc.h"
4
5
 
5
6
  #ifdef __cplusplus
6
7
  extern "C" {
7
8
  #endif
8
- struct ggml_backend;
9
- struct ggml_backend_buffer;
10
-
11
- // type-erased backend-specific types / wrappers
12
- typedef void * ggml_backend_context_t;
13
- typedef void * ggml_backend_graph_plan_t;
14
- typedef void * ggml_backend_buffer_context_t;
15
-
16
- // avoid accessing internals of these types
17
- typedef struct ggml_backend * ggml_backend_t;
18
- typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
19
9
 
20
10
  //
21
- // backend buffer
11
+ // Backend buffer
22
12
  //
23
13
 
24
- struct ggml_backend_buffer_i {
25
- void (*free_buffer) (ggml_backend_buffer_t buffer);
26
- void * (*get_base) (ggml_backend_buffer_t buffer); // get base pointer
27
- size_t (*get_alloc_size)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-allocation callback
28
- void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // post-allocation callback
29
- void (*free_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-free callback
30
- };
31
-
32
- // TODO: hide behind API
33
- struct ggml_backend_buffer {
34
- struct ggml_backend_buffer_i iface;
35
-
36
- ggml_backend_t backend;
37
- ggml_backend_buffer_context_t context;
38
-
39
- size_t size;
40
- };
14
+ struct ggml_backend_buffer;
15
+ typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
41
16
 
42
17
  // backend buffer functions
43
- GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
44
- struct ggml_backend * backend,
45
- struct ggml_backend_buffer_i iface,
46
- ggml_backend_buffer_context_t context,
47
- size_t size);
48
-
49
18
  GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
50
19
  GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
51
20
  GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
@@ -55,50 +24,13 @@ extern "C" {
55
24
  GGML_API void ggml_backend_buffer_free_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
56
25
 
57
26
  //
58
- // backend
27
+ // Backend
59
28
  //
60
29
 
61
- struct ggml_backend_i {
62
- const char * (*get_name)(ggml_backend_t backend);
63
-
64
- void (*free)(ggml_backend_t backend);
65
-
66
- // buffer allocation
67
- ggml_backend_buffer_t (*alloc_buffer)(ggml_backend_t backend, size_t size);
68
-
69
- // get buffer alignment
70
- size_t (*get_alignment)(ggml_backend_t backend);
71
-
72
- // tensor data access
73
- // these functions can be asynchronous, helper functions are provided for synchronous access that automatically call synchronize
74
- void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
75
- void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
76
- void (*synchronize) (ggml_backend_t backend);
77
-
78
- // (optional) copy tensor between different backends, allow for single-copy tranfers
79
- void (*cpy_tensor_from)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
80
- void (*cpy_tensor_to) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
81
-
82
- // compute graph with a plan
83
- ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
84
- void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
85
- void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
86
-
87
- // compute graph without a plan
88
- void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
89
-
90
- // check if the backend supports an operation
91
- bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
92
- };
93
-
94
- // TODO: hide behind API
95
- struct ggml_backend {
96
- struct ggml_backend_i iface;
97
-
98
- ggml_backend_context_t context;
99
- };
30
+ struct ggml_backend;
31
+ typedef struct ggml_backend * ggml_backend_t;
32
+ typedef void * ggml_backend_graph_plan_t;
100
33
 
101
- // backend helper functions
102
34
  GGML_API ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor);
103
35
 
104
36
  GGML_API const char * ggml_backend_name(ggml_backend_t backend);
@@ -133,11 +65,72 @@ extern "C" {
133
65
  GGML_API ggml_backend_t ggml_backend_cpu_init(void);
134
66
 
135
67
  GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend);
136
-
137
68
  GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
138
69
 
70
+ // Create a backend buffer from an existing pointer
139
71
  GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size);
140
72
 
73
+
74
+ //
75
+ // Backend scheduler
76
+ //
77
+
78
+ // The backend scheduler allows for multiple backends to be used together
79
+ // Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
80
+ // The backends are selected based on:
81
+ // - the backend that supports the operation
82
+ // - the location of the pre-allocated tensors (e.g. the weights)
83
+ /*
84
+ Example usage:
85
+
86
+ sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, num_backends);
87
+ // sched is initialized with measure allocators and cannot be used until allocated with a measure graph
88
+
89
+ // initialize buffers from a measure graph
90
+ measure_graph = build_graph(sched); // use the allocr to allocate inputs as needed
91
+
92
+ // in build_graph:
93
+ build_graph(...) {
94
+ // allocating tensors in a specific backend (optional, recommended: pre-allocate inputs in a different buffer)
95
+ alloc_cpu = ggml_backend_sched_get_allocr(sched, backend_cpu);
96
+ ggml_allocr_alloc(alloc_cpu, tensor);
97
+
98
+ // manually assigning nodes to a backend (optional, shouldn't be needed in most cases)
99
+ struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
100
+ ggml_backend_sched_set_node_backend(sched, node, backend_gpu);
101
+ }
102
+
103
+ // allocate backend buffers from measure graph
104
+ ggml_backend_sched_init_measure(sched, measure_graph);
105
+
106
+ // the scheduler is now ready to compute graphs
107
+
108
+ // compute
109
+ graph = build_graph(sched);
110
+ ggml_backend_sched_graph_compute(sched, graph);
111
+ */
112
+
113
+ struct ggml_backend_sched;
114
+ typedef struct ggml_backend_sched * ggml_backend_sched_t;
115
+
116
+ // Initialize a backend scheduler
117
+ GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends);
118
+
119
+ GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
120
+
121
+ // Initialize backend buffers from a measure graph
122
+ GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
123
+
124
+ GGML_API ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend);
125
+ GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend);
126
+
127
+ GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
128
+
129
+ // Allocate a graph on the backend scheduler
130
+ GGML_API void ggml_backend_sched_graph_compute(
131
+ ggml_backend_sched_t sched,
132
+ struct ggml_cgraph * graph);
133
+
141
134
  #ifdef __cplusplus
142
135
  }
143
136
  #endif
@@ -81,12 +81,15 @@
81
81
 
82
82
  #include "ggml-cuda.h"
83
83
  #include "ggml.h"
84
+ #include "ggml-backend-impl.h"
84
85
 
85
86
  #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
86
87
  #define CC_VOLTA 700
87
88
  #define CC_OFFSET_AMD 1000000
88
89
  #define CC_RDNA2 (CC_OFFSET_AMD + 1030)
89
90
 
91
+ #define GGML_CUDA_MAX_NODES 8192
92
+
90
93
  // define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
91
94
  // on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
92
95
  // for large computational tasks. the drawback is that this requires some extra amount of VRAM:
@@ -433,6 +436,8 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
433
436
  #define CUDA_MUL_BLOCK_SIZE 256
434
437
  #define CUDA_GELU_BLOCK_SIZE 256
435
438
  #define CUDA_SILU_BLOCK_SIZE 256
439
+ #define CUDA_RELU_BLOCK_SIZE 256
440
+ #define CUDA_SQR_BLOCK_SIZE 256
436
441
  #define CUDA_CPY_BLOCK_SIZE 32
437
442
  #define CUDA_SCALE_BLOCK_SIZE 256
438
443
  #define CUDA_CLAMP_BLOCK_SIZE 256
@@ -553,6 +558,24 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
553
558
  dst[i] = x[i] / (1.0f + expf(-x[i]));
554
559
  }
555
560
 
561
+ static __global__ void relu_f32(const float * x, float * dst, const int k) {
562
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
563
+
564
+ if (i >= k) {
565
+ return;
566
+ }
567
+ dst[i] = fmaxf(x[i], 0);
568
+ }
569
+
570
+ static __global__ void sqr_f32(const float * x, float * dst, const int k) {
571
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
572
+
573
+ if (i >= k) {
574
+ return;
575
+ }
576
+ dst[i] = x[i] * x[i];
577
+ }
578
+
556
579
  static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
557
580
  #pragma unroll
558
581
  for (int mask = 16; mask > 0; mask >>= 1) {
@@ -4468,6 +4491,13 @@ static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
4468
4491
  *dsti = __float2half(*xi);
4469
4492
  }
4470
4493
 
4494
+ static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) {
4495
+ const half * xi = (const half *) cxi;
4496
+ half * dsti = (half *) cdsti;
4497
+
4498
+ *dsti = *xi;
4499
+ }
4500
+
4471
4501
  template <cpy_kernel_t cpy_1>
4472
4502
  static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
4473
4503
  const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
@@ -4721,6 +4751,25 @@ static __global__ void clamp_f32(const float * x, float * dst, const float min,
4721
4751
  dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
4722
4752
  }
4723
4753
 
4754
+ static __global__ void im2col_f32_f16(
4755
+ const float * x, half * dst,
4756
+ int ofs0, int ofs1, int IW, int IH, int CHW,
4757
+ int s0, int s1, int p0, int p1, int d0, int d1) {
4758
+ const int iiw = blockIdx.z * s0 + threadIdx.z * d0 - p0;
4759
+ const int iih = blockIdx.y * s1 + threadIdx.y * d1 - p1;
4760
+
4761
+ const int offset_dst =
4762
+ (threadIdx.x * gridDim.y * gridDim.z + blockIdx.y * gridDim.z + blockIdx.z) * CHW +
4763
+ (blockIdx.x * (blockDim.y * blockDim.z) + threadIdx.y * blockDim.z + threadIdx.z);
4764
+
4765
+ if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
4766
+ dst[offset_dst] = __float2half(0.0f);
4767
+ } else {
4768
+ const int offset_src = threadIdx.x * ofs0 + blockIdx.x * ofs1;
4769
+ dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
4770
+ }
4771
+ }
4772
+
4724
4773
  template<int qk, int qr, dequantize_kernel_t dq>
4725
4774
  static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
4726
4775
  const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
@@ -4759,6 +4808,16 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
4759
4808
  silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
4760
4809
  }
4761
4810
 
4811
+ static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
4812
+ const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
4813
+ relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
4814
+ }
4815
+
4816
+ static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
4817
+ const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
4818
+ sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
4819
+ }
4820
+
4762
4821
  static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4763
4822
  GGML_ASSERT(ncols % WARP_SIZE == 0);
4764
4823
  if (ncols < 1024) {
@@ -5611,6 +5670,16 @@ static void ggml_cpy_f32_f16_cuda(
5611
5670
  (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
5612
5671
  }
5613
5672
 
5673
+ static void ggml_cpy_f16_f16_cuda(
5674
+ const char * cx, char * cdst, const int ne,
5675
+ const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
5676
+ const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
5677
+
5678
+ const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
5679
+ cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
5680
+ (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
5681
+ }
5682
+
5614
5683
  static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
5615
5684
  const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
5616
5685
  scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
@@ -5694,6 +5763,15 @@ static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, c
5694
5763
  soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
5695
5764
  }
5696
5765
 
5766
+ static void im2col_f32_f16_cuda(const float * x, half * dst,
5767
+ int OH, int IW, int IH, int OW, int IC,
5768
+ int KH, int KW, int N, int ofs0, int ofs1,
5769
+ int s0, int s1, int p0, int p1, int d0, int d1, cudaStream_t stream) {
5770
+ dim3 block_nums(IC, OH, OW);
5771
+ dim3 block_dims(N, KH, KW);
5772
+ im2col_f32_f16<<<block_nums, block_dims, 0, stream>>>(x, dst, ofs0, ofs1, IW, IH, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
5773
+ }
5774
+
5697
5775
  // buffer pool for cuda
5698
5776
  #define MAX_CUDA_BUFFERS 256
5699
5777
 
@@ -5762,7 +5840,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
5762
5840
  return ptr;
5763
5841
  }
5764
5842
  #ifdef DEBUG_CUDA_MALLOC
5765
- fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
5843
+ fprintf(stderr, "%s: %d buffers, max_size = %u MiB, tot_size = %u MiB, requested %u MiB\n", __func__, nnz,
5766
5844
  (uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
5767
5845
  #endif
5768
5846
  void * ptr;
@@ -5900,7 +5978,7 @@ void * ggml_cuda_host_malloc(size_t size) {
5900
5978
  // The allocation error can be bypassed. A null ptr will assigned out of this function.
5901
5979
  // This can fixed the OOM error in WSL.
5902
5980
  cudaGetLastError();
5903
- fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
5981
+ fprintf(stderr, "WARNING: failed to allocate %.2f MiB of pinned memory: %s\n",
5904
5982
  size/1024.0/1024.0, cudaGetErrorString(err));
5905
5983
  return nullptr;
5906
5984
  }
@@ -6128,6 +6206,34 @@ inline void ggml_cuda_op_silu(
6128
6206
  (void) src1_dd;
6129
6207
  }
6130
6208
 
6209
+ inline void ggml_cuda_op_relu(
6210
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6211
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6212
+
6213
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6214
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
6215
+
6216
+ relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
6217
+
6218
+ (void) src1;
6219
+ (void) dst;
6220
+ (void) src1_dd;
6221
+ }
6222
+
6223
+ inline void ggml_cuda_op_sqr(
6224
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6225
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6226
+
6227
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6228
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
6229
+
6230
+ sqr_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
6231
+
6232
+ (void) src1;
6233
+ (void) dst;
6234
+ (void) src1_dd;
6235
+ }
6236
+
6131
6237
  inline void ggml_cuda_op_norm(
6132
6238
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6133
6239
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -6250,6 +6356,7 @@ static int64_t get_row_rounding(ggml_type type) {
6250
6356
  case GGML_TYPE_Q8_0:
6251
6357
  return max_compute_capability >= CC_RDNA2 ? 128 : 64;
6252
6358
  case GGML_TYPE_F16:
6359
+ case GGML_TYPE_F32:
6253
6360
  return 1;
6254
6361
  case GGML_TYPE_Q2_K:
6255
6362
  return max_compute_capability >= CC_RDNA2 ? 128 : 32;
@@ -6272,6 +6379,7 @@ static int64_t get_row_rounding(ggml_type type) {
6272
6379
  case GGML_TYPE_Q8_0:
6273
6380
  return 64;
6274
6381
  case GGML_TYPE_F16:
6382
+ case GGML_TYPE_F32:
6275
6383
  return 1;
6276
6384
  case GGML_TYPE_Q2_K:
6277
6385
  case GGML_TYPE_Q3_K:
@@ -6463,8 +6571,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
6463
6571
  src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src1_as);
6464
6572
  to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream);
6465
6573
  }
6466
- const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddq_i : src1_as_f16;
6467
-
6574
+ const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16;
6468
6575
  size_t dst_as = 0;
6469
6576
  half * dst_f16 = (half *) ggml_cuda_pool_malloc(row_diff*src1_ncols * sizeof(half), &dst_as);
6470
6577
 
@@ -6639,6 +6746,45 @@ inline void ggml_cuda_op_alibi(
6639
6746
  (void) src1_dd;
6640
6747
  }
6641
6748
 
6749
+ inline void ggml_cuda_op_im2col(
6750
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6751
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6752
+
6753
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
6754
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
6755
+ GGML_ASSERT( dst->type == GGML_TYPE_F16);
6756
+
6757
+ const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
6758
+ const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
6759
+ const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
6760
+ const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
6761
+ const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
6762
+ const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
6763
+
6764
+ const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
6765
+
6766
+ const int64_t N = src1->ne[is_2D ? 3 : 2];
6767
+ const int64_t IC = src1->ne[is_2D ? 2 : 1];
6768
+ const int64_t IH = is_2D ? src1->ne[1] : 1;
6769
+ const int64_t IW = src1->ne[0];
6770
+
6771
+ const int64_t KH = is_2D ? src0->ne[1] : 1;
6772
+ const int64_t KW = src0->ne[0];
6773
+
6774
+ const int64_t OH = is_2D ? dst->ne[2] : 1;
6775
+ const int64_t OW = dst->ne[1];
6776
+
6777
+ const size_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4; // nb is byte offset, src is type float32
6778
+ const size_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
6779
+
6780
+ im2col_f32_f16_cuda(src1_dd, (half*) dst_dd,
6781
+ OH, IW, IH, OW, IC, KH, KW, N,
6782
+ ofs0, ofs1, s0, s1, p0, p1, d0, d1, main_stream);
6783
+
6784
+ (void) src0;
6785
+ (void) src0_dd;
6786
+ }
6787
+
6642
6788
  inline void ggml_cuda_op_diag_mask_inf(
6643
6789
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6644
6790
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -7160,6 +7306,14 @@ static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, g
7160
7306
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
7161
7307
  }
7162
7308
 
7309
+ static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7310
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu);
7311
+ }
7312
+
7313
+ static void ggml_cuda_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7314
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sqr);
7315
+ }
7316
+
7163
7317
  static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7164
7318
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
7165
7319
  }
@@ -7543,6 +7697,9 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
7543
7697
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
7544
7698
  ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
7545
7699
  ne10, ne11, nb10, nb11, nb12, main_stream);
7700
+ } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
7701
+ ggml_cpy_f16_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
7702
+ ne10, ne11, nb10, nb11, nb12, main_stream);
7546
7703
  } else {
7547
7704
  fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
7548
7705
  ggml_type_name(src0->type), ggml_type_name(src1->type));
@@ -7574,6 +7731,10 @@ static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1,
7574
7731
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
7575
7732
  }
7576
7733
 
7734
+ static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7735
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
7736
+ }
7737
+
7577
7738
  static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7578
7739
  (void) src0;
7579
7740
  (void) src1;
@@ -7685,11 +7846,11 @@ static size_t g_temp_tensor_extra_index = 0;
7685
7846
 
7686
7847
  static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
7687
7848
  if (g_temp_tensor_extras == nullptr) {
7688
- g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
7849
+ g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
7689
7850
  }
7690
7851
 
7691
7852
  size_t alloc_index = g_temp_tensor_extra_index;
7692
- g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
7853
+ g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
7693
7854
  ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
7694
7855
  memset(extra, 0, sizeof(*extra));
7695
7856
 
@@ -7867,6 +8028,15 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
7867
8028
  return false;
7868
8029
  }
7869
8030
 
8031
+ if (tensor->op == GGML_OP_MUL_MAT) {
8032
+ if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
8033
+ #ifndef NDEBUG
8034
+ fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %d, src1->ne[3] = %d - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
8035
+ #endif
8036
+ return false;
8037
+ }
8038
+ }
8039
+
7870
8040
  switch (tensor->op) {
7871
8041
  case GGML_OP_REPEAT:
7872
8042
  func = ggml_cuda_repeat;
@@ -7891,6 +8061,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
7891
8061
  case GGML_UNARY_OP_SILU:
7892
8062
  func = ggml_cuda_silu;
7893
8063
  break;
8064
+ case GGML_UNARY_OP_RELU:
8065
+ func = ggml_cuda_relu;
8066
+ break;
7894
8067
  default:
7895
8068
  return false;
7896
8069
  } break;
@@ -7909,6 +8082,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
7909
8082
  case GGML_OP_SCALE:
7910
8083
  func = ggml_cuda_scale;
7911
8084
  break;
8085
+ case GGML_OP_SQR:
8086
+ func = ggml_cuda_sqr;
8087
+ break;
7912
8088
  case GGML_OP_CLAMP:
7913
8089
  if (!any_on_device) {
7914
8090
  return false;
@@ -7939,6 +8115,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
7939
8115
  case GGML_OP_ALIBI:
7940
8116
  func = ggml_cuda_alibi;
7941
8117
  break;
8118
+ case GGML_OP_IM2COL:
8119
+ func = ggml_cuda_im2col;
8120
+ break;
7942
8121
  default:
7943
8122
  return false;
7944
8123
  }
@@ -7998,11 +8177,11 @@ struct ggml_backend_buffer_context_cuda {
7998
8177
 
7999
8178
  ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
8000
8179
  if (temp_tensor_extras == nullptr) {
8001
- temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
8180
+ temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
8002
8181
  }
8003
8182
 
8004
8183
  size_t alloc_index = temp_tensor_extra_index;
8005
- temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_MAX_NODES;
8184
+ temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
8006
8185
  ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
8007
8186
  memset(extra, 0, sizeof(*extra));
8008
8187
 
@@ -8088,7 +8267,12 @@ static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backe
8088
8267
  ggml_cuda_set_device(g_main_device);
8089
8268
 
8090
8269
  ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda;
8270
+
8271
+ size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
8272
+
8273
+ ggml_cuda_set_device(g_main_device);
8091
8274
  CUDA_CHECK(cudaMalloc(&ctx->device, size));
8275
+
8092
8276
  return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size);
8093
8277
  }
8094
8278
 
@@ -8155,6 +8339,8 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
8155
8339
  for (int i = 0; i < cgraph->n_nodes; i++) {
8156
8340
  ggml_tensor * node = cgraph->nodes[i];
8157
8341
 
8342
+ if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
8343
+ continue;
8158
8344
  assert(node->backend == GGML_BACKEND_GPU);
8159
8345
  for (int j = 0; j < GGML_MAX_SRC; j++) {
8160
8346
  if (node->src[j] != nullptr) {
@@ -39,12 +39,6 @@ extern "C" {
39
39
  #endif
40
40
  #endif
41
41
 
42
- #undef MIN
43
- #undef MAX
44
-
45
- #define MIN(a, b) ((a) < (b) ? (a) : (b))
46
- #define MAX(a, b) ((a) > (b) ? (a) : (b))
47
-
48
42
  // 16-bit float
49
43
  // on Arm, we use __fp16
50
44
  // on x86, we use uint16_t
@@ -230,7 +224,19 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
230
224
 
231
225
  #endif
232
226
 
233
- // TODO: backend v2 PR
227
+ #define GGML_HASHTABLE_FULL ((size_t)-1)
228
+ #define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2)
229
+
230
+ bool ggml_hash_contains (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
231
+
232
+ // returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
233
+ size_t ggml_hash_find (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
234
+
235
+ // returns GGML_HAHSHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
236
+ size_t ggml_hash_insert ( struct ggml_hash_set hash_set, struct ggml_tensor * key);
237
+
238
+ // return index, asserts if table is full
239
+ size_t ggml_hash_find_or_insert( struct ggml_hash_set hash_set, struct ggml_tensor * key);
234
240
 
235
241
  #ifdef __cplusplus
236
242
  }
@@ -26,7 +26,7 @@
26
26
  #include <stdbool.h>
27
27
 
28
28
  // max memory buffers that can be mapped to the device
29
- #define GGML_METAL_MAX_BUFFERS 16
29
+ #define GGML_METAL_MAX_BUFFERS 64
30
30
  #define GGML_METAL_MAX_COMMAND_BUFFERS 32
31
31
 
32
32
  struct ggml_tensor;