llama_cpp 0.9.2 → 0.9.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,51 +1,20 @@
1
1
  #pragma once
2
2
 
3
3
  #include "ggml.h"
4
+ #include "ggml-alloc.h"
4
5
 
5
6
  #ifdef __cplusplus
6
7
  extern "C" {
7
8
  #endif
8
- struct ggml_backend;
9
- struct ggml_backend_buffer;
10
-
11
- // type-erased backend-specific types / wrappers
12
- typedef void * ggml_backend_context_t;
13
- typedef void * ggml_backend_graph_plan_t;
14
- typedef void * ggml_backend_buffer_context_t;
15
-
16
- // avoid accessing internals of these types
17
- typedef struct ggml_backend * ggml_backend_t;
18
- typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
19
9
 
20
10
  //
21
- // backend buffer
11
+ // Backend buffer
22
12
  //
23
13
 
24
- struct ggml_backend_buffer_i {
25
- void (*free_buffer) (ggml_backend_buffer_t buffer);
26
- void * (*get_base) (ggml_backend_buffer_t buffer); // get base pointer
27
- size_t (*get_alloc_size)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-allocation callback
28
- void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // post-allocation callback
29
- void (*free_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-free callback
30
- };
31
-
32
- // TODO: hide behind API
33
- struct ggml_backend_buffer {
34
- struct ggml_backend_buffer_i iface;
35
-
36
- ggml_backend_t backend;
37
- ggml_backend_buffer_context_t context;
38
-
39
- size_t size;
40
- };
14
+ struct ggml_backend_buffer;
15
+ typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
41
16
 
42
17
  // backend buffer functions
43
- GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
44
- struct ggml_backend * backend,
45
- struct ggml_backend_buffer_i iface,
46
- ggml_backend_buffer_context_t context,
47
- size_t size);
48
-
49
18
  GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
50
19
  GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
51
20
  GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
@@ -55,50 +24,13 @@ extern "C" {
55
24
  GGML_API void ggml_backend_buffer_free_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
56
25
 
57
26
  //
58
- // backend
27
+ // Backend
59
28
  //
60
29
 
61
- struct ggml_backend_i {
62
- const char * (*get_name)(ggml_backend_t backend);
63
-
64
- void (*free)(ggml_backend_t backend);
65
-
66
- // buffer allocation
67
- ggml_backend_buffer_t (*alloc_buffer)(ggml_backend_t backend, size_t size);
68
-
69
- // get buffer alignment
70
- size_t (*get_alignment)(ggml_backend_t backend);
71
-
72
- // tensor data access
73
- // these functions can be asynchronous, helper functions are provided for synchronous access that automatically call synchronize
74
- void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
75
- void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
76
- void (*synchronize) (ggml_backend_t backend);
77
-
78
- // (optional) copy tensor between different backends, allow for single-copy tranfers
79
- void (*cpy_tensor_from)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
80
- void (*cpy_tensor_to) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
81
-
82
- // compute graph with a plan
83
- ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
84
- void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
85
- void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
86
-
87
- // compute graph without a plan
88
- void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
89
-
90
- // check if the backend supports an operation
91
- bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
92
- };
93
-
94
- // TODO: hide behind API
95
- struct ggml_backend {
96
- struct ggml_backend_i iface;
97
-
98
- ggml_backend_context_t context;
99
- };
30
+ struct ggml_backend;
31
+ typedef struct ggml_backend * ggml_backend_t;
32
+ typedef void * ggml_backend_graph_plan_t;
100
33
 
101
- // backend helper functions
102
34
  GGML_API ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor);
103
35
 
104
36
  GGML_API const char * ggml_backend_name(ggml_backend_t backend);
@@ -133,11 +65,72 @@ extern "C" {
133
65
  GGML_API ggml_backend_t ggml_backend_cpu_init(void);
134
66
 
135
67
  GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend);
136
-
137
68
  GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
138
69
 
70
+ // Create a backend buffer from an existing pointer
139
71
  GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size);
140
72
 
73
+
74
+ //
75
+ // Backend scheduler
76
+ //
77
+
78
+ // The backend scheduler allows for multiple backends to be used together
79
+ // Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
80
+ // The backends are selected based on:
81
+ // - the backend that supports the operation
82
+ // - the location of the pre-allocated tensors (e.g. the weights)
83
+ /*
84
+ Example usage:
85
+
86
+ sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, num_backends);
87
+ // sched is initialized with measure allocators and cannot be used until allocated with a measure graph
88
+
89
+ // initialize buffers from a measure graph
90
+ measure_graph = build_graph(sched); // use the allocr to allocate inputs as needed
91
+
92
+ // in build_graph:
93
+ build_graph(...) {
94
+ // allocating tensors in a specific backend (optional, recommended: pre-allocate inputs in a different buffer)
95
+ alloc_cpu = ggml_backend_sched_get_allocr(sched, backend_cpu);
96
+ ggml_allocr_alloc(alloc_cpu, tensor);
97
+
98
+ // manually assigning nodes to a backend (optional, shouldn't be needed in most cases)
99
+ struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
100
+ ggml_backend_sched_set_node_backend(sched, node, backend_gpu);
101
+ }
102
+
103
+ // allocate backend buffers from measure graph
104
+ ggml_backend_sched_init_measure(sched, measure_graph);
105
+
106
+ // the scheduler is now ready to compute graphs
107
+
108
+ // compute
109
+ graph = build_graph(sched);
110
+ ggml_backend_sched_graph_compute(sched, graph);
111
+ */
112
+
113
+ struct ggml_backend_sched;
114
+ typedef struct ggml_backend_sched * ggml_backend_sched_t;
115
+
116
+ // Initialize a backend scheduler
117
+ GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends);
118
+
119
+ GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
120
+
121
+ // Initialize backend buffers from a measure graph
122
+ GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
123
+
124
+ GGML_API ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend);
125
+ GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend);
126
+
127
+ GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
128
+
129
+ // Allocate a graph on the backend scheduler
130
+ GGML_API void ggml_backend_sched_graph_compute(
131
+ ggml_backend_sched_t sched,
132
+ struct ggml_cgraph * graph);
133
+
141
134
  #ifdef __cplusplus
142
135
  }
143
136
  #endif
@@ -81,12 +81,15 @@
81
81
 
82
82
  #include "ggml-cuda.h"
83
83
  #include "ggml.h"
84
+ #include "ggml-backend-impl.h"
84
85
 
85
86
  #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
86
87
  #define CC_VOLTA 700
87
88
  #define CC_OFFSET_AMD 1000000
88
89
  #define CC_RDNA2 (CC_OFFSET_AMD + 1030)
89
90
 
91
+ #define GGML_CUDA_MAX_NODES 8192
92
+
90
93
  // define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
91
94
  // on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
92
95
  // for large computational tasks. the drawback is that this requires some extra amount of VRAM:
@@ -433,6 +436,8 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
433
436
  #define CUDA_MUL_BLOCK_SIZE 256
434
437
  #define CUDA_GELU_BLOCK_SIZE 256
435
438
  #define CUDA_SILU_BLOCK_SIZE 256
439
+ #define CUDA_RELU_BLOCK_SIZE 256
440
+ #define CUDA_SQR_BLOCK_SIZE 256
436
441
  #define CUDA_CPY_BLOCK_SIZE 32
437
442
  #define CUDA_SCALE_BLOCK_SIZE 256
438
443
  #define CUDA_CLAMP_BLOCK_SIZE 256
@@ -553,6 +558,24 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
553
558
  dst[i] = x[i] / (1.0f + expf(-x[i]));
554
559
  }
555
560
 
561
+ static __global__ void relu_f32(const float * x, float * dst, const int k) {
562
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
563
+
564
+ if (i >= k) {
565
+ return;
566
+ }
567
+ dst[i] = fmaxf(x[i], 0);
568
+ }
569
+
570
+ static __global__ void sqr_f32(const float * x, float * dst, const int k) {
571
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
572
+
573
+ if (i >= k) {
574
+ return;
575
+ }
576
+ dst[i] = x[i] * x[i];
577
+ }
578
+
556
579
  static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
557
580
  #pragma unroll
558
581
  for (int mask = 16; mask > 0; mask >>= 1) {
@@ -4468,6 +4491,13 @@ static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
4468
4491
  *dsti = __float2half(*xi);
4469
4492
  }
4470
4493
 
4494
+ static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) {
4495
+ const half * xi = (const half *) cxi;
4496
+ half * dsti = (half *) cdsti;
4497
+
4498
+ *dsti = *xi;
4499
+ }
4500
+
4471
4501
  template <cpy_kernel_t cpy_1>
4472
4502
  static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
4473
4503
  const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
@@ -4721,6 +4751,25 @@ static __global__ void clamp_f32(const float * x, float * dst, const float min,
4721
4751
  dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
4722
4752
  }
4723
4753
 
4754
+ static __global__ void im2col_f32_f16(
4755
+ const float * x, half * dst,
4756
+ int ofs0, int ofs1, int IW, int IH, int CHW,
4757
+ int s0, int s1, int p0, int p1, int d0, int d1) {
4758
+ const int iiw = blockIdx.z * s0 + threadIdx.z * d0 - p0;
4759
+ const int iih = blockIdx.y * s1 + threadIdx.y * d1 - p1;
4760
+
4761
+ const int offset_dst =
4762
+ (threadIdx.x * gridDim.y * gridDim.z + blockIdx.y * gridDim.z + blockIdx.z) * CHW +
4763
+ (blockIdx.x * (blockDim.y * blockDim.z) + threadIdx.y * blockDim.z + threadIdx.z);
4764
+
4765
+ if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
4766
+ dst[offset_dst] = __float2half(0.0f);
4767
+ } else {
4768
+ const int offset_src = threadIdx.x * ofs0 + blockIdx.x * ofs1;
4769
+ dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
4770
+ }
4771
+ }
4772
+
4724
4773
  template<int qk, int qr, dequantize_kernel_t dq>
4725
4774
  static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
4726
4775
  const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
@@ -4759,6 +4808,16 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
4759
4808
  silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
4760
4809
  }
4761
4810
 
4811
+ static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
4812
+ const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
4813
+ relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
4814
+ }
4815
+
4816
+ static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
4817
+ const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
4818
+ sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
4819
+ }
4820
+
4762
4821
  static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4763
4822
  GGML_ASSERT(ncols % WARP_SIZE == 0);
4764
4823
  if (ncols < 1024) {
@@ -5611,6 +5670,16 @@ static void ggml_cpy_f32_f16_cuda(
5611
5670
  (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
5612
5671
  }
5613
5672
 
5673
+ static void ggml_cpy_f16_f16_cuda(
5674
+ const char * cx, char * cdst, const int ne,
5675
+ const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
5676
+ const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
5677
+
5678
+ const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
5679
+ cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
5680
+ (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
5681
+ }
5682
+
5614
5683
  static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
5615
5684
  const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
5616
5685
  scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
@@ -5694,6 +5763,15 @@ static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, c
5694
5763
  soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
5695
5764
  }
5696
5765
 
5766
+ static void im2col_f32_f16_cuda(const float * x, half * dst,
5767
+ int OH, int IW, int IH, int OW, int IC,
5768
+ int KH, int KW, int N, int ofs0, int ofs1,
5769
+ int s0, int s1, int p0, int p1, int d0, int d1, cudaStream_t stream) {
5770
+ dim3 block_nums(IC, OH, OW);
5771
+ dim3 block_dims(N, KH, KW);
5772
+ im2col_f32_f16<<<block_nums, block_dims, 0, stream>>>(x, dst, ofs0, ofs1, IW, IH, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
5773
+ }
5774
+
5697
5775
  // buffer pool for cuda
5698
5776
  #define MAX_CUDA_BUFFERS 256
5699
5777
 
@@ -5762,7 +5840,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
5762
5840
  return ptr;
5763
5841
  }
5764
5842
  #ifdef DEBUG_CUDA_MALLOC
5765
- fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
5843
+ fprintf(stderr, "%s: %d buffers, max_size = %u MiB, tot_size = %u MiB, requested %u MiB\n", __func__, nnz,
5766
5844
  (uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
5767
5845
  #endif
5768
5846
  void * ptr;
@@ -5900,7 +5978,7 @@ void * ggml_cuda_host_malloc(size_t size) {
5900
5978
  // The allocation error can be bypassed. A null ptr will assigned out of this function.
5901
5979
  // This can fixed the OOM error in WSL.
5902
5980
  cudaGetLastError();
5903
- fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
5981
+ fprintf(stderr, "WARNING: failed to allocate %.2f MiB of pinned memory: %s\n",
5904
5982
  size/1024.0/1024.0, cudaGetErrorString(err));
5905
5983
  return nullptr;
5906
5984
  }
@@ -6128,6 +6206,34 @@ inline void ggml_cuda_op_silu(
6128
6206
  (void) src1_dd;
6129
6207
  }
6130
6208
 
6209
+ inline void ggml_cuda_op_relu(
6210
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6211
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6212
+
6213
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6214
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
6215
+
6216
+ relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
6217
+
6218
+ (void) src1;
6219
+ (void) dst;
6220
+ (void) src1_dd;
6221
+ }
6222
+
6223
+ inline void ggml_cuda_op_sqr(
6224
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6225
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6226
+
6227
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6228
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
6229
+
6230
+ sqr_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
6231
+
6232
+ (void) src1;
6233
+ (void) dst;
6234
+ (void) src1_dd;
6235
+ }
6236
+
6131
6237
  inline void ggml_cuda_op_norm(
6132
6238
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6133
6239
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -6250,6 +6356,7 @@ static int64_t get_row_rounding(ggml_type type) {
6250
6356
  case GGML_TYPE_Q8_0:
6251
6357
  return max_compute_capability >= CC_RDNA2 ? 128 : 64;
6252
6358
  case GGML_TYPE_F16:
6359
+ case GGML_TYPE_F32:
6253
6360
  return 1;
6254
6361
  case GGML_TYPE_Q2_K:
6255
6362
  return max_compute_capability >= CC_RDNA2 ? 128 : 32;
@@ -6272,6 +6379,7 @@ static int64_t get_row_rounding(ggml_type type) {
6272
6379
  case GGML_TYPE_Q8_0:
6273
6380
  return 64;
6274
6381
  case GGML_TYPE_F16:
6382
+ case GGML_TYPE_F32:
6275
6383
  return 1;
6276
6384
  case GGML_TYPE_Q2_K:
6277
6385
  case GGML_TYPE_Q3_K:
@@ -6463,8 +6571,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
6463
6571
  src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src1_as);
6464
6572
  to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream);
6465
6573
  }
6466
- const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddq_i : src1_as_f16;
6467
-
6574
+ const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16;
6468
6575
  size_t dst_as = 0;
6469
6576
  half * dst_f16 = (half *) ggml_cuda_pool_malloc(row_diff*src1_ncols * sizeof(half), &dst_as);
6470
6577
 
@@ -6639,6 +6746,45 @@ inline void ggml_cuda_op_alibi(
6639
6746
  (void) src1_dd;
6640
6747
  }
6641
6748
 
6749
+ inline void ggml_cuda_op_im2col(
6750
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6751
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6752
+
6753
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
6754
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
6755
+ GGML_ASSERT( dst->type == GGML_TYPE_F16);
6756
+
6757
+ const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
6758
+ const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
6759
+ const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
6760
+ const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
6761
+ const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
6762
+ const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
6763
+
6764
+ const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
6765
+
6766
+ const int64_t N = src1->ne[is_2D ? 3 : 2];
6767
+ const int64_t IC = src1->ne[is_2D ? 2 : 1];
6768
+ const int64_t IH = is_2D ? src1->ne[1] : 1;
6769
+ const int64_t IW = src1->ne[0];
6770
+
6771
+ const int64_t KH = is_2D ? src0->ne[1] : 1;
6772
+ const int64_t KW = src0->ne[0];
6773
+
6774
+ const int64_t OH = is_2D ? dst->ne[2] : 1;
6775
+ const int64_t OW = dst->ne[1];
6776
+
6777
+ const size_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4; // nb is byte offset, src is type float32
6778
+ const size_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
6779
+
6780
+ im2col_f32_f16_cuda(src1_dd, (half*) dst_dd,
6781
+ OH, IW, IH, OW, IC, KH, KW, N,
6782
+ ofs0, ofs1, s0, s1, p0, p1, d0, d1, main_stream);
6783
+
6784
+ (void) src0;
6785
+ (void) src0_dd;
6786
+ }
6787
+
6642
6788
  inline void ggml_cuda_op_diag_mask_inf(
6643
6789
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6644
6790
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -7160,6 +7306,14 @@ static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, g
7160
7306
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
7161
7307
  }
7162
7308
 
7309
+ static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7310
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu);
7311
+ }
7312
+
7313
+ static void ggml_cuda_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7314
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sqr);
7315
+ }
7316
+
7163
7317
  static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7164
7318
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
7165
7319
  }
@@ -7543,6 +7697,9 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
7543
7697
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
7544
7698
  ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
7545
7699
  ne10, ne11, nb10, nb11, nb12, main_stream);
7700
+ } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
7701
+ ggml_cpy_f16_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
7702
+ ne10, ne11, nb10, nb11, nb12, main_stream);
7546
7703
  } else {
7547
7704
  fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
7548
7705
  ggml_type_name(src0->type), ggml_type_name(src1->type));
@@ -7574,6 +7731,10 @@ static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1,
7574
7731
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
7575
7732
  }
7576
7733
 
7734
+ static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7735
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
7736
+ }
7737
+
7577
7738
  static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7578
7739
  (void) src0;
7579
7740
  (void) src1;
@@ -7685,11 +7846,11 @@ static size_t g_temp_tensor_extra_index = 0;
7685
7846
 
7686
7847
  static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
7687
7848
  if (g_temp_tensor_extras == nullptr) {
7688
- g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
7849
+ g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
7689
7850
  }
7690
7851
 
7691
7852
  size_t alloc_index = g_temp_tensor_extra_index;
7692
- g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
7853
+ g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
7693
7854
  ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
7694
7855
  memset(extra, 0, sizeof(*extra));
7695
7856
 
@@ -7867,6 +8028,15 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
7867
8028
  return false;
7868
8029
  }
7869
8030
 
8031
+ if (tensor->op == GGML_OP_MUL_MAT) {
8032
+ if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
8033
+ #ifndef NDEBUG
8034
+ fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %d, src1->ne[3] = %d - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
8035
+ #endif
8036
+ return false;
8037
+ }
8038
+ }
8039
+
7870
8040
  switch (tensor->op) {
7871
8041
  case GGML_OP_REPEAT:
7872
8042
  func = ggml_cuda_repeat;
@@ -7891,6 +8061,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
7891
8061
  case GGML_UNARY_OP_SILU:
7892
8062
  func = ggml_cuda_silu;
7893
8063
  break;
8064
+ case GGML_UNARY_OP_RELU:
8065
+ func = ggml_cuda_relu;
8066
+ break;
7894
8067
  default:
7895
8068
  return false;
7896
8069
  } break;
@@ -7909,6 +8082,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
7909
8082
  case GGML_OP_SCALE:
7910
8083
  func = ggml_cuda_scale;
7911
8084
  break;
8085
+ case GGML_OP_SQR:
8086
+ func = ggml_cuda_sqr;
8087
+ break;
7912
8088
  case GGML_OP_CLAMP:
7913
8089
  if (!any_on_device) {
7914
8090
  return false;
@@ -7939,6 +8115,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
7939
8115
  case GGML_OP_ALIBI:
7940
8116
  func = ggml_cuda_alibi;
7941
8117
  break;
8118
+ case GGML_OP_IM2COL:
8119
+ func = ggml_cuda_im2col;
8120
+ break;
7942
8121
  default:
7943
8122
  return false;
7944
8123
  }
@@ -7998,11 +8177,11 @@ struct ggml_backend_buffer_context_cuda {
7998
8177
 
7999
8178
  ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
8000
8179
  if (temp_tensor_extras == nullptr) {
8001
- temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
8180
+ temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
8002
8181
  }
8003
8182
 
8004
8183
  size_t alloc_index = temp_tensor_extra_index;
8005
- temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_MAX_NODES;
8184
+ temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
8006
8185
  ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
8007
8186
  memset(extra, 0, sizeof(*extra));
8008
8187
 
@@ -8088,7 +8267,12 @@ static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backe
8088
8267
  ggml_cuda_set_device(g_main_device);
8089
8268
 
8090
8269
  ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda;
8270
+
8271
+ size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
8272
+
8273
+ ggml_cuda_set_device(g_main_device);
8091
8274
  CUDA_CHECK(cudaMalloc(&ctx->device, size));
8275
+
8092
8276
  return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size);
8093
8277
  }
8094
8278
 
@@ -8155,6 +8339,8 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
8155
8339
  for (int i = 0; i < cgraph->n_nodes; i++) {
8156
8340
  ggml_tensor * node = cgraph->nodes[i];
8157
8341
 
8342
+ if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
8343
+ continue;
8158
8344
  assert(node->backend == GGML_BACKEND_GPU);
8159
8345
  for (int j = 0; j < GGML_MAX_SRC; j++) {
8160
8346
  if (node->src[j] != nullptr) {
@@ -39,12 +39,6 @@ extern "C" {
39
39
  #endif
40
40
  #endif
41
41
 
42
- #undef MIN
43
- #undef MAX
44
-
45
- #define MIN(a, b) ((a) < (b) ? (a) : (b))
46
- #define MAX(a, b) ((a) > (b) ? (a) : (b))
47
-
48
42
  // 16-bit float
49
43
  // on Arm, we use __fp16
50
44
  // on x86, we use uint16_t
@@ -230,7 +224,19 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
230
224
 
231
225
  #endif
232
226
 
233
- // TODO: backend v2 PR
227
+ #define GGML_HASHTABLE_FULL ((size_t)-1)
228
+ #define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2)
229
+
230
+ bool ggml_hash_contains (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
231
+
232
+ // returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
233
+ size_t ggml_hash_find (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
234
+
235
+ // returns GGML_HAHSHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
236
+ size_t ggml_hash_insert ( struct ggml_hash_set hash_set, struct ggml_tensor * key);
237
+
238
+ // return index, asserts if table is full
239
+ size_t ggml_hash_find_or_insert( struct ggml_hash_set hash_set, struct ggml_tensor * key);
234
240
 
235
241
  #ifdef __cplusplus
236
242
  }
@@ -26,7 +26,7 @@
26
26
  #include <stdbool.h>
27
27
 
28
28
  // max memory buffers that can be mapped to the device
29
- #define GGML_METAL_MAX_BUFFERS 16
29
+ #define GGML_METAL_MAX_BUFFERS 64
30
30
  #define GGML_METAL_MAX_COMMAND_BUFFERS 32
31
31
 
32
32
  struct ggml_tensor;