llama_cpp 0.12.6 → 0.12.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -23,6 +23,9 @@
23
23
  #include <limits.h>
24
24
  #include <stdarg.h>
25
25
  #include <signal.h>
26
+ #if defined(__gnu_linux__)
27
+ #include <syscall.h>
28
+ #endif
26
29
 
27
30
  #ifdef GGML_USE_METAL
28
31
  #include <unistd.h>
@@ -270,6 +273,8 @@ inline static void * ggml_calloc(size_t num, size_t size) {
270
273
  #include <Accelerate/Accelerate.h>
271
274
  #if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
272
275
  #include "ggml-opencl.h"
276
+ #elif defined(GGML_USE_VULKAN)
277
+ #include "ggml-vulkan.h"
273
278
  #endif
274
279
  #elif defined(GGML_USE_OPENBLAS)
275
280
  #if defined(GGML_BLAS_USE_MKL)
@@ -318,7 +323,7 @@ float ggml_table_f32_f16[1 << 16];
318
323
  // note: do not use these inside ggml.c
319
324
  // these are meant to be used via the ggml.h API
320
325
  float ggml_fp16_to_fp32(ggml_fp16_t x) {
321
- return (float) GGML_FP16_TO_FP32(x);
326
+ return GGML_FP16_TO_FP32(x);
322
327
  }
323
328
 
324
329
  ggml_fp16_t ggml_fp32_to_fp16(float x) {
@@ -673,6 +678,30 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
673
678
  .vec_dot_type = GGML_TYPE_Q8_K,
674
679
  .nrows = 1,
675
680
  },
681
+ [GGML_TYPE_IQ1_S] = {
682
+ .type_name = "iq1_s",
683
+ .blck_size = QK_K,
684
+ .type_size = sizeof(block_iq1_s),
685
+ .is_quantized = true,
686
+ .to_float = (ggml_to_float_t) dequantize_row_iq1_s,
687
+ .from_float = NULL,
688
+ .from_float_reference = NULL,
689
+ .vec_dot = ggml_vec_dot_iq1_s_q8_K,
690
+ .vec_dot_type = GGML_TYPE_Q8_K,
691
+ .nrows = 1,
692
+ },
693
+ [GGML_TYPE_IQ4_NL] = {
694
+ .type_name = "iq4_nl",
695
+ .blck_size = QK4_NL,
696
+ .type_size = sizeof(block_iq4_nl),
697
+ .is_quantized = true,
698
+ .to_float = (ggml_to_float_t) dequantize_row_iq4_nl,
699
+ .from_float = quantize_row_iq4_nl,
700
+ .from_float_reference = (ggml_from_float_t)quantize_row_iq4_nl_reference,
701
+ .vec_dot = ggml_vec_dot_iq4_nl_q8_0,
702
+ .vec_dot_type = GGML_TYPE_Q8_0,
703
+ .nrows = 1,
704
+ },
676
705
  [GGML_TYPE_Q8_K] = {
677
706
  .type_name = "q8_K",
678
707
  .blck_size = QK_K,
@@ -769,7 +798,7 @@ inline static float vaddvq_f32(float32x4_t v) {
769
798
  #define GGML_F16x8 float16x8_t
770
799
  #define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
771
800
  #define GGML_F16x8_SET1(x) vdupq_n_f16(x)
772
- #define GGML_F16x8_LOAD vld1q_f16
801
+ #define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x))
773
802
  #define GGML_F16x8_STORE vst1q_f16
774
803
  #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
775
804
  #define GGML_F16x8_ADD vaddq_f16
@@ -812,7 +841,7 @@ inline static float vaddvq_f32(float32x4_t v) {
812
841
  #define GGML_F32Cx4 float32x4_t
813
842
  #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
814
843
  #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
815
- #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16(x))
844
+ #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
816
845
  #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
817
846
  #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
818
847
  #define GGML_F32Cx4_ADD vaddq_f32
@@ -868,7 +897,7 @@ do { \
868
897
  const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \
869
898
  _mm256_extractf128_ps(x[0], 1)); \
870
899
  const __m128 t1 = _mm_hadd_ps(t0, t0); \
871
- res = _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \
900
+ res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \
872
901
  } while (0)
873
902
  // TODO: is this optimal ?
874
903
 
@@ -1149,7 +1178,7 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
1149
1178
  x[i] = _mm_add_ps(x[i], x[offset+i]); \
1150
1179
  } \
1151
1180
  const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
1152
- res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
1181
+ res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
1153
1182
  }
1154
1183
  // TODO: is this optimal ?
1155
1184
 
@@ -1954,9 +1983,16 @@ struct ggml_numa_node {
1954
1983
  };
1955
1984
 
1956
1985
  struct ggml_numa_nodes {
1986
+ enum ggml_numa_strategy numa_strategy;
1957
1987
  struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
1958
1988
  uint32_t n_nodes;
1959
1989
  uint32_t total_cpus; // hardware threads on system
1990
+ uint32_t current_node; // node on which main process is execting
1991
+ #if defined(__gnu_linux__)
1992
+ cpu_set_t cpuset; // cpuset from numactl
1993
+ #else
1994
+ uint32_t cpuset; // no NUMA support outside of Linux at this time. Use a portable datatype
1995
+ #endif
1960
1996
  };
1961
1997
 
1962
1998
  //
@@ -1990,18 +2026,40 @@ inline static void ggml_critical_section_end(void) {
1990
2026
  atomic_fetch_sub(&g_state_barrier, 1);
1991
2027
  }
1992
2028
 
1993
- void ggml_numa_init(void) {
2029
+ #if defined(__gnu_linux__)
2030
+ static cpu_set_t ggml_get_numa_affinity(void) {
2031
+ cpu_set_t cpuset;
2032
+ pthread_t thread;
2033
+ thread = pthread_self();
2034
+ CPU_ZERO(&cpuset);
2035
+ pthread_getaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
2036
+ return cpuset;
2037
+ }
2038
+ #else
2039
+ static uint32_t ggml_get_numa_affinity(void) {
2040
+ return 0; // no NUMA support
2041
+ }
2042
+ #endif
2043
+
2044
+ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
1994
2045
  if (g_state.numa.n_nodes > 0) {
1995
2046
  fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
1996
2047
 
1997
2048
  return;
1998
2049
  }
1999
2050
 
2000
- #ifdef __linux__
2051
+ #if defined(__gnu_linux__)
2001
2052
  struct stat st;
2002
2053
  char path[256];
2003
2054
  int rv;
2004
2055
 
2056
+ // set numa scheme
2057
+ g_state.numa.numa_strategy = numa_flag;
2058
+
2059
+ GGML_PRINT_DEBUG("numa strategy %u\n",g_state.numa.numa_strategy);
2060
+
2061
+ g_state.numa.cpuset = ggml_get_numa_affinity();
2062
+
2005
2063
  // enumerate nodes
2006
2064
  while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
2007
2065
  rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
@@ -2020,11 +2078,23 @@ void ggml_numa_init(void) {
2020
2078
 
2021
2079
  GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
2022
2080
 
2023
- if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1) {
2081
+ // figure out which node we're on
2082
+ uint current_cpu;
2083
+ int getcpu_ret = 0;
2084
+ #if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 28)
2085
+ getcpu_ret = getcpu(&current_cpu, &g_state.numa.current_node);
2086
+ #else
2087
+ // old glibc doesn't have a wrapper for this call. Fall back on direct syscall
2088
+ getcpu_ret = syscall(SYS_getcpu,&current_cpu,&g_state.numa.current_node);
2089
+ #endif
2090
+
2091
+ if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) {
2024
2092
  g_state.numa.n_nodes = 0;
2025
2093
  return;
2026
2094
  }
2027
2095
 
2096
+ GGML_PRINT_DEBUG("found our process on numa node %u, CPU %u\n", g_state.numa.current_node, current_cpu);
2097
+
2028
2098
  for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
2029
2099
  struct ggml_numa_node * node = &g_state.numa.nodes[n];
2030
2100
  GGML_PRINT_DEBUG("CPUs on node %u:", n);
@@ -2051,6 +2121,7 @@ void ggml_numa_init(void) {
2051
2121
  }
2052
2122
  }
2053
2123
  #else
2124
+ GGML_UNUSED(numa_flag);
2054
2125
  // TODO
2055
2126
  #endif
2056
2127
  }
@@ -2231,6 +2302,8 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
2231
2302
  case GGML_FTYPE_MOSTLY_IQ2_XXS: wtype = GGML_TYPE_IQ2_XXS; break;
2232
2303
  case GGML_FTYPE_MOSTLY_IQ2_XS: wtype = GGML_TYPE_IQ2_XS; break;
2233
2304
  case GGML_FTYPE_MOSTLY_IQ3_XXS: wtype = GGML_TYPE_IQ3_XXS; break;
2305
+ case GGML_FTYPE_MOSTLY_IQ1_S: wtype = GGML_TYPE_IQ1_S; break;
2306
+ case GGML_FTYPE_MOSTLY_IQ4_NL: wtype = GGML_TYPE_IQ4_NL; break;
2234
2307
  case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
2235
2308
  case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
2236
2309
  }
@@ -3184,7 +3257,7 @@ const char * ggml_get_name(const struct ggml_tensor * tensor) {
3184
3257
  }
3185
3258
 
3186
3259
  struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name) {
3187
- strncpy(tensor->name, name, sizeof(tensor->name));
3260
+ strncpy(tensor->name, name, sizeof(tensor->name) - 1);
3188
3261
  tensor->name[sizeof(tensor->name) - 1] = '\0';
3189
3262
  return tensor;
3190
3263
  }
@@ -5060,16 +5133,28 @@ static struct ggml_tensor * ggml_soft_max_impl(
5060
5133
  struct ggml_context * ctx,
5061
5134
  struct ggml_tensor * a,
5062
5135
  struct ggml_tensor * mask,
5136
+ struct ggml_tensor * pos,
5063
5137
  float scale,
5138
+ float max_bias,
5064
5139
  bool inplace) {
5065
5140
  GGML_ASSERT(ggml_is_contiguous(a));
5141
+
5066
5142
  if (mask) {
5067
5143
  GGML_ASSERT(ggml_is_contiguous(mask));
5068
- GGML_ASSERT(mask->ne[2] == 1);
5069
- GGML_ASSERT(mask->ne[3] == 1);
5144
+ GGML_ASSERT(ggml_is_matrix(mask));
5070
5145
  GGML_ASSERT(ggml_can_repeat_rows(mask, a));
5071
5146
  }
5072
5147
 
5148
+ if (pos) {
5149
+ GGML_ASSERT(ggml_is_vector(pos));
5150
+ GGML_ASSERT(pos->type == GGML_TYPE_F32);
5151
+ GGML_ASSERT(pos->ne[0] == a->ne[0]);
5152
+ }
5153
+
5154
+ if (max_bias > 0.0f) {
5155
+ GGML_ASSERT(pos);
5156
+ }
5157
+
5073
5158
  bool is_node = false;
5074
5159
 
5075
5160
  if (a->grad) {
@@ -5078,13 +5163,14 @@ static struct ggml_tensor * ggml_soft_max_impl(
5078
5163
 
5079
5164
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5080
5165
 
5081
- float params[] = { scale };
5166
+ float params[] = { scale, max_bias };
5082
5167
  ggml_set_op_params(result, params, sizeof(params));
5083
5168
 
5084
5169
  result->op = GGML_OP_SOFT_MAX;
5085
5170
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5086
5171
  result->src[0] = a;
5087
5172
  result->src[1] = mask;
5173
+ result->src[2] = pos;
5088
5174
 
5089
5175
  return result;
5090
5176
  }
@@ -5092,21 +5178,23 @@ static struct ggml_tensor * ggml_soft_max_impl(
5092
5178
  struct ggml_tensor * ggml_soft_max(
5093
5179
  struct ggml_context * ctx,
5094
5180
  struct ggml_tensor * a) {
5095
- return ggml_soft_max_impl(ctx, a, NULL, 1.0f, false);
5181
+ return ggml_soft_max_impl(ctx, a, NULL, NULL, 1.0f, 0.0f, false);
5096
5182
  }
5097
5183
 
5098
5184
  struct ggml_tensor * ggml_soft_max_inplace(
5099
5185
  struct ggml_context * ctx,
5100
5186
  struct ggml_tensor * a) {
5101
- return ggml_soft_max_impl(ctx, a, NULL, 1.0f, true);
5187
+ return ggml_soft_max_impl(ctx, a, NULL, NULL, 1.0f, 0.0f, true);
5102
5188
  }
5103
5189
 
5104
5190
  struct ggml_tensor * ggml_soft_max_ext(
5105
5191
  struct ggml_context * ctx,
5106
5192
  struct ggml_tensor * a,
5107
5193
  struct ggml_tensor * mask,
5108
- float scale) {
5109
- return ggml_soft_max_impl(ctx, a, mask, scale, false);
5194
+ struct ggml_tensor * pos,
5195
+ float scale,
5196
+ float max_bias) {
5197
+ return ggml_soft_max_impl(ctx, a, mask, pos, scale, max_bias, false);
5110
5198
  }
5111
5199
 
5112
5200
  // ggml_soft_max_back
@@ -5556,7 +5644,9 @@ struct ggml_tensor * ggml_conv_2d(
5556
5644
  ggml_reshape_2d(ctx, im2col, im2col->ne[0], im2col->ne[3] * im2col->ne[2] * im2col->ne[1]), // [N, OH, OW, IC * KH * KW] => [N*OH*OW, IC * KH * KW]
5557
5645
  ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1] * a->ne[2]), a->ne[3])); // [OC,IC, KH, KW] => [OC, IC * KH * KW]
5558
5646
 
5559
- result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], a->ne[3], im2col->ne[3]); // [N, OC, OH, OW]
5647
+ result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], im2col->ne[3], a->ne[3]); // [OC, N, OH, OW]
5648
+ result = ggml_cont(ctx, ggml_permute(ctx, result, 0, 1, 3, 2)); // [N, OC, OH, OW]
5649
+
5560
5650
 
5561
5651
  return result;
5562
5652
  }
@@ -6562,8 +6652,10 @@ void ggml_set_param(
6562
6652
 
6563
6653
  static void ggml_compute_forward_dup_same_cont(
6564
6654
  const struct ggml_compute_params * params,
6565
- const struct ggml_tensor * src0,
6566
6655
  struct ggml_tensor * dst) {
6656
+
6657
+ const struct ggml_tensor * src0 = dst->src[0];
6658
+
6567
6659
  GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
6568
6660
  GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
6569
6661
  GGML_ASSERT(src0->type == dst->type);
@@ -6594,8 +6686,10 @@ static void ggml_compute_forward_dup_same_cont(
6594
6686
  }
6595
6687
  static void ggml_compute_forward_dup_f16(
6596
6688
  const struct ggml_compute_params * params,
6597
- const struct ggml_tensor * src0,
6598
6689
  struct ggml_tensor * dst) {
6690
+
6691
+ const struct ggml_tensor * src0 = dst->src[0];
6692
+
6599
6693
  GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
6600
6694
 
6601
6695
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -6608,7 +6702,7 @@ static void ggml_compute_forward_dup_f16(
6608
6702
  const int nth = params->nth; // number of threads
6609
6703
 
6610
6704
  if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
6611
- ggml_compute_forward_dup_same_cont(params, src0, dst);
6705
+ ggml_compute_forward_dup_same_cont(params, dst);
6612
6706
  return;
6613
6707
  }
6614
6708
 
@@ -6865,8 +6959,10 @@ static void ggml_compute_forward_dup_f16(
6865
6959
 
6866
6960
  static void ggml_compute_forward_dup_f32(
6867
6961
  const struct ggml_compute_params * params,
6868
- const struct ggml_tensor * src0,
6869
6962
  struct ggml_tensor * dst) {
6963
+
6964
+ const struct ggml_tensor * src0 = dst->src[0];
6965
+
6870
6966
  GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
6871
6967
 
6872
6968
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -6879,7 +6975,7 @@ static void ggml_compute_forward_dup_f32(
6879
6975
  const int nth = params->nth; // number of threads
6880
6976
 
6881
6977
  if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
6882
- ggml_compute_forward_dup_same_cont(params, src0, dst);
6978
+ ggml_compute_forward_dup_same_cont(params, dst);
6883
6979
  return;
6884
6980
  }
6885
6981
 
@@ -7115,8 +7211,10 @@ static void ggml_compute_forward_dup_f32(
7115
7211
  // A simplified version of ggml_compute_forward_dup that doesn't do float upcasting, and just plain old memcpy.
7116
7212
  static void ggml_compute_forward_dup_bytes(
7117
7213
  const struct ggml_compute_params * params,
7118
- const struct ggml_tensor * src0,
7119
7214
  struct ggml_tensor * dst) {
7215
+
7216
+ const struct ggml_tensor * src0 = dst->src[0];
7217
+
7120
7218
  GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
7121
7219
  GGML_ASSERT(src0->type == dst->type);
7122
7220
 
@@ -7125,7 +7223,7 @@ static void ggml_compute_forward_dup_bytes(
7125
7223
  }
7126
7224
 
7127
7225
  if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
7128
- ggml_compute_forward_dup_same_cont(params, src0, dst);
7226
+ ggml_compute_forward_dup_same_cont(params, dst);
7129
7227
  return;
7130
7228
  }
7131
7229
 
@@ -7264,21 +7362,23 @@ static void ggml_compute_forward_dup_bytes(
7264
7362
 
7265
7363
  static void ggml_compute_forward_dup(
7266
7364
  const struct ggml_compute_params * params,
7267
- const struct ggml_tensor * src0,
7268
7365
  struct ggml_tensor * dst) {
7366
+
7367
+ const struct ggml_tensor * src0 = dst->src[0];
7368
+
7269
7369
  if (src0->type == dst->type) {
7270
- ggml_compute_forward_dup_bytes(params, src0, dst);
7370
+ ggml_compute_forward_dup_bytes(params, dst);
7271
7371
  return;
7272
7372
  }
7273
7373
 
7274
7374
  switch (src0->type) {
7275
7375
  case GGML_TYPE_F16:
7276
7376
  {
7277
- ggml_compute_forward_dup_f16(params, src0, dst);
7377
+ ggml_compute_forward_dup_f16(params, dst);
7278
7378
  } break;
7279
7379
  case GGML_TYPE_F32:
7280
7380
  {
7281
- ggml_compute_forward_dup_f32(params, src0, dst);
7381
+ ggml_compute_forward_dup_f32(params, dst);
7282
7382
  } break;
7283
7383
  default:
7284
7384
  {
@@ -7291,9 +7391,11 @@ static void ggml_compute_forward_dup(
7291
7391
 
7292
7392
  static void ggml_compute_forward_add_f32(
7293
7393
  const struct ggml_compute_params * params,
7294
- const struct ggml_tensor * src0,
7295
- const struct ggml_tensor * src1,
7296
7394
  struct ggml_tensor * dst) {
7395
+
7396
+ const struct ggml_tensor * src0 = dst->src[0];
7397
+ const struct ggml_tensor * src1 = dst->src[1];
7398
+
7297
7399
  GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
7298
7400
 
7299
7401
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -7379,9 +7481,11 @@ static void ggml_compute_forward_add_f32(
7379
7481
 
7380
7482
  static void ggml_compute_forward_add_f16_f32(
7381
7483
  const struct ggml_compute_params * params,
7382
- const struct ggml_tensor * src0,
7383
- const struct ggml_tensor * src1,
7384
7484
  struct ggml_tensor * dst) {
7485
+
7486
+ const struct ggml_tensor * src0 = dst->src[0];
7487
+ const struct ggml_tensor * src1 = dst->src[1];
7488
+
7385
7489
  GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
7386
7490
 
7387
7491
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -7456,9 +7560,11 @@ static void ggml_compute_forward_add_f16_f32(
7456
7560
 
7457
7561
  static void ggml_compute_forward_add_f16_f16(
7458
7562
  const struct ggml_compute_params * params,
7459
- const struct ggml_tensor * src0,
7460
- const struct ggml_tensor * src1,
7461
7563
  struct ggml_tensor * dst) {
7564
+
7565
+ const struct ggml_tensor * src0 = dst->src[0];
7566
+ const struct ggml_tensor * src1 = dst->src[1];
7567
+
7462
7568
  GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
7463
7569
 
7464
7570
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -7510,9 +7616,11 @@ static void ggml_compute_forward_add_f16_f16(
7510
7616
 
7511
7617
  static void ggml_compute_forward_add_q_f32(
7512
7618
  const struct ggml_compute_params * params,
7513
- const struct ggml_tensor * src0,
7514
- const struct ggml_tensor * src1,
7515
7619
  struct ggml_tensor * dst) {
7620
+
7621
+ const struct ggml_tensor * src0 = dst->src[0];
7622
+ const struct ggml_tensor * src1 = dst->src[1];
7623
+
7516
7624
  GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
7517
7625
 
7518
7626
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -7588,14 +7696,16 @@ static void ggml_compute_forward_add_q_f32(
7588
7696
 
7589
7697
  static void ggml_compute_forward_add(
7590
7698
  const struct ggml_compute_params * params,
7591
- const struct ggml_tensor * src0,
7592
- const struct ggml_tensor * src1,
7593
7699
  struct ggml_tensor * dst) {
7700
+
7701
+ const struct ggml_tensor * src0 = dst->src[0];
7702
+ const struct ggml_tensor * src1 = dst->src[1];
7703
+
7594
7704
  switch (src0->type) {
7595
7705
  case GGML_TYPE_F32:
7596
7706
  {
7597
7707
  if (src1->type == GGML_TYPE_F32) {
7598
- ggml_compute_forward_add_f32(params, src0, src1, dst);
7708
+ ggml_compute_forward_add_f32(params, dst);
7599
7709
  }
7600
7710
  else {
7601
7711
  GGML_ASSERT(false);
@@ -7604,10 +7714,10 @@ static void ggml_compute_forward_add(
7604
7714
  case GGML_TYPE_F16:
7605
7715
  {
7606
7716
  if (src1->type == GGML_TYPE_F16) {
7607
- ggml_compute_forward_add_f16_f16(params, src0, src1, dst);
7717
+ ggml_compute_forward_add_f16_f16(params, dst);
7608
7718
  }
7609
7719
  else if (src1->type == GGML_TYPE_F32) {
7610
- ggml_compute_forward_add_f16_f32(params, src0, src1, dst);
7720
+ ggml_compute_forward_add_f16_f32(params, dst);
7611
7721
  }
7612
7722
  else {
7613
7723
  GGML_ASSERT(false);
@@ -7626,8 +7736,10 @@ static void ggml_compute_forward_add(
7626
7736
  case GGML_TYPE_IQ2_XXS:
7627
7737
  case GGML_TYPE_IQ2_XS:
7628
7738
  case GGML_TYPE_IQ3_XXS:
7739
+ case GGML_TYPE_IQ1_S:
7740
+ case GGML_TYPE_IQ4_NL:
7629
7741
  {
7630
- ggml_compute_forward_add_q_f32(params, src0, src1, dst);
7742
+ ggml_compute_forward_add_q_f32(params, dst);
7631
7743
  } break;
7632
7744
  default:
7633
7745
  {
@@ -7640,9 +7752,11 @@ static void ggml_compute_forward_add(
7640
7752
 
7641
7753
  static void ggml_compute_forward_add1_f32(
7642
7754
  const struct ggml_compute_params * params,
7643
- const struct ggml_tensor * src0,
7644
- const struct ggml_tensor * src1,
7645
7755
  struct ggml_tensor * dst) {
7756
+
7757
+ const struct ggml_tensor * src0 = dst->src[0];
7758
+ const struct ggml_tensor * src1 = dst->src[1];
7759
+
7646
7760
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
7647
7761
  GGML_ASSERT(ggml_is_scalar(src1));
7648
7762
 
@@ -7692,9 +7806,11 @@ static void ggml_compute_forward_add1_f32(
7692
7806
 
7693
7807
  static void ggml_compute_forward_add1_f16_f32(
7694
7808
  const struct ggml_compute_params * params,
7695
- const struct ggml_tensor * src0,
7696
- const struct ggml_tensor * src1,
7697
7809
  struct ggml_tensor * dst) {
7810
+
7811
+ const struct ggml_tensor * src0 = dst->src[0];
7812
+ const struct ggml_tensor * src1 = dst->src[1];
7813
+
7698
7814
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
7699
7815
  GGML_ASSERT(ggml_is_scalar(src1));
7700
7816
 
@@ -7742,9 +7858,11 @@ static void ggml_compute_forward_add1_f16_f32(
7742
7858
 
7743
7859
  static void ggml_compute_forward_add1_f16_f16(
7744
7860
  const struct ggml_compute_params * params,
7745
- const struct ggml_tensor * src0,
7746
- const struct ggml_tensor * src1,
7747
7861
  struct ggml_tensor * dst) {
7862
+
7863
+ const struct ggml_tensor * src0 = dst->src[0];
7864
+ const struct ggml_tensor * src1 = dst->src[1];
7865
+
7748
7866
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
7749
7867
  GGML_ASSERT(ggml_is_scalar(src1));
7750
7868
 
@@ -7792,9 +7910,11 @@ static void ggml_compute_forward_add1_f16_f16(
7792
7910
 
7793
7911
  static void ggml_compute_forward_add1_q_f32(
7794
7912
  const struct ggml_compute_params * params,
7795
- const struct ggml_tensor * src0,
7796
- const struct ggml_tensor * src1,
7797
7913
  struct ggml_tensor * dst) {
7914
+
7915
+ const struct ggml_tensor * src0 = dst->src[0];
7916
+ const struct ggml_tensor * src1 = dst->src[1];
7917
+
7798
7918
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
7799
7919
  GGML_ASSERT(ggml_is_scalar(src1));
7800
7920
 
@@ -7859,21 +7979,23 @@ static void ggml_compute_forward_add1_q_f32(
7859
7979
 
7860
7980
  static void ggml_compute_forward_add1(
7861
7981
  const struct ggml_compute_params * params,
7862
- const struct ggml_tensor * src0,
7863
- const struct ggml_tensor * src1,
7864
7982
  struct ggml_tensor * dst) {
7983
+
7984
+ const struct ggml_tensor * src0 = dst->src[0];
7985
+ const struct ggml_tensor * src1 = dst->src[1];
7986
+
7865
7987
  switch (src0->type) {
7866
7988
  case GGML_TYPE_F32:
7867
7989
  {
7868
- ggml_compute_forward_add1_f32(params, src0, src1, dst);
7990
+ ggml_compute_forward_add1_f32(params, dst);
7869
7991
  } break;
7870
7992
  case GGML_TYPE_F16:
7871
7993
  {
7872
7994
  if (src1->type == GGML_TYPE_F16) {
7873
- ggml_compute_forward_add1_f16_f16(params, src0, src1, dst);
7995
+ ggml_compute_forward_add1_f16_f16(params, dst);
7874
7996
  }
7875
7997
  else if (src1->type == GGML_TYPE_F32) {
7876
- ggml_compute_forward_add1_f16_f32(params, src0, src1, dst);
7998
+ ggml_compute_forward_add1_f16_f32(params, dst);
7877
7999
  }
7878
8000
  else {
7879
8001
  GGML_ASSERT(false);
@@ -7893,8 +8015,10 @@ static void ggml_compute_forward_add1(
7893
8015
  case GGML_TYPE_IQ2_XXS:
7894
8016
  case GGML_TYPE_IQ2_XS:
7895
8017
  case GGML_TYPE_IQ3_XXS:
8018
+ case GGML_TYPE_IQ1_S:
8019
+ case GGML_TYPE_IQ4_NL:
7896
8020
  {
7897
- ggml_compute_forward_add1_q_f32(params, src0, src1, dst);
8021
+ ggml_compute_forward_add1_q_f32(params, dst);
7898
8022
  } break;
7899
8023
  default:
7900
8024
  {
@@ -7907,9 +8031,11 @@ static void ggml_compute_forward_add1(
7907
8031
 
7908
8032
  static void ggml_compute_forward_acc_f32(
7909
8033
  const struct ggml_compute_params * params,
7910
- const struct ggml_tensor * src0,
7911
- const struct ggml_tensor * src1,
7912
8034
  struct ggml_tensor * dst) {
8035
+
8036
+ const struct ggml_tensor * src0 = dst->src[0];
8037
+ const struct ggml_tensor * src1 = dst->src[1];
8038
+
7913
8039
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
7914
8040
  GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
7915
8041
 
@@ -7989,14 +8115,14 @@ static void ggml_compute_forward_acc_f32(
7989
8115
 
7990
8116
  static void ggml_compute_forward_acc(
7991
8117
  const struct ggml_compute_params * params,
7992
- const struct ggml_tensor * src0,
7993
- const struct ggml_tensor * src1,
7994
8118
  struct ggml_tensor * dst) {
7995
8119
 
8120
+ const struct ggml_tensor * src0 = dst->src[0];
8121
+
7996
8122
  switch (src0->type) {
7997
8123
  case GGML_TYPE_F32:
7998
8124
  {
7999
- ggml_compute_forward_acc_f32(params, src0, src1, dst);
8125
+ ggml_compute_forward_acc_f32(params, dst);
8000
8126
  } break;
8001
8127
  case GGML_TYPE_F16:
8002
8128
  case GGML_TYPE_Q4_0:
@@ -8013,6 +8139,8 @@ static void ggml_compute_forward_acc(
8013
8139
  case GGML_TYPE_IQ2_XXS:
8014
8140
  case GGML_TYPE_IQ2_XS:
8015
8141
  case GGML_TYPE_IQ3_XXS:
8142
+ case GGML_TYPE_IQ1_S:
8143
+ case GGML_TYPE_IQ4_NL:
8016
8144
  default:
8017
8145
  {
8018
8146
  GGML_ASSERT(false);
@@ -8024,9 +8152,11 @@ static void ggml_compute_forward_acc(
8024
8152
 
8025
8153
  static void ggml_compute_forward_sub_f32(
8026
8154
  const struct ggml_compute_params * params,
8027
- const struct ggml_tensor * src0,
8028
- const struct ggml_tensor * src1,
8029
8155
  struct ggml_tensor * dst) {
8156
+
8157
+ const struct ggml_tensor * src0 = dst->src[0];
8158
+ const struct ggml_tensor * src1 = dst->src[1];
8159
+
8030
8160
  assert(params->ith == 0);
8031
8161
  assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
8032
8162
 
@@ -8084,13 +8214,14 @@ static void ggml_compute_forward_sub_f32(
8084
8214
 
8085
8215
  static void ggml_compute_forward_sub(
8086
8216
  const struct ggml_compute_params * params,
8087
- const struct ggml_tensor * src0,
8088
- const struct ggml_tensor * src1,
8089
8217
  struct ggml_tensor * dst) {
8218
+
8219
+ const struct ggml_tensor * src0 = dst->src[0];
8220
+
8090
8221
  switch (src0->type) {
8091
8222
  case GGML_TYPE_F32:
8092
8223
  {
8093
- ggml_compute_forward_sub_f32(params, src0, src1, dst);
8224
+ ggml_compute_forward_sub_f32(params, dst);
8094
8225
  } break;
8095
8226
  default:
8096
8227
  {
@@ -8103,9 +8234,11 @@ static void ggml_compute_forward_sub(
8103
8234
 
8104
8235
  static void ggml_compute_forward_mul_f32(
8105
8236
  const struct ggml_compute_params * params,
8106
- const struct ggml_tensor * src0,
8107
- const struct ggml_tensor * src1,
8108
8237
  struct ggml_tensor * dst) {
8238
+
8239
+ const struct ggml_tensor * src0 = dst->src[0];
8240
+ const struct ggml_tensor * src1 = dst->src[1];
8241
+
8109
8242
  GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
8110
8243
 
8111
8244
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -8186,15 +8319,17 @@ static void ggml_compute_forward_mul_f32(
8186
8319
 
8187
8320
  static void ggml_compute_forward_mul(
8188
8321
  const struct ggml_compute_params * params,
8189
- const struct ggml_tensor * src0,
8190
- const struct ggml_tensor * src1,
8191
8322
  struct ggml_tensor * dst) {
8323
+
8324
+ const struct ggml_tensor * src0 = dst->src[0];
8325
+ const struct ggml_tensor * src1 = dst->src[1];
8326
+
8192
8327
  GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now");
8193
8328
 
8194
8329
  switch (src0->type) {
8195
8330
  case GGML_TYPE_F32:
8196
8331
  {
8197
- ggml_compute_forward_mul_f32(params, src0, src1, dst);
8332
+ ggml_compute_forward_mul_f32(params, dst);
8198
8333
  } break;
8199
8334
  default:
8200
8335
  {
@@ -8207,9 +8342,11 @@ static void ggml_compute_forward_mul(
8207
8342
 
8208
8343
  static void ggml_compute_forward_div_f32(
8209
8344
  const struct ggml_compute_params * params,
8210
- const struct ggml_tensor * src0,
8211
- const struct ggml_tensor * src1,
8212
8345
  struct ggml_tensor * dst) {
8346
+
8347
+ const struct ggml_tensor * src0 = dst->src[0];
8348
+ const struct ggml_tensor * src1 = dst->src[1];
8349
+
8213
8350
  GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
8214
8351
 
8215
8352
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -8280,13 +8417,14 @@ static void ggml_compute_forward_div_f32(
8280
8417
 
8281
8418
  static void ggml_compute_forward_div(
8282
8419
  const struct ggml_compute_params * params,
8283
- const struct ggml_tensor * src0,
8284
- const struct ggml_tensor * src1,
8285
8420
  struct ggml_tensor * dst) {
8421
+
8422
+ const struct ggml_tensor * src0 = dst->src[0];
8423
+
8286
8424
  switch (src0->type) {
8287
8425
  case GGML_TYPE_F32:
8288
8426
  {
8289
- ggml_compute_forward_div_f32(params, src0, src1, dst);
8427
+ ggml_compute_forward_div_f32(params, dst);
8290
8428
  } break;
8291
8429
  default:
8292
8430
  {
@@ -8299,8 +8437,10 @@ static void ggml_compute_forward_div(
8299
8437
 
8300
8438
  static void ggml_compute_forward_sqr_f32(
8301
8439
  const struct ggml_compute_params * params,
8302
- const struct ggml_tensor * src0,
8303
8440
  struct ggml_tensor * dst) {
8441
+
8442
+ const struct ggml_tensor * src0 = dst->src[0];
8443
+
8304
8444
  assert(params->ith == 0);
8305
8445
  assert(ggml_are_same_shape(src0, dst));
8306
8446
 
@@ -8323,12 +8463,14 @@ static void ggml_compute_forward_sqr_f32(
8323
8463
 
8324
8464
  static void ggml_compute_forward_sqr(
8325
8465
  const struct ggml_compute_params * params,
8326
- const struct ggml_tensor * src0,
8327
8466
  struct ggml_tensor * dst) {
8467
+
8468
+ const struct ggml_tensor * src0 = dst->src[0];
8469
+
8328
8470
  switch (src0->type) {
8329
8471
  case GGML_TYPE_F32:
8330
8472
  {
8331
- ggml_compute_forward_sqr_f32(params, src0, dst);
8473
+ ggml_compute_forward_sqr_f32(params, dst);
8332
8474
  } break;
8333
8475
  default:
8334
8476
  {
@@ -8341,8 +8483,10 @@ static void ggml_compute_forward_sqr(
8341
8483
 
8342
8484
  static void ggml_compute_forward_sqrt_f32(
8343
8485
  const struct ggml_compute_params * params,
8344
- const struct ggml_tensor * src0,
8345
8486
  struct ggml_tensor * dst) {
8487
+
8488
+ const struct ggml_tensor * src0 = dst->src[0];
8489
+
8346
8490
  assert(params->ith == 0);
8347
8491
  assert(ggml_are_same_shape(src0, dst));
8348
8492
 
@@ -8365,12 +8509,14 @@ static void ggml_compute_forward_sqrt_f32(
8365
8509
 
8366
8510
  static void ggml_compute_forward_sqrt(
8367
8511
  const struct ggml_compute_params * params,
8368
- const struct ggml_tensor * src0,
8369
8512
  struct ggml_tensor * dst) {
8513
+
8514
+ const struct ggml_tensor * src0 = dst->src[0];
8515
+
8370
8516
  switch (src0->type) {
8371
8517
  case GGML_TYPE_F32:
8372
8518
  {
8373
- ggml_compute_forward_sqrt_f32(params, src0, dst);
8519
+ ggml_compute_forward_sqrt_f32(params, dst);
8374
8520
  } break;
8375
8521
  default:
8376
8522
  {
@@ -8383,8 +8529,10 @@ static void ggml_compute_forward_sqrt(
8383
8529
 
8384
8530
  static void ggml_compute_forward_log_f32(
8385
8531
  const struct ggml_compute_params * params,
8386
- const struct ggml_tensor * src0,
8387
8532
  struct ggml_tensor * dst) {
8533
+
8534
+ const struct ggml_tensor * src0 = dst->src[0];
8535
+
8388
8536
  GGML_ASSERT(params->ith == 0);
8389
8537
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
8390
8538
 
@@ -8407,12 +8555,14 @@ static void ggml_compute_forward_log_f32(
8407
8555
 
8408
8556
  static void ggml_compute_forward_log(
8409
8557
  const struct ggml_compute_params * params,
8410
- const struct ggml_tensor * src0,
8411
8558
  struct ggml_tensor * dst) {
8559
+
8560
+ const struct ggml_tensor * src0 = dst->src[0];
8561
+
8412
8562
  switch (src0->type) {
8413
8563
  case GGML_TYPE_F32:
8414
8564
  {
8415
- ggml_compute_forward_log_f32(params, src0, dst);
8565
+ ggml_compute_forward_log_f32(params, dst);
8416
8566
  } break;
8417
8567
  default:
8418
8568
  {
@@ -8425,8 +8575,10 @@ static void ggml_compute_forward_log(
8425
8575
 
8426
8576
  static void ggml_compute_forward_sum_f32(
8427
8577
  const struct ggml_compute_params * params,
8428
- const struct ggml_tensor * src0,
8429
8578
  struct ggml_tensor * dst) {
8579
+
8580
+ const struct ggml_tensor * src0 = dst->src[0];
8581
+
8430
8582
  assert(params->ith == 0);
8431
8583
  assert(ggml_is_scalar(dst));
8432
8584
 
@@ -8458,8 +8610,10 @@ static void ggml_compute_forward_sum_f32(
8458
8610
 
8459
8611
  static void ggml_compute_forward_sum_f16(
8460
8612
  const struct ggml_compute_params * params,
8461
- const struct ggml_tensor * src0,
8462
8613
  struct ggml_tensor * dst) {
8614
+
8615
+ const struct ggml_tensor * src0 = dst->src[0];
8616
+
8463
8617
  assert(params->ith == 0);
8464
8618
  assert(ggml_is_scalar(dst));
8465
8619
 
@@ -8490,16 +8644,18 @@ static void ggml_compute_forward_sum_f16(
8490
8644
 
8491
8645
  static void ggml_compute_forward_sum(
8492
8646
  const struct ggml_compute_params * params,
8493
- const struct ggml_tensor * src0,
8494
8647
  struct ggml_tensor * dst) {
8648
+
8649
+ const struct ggml_tensor * src0 = dst->src[0];
8650
+
8495
8651
  switch (src0->type) {
8496
8652
  case GGML_TYPE_F32:
8497
8653
  {
8498
- ggml_compute_forward_sum_f32(params, src0, dst);
8654
+ ggml_compute_forward_sum_f32(params, dst);
8499
8655
  } break;
8500
8656
  case GGML_TYPE_F16:
8501
8657
  {
8502
- ggml_compute_forward_sum_f16(params, src0, dst);
8658
+ ggml_compute_forward_sum_f16(params, dst);
8503
8659
  } break;
8504
8660
  default:
8505
8661
  {
@@ -8512,8 +8668,10 @@ static void ggml_compute_forward_sum(
8512
8668
 
8513
8669
  static void ggml_compute_forward_sum_rows_f32(
8514
8670
  const struct ggml_compute_params * params,
8515
- const struct ggml_tensor * src0,
8516
8671
  struct ggml_tensor * dst) {
8672
+
8673
+ const struct ggml_tensor * src0 = dst->src[0];
8674
+
8517
8675
  GGML_ASSERT(params->ith == 0);
8518
8676
 
8519
8677
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -8545,12 +8703,14 @@ static void ggml_compute_forward_sum_rows_f32(
8545
8703
 
8546
8704
  static void ggml_compute_forward_sum_rows(
8547
8705
  const struct ggml_compute_params * params,
8548
- const struct ggml_tensor * src0,
8549
8706
  struct ggml_tensor * dst) {
8707
+
8708
+ const struct ggml_tensor * src0 = dst->src[0];
8709
+
8550
8710
  switch (src0->type) {
8551
8711
  case GGML_TYPE_F32:
8552
8712
  {
8553
- ggml_compute_forward_sum_rows_f32(params, src0, dst);
8713
+ ggml_compute_forward_sum_rows_f32(params, dst);
8554
8714
  } break;
8555
8715
  default:
8556
8716
  {
@@ -8563,8 +8723,10 @@ static void ggml_compute_forward_sum_rows(
8563
8723
 
8564
8724
  static void ggml_compute_forward_mean_f32(
8565
8725
  const struct ggml_compute_params * params,
8566
- const struct ggml_tensor * src0,
8567
8726
  struct ggml_tensor * dst) {
8727
+
8728
+ const struct ggml_tensor * src0 = dst->src[0];
8729
+
8568
8730
  assert(params->ith == 0);
8569
8731
 
8570
8732
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -8600,12 +8762,14 @@ static void ggml_compute_forward_mean_f32(
8600
8762
 
8601
8763
  static void ggml_compute_forward_mean(
8602
8764
  const struct ggml_compute_params * params,
8603
- const struct ggml_tensor * src0,
8604
8765
  struct ggml_tensor * dst) {
8766
+
8767
+ const struct ggml_tensor * src0 = dst->src[0];
8768
+
8605
8769
  switch (src0->type) {
8606
8770
  case GGML_TYPE_F32:
8607
8771
  {
8608
- ggml_compute_forward_mean_f32(params, src0, dst);
8772
+ ggml_compute_forward_mean_f32(params, dst);
8609
8773
  } break;
8610
8774
  default:
8611
8775
  {
@@ -8618,8 +8782,10 @@ static void ggml_compute_forward_mean(
8618
8782
 
8619
8783
  static void ggml_compute_forward_argmax_f32(
8620
8784
  const struct ggml_compute_params * params,
8621
- const struct ggml_tensor * src0,
8622
8785
  struct ggml_tensor * dst) {
8786
+
8787
+ const struct ggml_tensor * src0 = dst->src[0];
8788
+
8623
8789
  assert(params->ith == 0);
8624
8790
 
8625
8791
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -8646,12 +8812,14 @@ static void ggml_compute_forward_argmax_f32(
8646
8812
 
8647
8813
  static void ggml_compute_forward_argmax(
8648
8814
  const struct ggml_compute_params * params,
8649
- const struct ggml_tensor * src0,
8650
8815
  struct ggml_tensor * dst) {
8816
+
8817
+ const struct ggml_tensor * src0 = dst->src[0];
8818
+
8651
8819
  switch (src0->type) {
8652
8820
  case GGML_TYPE_F32:
8653
8821
  {
8654
- ggml_compute_forward_argmax_f32(params, src0, dst);
8822
+ ggml_compute_forward_argmax_f32(params, dst);
8655
8823
  } break;
8656
8824
  default:
8657
8825
  {
@@ -8664,8 +8832,10 @@ static void ggml_compute_forward_argmax(
8664
8832
 
8665
8833
  static void ggml_compute_forward_repeat_f32(
8666
8834
  const struct ggml_compute_params * params,
8667
- const struct ggml_tensor * src0,
8668
8835
  struct ggml_tensor * dst) {
8836
+
8837
+ const struct ggml_tensor * src0 = dst->src[0];
8838
+
8669
8839
  GGML_ASSERT(params->ith == 0);
8670
8840
  GGML_ASSERT(ggml_can_repeat(src0, dst));
8671
8841
 
@@ -8707,8 +8877,10 @@ static void ggml_compute_forward_repeat_f32(
8707
8877
 
8708
8878
  static void ggml_compute_forward_repeat_f16(
8709
8879
  const struct ggml_compute_params * params,
8710
- const struct ggml_tensor * src0,
8711
8880
  struct ggml_tensor * dst) {
8881
+
8882
+ const struct ggml_tensor * src0 = dst->src[0];
8883
+
8712
8884
  GGML_ASSERT(params->ith == 0);
8713
8885
  GGML_ASSERT(ggml_can_repeat(src0, dst));
8714
8886
 
@@ -8753,18 +8925,20 @@ static void ggml_compute_forward_repeat_f16(
8753
8925
 
8754
8926
  static void ggml_compute_forward_repeat(
8755
8927
  const struct ggml_compute_params * params,
8756
- const struct ggml_tensor * src0,
8757
8928
  struct ggml_tensor * dst) {
8929
+
8930
+ const struct ggml_tensor * src0 = dst->src[0];
8931
+
8758
8932
  switch (src0->type) {
8759
8933
  case GGML_TYPE_F16:
8760
8934
  case GGML_TYPE_I16:
8761
8935
  {
8762
- ggml_compute_forward_repeat_f16(params, src0, dst);
8936
+ ggml_compute_forward_repeat_f16(params, dst);
8763
8937
  } break;
8764
8938
  case GGML_TYPE_F32:
8765
8939
  case GGML_TYPE_I32:
8766
8940
  {
8767
- ggml_compute_forward_repeat_f32(params, src0, dst);
8941
+ ggml_compute_forward_repeat_f32(params, dst);
8768
8942
  } break;
8769
8943
  default:
8770
8944
  {
@@ -8777,8 +8951,10 @@ static void ggml_compute_forward_repeat(
8777
8951
 
8778
8952
  static void ggml_compute_forward_repeat_back_f32(
8779
8953
  const struct ggml_compute_params * params,
8780
- const struct ggml_tensor * src0,
8781
8954
  struct ggml_tensor * dst) {
8955
+
8956
+ const struct ggml_tensor * src0 = dst->src[0];
8957
+
8782
8958
  GGML_ASSERT(params->ith == 0);
8783
8959
  GGML_ASSERT(ggml_can_repeat(dst, src0));
8784
8960
 
@@ -8834,12 +9010,14 @@ static void ggml_compute_forward_repeat_back_f32(
8834
9010
 
8835
9011
  static void ggml_compute_forward_repeat_back(
8836
9012
  const struct ggml_compute_params * params,
8837
- const struct ggml_tensor * src0,
8838
9013
  struct ggml_tensor * dst) {
9014
+
9015
+ const struct ggml_tensor * src0 = dst->src[0];
9016
+
8839
9017
  switch (src0->type) {
8840
9018
  case GGML_TYPE_F32:
8841
9019
  {
8842
- ggml_compute_forward_repeat_back_f32(params, src0, dst);
9020
+ ggml_compute_forward_repeat_back_f32(params, dst);
8843
9021
  } break;
8844
9022
  default:
8845
9023
  {
@@ -8852,10 +9030,11 @@ static void ggml_compute_forward_repeat_back(
8852
9030
 
8853
9031
  static void ggml_compute_forward_concat_f32(
8854
9032
  const struct ggml_compute_params * params,
8855
- const struct ggml_tensor * src0,
8856
- const struct ggml_tensor * src1,
8857
9033
  struct ggml_tensor * dst) {
8858
9034
 
9035
+ const struct ggml_tensor * src0 = dst->src[0];
9036
+ const struct ggml_tensor * src1 = dst->src[1];
9037
+
8859
9038
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8860
9039
  return;
8861
9040
  }
@@ -8900,14 +9079,15 @@ static void ggml_compute_forward_concat_f32(
8900
9079
 
8901
9080
  static void ggml_compute_forward_concat(
8902
9081
  const struct ggml_compute_params* params,
8903
- const struct ggml_tensor* src0,
8904
- const struct ggml_tensor* src1,
8905
9082
  struct ggml_tensor* dst) {
9083
+
9084
+ const struct ggml_tensor * src0 = dst->src[0];
9085
+
8906
9086
  switch (src0->type) {
8907
9087
  case GGML_TYPE_F32:
8908
9088
  case GGML_TYPE_I32:
8909
9089
  {
8910
- ggml_compute_forward_concat_f32(params, src0, src1, dst);
9090
+ ggml_compute_forward_concat_f32(params, dst);
8911
9091
  } break;
8912
9092
  default:
8913
9093
  {
@@ -8920,8 +9100,10 @@ static void ggml_compute_forward_concat(
8920
9100
 
8921
9101
  static void ggml_compute_forward_abs_f32(
8922
9102
  const struct ggml_compute_params * params,
8923
- const struct ggml_tensor * src0,
8924
9103
  struct ggml_tensor * dst) {
9104
+
9105
+ const struct ggml_tensor * src0 = dst->src[0];
9106
+
8925
9107
  assert(params->ith == 0);
8926
9108
  assert(ggml_are_same_shape(src0, dst));
8927
9109
 
@@ -8944,12 +9126,14 @@ static void ggml_compute_forward_abs_f32(
8944
9126
 
8945
9127
  static void ggml_compute_forward_abs(
8946
9128
  const struct ggml_compute_params * params,
8947
- const struct ggml_tensor * src0,
8948
9129
  struct ggml_tensor * dst) {
9130
+
9131
+ const struct ggml_tensor * src0 = dst->src[0];
9132
+
8949
9133
  switch (src0->type) {
8950
9134
  case GGML_TYPE_F32:
8951
9135
  {
8952
- ggml_compute_forward_abs_f32(params, src0, dst);
9136
+ ggml_compute_forward_abs_f32(params, dst);
8953
9137
  } break;
8954
9138
  default:
8955
9139
  {
@@ -8962,8 +9146,10 @@ static void ggml_compute_forward_abs(
8962
9146
 
8963
9147
  static void ggml_compute_forward_sgn_f32(
8964
9148
  const struct ggml_compute_params * params,
8965
- const struct ggml_tensor * src0,
8966
9149
  struct ggml_tensor * dst) {
9150
+
9151
+ const struct ggml_tensor * src0 = dst->src[0];
9152
+
8967
9153
  assert(params->ith == 0);
8968
9154
  assert(ggml_are_same_shape(src0, dst));
8969
9155
 
@@ -8986,12 +9172,14 @@ static void ggml_compute_forward_sgn_f32(
8986
9172
 
8987
9173
  static void ggml_compute_forward_sgn(
8988
9174
  const struct ggml_compute_params * params,
8989
- const struct ggml_tensor * src0,
8990
9175
  struct ggml_tensor * dst) {
9176
+
9177
+ const struct ggml_tensor * src0 = dst->src[0];
9178
+
8991
9179
  switch (src0->type) {
8992
9180
  case GGML_TYPE_F32:
8993
9181
  {
8994
- ggml_compute_forward_sgn_f32(params, src0, dst);
9182
+ ggml_compute_forward_sgn_f32(params, dst);
8995
9183
  } break;
8996
9184
  default:
8997
9185
  {
@@ -9004,8 +9192,10 @@ static void ggml_compute_forward_sgn(
9004
9192
 
9005
9193
  static void ggml_compute_forward_neg_f32(
9006
9194
  const struct ggml_compute_params * params,
9007
- const struct ggml_tensor * src0,
9008
9195
  struct ggml_tensor * dst) {
9196
+
9197
+ const struct ggml_tensor * src0 = dst->src[0];
9198
+
9009
9199
  assert(params->ith == 0);
9010
9200
  assert(ggml_are_same_shape(src0, dst));
9011
9201
 
@@ -9028,12 +9218,14 @@ static void ggml_compute_forward_neg_f32(
9028
9218
 
9029
9219
  static void ggml_compute_forward_neg(
9030
9220
  const struct ggml_compute_params * params,
9031
- const struct ggml_tensor * src0,
9032
9221
  struct ggml_tensor * dst) {
9222
+
9223
+ const struct ggml_tensor * src0 = dst->src[0];
9224
+
9033
9225
  switch (src0->type) {
9034
9226
  case GGML_TYPE_F32:
9035
9227
  {
9036
- ggml_compute_forward_neg_f32(params, src0, dst);
9228
+ ggml_compute_forward_neg_f32(params, dst);
9037
9229
  } break;
9038
9230
  default:
9039
9231
  {
@@ -9046,8 +9238,10 @@ static void ggml_compute_forward_neg(
9046
9238
 
9047
9239
  static void ggml_compute_forward_step_f32(
9048
9240
  const struct ggml_compute_params * params,
9049
- const struct ggml_tensor * src0,
9050
9241
  struct ggml_tensor * dst) {
9242
+
9243
+ const struct ggml_tensor * src0 = dst->src[0];
9244
+
9051
9245
  assert(params->ith == 0);
9052
9246
  assert(ggml_are_same_shape(src0, dst));
9053
9247
 
@@ -9070,12 +9264,14 @@ static void ggml_compute_forward_step_f32(
9070
9264
 
9071
9265
  static void ggml_compute_forward_step(
9072
9266
  const struct ggml_compute_params * params,
9073
- const struct ggml_tensor * src0,
9074
9267
  struct ggml_tensor * dst) {
9268
+
9269
+ const struct ggml_tensor * src0 = dst->src[0];
9270
+
9075
9271
  switch (src0->type) {
9076
9272
  case GGML_TYPE_F32:
9077
9273
  {
9078
- ggml_compute_forward_step_f32(params, src0, dst);
9274
+ ggml_compute_forward_step_f32(params, dst);
9079
9275
  } break;
9080
9276
  default:
9081
9277
  {
@@ -9088,8 +9284,10 @@ static void ggml_compute_forward_step(
9088
9284
 
9089
9285
  static void ggml_compute_forward_tanh_f32(
9090
9286
  const struct ggml_compute_params * params,
9091
- const struct ggml_tensor * src0,
9092
9287
  struct ggml_tensor * dst) {
9288
+
9289
+ const struct ggml_tensor * src0 = dst->src[0];
9290
+
9093
9291
  assert(params->ith == 0);
9094
9292
  assert(ggml_are_same_shape(src0, dst));
9095
9293
 
@@ -9112,12 +9310,14 @@ static void ggml_compute_forward_tanh_f32(
9112
9310
 
9113
9311
  static void ggml_compute_forward_tanh(
9114
9312
  const struct ggml_compute_params * params,
9115
- const struct ggml_tensor * src0,
9116
9313
  struct ggml_tensor * dst) {
9314
+
9315
+ const struct ggml_tensor * src0 = dst->src[0];
9316
+
9117
9317
  switch (src0->type) {
9118
9318
  case GGML_TYPE_F32:
9119
9319
  {
9120
- ggml_compute_forward_tanh_f32(params, src0, dst);
9320
+ ggml_compute_forward_tanh_f32(params, dst);
9121
9321
  } break;
9122
9322
  default:
9123
9323
  {
@@ -9130,8 +9330,10 @@ static void ggml_compute_forward_tanh(
9130
9330
 
9131
9331
  static void ggml_compute_forward_elu_f32(
9132
9332
  const struct ggml_compute_params * params,
9133
- const struct ggml_tensor * src0,
9134
9333
  struct ggml_tensor * dst) {
9334
+
9335
+ const struct ggml_tensor * src0 = dst->src[0];
9336
+
9135
9337
  assert(params->ith == 0);
9136
9338
  assert(ggml_are_same_shape(src0, dst));
9137
9339
 
@@ -9154,12 +9356,14 @@ static void ggml_compute_forward_elu_f32(
9154
9356
 
9155
9357
  static void ggml_compute_forward_elu(
9156
9358
  const struct ggml_compute_params * params,
9157
- const struct ggml_tensor * src0,
9158
9359
  struct ggml_tensor * dst) {
9360
+
9361
+ const struct ggml_tensor * src0 = dst->src[0];
9362
+
9159
9363
  switch (src0->type) {
9160
9364
  case GGML_TYPE_F32:
9161
9365
  {
9162
- ggml_compute_forward_elu_f32(params, src0, dst);
9366
+ ggml_compute_forward_elu_f32(params, dst);
9163
9367
  } break;
9164
9368
  default:
9165
9369
  {
@@ -9172,8 +9376,10 @@ static void ggml_compute_forward_elu(
9172
9376
 
9173
9377
  static void ggml_compute_forward_relu_f32(
9174
9378
  const struct ggml_compute_params * params,
9175
- const struct ggml_tensor * src0,
9176
9379
  struct ggml_tensor * dst) {
9380
+
9381
+ const struct ggml_tensor * src0 = dst->src[0];
9382
+
9177
9383
  assert(params->ith == 0);
9178
9384
  assert(ggml_are_same_shape(src0, dst));
9179
9385
 
@@ -9196,12 +9402,14 @@ static void ggml_compute_forward_relu_f32(
9196
9402
 
9197
9403
  static void ggml_compute_forward_relu(
9198
9404
  const struct ggml_compute_params * params,
9199
- const struct ggml_tensor * src0,
9200
9405
  struct ggml_tensor * dst) {
9406
+
9407
+ const struct ggml_tensor * src0 = dst->src[0];
9408
+
9201
9409
  switch (src0->type) {
9202
9410
  case GGML_TYPE_F32:
9203
9411
  {
9204
- ggml_compute_forward_relu_f32(params, src0, dst);
9412
+ ggml_compute_forward_relu_f32(params, dst);
9205
9413
  } break;
9206
9414
  default:
9207
9415
  {
@@ -9214,8 +9422,10 @@ static void ggml_compute_forward_relu(
9214
9422
 
9215
9423
  static void ggml_compute_forward_gelu_f32(
9216
9424
  const struct ggml_compute_params * params,
9217
- const struct ggml_tensor * src0,
9218
9425
  struct ggml_tensor * dst) {
9426
+
9427
+ const struct ggml_tensor * src0 = dst->src[0];
9428
+
9219
9429
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
9220
9430
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
9221
9431
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
@@ -9255,12 +9465,14 @@ static void ggml_compute_forward_gelu_f32(
9255
9465
 
9256
9466
  static void ggml_compute_forward_gelu(
9257
9467
  const struct ggml_compute_params * params,
9258
- const struct ggml_tensor * src0,
9259
9468
  struct ggml_tensor * dst) {
9469
+
9470
+ const struct ggml_tensor * src0 = dst->src[0];
9471
+
9260
9472
  switch (src0->type) {
9261
9473
  case GGML_TYPE_F32:
9262
9474
  {
9263
- ggml_compute_forward_gelu_f32(params, src0, dst);
9475
+ ggml_compute_forward_gelu_f32(params, dst);
9264
9476
  } break;
9265
9477
  default:
9266
9478
  {
@@ -9273,8 +9485,10 @@ static void ggml_compute_forward_gelu(
9273
9485
 
9274
9486
  static void ggml_compute_forward_gelu_quick_f32(
9275
9487
  const struct ggml_compute_params * params,
9276
- const struct ggml_tensor * src0,
9277
9488
  struct ggml_tensor * dst) {
9489
+
9490
+ const struct ggml_tensor * src0 = dst->src[0];
9491
+
9278
9492
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
9279
9493
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
9280
9494
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
@@ -9314,12 +9528,14 @@ static void ggml_compute_forward_gelu_quick_f32(
9314
9528
 
9315
9529
  static void ggml_compute_forward_gelu_quick(
9316
9530
  const struct ggml_compute_params * params,
9317
- const struct ggml_tensor * src0,
9318
9531
  struct ggml_tensor * dst) {
9532
+
9533
+ const struct ggml_tensor * src0 = dst->src[0];
9534
+
9319
9535
  switch (src0->type) {
9320
9536
  case GGML_TYPE_F32:
9321
9537
  {
9322
- ggml_compute_forward_gelu_quick_f32(params, src0, dst);
9538
+ ggml_compute_forward_gelu_quick_f32(params, dst);
9323
9539
  } break;
9324
9540
  default:
9325
9541
  {
@@ -9332,8 +9548,10 @@ static void ggml_compute_forward_gelu_quick(
9332
9548
 
9333
9549
  static void ggml_compute_forward_silu_f32(
9334
9550
  const struct ggml_compute_params * params,
9335
- const struct ggml_tensor * src0,
9336
9551
  struct ggml_tensor * dst) {
9552
+
9553
+ const struct ggml_tensor * src0 = dst->src[0];
9554
+
9337
9555
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
9338
9556
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
9339
9557
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
@@ -9373,12 +9591,14 @@ static void ggml_compute_forward_silu_f32(
9373
9591
 
9374
9592
  static void ggml_compute_forward_silu(
9375
9593
  const struct ggml_compute_params * params,
9376
- const struct ggml_tensor * src0,
9377
9594
  struct ggml_tensor * dst) {
9595
+
9596
+ const struct ggml_tensor * src0 = dst->src[0];
9597
+
9378
9598
  switch (src0->type) {
9379
9599
  case GGML_TYPE_F32:
9380
9600
  {
9381
- ggml_compute_forward_silu_f32(params, src0, dst);
9601
+ ggml_compute_forward_silu_f32(params, dst);
9382
9602
  } break;
9383
9603
  default:
9384
9604
  {
@@ -9390,8 +9610,10 @@ static void ggml_compute_forward_silu(
9390
9610
 
9391
9611
  static void ggml_compute_forward_leaky_relu_f32(
9392
9612
  const struct ggml_compute_params * params,
9393
- const struct ggml_tensor * src0,
9394
9613
  struct ggml_tensor * dst) {
9614
+
9615
+ const struct ggml_tensor * src0 = dst->src[0];
9616
+
9395
9617
  assert(params->ith == 0);
9396
9618
  assert(ggml_are_same_shape(src0, dst));
9397
9619
 
@@ -9417,12 +9639,14 @@ static void ggml_compute_forward_leaky_relu_f32(
9417
9639
 
9418
9640
  static void ggml_compute_forward_leaky_relu(
9419
9641
  const struct ggml_compute_params * params,
9420
- const struct ggml_tensor * src0,
9421
9642
  struct ggml_tensor * dst) {
9643
+
9644
+ const struct ggml_tensor * src0 = dst->src[0];
9645
+
9422
9646
  switch (src0->type) {
9423
9647
  case GGML_TYPE_F32:
9424
9648
  {
9425
- ggml_compute_forward_leaky_relu_f32(params, src0, dst);
9649
+ ggml_compute_forward_leaky_relu_f32(params, dst);
9426
9650
  } break;
9427
9651
  default:
9428
9652
  {
@@ -9435,9 +9659,11 @@ static void ggml_compute_forward_leaky_relu(
9435
9659
 
9436
9660
  static void ggml_compute_forward_silu_back_f32(
9437
9661
  const struct ggml_compute_params * params,
9438
- const struct ggml_tensor * src0,
9439
- const struct ggml_tensor * grad,
9440
9662
  struct ggml_tensor * dst) {
9663
+
9664
+ const struct ggml_tensor * src0 = dst->src[0];
9665
+ const struct ggml_tensor * grad = dst->src[1];
9666
+
9441
9667
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(grad));
9442
9668
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
9443
9669
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
@@ -9480,13 +9706,14 @@ static void ggml_compute_forward_silu_back_f32(
9480
9706
 
9481
9707
  static void ggml_compute_forward_silu_back(
9482
9708
  const struct ggml_compute_params * params,
9483
- const struct ggml_tensor * src0,
9484
- const struct ggml_tensor * grad,
9485
9709
  struct ggml_tensor * dst) {
9710
+
9711
+ const struct ggml_tensor * src0 = dst->src[0];
9712
+
9486
9713
  switch (src0->type) {
9487
9714
  case GGML_TYPE_F32:
9488
9715
  {
9489
- ggml_compute_forward_silu_back_f32(params, src0, grad, dst);
9716
+ ggml_compute_forward_silu_back_f32(params, dst);
9490
9717
  } break;
9491
9718
  default:
9492
9719
  {
@@ -9498,8 +9725,10 @@ static void ggml_compute_forward_silu_back(
9498
9725
 
9499
9726
  static void ggml_compute_forward_hardswish_f32(
9500
9727
  const struct ggml_compute_params * params,
9501
- const struct ggml_tensor * src0,
9502
9728
  struct ggml_tensor * dst) {
9729
+
9730
+ const struct ggml_tensor * src0 = dst->src[0];
9731
+
9503
9732
  assert(params->ith == 0);
9504
9733
  assert(ggml_are_same_shape(src0, dst));
9505
9734
 
@@ -9521,12 +9750,14 @@ static void ggml_compute_forward_hardswish_f32(
9521
9750
  }
9522
9751
  static void ggml_compute_forward_hardswish(
9523
9752
  const struct ggml_compute_params * params,
9524
- const struct ggml_tensor * src0,
9525
9753
  struct ggml_tensor * dst) {
9754
+
9755
+ const struct ggml_tensor * src0 = dst->src[0];
9756
+
9526
9757
  switch (src0->type) {
9527
9758
  case GGML_TYPE_F32:
9528
9759
  {
9529
- ggml_compute_forward_hardswish_f32(params, src0, dst);
9760
+ ggml_compute_forward_hardswish_f32(params, dst);
9530
9761
  } break;
9531
9762
  default:
9532
9763
  {
@@ -9537,8 +9768,10 @@ static void ggml_compute_forward_hardswish(
9537
9768
 
9538
9769
  static void ggml_compute_forward_hardsigmoid_f32(
9539
9770
  const struct ggml_compute_params * params,
9540
- const struct ggml_tensor * src0,
9541
9771
  struct ggml_tensor * dst) {
9772
+
9773
+ const struct ggml_tensor * src0 = dst->src[0];
9774
+
9542
9775
  assert(params->ith == 0);
9543
9776
  assert(ggml_are_same_shape(src0, dst));
9544
9777
 
@@ -9561,12 +9794,14 @@ static void ggml_compute_forward_hardsigmoid_f32(
9561
9794
 
9562
9795
  static void ggml_compute_forward_hardsigmoid(
9563
9796
  const struct ggml_compute_params * params,
9564
- const struct ggml_tensor * src0,
9565
9797
  struct ggml_tensor * dst) {
9798
+
9799
+ const struct ggml_tensor * src0 = dst->src[0];
9800
+
9566
9801
  switch (src0->type) {
9567
9802
  case GGML_TYPE_F32:
9568
9803
  {
9569
- ggml_compute_forward_hardsigmoid_f32(params, src0, dst);
9804
+ ggml_compute_forward_hardsigmoid_f32(params, dst);
9570
9805
  } break;
9571
9806
  default:
9572
9807
  {
@@ -9580,8 +9815,10 @@ static void ggml_compute_forward_hardsigmoid(
9580
9815
 
9581
9816
  static void ggml_compute_forward_norm_f32(
9582
9817
  const struct ggml_compute_params * params,
9583
- const struct ggml_tensor * src0,
9584
9818
  struct ggml_tensor * dst) {
9819
+
9820
+ const struct ggml_tensor * src0 = dst->src[0];
9821
+
9585
9822
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
9586
9823
 
9587
9824
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -9633,12 +9870,14 @@ static void ggml_compute_forward_norm_f32(
9633
9870
 
9634
9871
  static void ggml_compute_forward_norm(
9635
9872
  const struct ggml_compute_params * params,
9636
- const struct ggml_tensor * src0,
9637
9873
  struct ggml_tensor * dst) {
9874
+
9875
+ const struct ggml_tensor * src0 = dst->src[0];
9876
+
9638
9877
  switch (src0->type) {
9639
9878
  case GGML_TYPE_F32:
9640
9879
  {
9641
- ggml_compute_forward_norm_f32(params, src0, dst);
9880
+ ggml_compute_forward_norm_f32(params, dst);
9642
9881
  } break;
9643
9882
  default:
9644
9883
  {
@@ -9651,8 +9890,10 @@ static void ggml_compute_forward_norm(
9651
9890
 
9652
9891
  static void ggml_compute_forward_rms_norm_f32(
9653
9892
  const struct ggml_compute_params * params,
9654
- const struct ggml_tensor * src0,
9655
9893
  struct ggml_tensor * dst) {
9894
+
9895
+ const struct ggml_tensor * src0 = dst->src[0];
9896
+
9656
9897
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
9657
9898
 
9658
9899
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -9701,12 +9942,14 @@ static void ggml_compute_forward_rms_norm_f32(
9701
9942
 
9702
9943
  static void ggml_compute_forward_rms_norm(
9703
9944
  const struct ggml_compute_params * params,
9704
- const struct ggml_tensor * src0,
9705
9945
  struct ggml_tensor * dst) {
9946
+
9947
+ const struct ggml_tensor * src0 = dst->src[0];
9948
+
9706
9949
  switch (src0->type) {
9707
9950
  case GGML_TYPE_F32:
9708
9951
  {
9709
- ggml_compute_forward_rms_norm_f32(params, src0, dst);
9952
+ ggml_compute_forward_rms_norm_f32(params, dst);
9710
9953
  } break;
9711
9954
  default:
9712
9955
  {
@@ -9717,9 +9960,11 @@ static void ggml_compute_forward_rms_norm(
9717
9960
 
9718
9961
  static void ggml_compute_forward_rms_norm_back_f32(
9719
9962
  const struct ggml_compute_params * params,
9720
- const struct ggml_tensor * src0,
9721
- const struct ggml_tensor * src1,
9722
9963
  struct ggml_tensor * dst) {
9964
+
9965
+ const struct ggml_tensor * src0 = dst->src[0];
9966
+ const struct ggml_tensor * src1 = dst->src[1];
9967
+
9723
9968
  GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_are_same_shape(src0, src1));
9724
9969
 
9725
9970
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -9874,13 +10119,14 @@ static void ggml_compute_forward_rms_norm_back_f32(
9874
10119
 
9875
10120
  static void ggml_compute_forward_rms_norm_back(
9876
10121
  const struct ggml_compute_params * params,
9877
- const struct ggml_tensor * src0,
9878
- const struct ggml_tensor * src1,
9879
10122
  struct ggml_tensor * dst) {
10123
+
10124
+ const struct ggml_tensor * src0 = dst->src[0];
10125
+
9880
10126
  switch (src0->type) {
9881
10127
  case GGML_TYPE_F32:
9882
10128
  {
9883
- ggml_compute_forward_rms_norm_back_f32(params, src0, src1, dst);
10129
+ ggml_compute_forward_rms_norm_back_f32(params, dst);
9884
10130
  } break;
9885
10131
  default:
9886
10132
  {
@@ -9893,8 +10139,10 @@ static void ggml_compute_forward_rms_norm_back(
9893
10139
 
9894
10140
  static void ggml_compute_forward_group_norm_f32(
9895
10141
  const struct ggml_compute_params * params,
9896
- const struct ggml_tensor * src0,
9897
10142
  struct ggml_tensor * dst) {
10143
+
10144
+ const struct ggml_tensor * src0 = dst->src[0];
10145
+
9898
10146
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
9899
10147
 
9900
10148
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -9965,12 +10213,14 @@ static void ggml_compute_forward_group_norm_f32(
9965
10213
 
9966
10214
  static void ggml_compute_forward_group_norm(
9967
10215
  const struct ggml_compute_params * params,
9968
- const struct ggml_tensor * src0,
9969
10216
  struct ggml_tensor * dst) {
10217
+
10218
+ const struct ggml_tensor * src0 = dst->src[0];
10219
+
9970
10220
  switch (src0->type) {
9971
10221
  case GGML_TYPE_F32:
9972
10222
  {
9973
- ggml_compute_forward_group_norm_f32(params, src0, dst);
10223
+ ggml_compute_forward_group_norm_f32(params, dst);
9974
10224
  } break;
9975
10225
  default:
9976
10226
  {
@@ -10016,9 +10266,11 @@ static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
10016
10266
 
10017
10267
  static void ggml_compute_forward_mul_mat(
10018
10268
  const struct ggml_compute_params * params,
10019
- const struct ggml_tensor * src0,
10020
- const struct ggml_tensor * src1,
10021
10269
  struct ggml_tensor * dst) {
10270
+
10271
+ const struct ggml_tensor * src0 = dst->src[0];
10272
+ const struct ggml_tensor * src1 = dst->src[1];
10273
+
10022
10274
  int64_t t0 = ggml_perf_time_us();
10023
10275
  UNUSED(t0);
10024
10276
 
@@ -10263,10 +10515,11 @@ static void ggml_compute_forward_mul_mat(
10263
10515
 
10264
10516
  static void ggml_compute_forward_mul_mat_id(
10265
10517
  const struct ggml_compute_params * params,
10266
- const struct ggml_tensor * ids,
10267
- const struct ggml_tensor * src1,
10268
10518
  struct ggml_tensor * dst) {
10269
10519
 
10520
+ const struct ggml_tensor * ids = dst->src[0];
10521
+ const struct ggml_tensor * src1 = dst->src[1];
10522
+
10270
10523
  const struct ggml_tensor * src0 = dst->src[2]; // only for GGML_TENSOR_BINARY_OP_LOCALS
10271
10524
 
10272
10525
  GGML_TENSOR_BINARY_OP_LOCALS
@@ -10457,9 +10710,11 @@ static void ggml_compute_forward_mul_mat_id(
10457
10710
 
10458
10711
  static void ggml_compute_forward_out_prod_f32(
10459
10712
  const struct ggml_compute_params * params,
10460
- const struct ggml_tensor * src0,
10461
- const struct ggml_tensor * src1,
10462
10713
  struct ggml_tensor * dst) {
10714
+
10715
+ const struct ggml_tensor * src0 = dst->src[0];
10716
+ const struct ggml_tensor * src1 = dst->src[1];
10717
+
10463
10718
  // int64_t t0 = ggml_perf_time_us();
10464
10719
  // UNUSED(t0);
10465
10720
 
@@ -10649,9 +10904,11 @@ static void ggml_compute_forward_out_prod_f32(
10649
10904
 
10650
10905
  static void ggml_compute_forward_out_prod_q_f32(
10651
10906
  const struct ggml_compute_params * params,
10652
- const struct ggml_tensor * src0,
10653
- const struct ggml_tensor * src1,
10654
10907
  struct ggml_tensor * dst) {
10908
+
10909
+ const struct ggml_tensor * src0 = dst->src[0];
10910
+ const struct ggml_tensor * src1 = dst->src[1];
10911
+
10655
10912
  // int64_t t0 = ggml_perf_time_us();
10656
10913
  // UNUSED(t0);
10657
10914
 
@@ -10762,9 +11019,10 @@ static void ggml_compute_forward_out_prod_q_f32(
10762
11019
 
10763
11020
  static void ggml_compute_forward_out_prod(
10764
11021
  const struct ggml_compute_params * params,
10765
- const struct ggml_tensor * src0,
10766
- const struct ggml_tensor * src1,
10767
11022
  struct ggml_tensor * dst) {
11023
+
11024
+ const struct ggml_tensor * src0 = dst->src[0];
11025
+
10768
11026
  switch (src0->type) {
10769
11027
  case GGML_TYPE_Q4_0:
10770
11028
  case GGML_TYPE_Q4_1:
@@ -10779,17 +11037,19 @@ static void ggml_compute_forward_out_prod(
10779
11037
  case GGML_TYPE_IQ2_XXS:
10780
11038
  case GGML_TYPE_IQ2_XS:
10781
11039
  case GGML_TYPE_IQ3_XXS:
11040
+ case GGML_TYPE_IQ1_S:
11041
+ case GGML_TYPE_IQ4_NL:
10782
11042
  {
10783
- ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst);
11043
+ ggml_compute_forward_out_prod_q_f32(params, dst);
10784
11044
  } break;
10785
11045
  case GGML_TYPE_F16:
10786
11046
  {
10787
11047
  GGML_ASSERT(false); // todo
10788
- // ggml_compute_forward_out_prod_f16_f32(params, src0, src1, dst);
11048
+ // ggml_compute_forward_out_prod_f16_f32(params, dst);
10789
11049
  } break;
10790
11050
  case GGML_TYPE_F32:
10791
11051
  {
10792
- ggml_compute_forward_out_prod_f32(params, src0, src1, dst);
11052
+ ggml_compute_forward_out_prod_f32(params, dst);
10793
11053
  } break;
10794
11054
  default:
10795
11055
  {
@@ -10802,8 +11062,10 @@ static void ggml_compute_forward_out_prod(
10802
11062
 
10803
11063
  static void ggml_compute_forward_scale_f32(
10804
11064
  const struct ggml_compute_params * params,
10805
- const struct ggml_tensor * src0,
10806
11065
  struct ggml_tensor * dst) {
11066
+
11067
+ const struct ggml_tensor * src0 = dst->src[0];
11068
+
10807
11069
  GGML_ASSERT(ggml_is_contiguous(src0));
10808
11070
  GGML_ASSERT(ggml_is_contiguous(dst));
10809
11071
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
@@ -10844,12 +11106,14 @@ static void ggml_compute_forward_scale_f32(
10844
11106
 
10845
11107
  static void ggml_compute_forward_scale(
10846
11108
  const struct ggml_compute_params * params,
10847
- const struct ggml_tensor * src0,
10848
11109
  struct ggml_tensor * dst) {
11110
+
11111
+ const struct ggml_tensor * src0 = dst->src[0];
11112
+
10849
11113
  switch (src0->type) {
10850
11114
  case GGML_TYPE_F32:
10851
11115
  {
10852
- ggml_compute_forward_scale_f32(params, src0, dst);
11116
+ ggml_compute_forward_scale_f32(params, dst);
10853
11117
  } break;
10854
11118
  default:
10855
11119
  {
@@ -10862,9 +11126,11 @@ static void ggml_compute_forward_scale(
10862
11126
 
10863
11127
  static void ggml_compute_forward_set_f32(
10864
11128
  const struct ggml_compute_params * params,
10865
- const struct ggml_tensor * src0,
10866
- const struct ggml_tensor * src1,
10867
11129
  struct ggml_tensor * dst) {
11130
+
11131
+ const struct ggml_tensor * src0 = dst->src[0];
11132
+ const struct ggml_tensor * src1 = dst->src[1];
11133
+
10868
11134
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
10869
11135
  GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
10870
11136
 
@@ -10935,14 +11201,14 @@ static void ggml_compute_forward_set_f32(
10935
11201
 
10936
11202
  static void ggml_compute_forward_set(
10937
11203
  const struct ggml_compute_params * params,
10938
- const struct ggml_tensor * src0,
10939
- const struct ggml_tensor * src1,
10940
11204
  struct ggml_tensor * dst) {
10941
11205
 
11206
+ const struct ggml_tensor * src0 = dst->src[0];
11207
+
10942
11208
  switch (src0->type) {
10943
11209
  case GGML_TYPE_F32:
10944
11210
  {
10945
- ggml_compute_forward_set_f32(params, src0, src1, dst);
11211
+ ggml_compute_forward_set_f32(params, dst);
10946
11212
  } break;
10947
11213
  case GGML_TYPE_F16:
10948
11214
  case GGML_TYPE_Q4_0:
@@ -10959,6 +11225,8 @@ static void ggml_compute_forward_set(
10959
11225
  case GGML_TYPE_IQ2_XXS:
10960
11226
  case GGML_TYPE_IQ2_XS:
10961
11227
  case GGML_TYPE_IQ3_XXS:
11228
+ case GGML_TYPE_IQ1_S:
11229
+ case GGML_TYPE_IQ4_NL:
10962
11230
  default:
10963
11231
  {
10964
11232
  GGML_ASSERT(false);
@@ -10970,29 +11238,25 @@ static void ggml_compute_forward_set(
10970
11238
 
10971
11239
  static void ggml_compute_forward_cpy(
10972
11240
  const struct ggml_compute_params * params,
10973
- const struct ggml_tensor * src0,
10974
11241
  struct ggml_tensor * dst) {
10975
- ggml_compute_forward_dup(params, src0, dst);
11242
+ ggml_compute_forward_dup(params, dst);
10976
11243
  }
10977
11244
 
10978
11245
  // ggml_compute_forward_cont
10979
11246
 
10980
11247
  static void ggml_compute_forward_cont(
10981
11248
  const struct ggml_compute_params * params,
10982
- const struct ggml_tensor * src0,
10983
11249
  struct ggml_tensor * dst) {
10984
- ggml_compute_forward_dup(params, src0, dst);
11250
+ ggml_compute_forward_dup(params, dst);
10985
11251
  }
10986
11252
 
10987
11253
  // ggml_compute_forward_reshape
10988
11254
 
10989
11255
  static void ggml_compute_forward_reshape(
10990
11256
  const struct ggml_compute_params * params,
10991
- const struct ggml_tensor * src0,
10992
11257
  struct ggml_tensor * dst) {
10993
11258
  // NOP
10994
11259
  UNUSED(params);
10995
- UNUSED(src0);
10996
11260
  UNUSED(dst);
10997
11261
  }
10998
11262
 
@@ -11000,39 +11264,41 @@ static void ggml_compute_forward_reshape(
11000
11264
 
11001
11265
  static void ggml_compute_forward_view(
11002
11266
  const struct ggml_compute_params * params,
11003
- const struct ggml_tensor * src0) {
11267
+ const struct ggml_tensor * dst) {
11004
11268
  // NOP
11005
11269
  UNUSED(params);
11006
- UNUSED(src0);
11270
+ UNUSED(dst);
11007
11271
  }
11008
11272
 
11009
11273
  // ggml_compute_forward_permute
11010
11274
 
11011
11275
  static void ggml_compute_forward_permute(
11012
11276
  const struct ggml_compute_params * params,
11013
- const struct ggml_tensor * src0) {
11277
+ const struct ggml_tensor * dst) {
11014
11278
  // NOP
11015
11279
  UNUSED(params);
11016
- UNUSED(src0);
11280
+ UNUSED(dst);
11017
11281
  }
11018
11282
 
11019
11283
  // ggml_compute_forward_transpose
11020
11284
 
11021
11285
  static void ggml_compute_forward_transpose(
11022
11286
  const struct ggml_compute_params * params,
11023
- const struct ggml_tensor * src0) {
11287
+ const struct ggml_tensor * dst) {
11024
11288
  // NOP
11025
11289
  UNUSED(params);
11026
- UNUSED(src0);
11290
+ UNUSED(dst);
11027
11291
  }
11028
11292
 
11029
11293
  // ggml_compute_forward_get_rows
11030
11294
 
11031
11295
  static void ggml_compute_forward_get_rows_q(
11032
11296
  const struct ggml_compute_params * params,
11033
- const struct ggml_tensor * src0,
11034
- const struct ggml_tensor * src1,
11035
11297
  struct ggml_tensor * dst) {
11298
+
11299
+ const struct ggml_tensor * src0 = dst->src[0];
11300
+ const struct ggml_tensor * src1 = dst->src[1];
11301
+
11036
11302
  assert(params->ith == 0);
11037
11303
 
11038
11304
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -11068,9 +11334,11 @@ static void ggml_compute_forward_get_rows_q(
11068
11334
 
11069
11335
  static void ggml_compute_forward_get_rows_f16(
11070
11336
  const struct ggml_compute_params * params,
11071
- const struct ggml_tensor * src0,
11072
- const struct ggml_tensor * src1,
11073
11337
  struct ggml_tensor * dst) {
11338
+
11339
+ const struct ggml_tensor * src0 = dst->src[0];
11340
+ const struct ggml_tensor * src1 = dst->src[1];
11341
+
11074
11342
  assert(params->ith == 0);
11075
11343
 
11076
11344
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -11103,9 +11371,11 @@ static void ggml_compute_forward_get_rows_f16(
11103
11371
 
11104
11372
  static void ggml_compute_forward_get_rows_f32(
11105
11373
  const struct ggml_compute_params * params,
11106
- const struct ggml_tensor * src0,
11107
- const struct ggml_tensor * src1,
11108
11374
  struct ggml_tensor * dst) {
11375
+
11376
+ const struct ggml_tensor * src0 = dst->src[0];
11377
+ const struct ggml_tensor * src1 = dst->src[1];
11378
+
11109
11379
  assert(params->ith == 0);
11110
11380
 
11111
11381
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -11138,9 +11408,10 @@ static void ggml_compute_forward_get_rows_f32(
11138
11408
 
11139
11409
  static void ggml_compute_forward_get_rows(
11140
11410
  const struct ggml_compute_params * params,
11141
- const struct ggml_tensor * src0,
11142
- const struct ggml_tensor * src1,
11143
11411
  struct ggml_tensor * dst) {
11412
+
11413
+ const struct ggml_tensor * src0 = dst->src[0];
11414
+
11144
11415
  switch (src0->type) {
11145
11416
  case GGML_TYPE_Q4_0:
11146
11417
  case GGML_TYPE_Q4_1:
@@ -11156,17 +11427,19 @@ static void ggml_compute_forward_get_rows(
11156
11427
  case GGML_TYPE_IQ2_XXS:
11157
11428
  case GGML_TYPE_IQ2_XS:
11158
11429
  case GGML_TYPE_IQ3_XXS:
11430
+ case GGML_TYPE_IQ1_S:
11431
+ case GGML_TYPE_IQ4_NL:
11159
11432
  {
11160
- ggml_compute_forward_get_rows_q(params, src0, src1, dst);
11433
+ ggml_compute_forward_get_rows_q(params, dst);
11161
11434
  } break;
11162
11435
  case GGML_TYPE_F16:
11163
11436
  {
11164
- ggml_compute_forward_get_rows_f16(params, src0, src1, dst);
11437
+ ggml_compute_forward_get_rows_f16(params, dst);
11165
11438
  } break;
11166
11439
  case GGML_TYPE_F32:
11167
11440
  case GGML_TYPE_I32:
11168
11441
  {
11169
- ggml_compute_forward_get_rows_f32(params, src0, src1, dst);
11442
+ ggml_compute_forward_get_rows_f32(params, dst);
11170
11443
  } break;
11171
11444
  default:
11172
11445
  {
@@ -11197,9 +11470,11 @@ static void ggml_compute_forward_get_rows(
11197
11470
 
11198
11471
  static void ggml_compute_forward_get_rows_back_f32_f16(
11199
11472
  const struct ggml_compute_params * params,
11200
- const struct ggml_tensor * src0,
11201
- const struct ggml_tensor * src1,
11202
11473
  struct ggml_tensor * dst) {
11474
+
11475
+ const struct ggml_tensor * src0 = dst->src[0];
11476
+ const struct ggml_tensor * src1 = dst->src[1];
11477
+
11203
11478
  GGML_ASSERT(params->ith == 0);
11204
11479
  GGML_ASSERT(ggml_is_contiguous(dst));
11205
11480
 
@@ -11234,9 +11509,11 @@ static void ggml_compute_forward_get_rows_back_f32_f16(
11234
11509
 
11235
11510
  static void ggml_compute_forward_get_rows_back_f32(
11236
11511
  const struct ggml_compute_params * params,
11237
- const struct ggml_tensor * src0,
11238
- const struct ggml_tensor * src1,
11239
11512
  struct ggml_tensor * dst) {
11513
+
11514
+ const struct ggml_tensor * src0 = dst->src[0];
11515
+ const struct ggml_tensor * src1 = dst->src[1];
11516
+
11240
11517
  GGML_ASSERT(params->ith == 0);
11241
11518
  GGML_ASSERT(ggml_is_contiguous(dst));
11242
11519
 
@@ -11271,17 +11548,18 @@ static void ggml_compute_forward_get_rows_back_f32(
11271
11548
 
11272
11549
  static void ggml_compute_forward_get_rows_back(
11273
11550
  const struct ggml_compute_params * params,
11274
- const struct ggml_tensor * src0,
11275
- const struct ggml_tensor * src1,
11276
11551
  struct ggml_tensor * dst) {
11552
+
11553
+ const struct ggml_tensor * src0 = dst->src[0];
11554
+
11277
11555
  switch (src0->type) {
11278
11556
  case GGML_TYPE_F16:
11279
11557
  {
11280
- ggml_compute_forward_get_rows_back_f32_f16(params, src0, src1, dst);
11558
+ ggml_compute_forward_get_rows_back_f32_f16(params, dst);
11281
11559
  } break;
11282
11560
  case GGML_TYPE_F32:
11283
11561
  {
11284
- ggml_compute_forward_get_rows_back_f32(params, src0, src1, dst);
11562
+ ggml_compute_forward_get_rows_back_f32(params, dst);
11285
11563
  } break;
11286
11564
  default:
11287
11565
  {
@@ -11312,8 +11590,10 @@ static void ggml_compute_forward_get_rows_back(
11312
11590
 
11313
11591
  static void ggml_compute_forward_diag_f32(
11314
11592
  const struct ggml_compute_params * params,
11315
- const struct ggml_tensor * src0,
11316
11593
  struct ggml_tensor * dst) {
11594
+
11595
+ const struct ggml_tensor * src0 = dst->src[0];
11596
+
11317
11597
  GGML_ASSERT(params->ith == 0);
11318
11598
 
11319
11599
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -11352,12 +11632,14 @@ static void ggml_compute_forward_diag_f32(
11352
11632
 
11353
11633
  static void ggml_compute_forward_diag(
11354
11634
  const struct ggml_compute_params * params,
11355
- const struct ggml_tensor * src0,
11356
11635
  struct ggml_tensor * dst) {
11636
+
11637
+ const struct ggml_tensor * src0 = dst->src[0];
11638
+
11357
11639
  switch (src0->type) {
11358
11640
  case GGML_TYPE_F32:
11359
11641
  {
11360
- ggml_compute_forward_diag_f32(params, src0, dst);
11642
+ ggml_compute_forward_diag_f32(params, dst);
11361
11643
  } break;
11362
11644
  default:
11363
11645
  {
@@ -11370,10 +11652,11 @@ static void ggml_compute_forward_diag(
11370
11652
 
11371
11653
  static void ggml_compute_forward_diag_mask_f32(
11372
11654
  const struct ggml_compute_params * params,
11373
- const struct ggml_tensor * src0,
11374
11655
  struct ggml_tensor * dst,
11375
11656
  const float value) {
11376
11657
 
11658
+ const struct ggml_tensor * src0 = dst->src[0];
11659
+
11377
11660
  const int ith = params->ith;
11378
11661
  const int nth = params->nth;
11379
11662
 
@@ -11423,12 +11706,14 @@ static void ggml_compute_forward_diag_mask_f32(
11423
11706
 
11424
11707
  static void ggml_compute_forward_diag_mask_inf(
11425
11708
  const struct ggml_compute_params * params,
11426
- const struct ggml_tensor * src0,
11427
11709
  struct ggml_tensor * dst) {
11710
+
11711
+ const struct ggml_tensor * src0 = dst->src[0];
11712
+
11428
11713
  switch (src0->type) {
11429
11714
  case GGML_TYPE_F32:
11430
11715
  {
11431
- ggml_compute_forward_diag_mask_f32(params, src0, dst, -INFINITY);
11716
+ ggml_compute_forward_diag_mask_f32(params, dst, -INFINITY);
11432
11717
  } break;
11433
11718
  default:
11434
11719
  {
@@ -11439,12 +11724,14 @@ static void ggml_compute_forward_diag_mask_inf(
11439
11724
 
11440
11725
  static void ggml_compute_forward_diag_mask_zero(
11441
11726
  const struct ggml_compute_params * params,
11442
- const struct ggml_tensor * src0,
11443
11727
  struct ggml_tensor * dst) {
11728
+
11729
+ const struct ggml_tensor * src0 = dst->src[0];
11730
+
11444
11731
  switch (src0->type) {
11445
11732
  case GGML_TYPE_F32:
11446
11733
  {
11447
- ggml_compute_forward_diag_mask_f32(params, src0, dst, 0);
11734
+ ggml_compute_forward_diag_mask_f32(params, dst, 0);
11448
11735
  } break;
11449
11736
  default:
11450
11737
  {
@@ -11457,9 +11744,12 @@ static void ggml_compute_forward_diag_mask_zero(
11457
11744
 
11458
11745
  static void ggml_compute_forward_soft_max_f32(
11459
11746
  const struct ggml_compute_params * params,
11460
- const struct ggml_tensor * src0,
11461
- const struct ggml_tensor * src1,
11462
11747
  struct ggml_tensor * dst) {
11748
+
11749
+ const struct ggml_tensor * src0 = dst->src[0];
11750
+ const struct ggml_tensor * src1 = dst->src[1];
11751
+ const struct ggml_tensor * src2 = dst->src[2];
11752
+
11463
11753
  assert(ggml_is_contiguous(dst));
11464
11754
  assert(ggml_are_same_shape(src0, dst));
11465
11755
 
@@ -11467,16 +11757,29 @@ static void ggml_compute_forward_soft_max_f32(
11467
11757
  return;
11468
11758
  }
11469
11759
 
11470
- float scale = 1.0f;
11471
- memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
11760
+ float scale = 1.0f;
11761
+ float max_bias = 0.0f;
11762
+
11763
+ memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
11764
+ memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
11472
11765
 
11473
11766
  // TODO: handle transposed/permuted matrices
11474
11767
 
11475
11768
  const int ith = params->ith;
11476
11769
  const int nth = params->nth;
11477
11770
 
11771
+ GGML_TENSOR_UNARY_OP_LOCALS
11772
+
11478
11773
  const int64_t ne11 = src1 ? src1->ne[1] : 1;
11479
11774
 
11775
+ // TODO: is this supposed to be ceil instead of floor?
11776
+ // https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370
11777
+ const uint32_t n_head_kv = ne02;
11778
+ const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head_kv));
11779
+
11780
+ const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
11781
+ const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
11782
+
11480
11783
  const int nc = src0->ne[0];
11481
11784
  const int nr = ggml_nrows(src0);
11482
11785
 
@@ -11489,6 +11792,9 @@ static void ggml_compute_forward_soft_max_f32(
11489
11792
 
11490
11793
  float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
11491
11794
 
11795
+ // when max_bias <= 0.0f, src2 is not used and we default it to src0 to avoid branching
11796
+ float * pos = src2 ? (float *) src2->data : src0->data;
11797
+
11492
11798
  for (int i1 = ir0; i1 < ir1; i1++) {
11493
11799
  float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
11494
11800
  float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
@@ -11502,6 +11808,16 @@ static void ggml_compute_forward_soft_max_f32(
11502
11808
  ggml_vec_acc_f32(nc, wp, mp);
11503
11809
  }
11504
11810
 
11811
+ // ALiBi bias
11812
+ if (max_bias > 0.0f) {
11813
+ const uint32_t h = (i1/ne01)%ne02; // head
11814
+ const float slope = h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1);
11815
+
11816
+ for (int i = 0; i < nc; i++) {
11817
+ wp[i] = wp[i] + slope*pos[i];
11818
+ }
11819
+ }
11820
+
11505
11821
  #ifndef NDEBUG
11506
11822
  for (int i = 0; i < nc; ++i) {
11507
11823
  //printf("p[%d] = %f\n", i, p[i]);
@@ -11544,13 +11860,14 @@ static void ggml_compute_forward_soft_max_f32(
11544
11860
 
11545
11861
  static void ggml_compute_forward_soft_max(
11546
11862
  const struct ggml_compute_params * params,
11547
- const struct ggml_tensor * src0,
11548
- const struct ggml_tensor * src1,
11549
11863
  struct ggml_tensor * dst) {
11864
+
11865
+ const struct ggml_tensor * src0 = dst->src[0];
11866
+
11550
11867
  switch (src0->type) {
11551
11868
  case GGML_TYPE_F32:
11552
11869
  {
11553
- ggml_compute_forward_soft_max_f32(params, src0, src1, dst);
11870
+ ggml_compute_forward_soft_max_f32(params, dst);
11554
11871
  } break;
11555
11872
  default:
11556
11873
  {
@@ -11563,9 +11880,11 @@ static void ggml_compute_forward_soft_max(
11563
11880
 
11564
11881
  static void ggml_compute_forward_soft_max_back_f32(
11565
11882
  const struct ggml_compute_params * params,
11566
- const struct ggml_tensor * src0,
11567
- const struct ggml_tensor * src1,
11568
11883
  struct ggml_tensor * dst) {
11884
+
11885
+ const struct ggml_tensor * src0 = dst->src[0];
11886
+ const struct ggml_tensor * src1 = dst->src[1];
11887
+
11569
11888
  GGML_ASSERT(ggml_is_contiguous(src0));
11570
11889
  GGML_ASSERT(ggml_is_contiguous(src1));
11571
11890
  GGML_ASSERT(ggml_is_contiguous(dst));
@@ -11640,13 +11959,14 @@ static void ggml_compute_forward_soft_max_back_f32(
11640
11959
 
11641
11960
  static void ggml_compute_forward_soft_max_back(
11642
11961
  const struct ggml_compute_params * params,
11643
- const struct ggml_tensor * src0,
11644
- const struct ggml_tensor * src1,
11645
11962
  struct ggml_tensor * dst) {
11963
+
11964
+ const struct ggml_tensor * src0 = dst->src[0];
11965
+
11646
11966
  switch (src0->type) {
11647
11967
  case GGML_TYPE_F32:
11648
11968
  {
11649
- ggml_compute_forward_soft_max_back_f32(params, src0, src1, dst);
11969
+ ggml_compute_forward_soft_max_back_f32(params, dst);
11650
11970
  } break;
11651
11971
  default:
11652
11972
  {
@@ -11659,8 +11979,10 @@ static void ggml_compute_forward_soft_max_back(
11659
11979
 
11660
11980
  static void ggml_compute_forward_alibi_f32(
11661
11981
  const struct ggml_compute_params * params,
11662
- const struct ggml_tensor * src0,
11663
11982
  struct ggml_tensor * dst) {
11983
+
11984
+ const struct ggml_tensor * src0 = dst->src[0];
11985
+
11664
11986
  assert(params->ith == 0);
11665
11987
 
11666
11988
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -11694,22 +12016,20 @@ static void ggml_compute_forward_alibi_f32(
11694
12016
  const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
11695
12017
  const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
11696
12018
 
11697
- for (int64_t i = 0; i < ne0; i++) {
11698
- for (int64_t j = 0; j < ne1; j++) {
11699
- for (int64_t k = 0; k < ne2_ne3; k++) {
11700
- float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
11701
- float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
11702
-
11703
- // TODO: k*nb2 or k*nb3
11704
-
11705
- float m_k;
12019
+ for (int64_t k = 0; k < ne2_ne3; k++) {
12020
+ // TODO: k*nb2 or k*nb3
12021
+ float m_k;
11706
12022
 
11707
- if (k < n_heads_log2_floor) {
11708
- m_k = powf(m0, k + 1);
11709
- } else {
11710
- m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
11711
- }
12023
+ if (k < n_heads_log2_floor) {
12024
+ m_k = powf(m0, k + 1);
12025
+ } else {
12026
+ m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
12027
+ }
11712
12028
 
12029
+ for (int64_t i = 0; i < ne0; i++) {
12030
+ for (int64_t j = 0; j < ne1; j++) {
12031
+ float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
12032
+ float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
11713
12033
  pdst[0] = i * m_k + src[0];
11714
12034
  }
11715
12035
  }
@@ -11718,8 +12038,10 @@ static void ggml_compute_forward_alibi_f32(
11718
12038
 
11719
12039
  static void ggml_compute_forward_alibi_f16(
11720
12040
  const struct ggml_compute_params * params,
11721
- const struct ggml_tensor * src0,
11722
12041
  struct ggml_tensor * dst) {
12042
+
12043
+ const struct ggml_tensor * src0 = dst->src[0];
12044
+
11723
12045
  assert(params->ith == 0);
11724
12046
 
11725
12047
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -11754,21 +12076,20 @@ static void ggml_compute_forward_alibi_f16(
11754
12076
  const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
11755
12077
  const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
11756
12078
 
11757
- for (int i = 0; i < ne0; i++) {
11758
- for (int j = 0; j < ne1; j++) {
11759
- for (int k = 0; k < ne2_ne3; k++) {
11760
- ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
11761
- float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
11762
-
11763
- // TODO: k*nb2 or k*nb3
12079
+ for (int k = 0; k < ne2_ne3; k++) {
12080
+ // TODO: k*nb2 or k*nb3
12081
+ float m_k;
11764
12082
 
11765
- float m_k;
12083
+ if (k < n_heads_log2_floor) {
12084
+ m_k = powf(m0, k + 1);
12085
+ } else {
12086
+ m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
12087
+ }
11766
12088
 
11767
- if (k < n_heads_log2_floor) {
11768
- m_k = powf(m0, k + 1);
11769
- } else {
11770
- m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
11771
- }
12089
+ for (int i = 0; i < ne0; i++) {
12090
+ for (int j = 0; j < ne1; j++) {
12091
+ ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
12092
+ float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
11772
12093
 
11773
12094
  // we return F32
11774
12095
  pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]);
@@ -11779,16 +12100,18 @@ static void ggml_compute_forward_alibi_f16(
11779
12100
 
11780
12101
  static void ggml_compute_forward_alibi(
11781
12102
  const struct ggml_compute_params * params,
11782
- const struct ggml_tensor * src0,
11783
12103
  struct ggml_tensor * dst) {
12104
+
12105
+ const struct ggml_tensor * src0 = dst->src[0];
12106
+
11784
12107
  switch (src0->type) {
11785
12108
  case GGML_TYPE_F16:
11786
12109
  {
11787
- ggml_compute_forward_alibi_f16(params, src0, dst);
12110
+ ggml_compute_forward_alibi_f16(params, dst);
11788
12111
  } break;
11789
12112
  case GGML_TYPE_F32:
11790
12113
  {
11791
- ggml_compute_forward_alibi_f32(params, src0, dst);
12114
+ ggml_compute_forward_alibi_f32(params, dst);
11792
12115
  } break;
11793
12116
  case GGML_TYPE_Q4_0:
11794
12117
  case GGML_TYPE_Q4_1:
@@ -11804,6 +12127,8 @@ static void ggml_compute_forward_alibi(
11804
12127
  case GGML_TYPE_IQ2_XXS:
11805
12128
  case GGML_TYPE_IQ2_XS:
11806
12129
  case GGML_TYPE_IQ3_XXS:
12130
+ case GGML_TYPE_IQ1_S:
12131
+ case GGML_TYPE_IQ4_NL:
11807
12132
  case GGML_TYPE_Q8_K:
11808
12133
  case GGML_TYPE_I8:
11809
12134
  case GGML_TYPE_I16:
@@ -11819,8 +12144,10 @@ static void ggml_compute_forward_alibi(
11819
12144
 
11820
12145
  static void ggml_compute_forward_clamp_f32(
11821
12146
  const struct ggml_compute_params * params,
11822
- const struct ggml_tensor * src0,
11823
12147
  struct ggml_tensor * dst) {
12148
+
12149
+ const struct ggml_tensor * src0 = dst->src[0];
12150
+
11824
12151
  assert(params->ith == 0);
11825
12152
 
11826
12153
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -11859,12 +12186,14 @@ static void ggml_compute_forward_clamp_f32(
11859
12186
 
11860
12187
  static void ggml_compute_forward_clamp(
11861
12188
  const struct ggml_compute_params * params,
11862
- const struct ggml_tensor * src0,
11863
12189
  struct ggml_tensor * dst) {
12190
+
12191
+ const struct ggml_tensor * src0 = dst->src[0];
12192
+
11864
12193
  switch (src0->type) {
11865
12194
  case GGML_TYPE_F32:
11866
12195
  {
11867
- ggml_compute_forward_clamp_f32(params, src0, dst);
12196
+ ggml_compute_forward_clamp_f32(params, dst);
11868
12197
  } break;
11869
12198
  case GGML_TYPE_F16:
11870
12199
  case GGML_TYPE_Q4_0:
@@ -11881,6 +12210,8 @@ static void ggml_compute_forward_clamp(
11881
12210
  case GGML_TYPE_IQ2_XXS:
11882
12211
  case GGML_TYPE_IQ2_XS:
11883
12212
  case GGML_TYPE_IQ3_XXS:
12213
+ case GGML_TYPE_IQ1_S:
12214
+ case GGML_TYPE_IQ4_NL:
11884
12215
  case GGML_TYPE_Q8_K:
11885
12216
  case GGML_TYPE_I8:
11886
12217
  case GGML_TYPE_I16:
@@ -11952,10 +12283,12 @@ GGML_CALL void ggml_rope_yarn_corr_dims(
11952
12283
 
11953
12284
  static void ggml_compute_forward_rope_f32(
11954
12285
  const struct ggml_compute_params * params,
11955
- const struct ggml_tensor * src0,
11956
- const struct ggml_tensor * src1,
11957
12286
  struct ggml_tensor * dst,
11958
12287
  const bool forward) {
12288
+
12289
+ const struct ggml_tensor * src0 = dst->src[0];
12290
+ const struct ggml_tensor * src1 = dst->src[1];
12291
+
11959
12292
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11960
12293
  return;
11961
12294
  }
@@ -12128,10 +12461,12 @@ static void ggml_compute_forward_rope_f32(
12128
12461
 
12129
12462
  static void ggml_compute_forward_rope_f16(
12130
12463
  const struct ggml_compute_params * params,
12131
- const struct ggml_tensor * src0,
12132
- const struct ggml_tensor * src1,
12133
12464
  struct ggml_tensor * dst,
12134
12465
  const bool forward) {
12466
+
12467
+ const struct ggml_tensor * src0 = dst->src[0];
12468
+ const struct ggml_tensor * src1 = dst->src[1];
12469
+
12135
12470
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12136
12471
  return;
12137
12472
  }
@@ -12293,17 +12628,18 @@ static void ggml_compute_forward_rope_f16(
12293
12628
 
12294
12629
  static void ggml_compute_forward_rope(
12295
12630
  const struct ggml_compute_params * params,
12296
- const struct ggml_tensor * src0,
12297
- const struct ggml_tensor * src1,
12298
12631
  struct ggml_tensor * dst) {
12632
+
12633
+ const struct ggml_tensor * src0 = dst->src[0];
12634
+
12299
12635
  switch (src0->type) {
12300
12636
  case GGML_TYPE_F16:
12301
12637
  {
12302
- ggml_compute_forward_rope_f16(params, src0, src1, dst, true);
12638
+ ggml_compute_forward_rope_f16(params, dst, true);
12303
12639
  } break;
12304
12640
  case GGML_TYPE_F32:
12305
12641
  {
12306
- ggml_compute_forward_rope_f32(params, src0, src1, dst, true);
12642
+ ggml_compute_forward_rope_f32(params, dst, true);
12307
12643
  } break;
12308
12644
  default:
12309
12645
  {
@@ -12316,17 +12652,18 @@ static void ggml_compute_forward_rope(
12316
12652
 
12317
12653
  static void ggml_compute_forward_rope_back(
12318
12654
  const struct ggml_compute_params * params,
12319
- const struct ggml_tensor * src0,
12320
- const struct ggml_tensor * src1,
12321
12655
  struct ggml_tensor * dst) {
12656
+
12657
+ const struct ggml_tensor * src0 = dst->src[0];
12658
+
12322
12659
  switch (src0->type) {
12323
12660
  case GGML_TYPE_F16:
12324
12661
  {
12325
- ggml_compute_forward_rope_f16(params, src0, src1, dst, false);
12662
+ ggml_compute_forward_rope_f16(params, dst, false);
12326
12663
  } break;
12327
12664
  case GGML_TYPE_F32:
12328
12665
  {
12329
- ggml_compute_forward_rope_f32(params, src0, src1, dst, false);
12666
+ ggml_compute_forward_rope_f32(params, dst, false);
12330
12667
  } break;
12331
12668
  default:
12332
12669
  {
@@ -12339,9 +12676,11 @@ static void ggml_compute_forward_rope_back(
12339
12676
 
12340
12677
  static void ggml_compute_forward_conv_transpose_1d_f16_f32(
12341
12678
  const struct ggml_compute_params * params,
12342
- const struct ggml_tensor * src0,
12343
- const struct ggml_tensor * src1,
12344
12679
  struct ggml_tensor * dst) {
12680
+
12681
+ const struct ggml_tensor * src0 = dst->src[0];
12682
+ const struct ggml_tensor * src1 = dst->src[1];
12683
+
12345
12684
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
12346
12685
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
12347
12686
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -12436,9 +12775,11 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
12436
12775
 
12437
12776
  static void ggml_compute_forward_conv_transpose_1d_f32(
12438
12777
  const struct ggml_compute_params * params,
12439
- const struct ggml_tensor * src0,
12440
- const struct ggml_tensor * src1,
12441
12778
  struct ggml_tensor * dst) {
12779
+
12780
+ const struct ggml_tensor * src0 = dst->src[0];
12781
+ const struct ggml_tensor * src1 = dst->src[1];
12782
+
12442
12783
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
12443
12784
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
12444
12785
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -12533,17 +12874,18 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
12533
12874
 
12534
12875
  static void ggml_compute_forward_conv_transpose_1d(
12535
12876
  const struct ggml_compute_params * params,
12536
- const struct ggml_tensor * src0,
12537
- const struct ggml_tensor * src1,
12538
12877
  struct ggml_tensor * dst) {
12878
+
12879
+ const struct ggml_tensor * src0 = dst->src[0];
12880
+
12539
12881
  switch (src0->type) {
12540
12882
  case GGML_TYPE_F16:
12541
12883
  {
12542
- ggml_compute_forward_conv_transpose_1d_f16_f32(params, src0, src1, dst);
12884
+ ggml_compute_forward_conv_transpose_1d_f16_f32(params, dst);
12543
12885
  } break;
12544
12886
  case GGML_TYPE_F32:
12545
12887
  {
12546
- ggml_compute_forward_conv_transpose_1d_f32(params, src0, src1, dst);
12888
+ ggml_compute_forward_conv_transpose_1d_f32(params, dst);
12547
12889
  } break;
12548
12890
  default:
12549
12891
  {
@@ -12557,9 +12899,11 @@ static void ggml_compute_forward_conv_transpose_1d(
12557
12899
  // dst: result [N, OH, OW, IC*KH*KW]
12558
12900
  static void ggml_compute_forward_im2col_f32(
12559
12901
  const struct ggml_compute_params * params,
12560
- const struct ggml_tensor * src0,
12561
- const struct ggml_tensor * src1,
12562
12902
  struct ggml_tensor * dst) {
12903
+
12904
+ const struct ggml_tensor * src0 = dst->src[0];
12905
+ const struct ggml_tensor * src1 = dst->src[1];
12906
+
12563
12907
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
12564
12908
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
12565
12909
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -12643,9 +12987,11 @@ static void ggml_compute_forward_im2col_f32(
12643
12987
  // dst: result [N, OH, OW, IC*KH*KW]
12644
12988
  static void ggml_compute_forward_im2col_f16(
12645
12989
  const struct ggml_compute_params * params,
12646
- const struct ggml_tensor * src0,
12647
- const struct ggml_tensor * src1,
12648
12990
  struct ggml_tensor * dst) {
12991
+
12992
+ const struct ggml_tensor * src0 = dst->src[0];
12993
+ const struct ggml_tensor * src1 = dst->src[1];
12994
+
12649
12995
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
12650
12996
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
12651
12997
  GGML_ASSERT( dst->type == GGML_TYPE_F16);
@@ -12725,17 +13071,15 @@ static void ggml_compute_forward_im2col_f16(
12725
13071
 
12726
13072
  static void ggml_compute_forward_im2col(
12727
13073
  const struct ggml_compute_params * params,
12728
- const struct ggml_tensor * src0,
12729
- const struct ggml_tensor * src1,
12730
13074
  struct ggml_tensor * dst) {
12731
13075
  switch (dst->type) {
12732
13076
  case GGML_TYPE_F16:
12733
13077
  {
12734
- ggml_compute_forward_im2col_f16(params, src0, src1, dst);
13078
+ ggml_compute_forward_im2col_f16(params, dst);
12735
13079
  } break;
12736
13080
  case GGML_TYPE_F32:
12737
13081
  {
12738
- ggml_compute_forward_im2col_f32(params, src0, src1, dst);
13082
+ ggml_compute_forward_im2col_f32(params, dst);
12739
13083
  } break;
12740
13084
  default:
12741
13085
  {
@@ -12749,9 +13093,11 @@ static void ggml_compute_forward_im2col(
12749
13093
 
12750
13094
  static void ggml_compute_forward_conv_transpose_2d(
12751
13095
  const struct ggml_compute_params * params,
12752
- const struct ggml_tensor * src0,
12753
- const struct ggml_tensor * src1,
12754
13096
  struct ggml_tensor * dst) {
13097
+
13098
+ const struct ggml_tensor * src0 = dst->src[0];
13099
+ const struct ggml_tensor * src1 = dst->src[1];
13100
+
12755
13101
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
12756
13102
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
12757
13103
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -12855,9 +13201,11 @@ static void ggml_compute_forward_conv_transpose_2d(
12855
13201
  static void ggml_compute_forward_pool_1d_sk_p0(
12856
13202
  const struct ggml_compute_params * params,
12857
13203
  const enum ggml_op_pool op,
12858
- const struct ggml_tensor * src,
12859
13204
  const int k,
12860
13205
  struct ggml_tensor * dst) {
13206
+
13207
+ const struct ggml_tensor * src = dst->src[0];
13208
+
12861
13209
  assert(src->type == GGML_TYPE_F32);
12862
13210
  assert(params->ith == 0);
12863
13211
 
@@ -12906,7 +13254,6 @@ static void ggml_compute_forward_pool_1d_sk_p0(
12906
13254
 
12907
13255
  static void ggml_compute_forward_pool_1d(
12908
13256
  const struct ggml_compute_params * params,
12909
- const struct ggml_tensor * src0,
12910
13257
  struct ggml_tensor * dst) {
12911
13258
 
12912
13259
  const int32_t * opts = (const int32_t *)dst->op_params;
@@ -12917,15 +13264,17 @@ static void ggml_compute_forward_pool_1d(
12917
13264
  GGML_ASSERT(p0 == 0); // padding not supported
12918
13265
  GGML_ASSERT(k0 == s0); // only s = k supported
12919
13266
 
12920
- ggml_compute_forward_pool_1d_sk_p0(params, op, src0, k0, dst);
13267
+ ggml_compute_forward_pool_1d_sk_p0(params, op, k0, dst);
12921
13268
  }
12922
13269
 
12923
13270
  // ggml_compute_forward_pool_2d
12924
13271
 
12925
13272
  static void ggml_compute_forward_pool_2d(
12926
13273
  const struct ggml_compute_params * params,
12927
- const struct ggml_tensor * src,
12928
13274
  struct ggml_tensor * dst) {
13275
+
13276
+ const struct ggml_tensor * src = dst->src[0];
13277
+
12929
13278
  GGML_ASSERT(src->type == GGML_TYPE_F32);
12930
13279
  GGML_ASSERT(params->ith == 0);
12931
13280
 
@@ -12998,9 +13347,10 @@ static void ggml_compute_forward_pool_2d(
12998
13347
 
12999
13348
  static void ggml_compute_forward_upscale_f32(
13000
13349
  const struct ggml_compute_params * params,
13001
- const struct ggml_tensor * src0,
13002
13350
  struct ggml_tensor * dst) {
13003
13351
 
13352
+ const struct ggml_tensor * src0 = dst->src[0];
13353
+
13004
13354
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
13005
13355
  return;
13006
13356
  }
@@ -13037,12 +13387,14 @@ static void ggml_compute_forward_upscale_f32(
13037
13387
 
13038
13388
  static void ggml_compute_forward_upscale(
13039
13389
  const struct ggml_compute_params * params,
13040
- const struct ggml_tensor * src0,
13041
13390
  struct ggml_tensor * dst) {
13391
+
13392
+ const struct ggml_tensor * src0 = dst->src[0];
13393
+
13042
13394
  switch (src0->type) {
13043
13395
  case GGML_TYPE_F32:
13044
13396
  {
13045
- ggml_compute_forward_upscale_f32(params, src0, dst);
13397
+ ggml_compute_forward_upscale_f32(params, dst);
13046
13398
  } break;
13047
13399
  default:
13048
13400
  {
@@ -13055,9 +13407,10 @@ static void ggml_compute_forward_upscale(
13055
13407
 
13056
13408
  static void ggml_compute_forward_pad_f32(
13057
13409
  const struct ggml_compute_params * params,
13058
- const struct ggml_tensor * src0,
13059
13410
  struct ggml_tensor * dst) {
13060
13411
 
13412
+ const struct ggml_tensor * src0 = dst->src[0];
13413
+
13061
13414
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
13062
13415
  return;
13063
13416
  }
@@ -13095,12 +13448,14 @@ static void ggml_compute_forward_pad_f32(
13095
13448
 
13096
13449
  static void ggml_compute_forward_pad(
13097
13450
  const struct ggml_compute_params * params,
13098
- const struct ggml_tensor * src0,
13099
13451
  struct ggml_tensor * dst) {
13452
+
13453
+ const struct ggml_tensor * src0 = dst->src[0];
13454
+
13100
13455
  switch (src0->type) {
13101
13456
  case GGML_TYPE_F32:
13102
13457
  {
13103
- ggml_compute_forward_pad_f32(params, src0, dst);
13458
+ ggml_compute_forward_pad_f32(params, dst);
13104
13459
  } break;
13105
13460
  default:
13106
13461
  {
@@ -13113,9 +13468,10 @@ static void ggml_compute_forward_pad(
13113
13468
 
13114
13469
  static void ggml_compute_forward_argsort_f32(
13115
13470
  const struct ggml_compute_params * params,
13116
- const struct ggml_tensor * src0,
13117
13471
  struct ggml_tensor * dst) {
13118
13472
 
13473
+ const struct ggml_tensor * src0 = dst->src[0];
13474
+
13119
13475
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
13120
13476
  return;
13121
13477
  }
@@ -13155,13 +13511,14 @@ static void ggml_compute_forward_argsort_f32(
13155
13511
 
13156
13512
  static void ggml_compute_forward_argsort(
13157
13513
  const struct ggml_compute_params * params,
13158
- const struct ggml_tensor * src0,
13159
13514
  struct ggml_tensor * dst) {
13160
13515
 
13516
+ const struct ggml_tensor * src0 = dst->src[0];
13517
+
13161
13518
  switch (src0->type) {
13162
13519
  case GGML_TYPE_F32:
13163
13520
  {
13164
- ggml_compute_forward_argsort_f32(params, src0, dst);
13521
+ ggml_compute_forward_argsort_f32(params, dst);
13165
13522
  } break;
13166
13523
  default:
13167
13524
  {
@@ -13174,11 +13531,13 @@ static void ggml_compute_forward_argsort(
13174
13531
 
13175
13532
  static void ggml_compute_forward_flash_attn_f32(
13176
13533
  const struct ggml_compute_params * params,
13177
- const struct ggml_tensor * q,
13178
- const struct ggml_tensor * k,
13179
- const struct ggml_tensor * v,
13180
13534
  const bool masked,
13181
13535
  struct ggml_tensor * dst) {
13536
+
13537
+ const struct ggml_tensor * q = dst->src[0];
13538
+ const struct ggml_tensor * k = dst->src[1];
13539
+ const struct ggml_tensor * v = dst->src[2];
13540
+
13182
13541
  int64_t t0 = ggml_perf_time_us();
13183
13542
  UNUSED(t0);
13184
13543
 
@@ -13364,11 +13723,13 @@ static void ggml_compute_forward_flash_attn_f32(
13364
13723
 
13365
13724
  static void ggml_compute_forward_flash_attn_f16(
13366
13725
  const struct ggml_compute_params * params,
13367
- const struct ggml_tensor * q,
13368
- const struct ggml_tensor * k,
13369
- const struct ggml_tensor * v,
13370
13726
  const bool masked,
13371
13727
  struct ggml_tensor * dst) {
13728
+
13729
+ const struct ggml_tensor * q = dst->src[0];
13730
+ const struct ggml_tensor * k = dst->src[1];
13731
+ const struct ggml_tensor * v = dst->src[2];
13732
+
13372
13733
  int64_t t0 = ggml_perf_time_us();
13373
13734
  UNUSED(t0);
13374
13735
 
@@ -13590,19 +13951,19 @@ static void ggml_compute_forward_flash_attn_f16(
13590
13951
 
13591
13952
  static void ggml_compute_forward_flash_attn(
13592
13953
  const struct ggml_compute_params * params,
13593
- const struct ggml_tensor * q,
13594
- const struct ggml_tensor * k,
13595
- const struct ggml_tensor * v,
13596
13954
  const bool masked,
13597
13955
  struct ggml_tensor * dst) {
13956
+
13957
+ const struct ggml_tensor * q = dst->src[0];
13958
+
13598
13959
  switch (q->type) {
13599
13960
  case GGML_TYPE_F16:
13600
13961
  {
13601
- ggml_compute_forward_flash_attn_f16(params, q, k, v, masked, dst);
13962
+ ggml_compute_forward_flash_attn_f16(params, masked, dst);
13602
13963
  } break;
13603
13964
  case GGML_TYPE_F32:
13604
13965
  {
13605
- ggml_compute_forward_flash_attn_f32(params, q, k, v, masked, dst);
13966
+ ggml_compute_forward_flash_attn_f32(params, masked, dst);
13606
13967
  } break;
13607
13968
  default:
13608
13969
  {
@@ -13615,12 +13976,14 @@ static void ggml_compute_forward_flash_attn(
13615
13976
 
13616
13977
  static void ggml_compute_forward_flash_ff_f16(
13617
13978
  const struct ggml_compute_params * params,
13618
- const struct ggml_tensor * a, // F16
13619
- const struct ggml_tensor * b0, // F16 fc_w
13620
- const struct ggml_tensor * b1, // F32 fc_b
13621
- const struct ggml_tensor * c0, // F16 proj_w
13622
- const struct ggml_tensor * c1, // F32 proj_b
13623
13979
  struct ggml_tensor * dst) {
13980
+
13981
+ const struct ggml_tensor * a = dst->src[0]; // F16
13982
+ const struct ggml_tensor * b0 = dst->src[1]; // F16 fc_w
13983
+ const struct ggml_tensor * b1 = dst->src[2]; // F32 fc_b
13984
+ const struct ggml_tensor * c0 = dst->src[3]; // F16 proj_w
13985
+ const struct ggml_tensor * c1 = dst->src[4]; // F32 proj_b
13986
+
13624
13987
  int64_t t0 = ggml_perf_time_us();
13625
13988
  UNUSED(t0);
13626
13989
 
@@ -13748,16 +14111,14 @@ static void ggml_compute_forward_flash_ff_f16(
13748
14111
 
13749
14112
  static void ggml_compute_forward_flash_ff(
13750
14113
  const struct ggml_compute_params * params,
13751
- const struct ggml_tensor * a,
13752
- const struct ggml_tensor * b0,
13753
- const struct ggml_tensor * b1,
13754
- const struct ggml_tensor * c0,
13755
- const struct ggml_tensor * c1,
13756
14114
  struct ggml_tensor * dst) {
14115
+
14116
+ const struct ggml_tensor * b0 = dst->src[1];
14117
+
13757
14118
  switch (b0->type) {
13758
14119
  case GGML_TYPE_F16:
13759
14120
  {
13760
- ggml_compute_forward_flash_ff_f16(params, a, b0, b1, c0, c1, dst);
14121
+ ggml_compute_forward_flash_ff_f16(params, dst);
13761
14122
  } break;
13762
14123
  case GGML_TYPE_F32:
13763
14124
  {
@@ -13774,12 +14135,14 @@ static void ggml_compute_forward_flash_ff(
13774
14135
 
13775
14136
  static void ggml_compute_forward_flash_attn_back_f32(
13776
14137
  const struct ggml_compute_params * params,
13777
- const struct ggml_tensor * q,
13778
- const struct ggml_tensor * k,
13779
- const struct ggml_tensor * v,
13780
- const struct ggml_tensor * d,
13781
14138
  const bool masked,
13782
14139
  struct ggml_tensor * dst) {
14140
+
14141
+ const struct ggml_tensor * q = dst->src[0];
14142
+ const struct ggml_tensor * k = dst->src[1];
14143
+ const struct ggml_tensor * v = dst->src[2];
14144
+ const struct ggml_tensor * d = dst->src[3];
14145
+
13783
14146
  int64_t t0 = ggml_perf_time_us();
13784
14147
  UNUSED(t0);
13785
14148
 
@@ -14127,16 +14490,15 @@ static void ggml_compute_forward_flash_attn_back_f32(
14127
14490
 
14128
14491
  static void ggml_compute_forward_flash_attn_back(
14129
14492
  const struct ggml_compute_params * params,
14130
- const struct ggml_tensor * q,
14131
- const struct ggml_tensor * k,
14132
- const struct ggml_tensor * v,
14133
- const struct ggml_tensor * d,
14134
14493
  const bool masked,
14135
14494
  struct ggml_tensor * dst) {
14495
+
14496
+ const struct ggml_tensor * q = dst->src[0];
14497
+
14136
14498
  switch (q->type) {
14137
14499
  case GGML_TYPE_F32:
14138
14500
  {
14139
- ggml_compute_forward_flash_attn_back_f32(params, q, k, v, d, masked, dst);
14501
+ ggml_compute_forward_flash_attn_back_f32(params, masked, dst);
14140
14502
  } break;
14141
14503
  default:
14142
14504
  {
@@ -14149,8 +14511,10 @@ static void ggml_compute_forward_flash_attn_back(
14149
14511
 
14150
14512
  static void ggml_compute_forward_win_part_f32(
14151
14513
  const struct ggml_compute_params * params,
14152
- const struct ggml_tensor * src0,
14153
14514
  struct ggml_tensor * dst) {
14515
+
14516
+ const struct ggml_tensor * src0 = dst->src[0];
14517
+
14154
14518
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14155
14519
  return;
14156
14520
  }
@@ -14193,12 +14557,14 @@ static void ggml_compute_forward_win_part_f32(
14193
14557
 
14194
14558
  static void ggml_compute_forward_win_part(
14195
14559
  const struct ggml_compute_params * params,
14196
- const struct ggml_tensor * src0,
14197
14560
  struct ggml_tensor * dst) {
14561
+
14562
+ const struct ggml_tensor * src0 = dst->src[0];
14563
+
14198
14564
  switch (src0->type) {
14199
14565
  case GGML_TYPE_F32:
14200
14566
  {
14201
- ggml_compute_forward_win_part_f32(params, src0, dst);
14567
+ ggml_compute_forward_win_part_f32(params, dst);
14202
14568
  } break;
14203
14569
  default:
14204
14570
  {
@@ -14211,8 +14577,10 @@ static void ggml_compute_forward_win_part(
14211
14577
 
14212
14578
  static void ggml_compute_forward_win_unpart_f32(
14213
14579
  const struct ggml_compute_params * params,
14214
- const struct ggml_tensor * src0,
14215
14580
  struct ggml_tensor * dst) {
14581
+
14582
+ const struct ggml_tensor * src0 = dst->src[0];
14583
+
14216
14584
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14217
14585
  return;
14218
14586
  }
@@ -14253,12 +14621,14 @@ static void ggml_compute_forward_win_unpart_f32(
14253
14621
 
14254
14622
  static void ggml_compute_forward_win_unpart(
14255
14623
  const struct ggml_compute_params * params,
14256
- const struct ggml_tensor * src0,
14257
14624
  struct ggml_tensor * dst) {
14625
+
14626
+ const struct ggml_tensor * src0 = dst->src[0];
14627
+
14258
14628
  switch (src0->type) {
14259
14629
  case GGML_TYPE_F32:
14260
14630
  {
14261
- ggml_compute_forward_win_unpart_f32(params, src0, dst);
14631
+ ggml_compute_forward_win_unpart_f32(params, dst);
14262
14632
  } break;
14263
14633
  default:
14264
14634
  {
@@ -14271,58 +14641,58 @@ static void ggml_compute_forward_win_unpart(
14271
14641
 
14272
14642
  static void ggml_compute_forward_unary(
14273
14643
  const struct ggml_compute_params * params,
14274
- const struct ggml_tensor * src0,
14275
14644
  struct ggml_tensor * dst) {
14645
+
14276
14646
  const enum ggml_unary_op op = ggml_get_unary_op(dst);
14277
14647
 
14278
14648
  switch (op) {
14279
14649
  case GGML_UNARY_OP_ABS:
14280
14650
  {
14281
- ggml_compute_forward_abs(params, src0, dst);
14651
+ ggml_compute_forward_abs(params, dst);
14282
14652
  } break;
14283
14653
  case GGML_UNARY_OP_SGN:
14284
14654
  {
14285
- ggml_compute_forward_sgn(params, src0, dst);
14655
+ ggml_compute_forward_sgn(params, dst);
14286
14656
  } break;
14287
14657
  case GGML_UNARY_OP_NEG:
14288
14658
  {
14289
- ggml_compute_forward_neg(params, src0, dst);
14659
+ ggml_compute_forward_neg(params, dst);
14290
14660
  } break;
14291
14661
  case GGML_UNARY_OP_STEP:
14292
14662
  {
14293
- ggml_compute_forward_step(params, src0, dst);
14663
+ ggml_compute_forward_step(params, dst);
14294
14664
  } break;
14295
14665
  case GGML_UNARY_OP_TANH:
14296
14666
  {
14297
- ggml_compute_forward_tanh(params, src0, dst);
14667
+ ggml_compute_forward_tanh(params, dst);
14298
14668
  } break;
14299
14669
  case GGML_UNARY_OP_ELU:
14300
14670
  {
14301
- ggml_compute_forward_elu(params, src0, dst);
14671
+ ggml_compute_forward_elu(params, dst);
14302
14672
  } break;
14303
14673
  case GGML_UNARY_OP_RELU:
14304
14674
  {
14305
- ggml_compute_forward_relu(params, src0, dst);
14675
+ ggml_compute_forward_relu(params, dst);
14306
14676
  } break;
14307
14677
  case GGML_UNARY_OP_GELU:
14308
14678
  {
14309
- ggml_compute_forward_gelu(params, src0, dst);
14679
+ ggml_compute_forward_gelu(params, dst);
14310
14680
  } break;
14311
14681
  case GGML_UNARY_OP_GELU_QUICK:
14312
14682
  {
14313
- ggml_compute_forward_gelu_quick(params, src0, dst);
14683
+ ggml_compute_forward_gelu_quick(params, dst);
14314
14684
  } break;
14315
14685
  case GGML_UNARY_OP_SILU:
14316
14686
  {
14317
- ggml_compute_forward_silu(params, src0, dst);
14687
+ ggml_compute_forward_silu(params, dst);
14318
14688
  } break;
14319
14689
  case GGML_UNARY_OP_HARDSWISH:
14320
14690
  {
14321
- ggml_compute_forward_hardswish(params, src0, dst);
14691
+ ggml_compute_forward_hardswish(params, dst);
14322
14692
  } break;
14323
14693
  case GGML_UNARY_OP_HARDSIGMOID:
14324
14694
  {
14325
- ggml_compute_forward_hardsigmoid(params, src0, dst);
14695
+ ggml_compute_forward_hardsigmoid(params, dst);
14326
14696
  } break;
14327
14697
  default:
14328
14698
  {
@@ -14335,8 +14705,10 @@ static void ggml_compute_forward_unary(
14335
14705
 
14336
14706
  static void ggml_compute_forward_get_rel_pos_f16(
14337
14707
  const struct ggml_compute_params * params,
14338
- const struct ggml_tensor * src0,
14339
14708
  struct ggml_tensor * dst) {
14709
+
14710
+ const struct ggml_tensor * src0 = dst->src[0];
14711
+
14340
14712
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14341
14713
  return;
14342
14714
  }
@@ -14362,12 +14734,14 @@ static void ggml_compute_forward_get_rel_pos_f16(
14362
14734
 
14363
14735
  static void ggml_compute_forward_get_rel_pos(
14364
14736
  const struct ggml_compute_params * params,
14365
- const struct ggml_tensor * src0,
14366
14737
  struct ggml_tensor * dst) {
14738
+
14739
+ const struct ggml_tensor * src0 = dst->src[0];
14740
+
14367
14741
  switch (src0->type) {
14368
14742
  case GGML_TYPE_F16:
14369
14743
  {
14370
- ggml_compute_forward_get_rel_pos_f16(params, src0, dst);
14744
+ ggml_compute_forward_get_rel_pos_f16(params, dst);
14371
14745
  } break;
14372
14746
  default:
14373
14747
  {
@@ -14380,11 +14754,12 @@ static void ggml_compute_forward_get_rel_pos(
14380
14754
 
14381
14755
  static void ggml_compute_forward_add_rel_pos_f32(
14382
14756
  const struct ggml_compute_params * params,
14383
- const struct ggml_tensor * src0,
14384
- const struct ggml_tensor * src1,
14385
- const struct ggml_tensor * src2,
14386
14757
  struct ggml_tensor * dst) {
14387
14758
 
14759
+ const struct ggml_tensor * src0 = dst->src[0];
14760
+ const struct ggml_tensor * src1 = dst->src[1];
14761
+ const struct ggml_tensor * src2 = dst->src[2];
14762
+
14388
14763
  const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
14389
14764
  if (!inplace && params->type == GGML_TASK_INIT) {
14390
14765
  if (params->ith != 0) {
@@ -14448,14 +14823,14 @@ static void ggml_compute_forward_add_rel_pos_f32(
14448
14823
 
14449
14824
  static void ggml_compute_forward_add_rel_pos(
14450
14825
  const struct ggml_compute_params * params,
14451
- const struct ggml_tensor * src0,
14452
- const struct ggml_tensor * src1,
14453
- const struct ggml_tensor * src2,
14454
14826
  struct ggml_tensor * dst) {
14827
+
14828
+ const struct ggml_tensor * src0 = dst->src[0];
14829
+
14455
14830
  switch (src0->type) {
14456
14831
  case GGML_TYPE_F32:
14457
14832
  {
14458
- ggml_compute_forward_add_rel_pos_f32(params, src0, src1, src2, dst);
14833
+ ggml_compute_forward_add_rel_pos_f32(params, dst);
14459
14834
  } break;
14460
14835
  default:
14461
14836
  {
@@ -14468,9 +14843,11 @@ static void ggml_compute_forward_add_rel_pos(
14468
14843
 
14469
14844
  static void ggml_compute_forward_map_unary_f32(
14470
14845
  const struct ggml_compute_params * params,
14471
- const struct ggml_tensor * src0,
14472
14846
  struct ggml_tensor * dst,
14473
14847
  const ggml_unary_op_f32_t fun) {
14848
+
14849
+ const struct ggml_tensor * src0 = dst->src[0];
14850
+
14474
14851
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
14475
14852
 
14476
14853
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -14492,13 +14869,15 @@ static void ggml_compute_forward_map_unary_f32(
14492
14869
 
14493
14870
  static void ggml_compute_forward_map_unary(
14494
14871
  const struct ggml_compute_params * params,
14495
- const struct ggml_tensor * src0,
14496
14872
  struct ggml_tensor * dst,
14497
14873
  const ggml_unary_op_f32_t fun) {
14874
+
14875
+ const struct ggml_tensor * src0 = dst->src[0];
14876
+
14498
14877
  switch (src0->type) {
14499
14878
  case GGML_TYPE_F32:
14500
14879
  {
14501
- ggml_compute_forward_map_unary_f32(params, src0, dst, fun);
14880
+ ggml_compute_forward_map_unary_f32(params, dst, fun);
14502
14881
  } break;
14503
14882
  default:
14504
14883
  {
@@ -14511,10 +14890,12 @@ static void ggml_compute_forward_map_unary(
14511
14890
 
14512
14891
  static void ggml_compute_forward_map_binary_f32(
14513
14892
  const struct ggml_compute_params * params,
14514
- const struct ggml_tensor * src0,
14515
- const struct ggml_tensor * src1,
14516
14893
  struct ggml_tensor * dst,
14517
14894
  const ggml_binary_op_f32_t fun) {
14895
+
14896
+ const struct ggml_tensor * src0 = dst->src[0];
14897
+ const struct ggml_tensor * src1 = dst->src[1];
14898
+
14518
14899
  assert(params->ith == 0);
14519
14900
  assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
14520
14901
 
@@ -14539,14 +14920,15 @@ static void ggml_compute_forward_map_binary_f32(
14539
14920
 
14540
14921
  static void ggml_compute_forward_map_binary(
14541
14922
  const struct ggml_compute_params * params,
14542
- const struct ggml_tensor * src0,
14543
- const struct ggml_tensor * src1,
14544
14923
  struct ggml_tensor * dst,
14545
14924
  const ggml_binary_op_f32_t fun) {
14925
+
14926
+ const struct ggml_tensor * src0 = dst->src[0];
14927
+
14546
14928
  switch (src0->type) {
14547
14929
  case GGML_TYPE_F32:
14548
14930
  {
14549
- ggml_compute_forward_map_binary_f32(params, src0, src1, dst, fun);
14931
+ ggml_compute_forward_map_binary_f32(params, dst, fun);
14550
14932
  } break;
14551
14933
  default:
14552
14934
  {
@@ -14559,9 +14941,11 @@ static void ggml_compute_forward_map_binary(
14559
14941
 
14560
14942
  static void ggml_compute_forward_map_custom1_f32(
14561
14943
  const struct ggml_compute_params * params,
14562
- const struct ggml_tensor * a,
14563
14944
  struct ggml_tensor * dst,
14564
14945
  const ggml_custom1_op_f32_t fun) {
14946
+
14947
+ const struct ggml_tensor * a = dst->src[0];
14948
+
14565
14949
  assert(params->ith == 0);
14566
14950
 
14567
14951
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -14575,10 +14959,12 @@ static void ggml_compute_forward_map_custom1_f32(
14575
14959
 
14576
14960
  static void ggml_compute_forward_map_custom2_f32(
14577
14961
  const struct ggml_compute_params * params,
14578
- const struct ggml_tensor * a,
14579
- const struct ggml_tensor * b,
14580
14962
  struct ggml_tensor * dst,
14581
14963
  const ggml_custom2_op_f32_t fun) {
14964
+
14965
+ const struct ggml_tensor * a = dst->src[0];
14966
+ const struct ggml_tensor * b = dst->src[1];
14967
+
14582
14968
  assert(params->ith == 0);
14583
14969
 
14584
14970
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -14592,11 +14978,13 @@ static void ggml_compute_forward_map_custom2_f32(
14592
14978
 
14593
14979
  static void ggml_compute_forward_map_custom3_f32(
14594
14980
  const struct ggml_compute_params * params,
14595
- const struct ggml_tensor * a,
14596
- const struct ggml_tensor * b,
14597
- const struct ggml_tensor * c,
14598
14981
  struct ggml_tensor * dst,
14599
14982
  const ggml_custom3_op_f32_t fun) {
14983
+
14984
+ const struct ggml_tensor * a = dst->src[0];
14985
+ const struct ggml_tensor * b = dst->src[1];
14986
+ const struct ggml_tensor * c = dst->src[1];
14987
+
14600
14988
  assert(params->ith == 0);
14601
14989
 
14602
14990
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -14610,8 +14998,10 @@ static void ggml_compute_forward_map_custom3_f32(
14610
14998
 
14611
14999
  static void ggml_compute_forward_map_custom1(
14612
15000
  const struct ggml_compute_params * params,
14613
- const struct ggml_tensor * a,
14614
15001
  struct ggml_tensor * dst) {
15002
+
15003
+ const struct ggml_tensor * a = dst->src[0];
15004
+
14615
15005
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14616
15006
  return;
14617
15007
  }
@@ -14625,9 +15015,11 @@ static void ggml_compute_forward_map_custom1(
14625
15015
 
14626
15016
  static void ggml_compute_forward_map_custom2(
14627
15017
  const struct ggml_compute_params * params,
14628
- const struct ggml_tensor * a,
14629
- const struct ggml_tensor * b,
14630
15018
  struct ggml_tensor * dst) {
15019
+
15020
+ const struct ggml_tensor * a = dst->src[0];
15021
+ const struct ggml_tensor * b = dst->src[1];
15022
+
14631
15023
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14632
15024
  return;
14633
15025
  }
@@ -14641,10 +15033,12 @@ static void ggml_compute_forward_map_custom2(
14641
15033
 
14642
15034
  static void ggml_compute_forward_map_custom3(
14643
15035
  const struct ggml_compute_params * params,
14644
- const struct ggml_tensor * a,
14645
- const struct ggml_tensor * b,
14646
- const struct ggml_tensor * c,
14647
15036
  struct ggml_tensor * dst) {
15037
+
15038
+ const struct ggml_tensor * a = dst->src[0];
15039
+ const struct ggml_tensor * b = dst->src[1];
15040
+ const struct ggml_tensor * c = dst->src[2];
15041
+
14648
15042
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14649
15043
  return;
14650
15044
  }
@@ -14658,9 +15052,11 @@ static void ggml_compute_forward_map_custom3(
14658
15052
 
14659
15053
  static void ggml_compute_forward_cross_entropy_loss_f32(
14660
15054
  const struct ggml_compute_params * params,
14661
- const struct ggml_tensor * src0,
14662
- const struct ggml_tensor * src1,
14663
15055
  struct ggml_tensor * dst) {
15056
+
15057
+ const struct ggml_tensor * src0 = dst->src[0];
15058
+ const struct ggml_tensor * src1 = dst->src[1];
15059
+
14664
15060
  GGML_ASSERT(ggml_is_contiguous(src0));
14665
15061
  GGML_ASSERT(ggml_is_contiguous(src1));
14666
15062
  GGML_ASSERT(ggml_is_scalar(dst));
@@ -14764,13 +15160,14 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
14764
15160
 
14765
15161
  static void ggml_compute_forward_cross_entropy_loss(
14766
15162
  const struct ggml_compute_params * params,
14767
- const struct ggml_tensor * src0,
14768
- const struct ggml_tensor * src1,
14769
15163
  struct ggml_tensor * dst) {
15164
+
15165
+ const struct ggml_tensor * src0 = dst->src[0];
15166
+
14770
15167
  switch (src0->type) {
14771
15168
  case GGML_TYPE_F32:
14772
15169
  {
14773
- ggml_compute_forward_cross_entropy_loss_f32(params, src0, src1, dst);
15170
+ ggml_compute_forward_cross_entropy_loss_f32(params, dst);
14774
15171
  } break;
14775
15172
  default:
14776
15173
  {
@@ -14783,10 +15180,12 @@ static void ggml_compute_forward_cross_entropy_loss(
14783
15180
 
14784
15181
  static void ggml_compute_forward_cross_entropy_loss_back_f32(
14785
15182
  const struct ggml_compute_params * params,
14786
- const struct ggml_tensor * src0,
14787
- const struct ggml_tensor * src1,
14788
- const struct ggml_tensor * opt0,
14789
15183
  struct ggml_tensor * dst) {
15184
+
15185
+ const struct ggml_tensor * src0 = dst->src[0];
15186
+ const struct ggml_tensor * src1 = dst->src[1];
15187
+ const struct ggml_tensor * opt0 = dst->src[2];
15188
+
14790
15189
  GGML_ASSERT(ggml_is_contiguous(dst));
14791
15190
  GGML_ASSERT(ggml_is_contiguous(src0));
14792
15191
  GGML_ASSERT(ggml_is_contiguous(src1));
@@ -14873,14 +15272,14 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
14873
15272
 
14874
15273
  static void ggml_compute_forward_cross_entropy_loss_back(
14875
15274
  const struct ggml_compute_params * params,
14876
- const struct ggml_tensor * src0,
14877
- const struct ggml_tensor * src1,
14878
- const struct ggml_tensor * opt0,
14879
15275
  struct ggml_tensor * dst) {
15276
+
15277
+ const struct ggml_tensor * src0 = dst->src[0];
15278
+
14880
15279
  switch (src0->type) {
14881
15280
  case GGML_TYPE_F32:
14882
15281
  {
14883
- ggml_compute_forward_cross_entropy_loss_back_f32(params, src0, src1, opt0, dst);
15282
+ ggml_compute_forward_cross_entropy_loss_back_f32(params, dst);
14884
15283
  } break;
14885
15284
  default:
14886
15285
  {
@@ -14928,312 +15327,312 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14928
15327
  switch (tensor->op) {
14929
15328
  case GGML_OP_DUP:
14930
15329
  {
14931
- ggml_compute_forward_dup(params, tensor->src[0], tensor);
15330
+ ggml_compute_forward_dup(params, tensor);
14932
15331
  } break;
14933
15332
  case GGML_OP_ADD:
14934
15333
  {
14935
- ggml_compute_forward_add(params, tensor->src[0], tensor->src[1], tensor);
15334
+ ggml_compute_forward_add(params, tensor);
14936
15335
  } break;
14937
15336
  case GGML_OP_ADD1:
14938
15337
  {
14939
- ggml_compute_forward_add1(params, tensor->src[0], tensor->src[1], tensor);
15338
+ ggml_compute_forward_add1(params, tensor);
14940
15339
  } break;
14941
15340
  case GGML_OP_ACC:
14942
15341
  {
14943
- ggml_compute_forward_acc(params, tensor->src[0], tensor->src[1], tensor);
15342
+ ggml_compute_forward_acc(params, tensor);
14944
15343
  } break;
14945
15344
  case GGML_OP_SUB:
14946
15345
  {
14947
- ggml_compute_forward_sub(params, tensor->src[0], tensor->src[1], tensor);
15346
+ ggml_compute_forward_sub(params, tensor);
14948
15347
  } break;
14949
15348
  case GGML_OP_MUL:
14950
15349
  {
14951
- ggml_compute_forward_mul(params, tensor->src[0], tensor->src[1], tensor);
15350
+ ggml_compute_forward_mul(params, tensor);
14952
15351
  } break;
14953
15352
  case GGML_OP_DIV:
14954
15353
  {
14955
- ggml_compute_forward_div(params, tensor->src[0], tensor->src[1], tensor);
15354
+ ggml_compute_forward_div(params, tensor);
14956
15355
  } break;
14957
15356
  case GGML_OP_SQR:
14958
15357
  {
14959
- ggml_compute_forward_sqr(params, tensor->src[0], tensor);
15358
+ ggml_compute_forward_sqr(params, tensor);
14960
15359
  } break;
14961
15360
  case GGML_OP_SQRT:
14962
15361
  {
14963
- ggml_compute_forward_sqrt(params, tensor->src[0], tensor);
15362
+ ggml_compute_forward_sqrt(params, tensor);
14964
15363
  } break;
14965
15364
  case GGML_OP_LOG:
14966
15365
  {
14967
- ggml_compute_forward_log(params, tensor->src[0], tensor);
15366
+ ggml_compute_forward_log(params, tensor);
14968
15367
  } break;
14969
15368
  case GGML_OP_SUM:
14970
15369
  {
14971
- ggml_compute_forward_sum(params, tensor->src[0], tensor);
15370
+ ggml_compute_forward_sum(params, tensor);
14972
15371
  } break;
14973
15372
  case GGML_OP_SUM_ROWS:
14974
15373
  {
14975
- ggml_compute_forward_sum_rows(params, tensor->src[0], tensor);
15374
+ ggml_compute_forward_sum_rows(params, tensor);
14976
15375
  } break;
14977
15376
  case GGML_OP_MEAN:
14978
15377
  {
14979
- ggml_compute_forward_mean(params, tensor->src[0], tensor);
15378
+ ggml_compute_forward_mean(params, tensor);
14980
15379
  } break;
14981
15380
  case GGML_OP_ARGMAX:
14982
15381
  {
14983
- ggml_compute_forward_argmax(params, tensor->src[0], tensor);
15382
+ ggml_compute_forward_argmax(params, tensor);
14984
15383
  } break;
14985
15384
  case GGML_OP_REPEAT:
14986
15385
  {
14987
- ggml_compute_forward_repeat(params, tensor->src[0], tensor);
15386
+ ggml_compute_forward_repeat(params, tensor);
14988
15387
  } break;
14989
15388
  case GGML_OP_REPEAT_BACK:
14990
15389
  {
14991
- ggml_compute_forward_repeat_back(params, tensor->src[0], tensor);
15390
+ ggml_compute_forward_repeat_back(params, tensor);
14992
15391
  } break;
14993
15392
  case GGML_OP_CONCAT:
14994
15393
  {
14995
- ggml_compute_forward_concat(params, tensor->src[0], tensor->src[1], tensor);
15394
+ ggml_compute_forward_concat(params, tensor);
14996
15395
  } break;
14997
15396
  case GGML_OP_SILU_BACK:
14998
15397
  {
14999
- ggml_compute_forward_silu_back(params, tensor->src[0], tensor->src[1], tensor);
15398
+ ggml_compute_forward_silu_back(params, tensor);
15000
15399
  } break;
15001
15400
  case GGML_OP_NORM:
15002
15401
  {
15003
- ggml_compute_forward_norm(params, tensor->src[0], tensor);
15402
+ ggml_compute_forward_norm(params, tensor);
15004
15403
  } break;
15005
15404
  case GGML_OP_RMS_NORM:
15006
15405
  {
15007
- ggml_compute_forward_rms_norm(params, tensor->src[0], tensor);
15406
+ ggml_compute_forward_rms_norm(params, tensor);
15008
15407
  } break;
15009
15408
  case GGML_OP_RMS_NORM_BACK:
15010
15409
  {
15011
- ggml_compute_forward_rms_norm_back(params, tensor->src[0], tensor->src[1], tensor);
15410
+ ggml_compute_forward_rms_norm_back(params, tensor);
15012
15411
  } break;
15013
15412
  case GGML_OP_GROUP_NORM:
15014
15413
  {
15015
- ggml_compute_forward_group_norm(params, tensor->src[0], tensor);
15414
+ ggml_compute_forward_group_norm(params, tensor);
15016
15415
  } break;
15017
15416
  case GGML_OP_MUL_MAT:
15018
15417
  {
15019
- ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
15418
+ ggml_compute_forward_mul_mat(params, tensor);
15020
15419
  } break;
15021
15420
  case GGML_OP_MUL_MAT_ID:
15022
15421
  {
15023
- ggml_compute_forward_mul_mat_id(params, tensor->src[0], tensor->src[1], tensor);
15422
+ ggml_compute_forward_mul_mat_id(params, tensor);
15024
15423
  } break;
15025
15424
  case GGML_OP_OUT_PROD:
15026
15425
  {
15027
- ggml_compute_forward_out_prod(params, tensor->src[0], tensor->src[1], tensor);
15426
+ ggml_compute_forward_out_prod(params, tensor);
15028
15427
  } break;
15029
15428
  case GGML_OP_SCALE:
15030
15429
  {
15031
- ggml_compute_forward_scale(params, tensor->src[0], tensor);
15430
+ ggml_compute_forward_scale(params, tensor);
15032
15431
  } break;
15033
15432
  case GGML_OP_SET:
15034
15433
  {
15035
- ggml_compute_forward_set(params, tensor->src[0], tensor->src[1], tensor);
15434
+ ggml_compute_forward_set(params, tensor);
15036
15435
  } break;
15037
15436
  case GGML_OP_CPY:
15038
15437
  {
15039
- ggml_compute_forward_cpy(params, tensor->src[0], tensor);
15438
+ ggml_compute_forward_cpy(params, tensor);
15040
15439
  } break;
15041
15440
  case GGML_OP_CONT:
15042
15441
  {
15043
- ggml_compute_forward_cont(params, tensor->src[0], tensor);
15442
+ ggml_compute_forward_cont(params, tensor);
15044
15443
  } break;
15045
15444
  case GGML_OP_RESHAPE:
15046
15445
  {
15047
- ggml_compute_forward_reshape(params, tensor->src[0], tensor);
15446
+ ggml_compute_forward_reshape(params, tensor);
15048
15447
  } break;
15049
15448
  case GGML_OP_VIEW:
15050
15449
  {
15051
- ggml_compute_forward_view(params, tensor->src[0]);
15450
+ ggml_compute_forward_view(params, tensor);
15052
15451
  } break;
15053
15452
  case GGML_OP_PERMUTE:
15054
15453
  {
15055
- ggml_compute_forward_permute(params, tensor->src[0]);
15454
+ ggml_compute_forward_permute(params, tensor);
15056
15455
  } break;
15057
15456
  case GGML_OP_TRANSPOSE:
15058
15457
  {
15059
- ggml_compute_forward_transpose(params, tensor->src[0]);
15458
+ ggml_compute_forward_transpose(params, tensor);
15060
15459
  } break;
15061
15460
  case GGML_OP_GET_ROWS:
15062
15461
  {
15063
- ggml_compute_forward_get_rows(params, tensor->src[0], tensor->src[1], tensor);
15462
+ ggml_compute_forward_get_rows(params, tensor);
15064
15463
  } break;
15065
15464
  case GGML_OP_GET_ROWS_BACK:
15066
15465
  {
15067
- ggml_compute_forward_get_rows_back(params, tensor->src[0], tensor->src[1], tensor);
15466
+ ggml_compute_forward_get_rows_back(params, tensor);
15068
15467
  } break;
15069
15468
  case GGML_OP_DIAG:
15070
15469
  {
15071
- ggml_compute_forward_diag(params, tensor->src[0], tensor);
15470
+ ggml_compute_forward_diag(params, tensor);
15072
15471
  } break;
15073
15472
  case GGML_OP_DIAG_MASK_INF:
15074
15473
  {
15075
- ggml_compute_forward_diag_mask_inf(params, tensor->src[0], tensor);
15474
+ ggml_compute_forward_diag_mask_inf(params, tensor);
15076
15475
  } break;
15077
15476
  case GGML_OP_DIAG_MASK_ZERO:
15078
15477
  {
15079
- ggml_compute_forward_diag_mask_zero(params, tensor->src[0], tensor);
15478
+ ggml_compute_forward_diag_mask_zero(params, tensor);
15080
15479
  } break;
15081
15480
  case GGML_OP_SOFT_MAX:
15082
15481
  {
15083
- ggml_compute_forward_soft_max(params, tensor->src[0], tensor->src[1], tensor);
15482
+ ggml_compute_forward_soft_max(params, tensor);
15084
15483
  } break;
15085
15484
  case GGML_OP_SOFT_MAX_BACK:
15086
15485
  {
15087
- ggml_compute_forward_soft_max_back(params, tensor->src[0], tensor->src[1], tensor);
15486
+ ggml_compute_forward_soft_max_back(params, tensor);
15088
15487
  } break;
15089
15488
  case GGML_OP_ROPE:
15090
15489
  {
15091
- ggml_compute_forward_rope(params, tensor->src[0], tensor->src[1], tensor);
15490
+ ggml_compute_forward_rope(params, tensor);
15092
15491
  } break;
15093
15492
  case GGML_OP_ROPE_BACK:
15094
15493
  {
15095
- ggml_compute_forward_rope_back(params, tensor->src[0], tensor->src[1], tensor);
15494
+ ggml_compute_forward_rope_back(params, tensor);
15096
15495
  } break;
15097
15496
  case GGML_OP_ALIBI:
15098
15497
  {
15099
- ggml_compute_forward_alibi(params, tensor->src[0], tensor);
15498
+ ggml_compute_forward_alibi(params, tensor);
15100
15499
  } break;
15101
15500
  case GGML_OP_CLAMP:
15102
15501
  {
15103
- ggml_compute_forward_clamp(params, tensor->src[0], tensor);
15502
+ ggml_compute_forward_clamp(params, tensor);
15104
15503
  } break;
15105
15504
  case GGML_OP_CONV_TRANSPOSE_1D:
15106
15505
  {
15107
- ggml_compute_forward_conv_transpose_1d(params, tensor->src[0], tensor->src[1], tensor);
15506
+ ggml_compute_forward_conv_transpose_1d(params, tensor);
15108
15507
  } break;
15109
15508
  case GGML_OP_IM2COL:
15110
15509
  {
15111
- ggml_compute_forward_im2col(params, tensor->src[0], tensor->src[1], tensor);
15510
+ ggml_compute_forward_im2col(params, tensor);
15112
15511
  } break;
15113
15512
  case GGML_OP_CONV_TRANSPOSE_2D:
15114
15513
  {
15115
- ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor);
15514
+ ggml_compute_forward_conv_transpose_2d(params, tensor);
15116
15515
  } break;
15117
15516
  case GGML_OP_POOL_1D:
15118
15517
  {
15119
- ggml_compute_forward_pool_1d(params, tensor->src[0], tensor);
15518
+ ggml_compute_forward_pool_1d(params, tensor);
15120
15519
  } break;
15121
15520
  case GGML_OP_POOL_2D:
15122
15521
  {
15123
- ggml_compute_forward_pool_2d(params, tensor->src[0], tensor);
15522
+ ggml_compute_forward_pool_2d(params, tensor);
15124
15523
  } break;
15125
15524
  case GGML_OP_UPSCALE:
15126
15525
  {
15127
- ggml_compute_forward_upscale(params, tensor->src[0], tensor);
15526
+ ggml_compute_forward_upscale(params, tensor);
15128
15527
  } break;
15129
15528
  case GGML_OP_PAD:
15130
15529
  {
15131
- ggml_compute_forward_pad(params, tensor->src[0], tensor);
15530
+ ggml_compute_forward_pad(params, tensor);
15132
15531
  } break;
15133
15532
  case GGML_OP_ARGSORT:
15134
15533
  {
15135
- ggml_compute_forward_argsort(params, tensor->src[0], tensor);
15534
+ ggml_compute_forward_argsort(params, tensor);
15136
15535
  } break;
15137
15536
  case GGML_OP_LEAKY_RELU:
15138
15537
  {
15139
- ggml_compute_forward_leaky_relu(params, tensor->src[0], tensor);
15538
+ ggml_compute_forward_leaky_relu(params, tensor);
15140
15539
  } break;
15141
15540
  case GGML_OP_FLASH_ATTN:
15142
15541
  {
15143
15542
  const int32_t t = ggml_get_op_params_i32(tensor, 0);
15144
15543
  GGML_ASSERT(t == 0 || t == 1);
15145
15544
  const bool masked = t != 0;
15146
- ggml_compute_forward_flash_attn(params, tensor->src[0], tensor->src[1], tensor->src[2], masked, tensor);
15545
+ ggml_compute_forward_flash_attn(params, masked, tensor);
15147
15546
  } break;
15148
15547
  case GGML_OP_FLASH_FF:
15149
15548
  {
15150
- ggml_compute_forward_flash_ff(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor->src[4], tensor);
15549
+ ggml_compute_forward_flash_ff(params, tensor);
15151
15550
  } break;
15152
15551
  case GGML_OP_FLASH_ATTN_BACK:
15153
15552
  {
15154
15553
  int32_t t = ggml_get_op_params_i32(tensor, 0);
15155
15554
  GGML_ASSERT(t == 0 || t == 1);
15156
15555
  bool masked = t != 0;
15157
- ggml_compute_forward_flash_attn_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], masked, tensor);
15556
+ ggml_compute_forward_flash_attn_back(params, masked, tensor);
15158
15557
  } break;
15159
15558
  case GGML_OP_WIN_PART:
15160
15559
  {
15161
- ggml_compute_forward_win_part(params, tensor->src[0], tensor);
15560
+ ggml_compute_forward_win_part(params, tensor);
15162
15561
  } break;
15163
15562
  case GGML_OP_WIN_UNPART:
15164
15563
  {
15165
- ggml_compute_forward_win_unpart(params, tensor->src[0], tensor);
15564
+ ggml_compute_forward_win_unpart(params, tensor);
15166
15565
  } break;
15167
15566
  case GGML_OP_UNARY:
15168
15567
  {
15169
- ggml_compute_forward_unary(params, tensor->src[0], tensor);
15568
+ ggml_compute_forward_unary(params, tensor);
15170
15569
  } break;
15171
15570
  case GGML_OP_GET_REL_POS:
15172
15571
  {
15173
- ggml_compute_forward_get_rel_pos(params, tensor->src[0], tensor);
15572
+ ggml_compute_forward_get_rel_pos(params, tensor);
15174
15573
  } break;
15175
15574
  case GGML_OP_ADD_REL_POS:
15176
15575
  {
15177
- ggml_compute_forward_add_rel_pos(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
15576
+ ggml_compute_forward_add_rel_pos(params, tensor);
15178
15577
  } break;
15179
15578
  case GGML_OP_MAP_UNARY:
15180
15579
  {
15181
15580
  ggml_unary_op_f32_t fun;
15182
15581
  memcpy(&fun, tensor->op_params, sizeof(fun));
15183
- ggml_compute_forward_map_unary(params, tensor->src[0], tensor, fun);
15582
+ ggml_compute_forward_map_unary(params, tensor, fun);
15184
15583
  }
15185
15584
  break;
15186
15585
  case GGML_OP_MAP_BINARY:
15187
15586
  {
15188
15587
  ggml_binary_op_f32_t fun;
15189
15588
  memcpy(&fun, tensor->op_params, sizeof(fun));
15190
- ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun);
15589
+ ggml_compute_forward_map_binary(params, tensor, fun);
15191
15590
  }
15192
15591
  break;
15193
15592
  case GGML_OP_MAP_CUSTOM1_F32:
15194
15593
  {
15195
15594
  ggml_custom1_op_f32_t fun;
15196
15595
  memcpy(&fun, tensor->op_params, sizeof(fun));
15197
- ggml_compute_forward_map_custom1_f32(params, tensor->src[0], tensor, fun);
15596
+ ggml_compute_forward_map_custom1_f32(params, tensor, fun);
15198
15597
  }
15199
15598
  break;
15200
15599
  case GGML_OP_MAP_CUSTOM2_F32:
15201
15600
  {
15202
15601
  ggml_custom2_op_f32_t fun;
15203
15602
  memcpy(&fun, tensor->op_params, sizeof(fun));
15204
- ggml_compute_forward_map_custom2_f32(params, tensor->src[0], tensor->src[1], tensor, fun);
15603
+ ggml_compute_forward_map_custom2_f32(params, tensor, fun);
15205
15604
  }
15206
15605
  break;
15207
15606
  case GGML_OP_MAP_CUSTOM3_F32:
15208
15607
  {
15209
15608
  ggml_custom3_op_f32_t fun;
15210
15609
  memcpy(&fun, tensor->op_params, sizeof(fun));
15211
- ggml_compute_forward_map_custom3_f32(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun);
15610
+ ggml_compute_forward_map_custom3_f32(params, tensor, fun);
15212
15611
  }
15213
15612
  break;
15214
15613
  case GGML_OP_MAP_CUSTOM1:
15215
15614
  {
15216
- ggml_compute_forward_map_custom1(params, tensor->src[0], tensor);
15615
+ ggml_compute_forward_map_custom1(params, tensor);
15217
15616
  }
15218
15617
  break;
15219
15618
  case GGML_OP_MAP_CUSTOM2:
15220
15619
  {
15221
- ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor);
15620
+ ggml_compute_forward_map_custom2(params, tensor);
15222
15621
  }
15223
15622
  break;
15224
15623
  case GGML_OP_MAP_CUSTOM3:
15225
15624
  {
15226
- ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
15625
+ ggml_compute_forward_map_custom3(params, tensor);
15227
15626
  }
15228
15627
  break;
15229
15628
  case GGML_OP_CROSS_ENTROPY_LOSS:
15230
15629
  {
15231
- ggml_compute_forward_cross_entropy_loss(params, tensor->src[0], tensor->src[1], tensor);
15630
+ ggml_compute_forward_cross_entropy_loss(params, tensor);
15232
15631
  }
15233
15632
  break;
15234
15633
  case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
15235
15634
  {
15236
- ggml_compute_forward_cross_entropy_loss_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
15635
+ ggml_compute_forward_cross_entropy_loss_back(params, tensor);
15237
15636
  }
15238
15637
  break;
15239
15638
  case GGML_OP_NONE:
@@ -16637,27 +17036,47 @@ typedef pthread_t ggml_thread_t;
16637
17036
  #endif
16638
17037
 
16639
17038
  // Android's libc implementation "bionic" does not support setting affinity
16640
- #if defined(__linux__) && !defined(__BIONIC__)
16641
- static void set_numa_thread_affinity(int thread_n, int n_threads) {
17039
+ #if defined(__gnu_linux__)
17040
+ static void set_numa_thread_affinity(int thread_n) {
16642
17041
  if (!ggml_is_numa()) {
16643
17042
  return;
16644
17043
  }
16645
17044
 
16646
- // run thread on node_num thread_n / (threads per node)
16647
- const int node_num = thread_n / ((n_threads + g_state.numa.n_nodes - 1) / g_state.numa.n_nodes);
16648
- struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
17045
+ int node_num;
17046
+ int rv;
16649
17047
  size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
16650
17048
 
17049
+ switch(g_state.numa.numa_strategy) {
17050
+ case GGML_NUMA_STRATEGY_DISTRIBUTE:
17051
+ // run thread on node_num thread_n / (threads per node)
17052
+ node_num = thread_n % g_state.numa.n_nodes;
17053
+ break;
17054
+ case GGML_NUMA_STRATEGY_ISOLATE:
17055
+ // run thread on current_node
17056
+ node_num = g_state.numa.current_node;
17057
+ break;
17058
+ case GGML_NUMA_STRATEGY_NUMACTL:
17059
+ // use the cpuset that numactl gave us
17060
+ rv = pthread_setaffinity_np(pthread_self(), setsize, &g_state.numa.cpuset);
17061
+ if (rv) {
17062
+ fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",strerror(rv));
17063
+ }
17064
+ return;
17065
+ default:
17066
+ return;
17067
+ }
17068
+
17069
+ struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
17070
+
16651
17071
  cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
16652
17072
  CPU_ZERO_S(setsize, cpus);
16653
17073
  for (size_t i = 0; i < node->n_cpus; ++i) {
16654
17074
  CPU_SET_S(node->cpus[i], setsize, cpus);
16655
17075
  }
16656
17076
 
16657
- int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
17077
+ rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
16658
17078
  if (rv) {
16659
- fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
16660
- strerror(rv));
17079
+ fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
16661
17080
  }
16662
17081
 
16663
17082
  CPU_FREE(cpus);
@@ -16678,8 +17097,7 @@ static void clear_numa_thread_affinity(void) {
16678
17097
 
16679
17098
  int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
16680
17099
  if (rv) {
16681
- fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
16682
- strerror(rv));
17100
+ fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
16683
17101
  }
16684
17102
 
16685
17103
  CPU_FREE(cpus);
@@ -16687,7 +17105,7 @@ static void clear_numa_thread_affinity(void) {
16687
17105
  #else
16688
17106
  // TODO: Windows etc.
16689
17107
  // (the linux implementation may also work on BSD, someone should test)
16690
- static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
17108
+ static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n); }
16691
17109
  static void clear_numa_thread_affinity(void) {}
16692
17110
  #endif
16693
17111
 
@@ -16987,7 +17405,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16987
17405
 
16988
17406
  const int n_threads = state->shared->n_threads;
16989
17407
 
16990
- set_numa_thread_affinity(state->ith, n_threads);
17408
+ set_numa_thread_affinity(state->ith);
16991
17409
 
16992
17410
  int node_n = -1;
16993
17411
  int task_phase = GGML_TASK_FINALIZE;
@@ -17793,7 +18211,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
17793
18211
 
17794
18212
  ptr += ggml_nbytes(tensor);
17795
18213
 
17796
- fprintf(stderr, "%s: loaded leaf %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
18214
+ fprintf(stderr, "%s: loaded leaf %u: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
17797
18215
  }
17798
18216
  }
17799
18217
 
@@ -17896,7 +18314,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
17896
18314
 
17897
18315
  result->nodes[i] = tensor;
17898
18316
 
17899
- fprintf(stderr, "%s: loaded node %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
18317
+ fprintf(stderr, "%s: loaded node %u: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
17900
18318
  }
17901
18319
  }
17902
18320
  }
@@ -18521,7 +18939,9 @@ static enum ggml_opt_result linesearch_backtracking(
18521
18939
  (*step) *= width;
18522
18940
  }
18523
18941
 
18524
- GGML_UNREACHABLE();
18942
+ GGML_ASSERT(false && "line search failed");
18943
+
18944
+ return GGML_LINESEARCH_FAIL;
18525
18945
  }
18526
18946
 
18527
18947
  static enum ggml_opt_result ggml_opt_lbfgs(
@@ -18789,7 +19209,9 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18789
19209
  step[0] = 1.0;
18790
19210
  }
18791
19211
 
18792
- GGML_UNREACHABLE();
19212
+ GGML_ASSERT(false && "lbfgs failed");
19213
+
19214
+ return GGML_OPT_DID_NOT_CONVERGE;
18793
19215
  }
18794
19216
 
18795
19217
  struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
@@ -19037,8 +19459,9 @@ void ggml_quantize_init(enum ggml_type type) {
19037
19459
  ggml_critical_section_start();
19038
19460
 
19039
19461
  switch (type) {
19040
- case GGML_TYPE_IQ2_XXS: iq2xs_init_impl(256); break;
19041
- case GGML_TYPE_IQ2_XS: iq2xs_init_impl(512); break;
19462
+ case GGML_TYPE_IQ2_XXS:
19463
+ case GGML_TYPE_IQ2_XS:
19464
+ case GGML_TYPE_IQ1_S: iq2xs_init_impl(type); break;
19042
19465
  case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
19043
19466
  default: // nothing
19044
19467
  break;
@@ -19050,8 +19473,10 @@ void ggml_quantize_init(enum ggml_type type) {
19050
19473
  void ggml_quantize_free(void) {
19051
19474
  ggml_critical_section_start();
19052
19475
 
19053
- iq2xs_free_impl(256);
19054
- iq2xs_free_impl(512);
19476
+ iq2xs_free_impl(GGML_TYPE_IQ2_XXS);
19477
+ iq2xs_free_impl(GGML_TYPE_IQ2_XS);
19478
+ iq2xs_free_impl(GGML_TYPE_IQ1_S);
19479
+ iq3xs_free_impl(256);
19055
19480
 
19056
19481
  ggml_critical_section_end();
19057
19482
  }
@@ -19186,7 +19611,8 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
19186
19611
  bool ggml_quantize_requires_imatrix(enum ggml_type type) {
19187
19612
  return
19188
19613
  type == GGML_TYPE_IQ2_XXS ||
19189
- type == GGML_TYPE_IQ2_XS;
19614
+ type == GGML_TYPE_IQ2_XS ||
19615
+ type == GGML_TYPE_IQ1_S;
19190
19616
  }
19191
19617
 
19192
19618
  size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
@@ -19311,6 +19737,24 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
19311
19737
  result = quantize_iq3_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19312
19738
  GGML_ASSERT(result == row_size * nrows);
19313
19739
  } break;
19740
+ case GGML_TYPE_IQ1_S:
19741
+ {
19742
+ GGML_ASSERT(start % QK_K == 0);
19743
+ GGML_ASSERT(start % n_per_row == 0);
19744
+ size_t start_row = start / n_per_row;
19745
+ size_t row_size = ggml_row_size(type, n_per_row);
19746
+ result = quantize_iq1_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19747
+ GGML_ASSERT(result == row_size * nrows);
19748
+ } break;
19749
+ case GGML_TYPE_IQ4_NL:
19750
+ {
19751
+ GGML_ASSERT(start % QK4_NL == 0);
19752
+ GGML_ASSERT(start % n_per_row == 0);
19753
+ size_t start_row = start / n_per_row;
19754
+ size_t row_size = ggml_row_size(type, n_per_row);
19755
+ result = quantize_iq4_nl(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19756
+ GGML_ASSERT(result == row_size * nrows);
19757
+ } break;
19314
19758
  case GGML_TYPE_F16:
19315
19759
  {
19316
19760
  size_t elemsize = sizeof(ggml_fp16_t);