llama_cpp 0.12.6 → 0.12.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -23,6 +23,9 @@
23
23
  #include <limits.h>
24
24
  #include <stdarg.h>
25
25
  #include <signal.h>
26
+ #if defined(__gnu_linux__)
27
+ #include <syscall.h>
28
+ #endif
26
29
 
27
30
  #ifdef GGML_USE_METAL
28
31
  #include <unistd.h>
@@ -270,6 +273,8 @@ inline static void * ggml_calloc(size_t num, size_t size) {
270
273
  #include <Accelerate/Accelerate.h>
271
274
  #if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
272
275
  #include "ggml-opencl.h"
276
+ #elif defined(GGML_USE_VULKAN)
277
+ #include "ggml-vulkan.h"
273
278
  #endif
274
279
  #elif defined(GGML_USE_OPENBLAS)
275
280
  #if defined(GGML_BLAS_USE_MKL)
@@ -318,7 +323,7 @@ float ggml_table_f32_f16[1 << 16];
318
323
  // note: do not use these inside ggml.c
319
324
  // these are meant to be used via the ggml.h API
320
325
  float ggml_fp16_to_fp32(ggml_fp16_t x) {
321
- return (float) GGML_FP16_TO_FP32(x);
326
+ return GGML_FP16_TO_FP32(x);
322
327
  }
323
328
 
324
329
  ggml_fp16_t ggml_fp32_to_fp16(float x) {
@@ -673,6 +678,30 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
673
678
  .vec_dot_type = GGML_TYPE_Q8_K,
674
679
  .nrows = 1,
675
680
  },
681
+ [GGML_TYPE_IQ1_S] = {
682
+ .type_name = "iq1_s",
683
+ .blck_size = QK_K,
684
+ .type_size = sizeof(block_iq1_s),
685
+ .is_quantized = true,
686
+ .to_float = (ggml_to_float_t) dequantize_row_iq1_s,
687
+ .from_float = NULL,
688
+ .from_float_reference = NULL,
689
+ .vec_dot = ggml_vec_dot_iq1_s_q8_K,
690
+ .vec_dot_type = GGML_TYPE_Q8_K,
691
+ .nrows = 1,
692
+ },
693
+ [GGML_TYPE_IQ4_NL] = {
694
+ .type_name = "iq4_nl",
695
+ .blck_size = QK4_NL,
696
+ .type_size = sizeof(block_iq4_nl),
697
+ .is_quantized = true,
698
+ .to_float = (ggml_to_float_t) dequantize_row_iq4_nl,
699
+ .from_float = quantize_row_iq4_nl,
700
+ .from_float_reference = (ggml_from_float_t)quantize_row_iq4_nl_reference,
701
+ .vec_dot = ggml_vec_dot_iq4_nl_q8_0,
702
+ .vec_dot_type = GGML_TYPE_Q8_0,
703
+ .nrows = 1,
704
+ },
676
705
  [GGML_TYPE_Q8_K] = {
677
706
  .type_name = "q8_K",
678
707
  .blck_size = QK_K,
@@ -769,7 +798,7 @@ inline static float vaddvq_f32(float32x4_t v) {
769
798
  #define GGML_F16x8 float16x8_t
770
799
  #define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
771
800
  #define GGML_F16x8_SET1(x) vdupq_n_f16(x)
772
- #define GGML_F16x8_LOAD vld1q_f16
801
+ #define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x))
773
802
  #define GGML_F16x8_STORE vst1q_f16
774
803
  #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
775
804
  #define GGML_F16x8_ADD vaddq_f16
@@ -812,7 +841,7 @@ inline static float vaddvq_f32(float32x4_t v) {
812
841
  #define GGML_F32Cx4 float32x4_t
813
842
  #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
814
843
  #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
815
- #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16(x))
844
+ #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
816
845
  #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
817
846
  #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
818
847
  #define GGML_F32Cx4_ADD vaddq_f32
@@ -868,7 +897,7 @@ do { \
868
897
  const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \
869
898
  _mm256_extractf128_ps(x[0], 1)); \
870
899
  const __m128 t1 = _mm_hadd_ps(t0, t0); \
871
- res = _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \
900
+ res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \
872
901
  } while (0)
873
902
  // TODO: is this optimal ?
874
903
 
@@ -1149,7 +1178,7 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
1149
1178
  x[i] = _mm_add_ps(x[i], x[offset+i]); \
1150
1179
  } \
1151
1180
  const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
1152
- res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
1181
+ res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
1153
1182
  }
1154
1183
  // TODO: is this optimal ?
1155
1184
 
@@ -1954,9 +1983,16 @@ struct ggml_numa_node {
1954
1983
  };
1955
1984
 
1956
1985
  struct ggml_numa_nodes {
1986
+ enum ggml_numa_strategy numa_strategy;
1957
1987
  struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
1958
1988
  uint32_t n_nodes;
1959
1989
  uint32_t total_cpus; // hardware threads on system
1990
+ uint32_t current_node; // node on which main process is execting
1991
+ #if defined(__gnu_linux__)
1992
+ cpu_set_t cpuset; // cpuset from numactl
1993
+ #else
1994
+ uint32_t cpuset; // no NUMA support outside of Linux at this time. Use a portable datatype
1995
+ #endif
1960
1996
  };
1961
1997
 
1962
1998
  //
@@ -1990,18 +2026,40 @@ inline static void ggml_critical_section_end(void) {
1990
2026
  atomic_fetch_sub(&g_state_barrier, 1);
1991
2027
  }
1992
2028
 
1993
- void ggml_numa_init(void) {
2029
+ #if defined(__gnu_linux__)
2030
+ static cpu_set_t ggml_get_numa_affinity(void) {
2031
+ cpu_set_t cpuset;
2032
+ pthread_t thread;
2033
+ thread = pthread_self();
2034
+ CPU_ZERO(&cpuset);
2035
+ pthread_getaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
2036
+ return cpuset;
2037
+ }
2038
+ #else
2039
+ static uint32_t ggml_get_numa_affinity(void) {
2040
+ return 0; // no NUMA support
2041
+ }
2042
+ #endif
2043
+
2044
+ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
1994
2045
  if (g_state.numa.n_nodes > 0) {
1995
2046
  fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
1996
2047
 
1997
2048
  return;
1998
2049
  }
1999
2050
 
2000
- #ifdef __linux__
2051
+ #if defined(__gnu_linux__)
2001
2052
  struct stat st;
2002
2053
  char path[256];
2003
2054
  int rv;
2004
2055
 
2056
+ // set numa scheme
2057
+ g_state.numa.numa_strategy = numa_flag;
2058
+
2059
+ GGML_PRINT_DEBUG("numa strategy %u\n",g_state.numa.numa_strategy);
2060
+
2061
+ g_state.numa.cpuset = ggml_get_numa_affinity();
2062
+
2005
2063
  // enumerate nodes
2006
2064
  while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
2007
2065
  rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
@@ -2020,11 +2078,23 @@ void ggml_numa_init(void) {
2020
2078
 
2021
2079
  GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
2022
2080
 
2023
- if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1) {
2081
+ // figure out which node we're on
2082
+ uint current_cpu;
2083
+ int getcpu_ret = 0;
2084
+ #if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 28)
2085
+ getcpu_ret = getcpu(&current_cpu, &g_state.numa.current_node);
2086
+ #else
2087
+ // old glibc doesn't have a wrapper for this call. Fall back on direct syscall
2088
+ getcpu_ret = syscall(SYS_getcpu,&current_cpu,&g_state.numa.current_node);
2089
+ #endif
2090
+
2091
+ if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) {
2024
2092
  g_state.numa.n_nodes = 0;
2025
2093
  return;
2026
2094
  }
2027
2095
 
2096
+ GGML_PRINT_DEBUG("found our process on numa node %u, CPU %u\n", g_state.numa.current_node, current_cpu);
2097
+
2028
2098
  for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
2029
2099
  struct ggml_numa_node * node = &g_state.numa.nodes[n];
2030
2100
  GGML_PRINT_DEBUG("CPUs on node %u:", n);
@@ -2051,6 +2121,7 @@ void ggml_numa_init(void) {
2051
2121
  }
2052
2122
  }
2053
2123
  #else
2124
+ GGML_UNUSED(numa_flag);
2054
2125
  // TODO
2055
2126
  #endif
2056
2127
  }
@@ -2231,6 +2302,8 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
2231
2302
  case GGML_FTYPE_MOSTLY_IQ2_XXS: wtype = GGML_TYPE_IQ2_XXS; break;
2232
2303
  case GGML_FTYPE_MOSTLY_IQ2_XS: wtype = GGML_TYPE_IQ2_XS; break;
2233
2304
  case GGML_FTYPE_MOSTLY_IQ3_XXS: wtype = GGML_TYPE_IQ3_XXS; break;
2305
+ case GGML_FTYPE_MOSTLY_IQ1_S: wtype = GGML_TYPE_IQ1_S; break;
2306
+ case GGML_FTYPE_MOSTLY_IQ4_NL: wtype = GGML_TYPE_IQ4_NL; break;
2234
2307
  case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
2235
2308
  case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
2236
2309
  }
@@ -3184,7 +3257,7 @@ const char * ggml_get_name(const struct ggml_tensor * tensor) {
3184
3257
  }
3185
3258
 
3186
3259
  struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name) {
3187
- strncpy(tensor->name, name, sizeof(tensor->name));
3260
+ strncpy(tensor->name, name, sizeof(tensor->name) - 1);
3188
3261
  tensor->name[sizeof(tensor->name) - 1] = '\0';
3189
3262
  return tensor;
3190
3263
  }
@@ -5060,16 +5133,28 @@ static struct ggml_tensor * ggml_soft_max_impl(
5060
5133
  struct ggml_context * ctx,
5061
5134
  struct ggml_tensor * a,
5062
5135
  struct ggml_tensor * mask,
5136
+ struct ggml_tensor * pos,
5063
5137
  float scale,
5138
+ float max_bias,
5064
5139
  bool inplace) {
5065
5140
  GGML_ASSERT(ggml_is_contiguous(a));
5141
+
5066
5142
  if (mask) {
5067
5143
  GGML_ASSERT(ggml_is_contiguous(mask));
5068
- GGML_ASSERT(mask->ne[2] == 1);
5069
- GGML_ASSERT(mask->ne[3] == 1);
5144
+ GGML_ASSERT(ggml_is_matrix(mask));
5070
5145
  GGML_ASSERT(ggml_can_repeat_rows(mask, a));
5071
5146
  }
5072
5147
 
5148
+ if (pos) {
5149
+ GGML_ASSERT(ggml_is_vector(pos));
5150
+ GGML_ASSERT(pos->type == GGML_TYPE_F32);
5151
+ GGML_ASSERT(pos->ne[0] == a->ne[0]);
5152
+ }
5153
+
5154
+ if (max_bias > 0.0f) {
5155
+ GGML_ASSERT(pos);
5156
+ }
5157
+
5073
5158
  bool is_node = false;
5074
5159
 
5075
5160
  if (a->grad) {
@@ -5078,13 +5163,14 @@ static struct ggml_tensor * ggml_soft_max_impl(
5078
5163
 
5079
5164
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5080
5165
 
5081
- float params[] = { scale };
5166
+ float params[] = { scale, max_bias };
5082
5167
  ggml_set_op_params(result, params, sizeof(params));
5083
5168
 
5084
5169
  result->op = GGML_OP_SOFT_MAX;
5085
5170
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5086
5171
  result->src[0] = a;
5087
5172
  result->src[1] = mask;
5173
+ result->src[2] = pos;
5088
5174
 
5089
5175
  return result;
5090
5176
  }
@@ -5092,21 +5178,23 @@ static struct ggml_tensor * ggml_soft_max_impl(
5092
5178
  struct ggml_tensor * ggml_soft_max(
5093
5179
  struct ggml_context * ctx,
5094
5180
  struct ggml_tensor * a) {
5095
- return ggml_soft_max_impl(ctx, a, NULL, 1.0f, false);
5181
+ return ggml_soft_max_impl(ctx, a, NULL, NULL, 1.0f, 0.0f, false);
5096
5182
  }
5097
5183
 
5098
5184
  struct ggml_tensor * ggml_soft_max_inplace(
5099
5185
  struct ggml_context * ctx,
5100
5186
  struct ggml_tensor * a) {
5101
- return ggml_soft_max_impl(ctx, a, NULL, 1.0f, true);
5187
+ return ggml_soft_max_impl(ctx, a, NULL, NULL, 1.0f, 0.0f, true);
5102
5188
  }
5103
5189
 
5104
5190
  struct ggml_tensor * ggml_soft_max_ext(
5105
5191
  struct ggml_context * ctx,
5106
5192
  struct ggml_tensor * a,
5107
5193
  struct ggml_tensor * mask,
5108
- float scale) {
5109
- return ggml_soft_max_impl(ctx, a, mask, scale, false);
5194
+ struct ggml_tensor * pos,
5195
+ float scale,
5196
+ float max_bias) {
5197
+ return ggml_soft_max_impl(ctx, a, mask, pos, scale, max_bias, false);
5110
5198
  }
5111
5199
 
5112
5200
  // ggml_soft_max_back
@@ -5556,7 +5644,9 @@ struct ggml_tensor * ggml_conv_2d(
5556
5644
  ggml_reshape_2d(ctx, im2col, im2col->ne[0], im2col->ne[3] * im2col->ne[2] * im2col->ne[1]), // [N, OH, OW, IC * KH * KW] => [N*OH*OW, IC * KH * KW]
5557
5645
  ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1] * a->ne[2]), a->ne[3])); // [OC,IC, KH, KW] => [OC, IC * KH * KW]
5558
5646
 
5559
- result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], a->ne[3], im2col->ne[3]); // [N, OC, OH, OW]
5647
+ result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], im2col->ne[3], a->ne[3]); // [OC, N, OH, OW]
5648
+ result = ggml_cont(ctx, ggml_permute(ctx, result, 0, 1, 3, 2)); // [N, OC, OH, OW]
5649
+
5560
5650
 
5561
5651
  return result;
5562
5652
  }
@@ -6562,8 +6652,10 @@ void ggml_set_param(
6562
6652
 
6563
6653
  static void ggml_compute_forward_dup_same_cont(
6564
6654
  const struct ggml_compute_params * params,
6565
- const struct ggml_tensor * src0,
6566
6655
  struct ggml_tensor * dst) {
6656
+
6657
+ const struct ggml_tensor * src0 = dst->src[0];
6658
+
6567
6659
  GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
6568
6660
  GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
6569
6661
  GGML_ASSERT(src0->type == dst->type);
@@ -6594,8 +6686,10 @@ static void ggml_compute_forward_dup_same_cont(
6594
6686
  }
6595
6687
  static void ggml_compute_forward_dup_f16(
6596
6688
  const struct ggml_compute_params * params,
6597
- const struct ggml_tensor * src0,
6598
6689
  struct ggml_tensor * dst) {
6690
+
6691
+ const struct ggml_tensor * src0 = dst->src[0];
6692
+
6599
6693
  GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
6600
6694
 
6601
6695
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -6608,7 +6702,7 @@ static void ggml_compute_forward_dup_f16(
6608
6702
  const int nth = params->nth; // number of threads
6609
6703
 
6610
6704
  if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
6611
- ggml_compute_forward_dup_same_cont(params, src0, dst);
6705
+ ggml_compute_forward_dup_same_cont(params, dst);
6612
6706
  return;
6613
6707
  }
6614
6708
 
@@ -6865,8 +6959,10 @@ static void ggml_compute_forward_dup_f16(
6865
6959
 
6866
6960
  static void ggml_compute_forward_dup_f32(
6867
6961
  const struct ggml_compute_params * params,
6868
- const struct ggml_tensor * src0,
6869
6962
  struct ggml_tensor * dst) {
6963
+
6964
+ const struct ggml_tensor * src0 = dst->src[0];
6965
+
6870
6966
  GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
6871
6967
 
6872
6968
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -6879,7 +6975,7 @@ static void ggml_compute_forward_dup_f32(
6879
6975
  const int nth = params->nth; // number of threads
6880
6976
 
6881
6977
  if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
6882
- ggml_compute_forward_dup_same_cont(params, src0, dst);
6978
+ ggml_compute_forward_dup_same_cont(params, dst);
6883
6979
  return;
6884
6980
  }
6885
6981
 
@@ -7115,8 +7211,10 @@ static void ggml_compute_forward_dup_f32(
7115
7211
  // A simplified version of ggml_compute_forward_dup that doesn't do float upcasting, and just plain old memcpy.
7116
7212
  static void ggml_compute_forward_dup_bytes(
7117
7213
  const struct ggml_compute_params * params,
7118
- const struct ggml_tensor * src0,
7119
7214
  struct ggml_tensor * dst) {
7215
+
7216
+ const struct ggml_tensor * src0 = dst->src[0];
7217
+
7120
7218
  GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
7121
7219
  GGML_ASSERT(src0->type == dst->type);
7122
7220
 
@@ -7125,7 +7223,7 @@ static void ggml_compute_forward_dup_bytes(
7125
7223
  }
7126
7224
 
7127
7225
  if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
7128
- ggml_compute_forward_dup_same_cont(params, src0, dst);
7226
+ ggml_compute_forward_dup_same_cont(params, dst);
7129
7227
  return;
7130
7228
  }
7131
7229
 
@@ -7264,21 +7362,23 @@ static void ggml_compute_forward_dup_bytes(
7264
7362
 
7265
7363
  static void ggml_compute_forward_dup(
7266
7364
  const struct ggml_compute_params * params,
7267
- const struct ggml_tensor * src0,
7268
7365
  struct ggml_tensor * dst) {
7366
+
7367
+ const struct ggml_tensor * src0 = dst->src[0];
7368
+
7269
7369
  if (src0->type == dst->type) {
7270
- ggml_compute_forward_dup_bytes(params, src0, dst);
7370
+ ggml_compute_forward_dup_bytes(params, dst);
7271
7371
  return;
7272
7372
  }
7273
7373
 
7274
7374
  switch (src0->type) {
7275
7375
  case GGML_TYPE_F16:
7276
7376
  {
7277
- ggml_compute_forward_dup_f16(params, src0, dst);
7377
+ ggml_compute_forward_dup_f16(params, dst);
7278
7378
  } break;
7279
7379
  case GGML_TYPE_F32:
7280
7380
  {
7281
- ggml_compute_forward_dup_f32(params, src0, dst);
7381
+ ggml_compute_forward_dup_f32(params, dst);
7282
7382
  } break;
7283
7383
  default:
7284
7384
  {
@@ -7291,9 +7391,11 @@ static void ggml_compute_forward_dup(
7291
7391
 
7292
7392
  static void ggml_compute_forward_add_f32(
7293
7393
  const struct ggml_compute_params * params,
7294
- const struct ggml_tensor * src0,
7295
- const struct ggml_tensor * src1,
7296
7394
  struct ggml_tensor * dst) {
7395
+
7396
+ const struct ggml_tensor * src0 = dst->src[0];
7397
+ const struct ggml_tensor * src1 = dst->src[1];
7398
+
7297
7399
  GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
7298
7400
 
7299
7401
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -7379,9 +7481,11 @@ static void ggml_compute_forward_add_f32(
7379
7481
 
7380
7482
  static void ggml_compute_forward_add_f16_f32(
7381
7483
  const struct ggml_compute_params * params,
7382
- const struct ggml_tensor * src0,
7383
- const struct ggml_tensor * src1,
7384
7484
  struct ggml_tensor * dst) {
7485
+
7486
+ const struct ggml_tensor * src0 = dst->src[0];
7487
+ const struct ggml_tensor * src1 = dst->src[1];
7488
+
7385
7489
  GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
7386
7490
 
7387
7491
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -7456,9 +7560,11 @@ static void ggml_compute_forward_add_f16_f32(
7456
7560
 
7457
7561
  static void ggml_compute_forward_add_f16_f16(
7458
7562
  const struct ggml_compute_params * params,
7459
- const struct ggml_tensor * src0,
7460
- const struct ggml_tensor * src1,
7461
7563
  struct ggml_tensor * dst) {
7564
+
7565
+ const struct ggml_tensor * src0 = dst->src[0];
7566
+ const struct ggml_tensor * src1 = dst->src[1];
7567
+
7462
7568
  GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
7463
7569
 
7464
7570
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -7510,9 +7616,11 @@ static void ggml_compute_forward_add_f16_f16(
7510
7616
 
7511
7617
  static void ggml_compute_forward_add_q_f32(
7512
7618
  const struct ggml_compute_params * params,
7513
- const struct ggml_tensor * src0,
7514
- const struct ggml_tensor * src1,
7515
7619
  struct ggml_tensor * dst) {
7620
+
7621
+ const struct ggml_tensor * src0 = dst->src[0];
7622
+ const struct ggml_tensor * src1 = dst->src[1];
7623
+
7516
7624
  GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
7517
7625
 
7518
7626
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -7588,14 +7696,16 @@ static void ggml_compute_forward_add_q_f32(
7588
7696
 
7589
7697
  static void ggml_compute_forward_add(
7590
7698
  const struct ggml_compute_params * params,
7591
- const struct ggml_tensor * src0,
7592
- const struct ggml_tensor * src1,
7593
7699
  struct ggml_tensor * dst) {
7700
+
7701
+ const struct ggml_tensor * src0 = dst->src[0];
7702
+ const struct ggml_tensor * src1 = dst->src[1];
7703
+
7594
7704
  switch (src0->type) {
7595
7705
  case GGML_TYPE_F32:
7596
7706
  {
7597
7707
  if (src1->type == GGML_TYPE_F32) {
7598
- ggml_compute_forward_add_f32(params, src0, src1, dst);
7708
+ ggml_compute_forward_add_f32(params, dst);
7599
7709
  }
7600
7710
  else {
7601
7711
  GGML_ASSERT(false);
@@ -7604,10 +7714,10 @@ static void ggml_compute_forward_add(
7604
7714
  case GGML_TYPE_F16:
7605
7715
  {
7606
7716
  if (src1->type == GGML_TYPE_F16) {
7607
- ggml_compute_forward_add_f16_f16(params, src0, src1, dst);
7717
+ ggml_compute_forward_add_f16_f16(params, dst);
7608
7718
  }
7609
7719
  else if (src1->type == GGML_TYPE_F32) {
7610
- ggml_compute_forward_add_f16_f32(params, src0, src1, dst);
7720
+ ggml_compute_forward_add_f16_f32(params, dst);
7611
7721
  }
7612
7722
  else {
7613
7723
  GGML_ASSERT(false);
@@ -7626,8 +7736,10 @@ static void ggml_compute_forward_add(
7626
7736
  case GGML_TYPE_IQ2_XXS:
7627
7737
  case GGML_TYPE_IQ2_XS:
7628
7738
  case GGML_TYPE_IQ3_XXS:
7739
+ case GGML_TYPE_IQ1_S:
7740
+ case GGML_TYPE_IQ4_NL:
7629
7741
  {
7630
- ggml_compute_forward_add_q_f32(params, src0, src1, dst);
7742
+ ggml_compute_forward_add_q_f32(params, dst);
7631
7743
  } break;
7632
7744
  default:
7633
7745
  {
@@ -7640,9 +7752,11 @@ static void ggml_compute_forward_add(
7640
7752
 
7641
7753
  static void ggml_compute_forward_add1_f32(
7642
7754
  const struct ggml_compute_params * params,
7643
- const struct ggml_tensor * src0,
7644
- const struct ggml_tensor * src1,
7645
7755
  struct ggml_tensor * dst) {
7756
+
7757
+ const struct ggml_tensor * src0 = dst->src[0];
7758
+ const struct ggml_tensor * src1 = dst->src[1];
7759
+
7646
7760
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
7647
7761
  GGML_ASSERT(ggml_is_scalar(src1));
7648
7762
 
@@ -7692,9 +7806,11 @@ static void ggml_compute_forward_add1_f32(
7692
7806
 
7693
7807
  static void ggml_compute_forward_add1_f16_f32(
7694
7808
  const struct ggml_compute_params * params,
7695
- const struct ggml_tensor * src0,
7696
- const struct ggml_tensor * src1,
7697
7809
  struct ggml_tensor * dst) {
7810
+
7811
+ const struct ggml_tensor * src0 = dst->src[0];
7812
+ const struct ggml_tensor * src1 = dst->src[1];
7813
+
7698
7814
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
7699
7815
  GGML_ASSERT(ggml_is_scalar(src1));
7700
7816
 
@@ -7742,9 +7858,11 @@ static void ggml_compute_forward_add1_f16_f32(
7742
7858
 
7743
7859
  static void ggml_compute_forward_add1_f16_f16(
7744
7860
  const struct ggml_compute_params * params,
7745
- const struct ggml_tensor * src0,
7746
- const struct ggml_tensor * src1,
7747
7861
  struct ggml_tensor * dst) {
7862
+
7863
+ const struct ggml_tensor * src0 = dst->src[0];
7864
+ const struct ggml_tensor * src1 = dst->src[1];
7865
+
7748
7866
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
7749
7867
  GGML_ASSERT(ggml_is_scalar(src1));
7750
7868
 
@@ -7792,9 +7910,11 @@ static void ggml_compute_forward_add1_f16_f16(
7792
7910
 
7793
7911
  static void ggml_compute_forward_add1_q_f32(
7794
7912
  const struct ggml_compute_params * params,
7795
- const struct ggml_tensor * src0,
7796
- const struct ggml_tensor * src1,
7797
7913
  struct ggml_tensor * dst) {
7914
+
7915
+ const struct ggml_tensor * src0 = dst->src[0];
7916
+ const struct ggml_tensor * src1 = dst->src[1];
7917
+
7798
7918
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
7799
7919
  GGML_ASSERT(ggml_is_scalar(src1));
7800
7920
 
@@ -7859,21 +7979,23 @@ static void ggml_compute_forward_add1_q_f32(
7859
7979
 
7860
7980
  static void ggml_compute_forward_add1(
7861
7981
  const struct ggml_compute_params * params,
7862
- const struct ggml_tensor * src0,
7863
- const struct ggml_tensor * src1,
7864
7982
  struct ggml_tensor * dst) {
7983
+
7984
+ const struct ggml_tensor * src0 = dst->src[0];
7985
+ const struct ggml_tensor * src1 = dst->src[1];
7986
+
7865
7987
  switch (src0->type) {
7866
7988
  case GGML_TYPE_F32:
7867
7989
  {
7868
- ggml_compute_forward_add1_f32(params, src0, src1, dst);
7990
+ ggml_compute_forward_add1_f32(params, dst);
7869
7991
  } break;
7870
7992
  case GGML_TYPE_F16:
7871
7993
  {
7872
7994
  if (src1->type == GGML_TYPE_F16) {
7873
- ggml_compute_forward_add1_f16_f16(params, src0, src1, dst);
7995
+ ggml_compute_forward_add1_f16_f16(params, dst);
7874
7996
  }
7875
7997
  else if (src1->type == GGML_TYPE_F32) {
7876
- ggml_compute_forward_add1_f16_f32(params, src0, src1, dst);
7998
+ ggml_compute_forward_add1_f16_f32(params, dst);
7877
7999
  }
7878
8000
  else {
7879
8001
  GGML_ASSERT(false);
@@ -7893,8 +8015,10 @@ static void ggml_compute_forward_add1(
7893
8015
  case GGML_TYPE_IQ2_XXS:
7894
8016
  case GGML_TYPE_IQ2_XS:
7895
8017
  case GGML_TYPE_IQ3_XXS:
8018
+ case GGML_TYPE_IQ1_S:
8019
+ case GGML_TYPE_IQ4_NL:
7896
8020
  {
7897
- ggml_compute_forward_add1_q_f32(params, src0, src1, dst);
8021
+ ggml_compute_forward_add1_q_f32(params, dst);
7898
8022
  } break;
7899
8023
  default:
7900
8024
  {
@@ -7907,9 +8031,11 @@ static void ggml_compute_forward_add1(
7907
8031
 
7908
8032
  static void ggml_compute_forward_acc_f32(
7909
8033
  const struct ggml_compute_params * params,
7910
- const struct ggml_tensor * src0,
7911
- const struct ggml_tensor * src1,
7912
8034
  struct ggml_tensor * dst) {
8035
+
8036
+ const struct ggml_tensor * src0 = dst->src[0];
8037
+ const struct ggml_tensor * src1 = dst->src[1];
8038
+
7913
8039
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
7914
8040
  GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
7915
8041
 
@@ -7989,14 +8115,14 @@ static void ggml_compute_forward_acc_f32(
7989
8115
 
7990
8116
  static void ggml_compute_forward_acc(
7991
8117
  const struct ggml_compute_params * params,
7992
- const struct ggml_tensor * src0,
7993
- const struct ggml_tensor * src1,
7994
8118
  struct ggml_tensor * dst) {
7995
8119
 
8120
+ const struct ggml_tensor * src0 = dst->src[0];
8121
+
7996
8122
  switch (src0->type) {
7997
8123
  case GGML_TYPE_F32:
7998
8124
  {
7999
- ggml_compute_forward_acc_f32(params, src0, src1, dst);
8125
+ ggml_compute_forward_acc_f32(params, dst);
8000
8126
  } break;
8001
8127
  case GGML_TYPE_F16:
8002
8128
  case GGML_TYPE_Q4_0:
@@ -8013,6 +8139,8 @@ static void ggml_compute_forward_acc(
8013
8139
  case GGML_TYPE_IQ2_XXS:
8014
8140
  case GGML_TYPE_IQ2_XS:
8015
8141
  case GGML_TYPE_IQ3_XXS:
8142
+ case GGML_TYPE_IQ1_S:
8143
+ case GGML_TYPE_IQ4_NL:
8016
8144
  default:
8017
8145
  {
8018
8146
  GGML_ASSERT(false);
@@ -8024,9 +8152,11 @@ static void ggml_compute_forward_acc(
8024
8152
 
8025
8153
  static void ggml_compute_forward_sub_f32(
8026
8154
  const struct ggml_compute_params * params,
8027
- const struct ggml_tensor * src0,
8028
- const struct ggml_tensor * src1,
8029
8155
  struct ggml_tensor * dst) {
8156
+
8157
+ const struct ggml_tensor * src0 = dst->src[0];
8158
+ const struct ggml_tensor * src1 = dst->src[1];
8159
+
8030
8160
  assert(params->ith == 0);
8031
8161
  assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
8032
8162
 
@@ -8084,13 +8214,14 @@ static void ggml_compute_forward_sub_f32(
8084
8214
 
8085
8215
  static void ggml_compute_forward_sub(
8086
8216
  const struct ggml_compute_params * params,
8087
- const struct ggml_tensor * src0,
8088
- const struct ggml_tensor * src1,
8089
8217
  struct ggml_tensor * dst) {
8218
+
8219
+ const struct ggml_tensor * src0 = dst->src[0];
8220
+
8090
8221
  switch (src0->type) {
8091
8222
  case GGML_TYPE_F32:
8092
8223
  {
8093
- ggml_compute_forward_sub_f32(params, src0, src1, dst);
8224
+ ggml_compute_forward_sub_f32(params, dst);
8094
8225
  } break;
8095
8226
  default:
8096
8227
  {
@@ -8103,9 +8234,11 @@ static void ggml_compute_forward_sub(
8103
8234
 
8104
8235
  static void ggml_compute_forward_mul_f32(
8105
8236
  const struct ggml_compute_params * params,
8106
- const struct ggml_tensor * src0,
8107
- const struct ggml_tensor * src1,
8108
8237
  struct ggml_tensor * dst) {
8238
+
8239
+ const struct ggml_tensor * src0 = dst->src[0];
8240
+ const struct ggml_tensor * src1 = dst->src[1];
8241
+
8109
8242
  GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
8110
8243
 
8111
8244
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -8186,15 +8319,17 @@ static void ggml_compute_forward_mul_f32(
8186
8319
 
8187
8320
  static void ggml_compute_forward_mul(
8188
8321
  const struct ggml_compute_params * params,
8189
- const struct ggml_tensor * src0,
8190
- const struct ggml_tensor * src1,
8191
8322
  struct ggml_tensor * dst) {
8323
+
8324
+ const struct ggml_tensor * src0 = dst->src[0];
8325
+ const struct ggml_tensor * src1 = dst->src[1];
8326
+
8192
8327
  GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now");
8193
8328
 
8194
8329
  switch (src0->type) {
8195
8330
  case GGML_TYPE_F32:
8196
8331
  {
8197
- ggml_compute_forward_mul_f32(params, src0, src1, dst);
8332
+ ggml_compute_forward_mul_f32(params, dst);
8198
8333
  } break;
8199
8334
  default:
8200
8335
  {
@@ -8207,9 +8342,11 @@ static void ggml_compute_forward_mul(
8207
8342
 
8208
8343
  static void ggml_compute_forward_div_f32(
8209
8344
  const struct ggml_compute_params * params,
8210
- const struct ggml_tensor * src0,
8211
- const struct ggml_tensor * src1,
8212
8345
  struct ggml_tensor * dst) {
8346
+
8347
+ const struct ggml_tensor * src0 = dst->src[0];
8348
+ const struct ggml_tensor * src1 = dst->src[1];
8349
+
8213
8350
  GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
8214
8351
 
8215
8352
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -8280,13 +8417,14 @@ static void ggml_compute_forward_div_f32(
8280
8417
 
8281
8418
  static void ggml_compute_forward_div(
8282
8419
  const struct ggml_compute_params * params,
8283
- const struct ggml_tensor * src0,
8284
- const struct ggml_tensor * src1,
8285
8420
  struct ggml_tensor * dst) {
8421
+
8422
+ const struct ggml_tensor * src0 = dst->src[0];
8423
+
8286
8424
  switch (src0->type) {
8287
8425
  case GGML_TYPE_F32:
8288
8426
  {
8289
- ggml_compute_forward_div_f32(params, src0, src1, dst);
8427
+ ggml_compute_forward_div_f32(params, dst);
8290
8428
  } break;
8291
8429
  default:
8292
8430
  {
@@ -8299,8 +8437,10 @@ static void ggml_compute_forward_div(
8299
8437
 
8300
8438
  static void ggml_compute_forward_sqr_f32(
8301
8439
  const struct ggml_compute_params * params,
8302
- const struct ggml_tensor * src0,
8303
8440
  struct ggml_tensor * dst) {
8441
+
8442
+ const struct ggml_tensor * src0 = dst->src[0];
8443
+
8304
8444
  assert(params->ith == 0);
8305
8445
  assert(ggml_are_same_shape(src0, dst));
8306
8446
 
@@ -8323,12 +8463,14 @@ static void ggml_compute_forward_sqr_f32(
8323
8463
 
8324
8464
  static void ggml_compute_forward_sqr(
8325
8465
  const struct ggml_compute_params * params,
8326
- const struct ggml_tensor * src0,
8327
8466
  struct ggml_tensor * dst) {
8467
+
8468
+ const struct ggml_tensor * src0 = dst->src[0];
8469
+
8328
8470
  switch (src0->type) {
8329
8471
  case GGML_TYPE_F32:
8330
8472
  {
8331
- ggml_compute_forward_sqr_f32(params, src0, dst);
8473
+ ggml_compute_forward_sqr_f32(params, dst);
8332
8474
  } break;
8333
8475
  default:
8334
8476
  {
@@ -8341,8 +8483,10 @@ static void ggml_compute_forward_sqr(
8341
8483
 
8342
8484
  static void ggml_compute_forward_sqrt_f32(
8343
8485
  const struct ggml_compute_params * params,
8344
- const struct ggml_tensor * src0,
8345
8486
  struct ggml_tensor * dst) {
8487
+
8488
+ const struct ggml_tensor * src0 = dst->src[0];
8489
+
8346
8490
  assert(params->ith == 0);
8347
8491
  assert(ggml_are_same_shape(src0, dst));
8348
8492
 
@@ -8365,12 +8509,14 @@ static void ggml_compute_forward_sqrt_f32(
8365
8509
 
8366
8510
  static void ggml_compute_forward_sqrt(
8367
8511
  const struct ggml_compute_params * params,
8368
- const struct ggml_tensor * src0,
8369
8512
  struct ggml_tensor * dst) {
8513
+
8514
+ const struct ggml_tensor * src0 = dst->src[0];
8515
+
8370
8516
  switch (src0->type) {
8371
8517
  case GGML_TYPE_F32:
8372
8518
  {
8373
- ggml_compute_forward_sqrt_f32(params, src0, dst);
8519
+ ggml_compute_forward_sqrt_f32(params, dst);
8374
8520
  } break;
8375
8521
  default:
8376
8522
  {
@@ -8383,8 +8529,10 @@ static void ggml_compute_forward_sqrt(
8383
8529
 
8384
8530
  static void ggml_compute_forward_log_f32(
8385
8531
  const struct ggml_compute_params * params,
8386
- const struct ggml_tensor * src0,
8387
8532
  struct ggml_tensor * dst) {
8533
+
8534
+ const struct ggml_tensor * src0 = dst->src[0];
8535
+
8388
8536
  GGML_ASSERT(params->ith == 0);
8389
8537
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
8390
8538
 
@@ -8407,12 +8555,14 @@ static void ggml_compute_forward_log_f32(
8407
8555
 
8408
8556
  static void ggml_compute_forward_log(
8409
8557
  const struct ggml_compute_params * params,
8410
- const struct ggml_tensor * src0,
8411
8558
  struct ggml_tensor * dst) {
8559
+
8560
+ const struct ggml_tensor * src0 = dst->src[0];
8561
+
8412
8562
  switch (src0->type) {
8413
8563
  case GGML_TYPE_F32:
8414
8564
  {
8415
- ggml_compute_forward_log_f32(params, src0, dst);
8565
+ ggml_compute_forward_log_f32(params, dst);
8416
8566
  } break;
8417
8567
  default:
8418
8568
  {
@@ -8425,8 +8575,10 @@ static void ggml_compute_forward_log(
8425
8575
 
8426
8576
  static void ggml_compute_forward_sum_f32(
8427
8577
  const struct ggml_compute_params * params,
8428
- const struct ggml_tensor * src0,
8429
8578
  struct ggml_tensor * dst) {
8579
+
8580
+ const struct ggml_tensor * src0 = dst->src[0];
8581
+
8430
8582
  assert(params->ith == 0);
8431
8583
  assert(ggml_is_scalar(dst));
8432
8584
 
@@ -8458,8 +8610,10 @@ static void ggml_compute_forward_sum_f32(
8458
8610
 
8459
8611
  static void ggml_compute_forward_sum_f16(
8460
8612
  const struct ggml_compute_params * params,
8461
- const struct ggml_tensor * src0,
8462
8613
  struct ggml_tensor * dst) {
8614
+
8615
+ const struct ggml_tensor * src0 = dst->src[0];
8616
+
8463
8617
  assert(params->ith == 0);
8464
8618
  assert(ggml_is_scalar(dst));
8465
8619
 
@@ -8490,16 +8644,18 @@ static void ggml_compute_forward_sum_f16(
8490
8644
 
8491
8645
  static void ggml_compute_forward_sum(
8492
8646
  const struct ggml_compute_params * params,
8493
- const struct ggml_tensor * src0,
8494
8647
  struct ggml_tensor * dst) {
8648
+
8649
+ const struct ggml_tensor * src0 = dst->src[0];
8650
+
8495
8651
  switch (src0->type) {
8496
8652
  case GGML_TYPE_F32:
8497
8653
  {
8498
- ggml_compute_forward_sum_f32(params, src0, dst);
8654
+ ggml_compute_forward_sum_f32(params, dst);
8499
8655
  } break;
8500
8656
  case GGML_TYPE_F16:
8501
8657
  {
8502
- ggml_compute_forward_sum_f16(params, src0, dst);
8658
+ ggml_compute_forward_sum_f16(params, dst);
8503
8659
  } break;
8504
8660
  default:
8505
8661
  {
@@ -8512,8 +8668,10 @@ static void ggml_compute_forward_sum(
8512
8668
 
8513
8669
  static void ggml_compute_forward_sum_rows_f32(
8514
8670
  const struct ggml_compute_params * params,
8515
- const struct ggml_tensor * src0,
8516
8671
  struct ggml_tensor * dst) {
8672
+
8673
+ const struct ggml_tensor * src0 = dst->src[0];
8674
+
8517
8675
  GGML_ASSERT(params->ith == 0);
8518
8676
 
8519
8677
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -8545,12 +8703,14 @@ static void ggml_compute_forward_sum_rows_f32(
8545
8703
 
8546
8704
  static void ggml_compute_forward_sum_rows(
8547
8705
  const struct ggml_compute_params * params,
8548
- const struct ggml_tensor * src0,
8549
8706
  struct ggml_tensor * dst) {
8707
+
8708
+ const struct ggml_tensor * src0 = dst->src[0];
8709
+
8550
8710
  switch (src0->type) {
8551
8711
  case GGML_TYPE_F32:
8552
8712
  {
8553
- ggml_compute_forward_sum_rows_f32(params, src0, dst);
8713
+ ggml_compute_forward_sum_rows_f32(params, dst);
8554
8714
  } break;
8555
8715
  default:
8556
8716
  {
@@ -8563,8 +8723,10 @@ static void ggml_compute_forward_sum_rows(
8563
8723
 
8564
8724
  static void ggml_compute_forward_mean_f32(
8565
8725
  const struct ggml_compute_params * params,
8566
- const struct ggml_tensor * src0,
8567
8726
  struct ggml_tensor * dst) {
8727
+
8728
+ const struct ggml_tensor * src0 = dst->src[0];
8729
+
8568
8730
  assert(params->ith == 0);
8569
8731
 
8570
8732
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -8600,12 +8762,14 @@ static void ggml_compute_forward_mean_f32(
8600
8762
 
8601
8763
  static void ggml_compute_forward_mean(
8602
8764
  const struct ggml_compute_params * params,
8603
- const struct ggml_tensor * src0,
8604
8765
  struct ggml_tensor * dst) {
8766
+
8767
+ const struct ggml_tensor * src0 = dst->src[0];
8768
+
8605
8769
  switch (src0->type) {
8606
8770
  case GGML_TYPE_F32:
8607
8771
  {
8608
- ggml_compute_forward_mean_f32(params, src0, dst);
8772
+ ggml_compute_forward_mean_f32(params, dst);
8609
8773
  } break;
8610
8774
  default:
8611
8775
  {
@@ -8618,8 +8782,10 @@ static void ggml_compute_forward_mean(
8618
8782
 
8619
8783
  static void ggml_compute_forward_argmax_f32(
8620
8784
  const struct ggml_compute_params * params,
8621
- const struct ggml_tensor * src0,
8622
8785
  struct ggml_tensor * dst) {
8786
+
8787
+ const struct ggml_tensor * src0 = dst->src[0];
8788
+
8623
8789
  assert(params->ith == 0);
8624
8790
 
8625
8791
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -8646,12 +8812,14 @@ static void ggml_compute_forward_argmax_f32(
8646
8812
 
8647
8813
  static void ggml_compute_forward_argmax(
8648
8814
  const struct ggml_compute_params * params,
8649
- const struct ggml_tensor * src0,
8650
8815
  struct ggml_tensor * dst) {
8816
+
8817
+ const struct ggml_tensor * src0 = dst->src[0];
8818
+
8651
8819
  switch (src0->type) {
8652
8820
  case GGML_TYPE_F32:
8653
8821
  {
8654
- ggml_compute_forward_argmax_f32(params, src0, dst);
8822
+ ggml_compute_forward_argmax_f32(params, dst);
8655
8823
  } break;
8656
8824
  default:
8657
8825
  {
@@ -8664,8 +8832,10 @@ static void ggml_compute_forward_argmax(
8664
8832
 
8665
8833
  static void ggml_compute_forward_repeat_f32(
8666
8834
  const struct ggml_compute_params * params,
8667
- const struct ggml_tensor * src0,
8668
8835
  struct ggml_tensor * dst) {
8836
+
8837
+ const struct ggml_tensor * src0 = dst->src[0];
8838
+
8669
8839
  GGML_ASSERT(params->ith == 0);
8670
8840
  GGML_ASSERT(ggml_can_repeat(src0, dst));
8671
8841
 
@@ -8707,8 +8877,10 @@ static void ggml_compute_forward_repeat_f32(
8707
8877
 
8708
8878
  static void ggml_compute_forward_repeat_f16(
8709
8879
  const struct ggml_compute_params * params,
8710
- const struct ggml_tensor * src0,
8711
8880
  struct ggml_tensor * dst) {
8881
+
8882
+ const struct ggml_tensor * src0 = dst->src[0];
8883
+
8712
8884
  GGML_ASSERT(params->ith == 0);
8713
8885
  GGML_ASSERT(ggml_can_repeat(src0, dst));
8714
8886
 
@@ -8753,18 +8925,20 @@ static void ggml_compute_forward_repeat_f16(
8753
8925
 
8754
8926
  static void ggml_compute_forward_repeat(
8755
8927
  const struct ggml_compute_params * params,
8756
- const struct ggml_tensor * src0,
8757
8928
  struct ggml_tensor * dst) {
8929
+
8930
+ const struct ggml_tensor * src0 = dst->src[0];
8931
+
8758
8932
  switch (src0->type) {
8759
8933
  case GGML_TYPE_F16:
8760
8934
  case GGML_TYPE_I16:
8761
8935
  {
8762
- ggml_compute_forward_repeat_f16(params, src0, dst);
8936
+ ggml_compute_forward_repeat_f16(params, dst);
8763
8937
  } break;
8764
8938
  case GGML_TYPE_F32:
8765
8939
  case GGML_TYPE_I32:
8766
8940
  {
8767
- ggml_compute_forward_repeat_f32(params, src0, dst);
8941
+ ggml_compute_forward_repeat_f32(params, dst);
8768
8942
  } break;
8769
8943
  default:
8770
8944
  {
@@ -8777,8 +8951,10 @@ static void ggml_compute_forward_repeat(
8777
8951
 
8778
8952
  static void ggml_compute_forward_repeat_back_f32(
8779
8953
  const struct ggml_compute_params * params,
8780
- const struct ggml_tensor * src0,
8781
8954
  struct ggml_tensor * dst) {
8955
+
8956
+ const struct ggml_tensor * src0 = dst->src[0];
8957
+
8782
8958
  GGML_ASSERT(params->ith == 0);
8783
8959
  GGML_ASSERT(ggml_can_repeat(dst, src0));
8784
8960
 
@@ -8834,12 +9010,14 @@ static void ggml_compute_forward_repeat_back_f32(
8834
9010
 
8835
9011
  static void ggml_compute_forward_repeat_back(
8836
9012
  const struct ggml_compute_params * params,
8837
- const struct ggml_tensor * src0,
8838
9013
  struct ggml_tensor * dst) {
9014
+
9015
+ const struct ggml_tensor * src0 = dst->src[0];
9016
+
8839
9017
  switch (src0->type) {
8840
9018
  case GGML_TYPE_F32:
8841
9019
  {
8842
- ggml_compute_forward_repeat_back_f32(params, src0, dst);
9020
+ ggml_compute_forward_repeat_back_f32(params, dst);
8843
9021
  } break;
8844
9022
  default:
8845
9023
  {
@@ -8852,10 +9030,11 @@ static void ggml_compute_forward_repeat_back(
8852
9030
 
8853
9031
  static void ggml_compute_forward_concat_f32(
8854
9032
  const struct ggml_compute_params * params,
8855
- const struct ggml_tensor * src0,
8856
- const struct ggml_tensor * src1,
8857
9033
  struct ggml_tensor * dst) {
8858
9034
 
9035
+ const struct ggml_tensor * src0 = dst->src[0];
9036
+ const struct ggml_tensor * src1 = dst->src[1];
9037
+
8859
9038
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8860
9039
  return;
8861
9040
  }
@@ -8900,14 +9079,15 @@ static void ggml_compute_forward_concat_f32(
8900
9079
 
8901
9080
  static void ggml_compute_forward_concat(
8902
9081
  const struct ggml_compute_params* params,
8903
- const struct ggml_tensor* src0,
8904
- const struct ggml_tensor* src1,
8905
9082
  struct ggml_tensor* dst) {
9083
+
9084
+ const struct ggml_tensor * src0 = dst->src[0];
9085
+
8906
9086
  switch (src0->type) {
8907
9087
  case GGML_TYPE_F32:
8908
9088
  case GGML_TYPE_I32:
8909
9089
  {
8910
- ggml_compute_forward_concat_f32(params, src0, src1, dst);
9090
+ ggml_compute_forward_concat_f32(params, dst);
8911
9091
  } break;
8912
9092
  default:
8913
9093
  {
@@ -8920,8 +9100,10 @@ static void ggml_compute_forward_concat(
8920
9100
 
8921
9101
  static void ggml_compute_forward_abs_f32(
8922
9102
  const struct ggml_compute_params * params,
8923
- const struct ggml_tensor * src0,
8924
9103
  struct ggml_tensor * dst) {
9104
+
9105
+ const struct ggml_tensor * src0 = dst->src[0];
9106
+
8925
9107
  assert(params->ith == 0);
8926
9108
  assert(ggml_are_same_shape(src0, dst));
8927
9109
 
@@ -8944,12 +9126,14 @@ static void ggml_compute_forward_abs_f32(
8944
9126
 
8945
9127
  static void ggml_compute_forward_abs(
8946
9128
  const struct ggml_compute_params * params,
8947
- const struct ggml_tensor * src0,
8948
9129
  struct ggml_tensor * dst) {
9130
+
9131
+ const struct ggml_tensor * src0 = dst->src[0];
9132
+
8949
9133
  switch (src0->type) {
8950
9134
  case GGML_TYPE_F32:
8951
9135
  {
8952
- ggml_compute_forward_abs_f32(params, src0, dst);
9136
+ ggml_compute_forward_abs_f32(params, dst);
8953
9137
  } break;
8954
9138
  default:
8955
9139
  {
@@ -8962,8 +9146,10 @@ static void ggml_compute_forward_abs(
8962
9146
 
8963
9147
  static void ggml_compute_forward_sgn_f32(
8964
9148
  const struct ggml_compute_params * params,
8965
- const struct ggml_tensor * src0,
8966
9149
  struct ggml_tensor * dst) {
9150
+
9151
+ const struct ggml_tensor * src0 = dst->src[0];
9152
+
8967
9153
  assert(params->ith == 0);
8968
9154
  assert(ggml_are_same_shape(src0, dst));
8969
9155
 
@@ -8986,12 +9172,14 @@ static void ggml_compute_forward_sgn_f32(
8986
9172
 
8987
9173
  static void ggml_compute_forward_sgn(
8988
9174
  const struct ggml_compute_params * params,
8989
- const struct ggml_tensor * src0,
8990
9175
  struct ggml_tensor * dst) {
9176
+
9177
+ const struct ggml_tensor * src0 = dst->src[0];
9178
+
8991
9179
  switch (src0->type) {
8992
9180
  case GGML_TYPE_F32:
8993
9181
  {
8994
- ggml_compute_forward_sgn_f32(params, src0, dst);
9182
+ ggml_compute_forward_sgn_f32(params, dst);
8995
9183
  } break;
8996
9184
  default:
8997
9185
  {
@@ -9004,8 +9192,10 @@ static void ggml_compute_forward_sgn(
9004
9192
 
9005
9193
  static void ggml_compute_forward_neg_f32(
9006
9194
  const struct ggml_compute_params * params,
9007
- const struct ggml_tensor * src0,
9008
9195
  struct ggml_tensor * dst) {
9196
+
9197
+ const struct ggml_tensor * src0 = dst->src[0];
9198
+
9009
9199
  assert(params->ith == 0);
9010
9200
  assert(ggml_are_same_shape(src0, dst));
9011
9201
 
@@ -9028,12 +9218,14 @@ static void ggml_compute_forward_neg_f32(
9028
9218
 
9029
9219
  static void ggml_compute_forward_neg(
9030
9220
  const struct ggml_compute_params * params,
9031
- const struct ggml_tensor * src0,
9032
9221
  struct ggml_tensor * dst) {
9222
+
9223
+ const struct ggml_tensor * src0 = dst->src[0];
9224
+
9033
9225
  switch (src0->type) {
9034
9226
  case GGML_TYPE_F32:
9035
9227
  {
9036
- ggml_compute_forward_neg_f32(params, src0, dst);
9228
+ ggml_compute_forward_neg_f32(params, dst);
9037
9229
  } break;
9038
9230
  default:
9039
9231
  {
@@ -9046,8 +9238,10 @@ static void ggml_compute_forward_neg(
9046
9238
 
9047
9239
  static void ggml_compute_forward_step_f32(
9048
9240
  const struct ggml_compute_params * params,
9049
- const struct ggml_tensor * src0,
9050
9241
  struct ggml_tensor * dst) {
9242
+
9243
+ const struct ggml_tensor * src0 = dst->src[0];
9244
+
9051
9245
  assert(params->ith == 0);
9052
9246
  assert(ggml_are_same_shape(src0, dst));
9053
9247
 
@@ -9070,12 +9264,14 @@ static void ggml_compute_forward_step_f32(
9070
9264
 
9071
9265
  static void ggml_compute_forward_step(
9072
9266
  const struct ggml_compute_params * params,
9073
- const struct ggml_tensor * src0,
9074
9267
  struct ggml_tensor * dst) {
9268
+
9269
+ const struct ggml_tensor * src0 = dst->src[0];
9270
+
9075
9271
  switch (src0->type) {
9076
9272
  case GGML_TYPE_F32:
9077
9273
  {
9078
- ggml_compute_forward_step_f32(params, src0, dst);
9274
+ ggml_compute_forward_step_f32(params, dst);
9079
9275
  } break;
9080
9276
  default:
9081
9277
  {
@@ -9088,8 +9284,10 @@ static void ggml_compute_forward_step(
9088
9284
 
9089
9285
  static void ggml_compute_forward_tanh_f32(
9090
9286
  const struct ggml_compute_params * params,
9091
- const struct ggml_tensor * src0,
9092
9287
  struct ggml_tensor * dst) {
9288
+
9289
+ const struct ggml_tensor * src0 = dst->src[0];
9290
+
9093
9291
  assert(params->ith == 0);
9094
9292
  assert(ggml_are_same_shape(src0, dst));
9095
9293
 
@@ -9112,12 +9310,14 @@ static void ggml_compute_forward_tanh_f32(
9112
9310
 
9113
9311
  static void ggml_compute_forward_tanh(
9114
9312
  const struct ggml_compute_params * params,
9115
- const struct ggml_tensor * src0,
9116
9313
  struct ggml_tensor * dst) {
9314
+
9315
+ const struct ggml_tensor * src0 = dst->src[0];
9316
+
9117
9317
  switch (src0->type) {
9118
9318
  case GGML_TYPE_F32:
9119
9319
  {
9120
- ggml_compute_forward_tanh_f32(params, src0, dst);
9320
+ ggml_compute_forward_tanh_f32(params, dst);
9121
9321
  } break;
9122
9322
  default:
9123
9323
  {
@@ -9130,8 +9330,10 @@ static void ggml_compute_forward_tanh(
9130
9330
 
9131
9331
  static void ggml_compute_forward_elu_f32(
9132
9332
  const struct ggml_compute_params * params,
9133
- const struct ggml_tensor * src0,
9134
9333
  struct ggml_tensor * dst) {
9334
+
9335
+ const struct ggml_tensor * src0 = dst->src[0];
9336
+
9135
9337
  assert(params->ith == 0);
9136
9338
  assert(ggml_are_same_shape(src0, dst));
9137
9339
 
@@ -9154,12 +9356,14 @@ static void ggml_compute_forward_elu_f32(
9154
9356
 
9155
9357
  static void ggml_compute_forward_elu(
9156
9358
  const struct ggml_compute_params * params,
9157
- const struct ggml_tensor * src0,
9158
9359
  struct ggml_tensor * dst) {
9360
+
9361
+ const struct ggml_tensor * src0 = dst->src[0];
9362
+
9159
9363
  switch (src0->type) {
9160
9364
  case GGML_TYPE_F32:
9161
9365
  {
9162
- ggml_compute_forward_elu_f32(params, src0, dst);
9366
+ ggml_compute_forward_elu_f32(params, dst);
9163
9367
  } break;
9164
9368
  default:
9165
9369
  {
@@ -9172,8 +9376,10 @@ static void ggml_compute_forward_elu(
9172
9376
 
9173
9377
  static void ggml_compute_forward_relu_f32(
9174
9378
  const struct ggml_compute_params * params,
9175
- const struct ggml_tensor * src0,
9176
9379
  struct ggml_tensor * dst) {
9380
+
9381
+ const struct ggml_tensor * src0 = dst->src[0];
9382
+
9177
9383
  assert(params->ith == 0);
9178
9384
  assert(ggml_are_same_shape(src0, dst));
9179
9385
 
@@ -9196,12 +9402,14 @@ static void ggml_compute_forward_relu_f32(
9196
9402
 
9197
9403
  static void ggml_compute_forward_relu(
9198
9404
  const struct ggml_compute_params * params,
9199
- const struct ggml_tensor * src0,
9200
9405
  struct ggml_tensor * dst) {
9406
+
9407
+ const struct ggml_tensor * src0 = dst->src[0];
9408
+
9201
9409
  switch (src0->type) {
9202
9410
  case GGML_TYPE_F32:
9203
9411
  {
9204
- ggml_compute_forward_relu_f32(params, src0, dst);
9412
+ ggml_compute_forward_relu_f32(params, dst);
9205
9413
  } break;
9206
9414
  default:
9207
9415
  {
@@ -9214,8 +9422,10 @@ static void ggml_compute_forward_relu(
9214
9422
 
9215
9423
  static void ggml_compute_forward_gelu_f32(
9216
9424
  const struct ggml_compute_params * params,
9217
- const struct ggml_tensor * src0,
9218
9425
  struct ggml_tensor * dst) {
9426
+
9427
+ const struct ggml_tensor * src0 = dst->src[0];
9428
+
9219
9429
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
9220
9430
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
9221
9431
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
@@ -9255,12 +9465,14 @@ static void ggml_compute_forward_gelu_f32(
9255
9465
 
9256
9466
  static void ggml_compute_forward_gelu(
9257
9467
  const struct ggml_compute_params * params,
9258
- const struct ggml_tensor * src0,
9259
9468
  struct ggml_tensor * dst) {
9469
+
9470
+ const struct ggml_tensor * src0 = dst->src[0];
9471
+
9260
9472
  switch (src0->type) {
9261
9473
  case GGML_TYPE_F32:
9262
9474
  {
9263
- ggml_compute_forward_gelu_f32(params, src0, dst);
9475
+ ggml_compute_forward_gelu_f32(params, dst);
9264
9476
  } break;
9265
9477
  default:
9266
9478
  {
@@ -9273,8 +9485,10 @@ static void ggml_compute_forward_gelu(
9273
9485
 
9274
9486
  static void ggml_compute_forward_gelu_quick_f32(
9275
9487
  const struct ggml_compute_params * params,
9276
- const struct ggml_tensor * src0,
9277
9488
  struct ggml_tensor * dst) {
9489
+
9490
+ const struct ggml_tensor * src0 = dst->src[0];
9491
+
9278
9492
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
9279
9493
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
9280
9494
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
@@ -9314,12 +9528,14 @@ static void ggml_compute_forward_gelu_quick_f32(
9314
9528
 
9315
9529
  static void ggml_compute_forward_gelu_quick(
9316
9530
  const struct ggml_compute_params * params,
9317
- const struct ggml_tensor * src0,
9318
9531
  struct ggml_tensor * dst) {
9532
+
9533
+ const struct ggml_tensor * src0 = dst->src[0];
9534
+
9319
9535
  switch (src0->type) {
9320
9536
  case GGML_TYPE_F32:
9321
9537
  {
9322
- ggml_compute_forward_gelu_quick_f32(params, src0, dst);
9538
+ ggml_compute_forward_gelu_quick_f32(params, dst);
9323
9539
  } break;
9324
9540
  default:
9325
9541
  {
@@ -9332,8 +9548,10 @@ static void ggml_compute_forward_gelu_quick(
9332
9548
 
9333
9549
  static void ggml_compute_forward_silu_f32(
9334
9550
  const struct ggml_compute_params * params,
9335
- const struct ggml_tensor * src0,
9336
9551
  struct ggml_tensor * dst) {
9552
+
9553
+ const struct ggml_tensor * src0 = dst->src[0];
9554
+
9337
9555
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
9338
9556
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
9339
9557
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
@@ -9373,12 +9591,14 @@ static void ggml_compute_forward_silu_f32(
9373
9591
 
9374
9592
  static void ggml_compute_forward_silu(
9375
9593
  const struct ggml_compute_params * params,
9376
- const struct ggml_tensor * src0,
9377
9594
  struct ggml_tensor * dst) {
9595
+
9596
+ const struct ggml_tensor * src0 = dst->src[0];
9597
+
9378
9598
  switch (src0->type) {
9379
9599
  case GGML_TYPE_F32:
9380
9600
  {
9381
- ggml_compute_forward_silu_f32(params, src0, dst);
9601
+ ggml_compute_forward_silu_f32(params, dst);
9382
9602
  } break;
9383
9603
  default:
9384
9604
  {
@@ -9390,8 +9610,10 @@ static void ggml_compute_forward_silu(
9390
9610
 
9391
9611
  static void ggml_compute_forward_leaky_relu_f32(
9392
9612
  const struct ggml_compute_params * params,
9393
- const struct ggml_tensor * src0,
9394
9613
  struct ggml_tensor * dst) {
9614
+
9615
+ const struct ggml_tensor * src0 = dst->src[0];
9616
+
9395
9617
  assert(params->ith == 0);
9396
9618
  assert(ggml_are_same_shape(src0, dst));
9397
9619
 
@@ -9417,12 +9639,14 @@ static void ggml_compute_forward_leaky_relu_f32(
9417
9639
 
9418
9640
  static void ggml_compute_forward_leaky_relu(
9419
9641
  const struct ggml_compute_params * params,
9420
- const struct ggml_tensor * src0,
9421
9642
  struct ggml_tensor * dst) {
9643
+
9644
+ const struct ggml_tensor * src0 = dst->src[0];
9645
+
9422
9646
  switch (src0->type) {
9423
9647
  case GGML_TYPE_F32:
9424
9648
  {
9425
- ggml_compute_forward_leaky_relu_f32(params, src0, dst);
9649
+ ggml_compute_forward_leaky_relu_f32(params, dst);
9426
9650
  } break;
9427
9651
  default:
9428
9652
  {
@@ -9435,9 +9659,11 @@ static void ggml_compute_forward_leaky_relu(
9435
9659
 
9436
9660
  static void ggml_compute_forward_silu_back_f32(
9437
9661
  const struct ggml_compute_params * params,
9438
- const struct ggml_tensor * src0,
9439
- const struct ggml_tensor * grad,
9440
9662
  struct ggml_tensor * dst) {
9663
+
9664
+ const struct ggml_tensor * src0 = dst->src[0];
9665
+ const struct ggml_tensor * grad = dst->src[1];
9666
+
9441
9667
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(grad));
9442
9668
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
9443
9669
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
@@ -9480,13 +9706,14 @@ static void ggml_compute_forward_silu_back_f32(
9480
9706
 
9481
9707
  static void ggml_compute_forward_silu_back(
9482
9708
  const struct ggml_compute_params * params,
9483
- const struct ggml_tensor * src0,
9484
- const struct ggml_tensor * grad,
9485
9709
  struct ggml_tensor * dst) {
9710
+
9711
+ const struct ggml_tensor * src0 = dst->src[0];
9712
+
9486
9713
  switch (src0->type) {
9487
9714
  case GGML_TYPE_F32:
9488
9715
  {
9489
- ggml_compute_forward_silu_back_f32(params, src0, grad, dst);
9716
+ ggml_compute_forward_silu_back_f32(params, dst);
9490
9717
  } break;
9491
9718
  default:
9492
9719
  {
@@ -9498,8 +9725,10 @@ static void ggml_compute_forward_silu_back(
9498
9725
 
9499
9726
  static void ggml_compute_forward_hardswish_f32(
9500
9727
  const struct ggml_compute_params * params,
9501
- const struct ggml_tensor * src0,
9502
9728
  struct ggml_tensor * dst) {
9729
+
9730
+ const struct ggml_tensor * src0 = dst->src[0];
9731
+
9503
9732
  assert(params->ith == 0);
9504
9733
  assert(ggml_are_same_shape(src0, dst));
9505
9734
 
@@ -9521,12 +9750,14 @@ static void ggml_compute_forward_hardswish_f32(
9521
9750
  }
9522
9751
  static void ggml_compute_forward_hardswish(
9523
9752
  const struct ggml_compute_params * params,
9524
- const struct ggml_tensor * src0,
9525
9753
  struct ggml_tensor * dst) {
9754
+
9755
+ const struct ggml_tensor * src0 = dst->src[0];
9756
+
9526
9757
  switch (src0->type) {
9527
9758
  case GGML_TYPE_F32:
9528
9759
  {
9529
- ggml_compute_forward_hardswish_f32(params, src0, dst);
9760
+ ggml_compute_forward_hardswish_f32(params, dst);
9530
9761
  } break;
9531
9762
  default:
9532
9763
  {
@@ -9537,8 +9768,10 @@ static void ggml_compute_forward_hardswish(
9537
9768
 
9538
9769
  static void ggml_compute_forward_hardsigmoid_f32(
9539
9770
  const struct ggml_compute_params * params,
9540
- const struct ggml_tensor * src0,
9541
9771
  struct ggml_tensor * dst) {
9772
+
9773
+ const struct ggml_tensor * src0 = dst->src[0];
9774
+
9542
9775
  assert(params->ith == 0);
9543
9776
  assert(ggml_are_same_shape(src0, dst));
9544
9777
 
@@ -9561,12 +9794,14 @@ static void ggml_compute_forward_hardsigmoid_f32(
9561
9794
 
9562
9795
  static void ggml_compute_forward_hardsigmoid(
9563
9796
  const struct ggml_compute_params * params,
9564
- const struct ggml_tensor * src0,
9565
9797
  struct ggml_tensor * dst) {
9798
+
9799
+ const struct ggml_tensor * src0 = dst->src[0];
9800
+
9566
9801
  switch (src0->type) {
9567
9802
  case GGML_TYPE_F32:
9568
9803
  {
9569
- ggml_compute_forward_hardsigmoid_f32(params, src0, dst);
9804
+ ggml_compute_forward_hardsigmoid_f32(params, dst);
9570
9805
  } break;
9571
9806
  default:
9572
9807
  {
@@ -9580,8 +9815,10 @@ static void ggml_compute_forward_hardsigmoid(
9580
9815
 
9581
9816
  static void ggml_compute_forward_norm_f32(
9582
9817
  const struct ggml_compute_params * params,
9583
- const struct ggml_tensor * src0,
9584
9818
  struct ggml_tensor * dst) {
9819
+
9820
+ const struct ggml_tensor * src0 = dst->src[0];
9821
+
9585
9822
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
9586
9823
 
9587
9824
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -9633,12 +9870,14 @@ static void ggml_compute_forward_norm_f32(
9633
9870
 
9634
9871
  static void ggml_compute_forward_norm(
9635
9872
  const struct ggml_compute_params * params,
9636
- const struct ggml_tensor * src0,
9637
9873
  struct ggml_tensor * dst) {
9874
+
9875
+ const struct ggml_tensor * src0 = dst->src[0];
9876
+
9638
9877
  switch (src0->type) {
9639
9878
  case GGML_TYPE_F32:
9640
9879
  {
9641
- ggml_compute_forward_norm_f32(params, src0, dst);
9880
+ ggml_compute_forward_norm_f32(params, dst);
9642
9881
  } break;
9643
9882
  default:
9644
9883
  {
@@ -9651,8 +9890,10 @@ static void ggml_compute_forward_norm(
9651
9890
 
9652
9891
  static void ggml_compute_forward_rms_norm_f32(
9653
9892
  const struct ggml_compute_params * params,
9654
- const struct ggml_tensor * src0,
9655
9893
  struct ggml_tensor * dst) {
9894
+
9895
+ const struct ggml_tensor * src0 = dst->src[0];
9896
+
9656
9897
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
9657
9898
 
9658
9899
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -9701,12 +9942,14 @@ static void ggml_compute_forward_rms_norm_f32(
9701
9942
 
9702
9943
  static void ggml_compute_forward_rms_norm(
9703
9944
  const struct ggml_compute_params * params,
9704
- const struct ggml_tensor * src0,
9705
9945
  struct ggml_tensor * dst) {
9946
+
9947
+ const struct ggml_tensor * src0 = dst->src[0];
9948
+
9706
9949
  switch (src0->type) {
9707
9950
  case GGML_TYPE_F32:
9708
9951
  {
9709
- ggml_compute_forward_rms_norm_f32(params, src0, dst);
9952
+ ggml_compute_forward_rms_norm_f32(params, dst);
9710
9953
  } break;
9711
9954
  default:
9712
9955
  {
@@ -9717,9 +9960,11 @@ static void ggml_compute_forward_rms_norm(
9717
9960
 
9718
9961
  static void ggml_compute_forward_rms_norm_back_f32(
9719
9962
  const struct ggml_compute_params * params,
9720
- const struct ggml_tensor * src0,
9721
- const struct ggml_tensor * src1,
9722
9963
  struct ggml_tensor * dst) {
9964
+
9965
+ const struct ggml_tensor * src0 = dst->src[0];
9966
+ const struct ggml_tensor * src1 = dst->src[1];
9967
+
9723
9968
  GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_are_same_shape(src0, src1));
9724
9969
 
9725
9970
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -9874,13 +10119,14 @@ static void ggml_compute_forward_rms_norm_back_f32(
9874
10119
 
9875
10120
  static void ggml_compute_forward_rms_norm_back(
9876
10121
  const struct ggml_compute_params * params,
9877
- const struct ggml_tensor * src0,
9878
- const struct ggml_tensor * src1,
9879
10122
  struct ggml_tensor * dst) {
10123
+
10124
+ const struct ggml_tensor * src0 = dst->src[0];
10125
+
9880
10126
  switch (src0->type) {
9881
10127
  case GGML_TYPE_F32:
9882
10128
  {
9883
- ggml_compute_forward_rms_norm_back_f32(params, src0, src1, dst);
10129
+ ggml_compute_forward_rms_norm_back_f32(params, dst);
9884
10130
  } break;
9885
10131
  default:
9886
10132
  {
@@ -9893,8 +10139,10 @@ static void ggml_compute_forward_rms_norm_back(
9893
10139
 
9894
10140
  static void ggml_compute_forward_group_norm_f32(
9895
10141
  const struct ggml_compute_params * params,
9896
- const struct ggml_tensor * src0,
9897
10142
  struct ggml_tensor * dst) {
10143
+
10144
+ const struct ggml_tensor * src0 = dst->src[0];
10145
+
9898
10146
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
9899
10147
 
9900
10148
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -9965,12 +10213,14 @@ static void ggml_compute_forward_group_norm_f32(
9965
10213
 
9966
10214
  static void ggml_compute_forward_group_norm(
9967
10215
  const struct ggml_compute_params * params,
9968
- const struct ggml_tensor * src0,
9969
10216
  struct ggml_tensor * dst) {
10217
+
10218
+ const struct ggml_tensor * src0 = dst->src[0];
10219
+
9970
10220
  switch (src0->type) {
9971
10221
  case GGML_TYPE_F32:
9972
10222
  {
9973
- ggml_compute_forward_group_norm_f32(params, src0, dst);
10223
+ ggml_compute_forward_group_norm_f32(params, dst);
9974
10224
  } break;
9975
10225
  default:
9976
10226
  {
@@ -10016,9 +10266,11 @@ static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
10016
10266
 
10017
10267
  static void ggml_compute_forward_mul_mat(
10018
10268
  const struct ggml_compute_params * params,
10019
- const struct ggml_tensor * src0,
10020
- const struct ggml_tensor * src1,
10021
10269
  struct ggml_tensor * dst) {
10270
+
10271
+ const struct ggml_tensor * src0 = dst->src[0];
10272
+ const struct ggml_tensor * src1 = dst->src[1];
10273
+
10022
10274
  int64_t t0 = ggml_perf_time_us();
10023
10275
  UNUSED(t0);
10024
10276
 
@@ -10263,10 +10515,11 @@ static void ggml_compute_forward_mul_mat(
10263
10515
 
10264
10516
  static void ggml_compute_forward_mul_mat_id(
10265
10517
  const struct ggml_compute_params * params,
10266
- const struct ggml_tensor * ids,
10267
- const struct ggml_tensor * src1,
10268
10518
  struct ggml_tensor * dst) {
10269
10519
 
10520
+ const struct ggml_tensor * ids = dst->src[0];
10521
+ const struct ggml_tensor * src1 = dst->src[1];
10522
+
10270
10523
  const struct ggml_tensor * src0 = dst->src[2]; // only for GGML_TENSOR_BINARY_OP_LOCALS
10271
10524
 
10272
10525
  GGML_TENSOR_BINARY_OP_LOCALS
@@ -10457,9 +10710,11 @@ static void ggml_compute_forward_mul_mat_id(
10457
10710
 
10458
10711
  static void ggml_compute_forward_out_prod_f32(
10459
10712
  const struct ggml_compute_params * params,
10460
- const struct ggml_tensor * src0,
10461
- const struct ggml_tensor * src1,
10462
10713
  struct ggml_tensor * dst) {
10714
+
10715
+ const struct ggml_tensor * src0 = dst->src[0];
10716
+ const struct ggml_tensor * src1 = dst->src[1];
10717
+
10463
10718
  // int64_t t0 = ggml_perf_time_us();
10464
10719
  // UNUSED(t0);
10465
10720
 
@@ -10649,9 +10904,11 @@ static void ggml_compute_forward_out_prod_f32(
10649
10904
 
10650
10905
  static void ggml_compute_forward_out_prod_q_f32(
10651
10906
  const struct ggml_compute_params * params,
10652
- const struct ggml_tensor * src0,
10653
- const struct ggml_tensor * src1,
10654
10907
  struct ggml_tensor * dst) {
10908
+
10909
+ const struct ggml_tensor * src0 = dst->src[0];
10910
+ const struct ggml_tensor * src1 = dst->src[1];
10911
+
10655
10912
  // int64_t t0 = ggml_perf_time_us();
10656
10913
  // UNUSED(t0);
10657
10914
 
@@ -10762,9 +11019,10 @@ static void ggml_compute_forward_out_prod_q_f32(
10762
11019
 
10763
11020
  static void ggml_compute_forward_out_prod(
10764
11021
  const struct ggml_compute_params * params,
10765
- const struct ggml_tensor * src0,
10766
- const struct ggml_tensor * src1,
10767
11022
  struct ggml_tensor * dst) {
11023
+
11024
+ const struct ggml_tensor * src0 = dst->src[0];
11025
+
10768
11026
  switch (src0->type) {
10769
11027
  case GGML_TYPE_Q4_0:
10770
11028
  case GGML_TYPE_Q4_1:
@@ -10779,17 +11037,19 @@ static void ggml_compute_forward_out_prod(
10779
11037
  case GGML_TYPE_IQ2_XXS:
10780
11038
  case GGML_TYPE_IQ2_XS:
10781
11039
  case GGML_TYPE_IQ3_XXS:
11040
+ case GGML_TYPE_IQ1_S:
11041
+ case GGML_TYPE_IQ4_NL:
10782
11042
  {
10783
- ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst);
11043
+ ggml_compute_forward_out_prod_q_f32(params, dst);
10784
11044
  } break;
10785
11045
  case GGML_TYPE_F16:
10786
11046
  {
10787
11047
  GGML_ASSERT(false); // todo
10788
- // ggml_compute_forward_out_prod_f16_f32(params, src0, src1, dst);
11048
+ // ggml_compute_forward_out_prod_f16_f32(params, dst);
10789
11049
  } break;
10790
11050
  case GGML_TYPE_F32:
10791
11051
  {
10792
- ggml_compute_forward_out_prod_f32(params, src0, src1, dst);
11052
+ ggml_compute_forward_out_prod_f32(params, dst);
10793
11053
  } break;
10794
11054
  default:
10795
11055
  {
@@ -10802,8 +11062,10 @@ static void ggml_compute_forward_out_prod(
10802
11062
 
10803
11063
  static void ggml_compute_forward_scale_f32(
10804
11064
  const struct ggml_compute_params * params,
10805
- const struct ggml_tensor * src0,
10806
11065
  struct ggml_tensor * dst) {
11066
+
11067
+ const struct ggml_tensor * src0 = dst->src[0];
11068
+
10807
11069
  GGML_ASSERT(ggml_is_contiguous(src0));
10808
11070
  GGML_ASSERT(ggml_is_contiguous(dst));
10809
11071
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
@@ -10844,12 +11106,14 @@ static void ggml_compute_forward_scale_f32(
10844
11106
 
10845
11107
  static void ggml_compute_forward_scale(
10846
11108
  const struct ggml_compute_params * params,
10847
- const struct ggml_tensor * src0,
10848
11109
  struct ggml_tensor * dst) {
11110
+
11111
+ const struct ggml_tensor * src0 = dst->src[0];
11112
+
10849
11113
  switch (src0->type) {
10850
11114
  case GGML_TYPE_F32:
10851
11115
  {
10852
- ggml_compute_forward_scale_f32(params, src0, dst);
11116
+ ggml_compute_forward_scale_f32(params, dst);
10853
11117
  } break;
10854
11118
  default:
10855
11119
  {
@@ -10862,9 +11126,11 @@ static void ggml_compute_forward_scale(
10862
11126
 
10863
11127
  static void ggml_compute_forward_set_f32(
10864
11128
  const struct ggml_compute_params * params,
10865
- const struct ggml_tensor * src0,
10866
- const struct ggml_tensor * src1,
10867
11129
  struct ggml_tensor * dst) {
11130
+
11131
+ const struct ggml_tensor * src0 = dst->src[0];
11132
+ const struct ggml_tensor * src1 = dst->src[1];
11133
+
10868
11134
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
10869
11135
  GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
10870
11136
 
@@ -10935,14 +11201,14 @@ static void ggml_compute_forward_set_f32(
10935
11201
 
10936
11202
  static void ggml_compute_forward_set(
10937
11203
  const struct ggml_compute_params * params,
10938
- const struct ggml_tensor * src0,
10939
- const struct ggml_tensor * src1,
10940
11204
  struct ggml_tensor * dst) {
10941
11205
 
11206
+ const struct ggml_tensor * src0 = dst->src[0];
11207
+
10942
11208
  switch (src0->type) {
10943
11209
  case GGML_TYPE_F32:
10944
11210
  {
10945
- ggml_compute_forward_set_f32(params, src0, src1, dst);
11211
+ ggml_compute_forward_set_f32(params, dst);
10946
11212
  } break;
10947
11213
  case GGML_TYPE_F16:
10948
11214
  case GGML_TYPE_Q4_0:
@@ -10959,6 +11225,8 @@ static void ggml_compute_forward_set(
10959
11225
  case GGML_TYPE_IQ2_XXS:
10960
11226
  case GGML_TYPE_IQ2_XS:
10961
11227
  case GGML_TYPE_IQ3_XXS:
11228
+ case GGML_TYPE_IQ1_S:
11229
+ case GGML_TYPE_IQ4_NL:
10962
11230
  default:
10963
11231
  {
10964
11232
  GGML_ASSERT(false);
@@ -10970,29 +11238,25 @@ static void ggml_compute_forward_set(
10970
11238
 
10971
11239
  static void ggml_compute_forward_cpy(
10972
11240
  const struct ggml_compute_params * params,
10973
- const struct ggml_tensor * src0,
10974
11241
  struct ggml_tensor * dst) {
10975
- ggml_compute_forward_dup(params, src0, dst);
11242
+ ggml_compute_forward_dup(params, dst);
10976
11243
  }
10977
11244
 
10978
11245
  // ggml_compute_forward_cont
10979
11246
 
10980
11247
  static void ggml_compute_forward_cont(
10981
11248
  const struct ggml_compute_params * params,
10982
- const struct ggml_tensor * src0,
10983
11249
  struct ggml_tensor * dst) {
10984
- ggml_compute_forward_dup(params, src0, dst);
11250
+ ggml_compute_forward_dup(params, dst);
10985
11251
  }
10986
11252
 
10987
11253
  // ggml_compute_forward_reshape
10988
11254
 
10989
11255
  static void ggml_compute_forward_reshape(
10990
11256
  const struct ggml_compute_params * params,
10991
- const struct ggml_tensor * src0,
10992
11257
  struct ggml_tensor * dst) {
10993
11258
  // NOP
10994
11259
  UNUSED(params);
10995
- UNUSED(src0);
10996
11260
  UNUSED(dst);
10997
11261
  }
10998
11262
 
@@ -11000,39 +11264,41 @@ static void ggml_compute_forward_reshape(
11000
11264
 
11001
11265
  static void ggml_compute_forward_view(
11002
11266
  const struct ggml_compute_params * params,
11003
- const struct ggml_tensor * src0) {
11267
+ const struct ggml_tensor * dst) {
11004
11268
  // NOP
11005
11269
  UNUSED(params);
11006
- UNUSED(src0);
11270
+ UNUSED(dst);
11007
11271
  }
11008
11272
 
11009
11273
  // ggml_compute_forward_permute
11010
11274
 
11011
11275
  static void ggml_compute_forward_permute(
11012
11276
  const struct ggml_compute_params * params,
11013
- const struct ggml_tensor * src0) {
11277
+ const struct ggml_tensor * dst) {
11014
11278
  // NOP
11015
11279
  UNUSED(params);
11016
- UNUSED(src0);
11280
+ UNUSED(dst);
11017
11281
  }
11018
11282
 
11019
11283
  // ggml_compute_forward_transpose
11020
11284
 
11021
11285
  static void ggml_compute_forward_transpose(
11022
11286
  const struct ggml_compute_params * params,
11023
- const struct ggml_tensor * src0) {
11287
+ const struct ggml_tensor * dst) {
11024
11288
  // NOP
11025
11289
  UNUSED(params);
11026
- UNUSED(src0);
11290
+ UNUSED(dst);
11027
11291
  }
11028
11292
 
11029
11293
  // ggml_compute_forward_get_rows
11030
11294
 
11031
11295
  static void ggml_compute_forward_get_rows_q(
11032
11296
  const struct ggml_compute_params * params,
11033
- const struct ggml_tensor * src0,
11034
- const struct ggml_tensor * src1,
11035
11297
  struct ggml_tensor * dst) {
11298
+
11299
+ const struct ggml_tensor * src0 = dst->src[0];
11300
+ const struct ggml_tensor * src1 = dst->src[1];
11301
+
11036
11302
  assert(params->ith == 0);
11037
11303
 
11038
11304
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -11068,9 +11334,11 @@ static void ggml_compute_forward_get_rows_q(
11068
11334
 
11069
11335
  static void ggml_compute_forward_get_rows_f16(
11070
11336
  const struct ggml_compute_params * params,
11071
- const struct ggml_tensor * src0,
11072
- const struct ggml_tensor * src1,
11073
11337
  struct ggml_tensor * dst) {
11338
+
11339
+ const struct ggml_tensor * src0 = dst->src[0];
11340
+ const struct ggml_tensor * src1 = dst->src[1];
11341
+
11074
11342
  assert(params->ith == 0);
11075
11343
 
11076
11344
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -11103,9 +11371,11 @@ static void ggml_compute_forward_get_rows_f16(
11103
11371
 
11104
11372
  static void ggml_compute_forward_get_rows_f32(
11105
11373
  const struct ggml_compute_params * params,
11106
- const struct ggml_tensor * src0,
11107
- const struct ggml_tensor * src1,
11108
11374
  struct ggml_tensor * dst) {
11375
+
11376
+ const struct ggml_tensor * src0 = dst->src[0];
11377
+ const struct ggml_tensor * src1 = dst->src[1];
11378
+
11109
11379
  assert(params->ith == 0);
11110
11380
 
11111
11381
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -11138,9 +11408,10 @@ static void ggml_compute_forward_get_rows_f32(
11138
11408
 
11139
11409
  static void ggml_compute_forward_get_rows(
11140
11410
  const struct ggml_compute_params * params,
11141
- const struct ggml_tensor * src0,
11142
- const struct ggml_tensor * src1,
11143
11411
  struct ggml_tensor * dst) {
11412
+
11413
+ const struct ggml_tensor * src0 = dst->src[0];
11414
+
11144
11415
  switch (src0->type) {
11145
11416
  case GGML_TYPE_Q4_0:
11146
11417
  case GGML_TYPE_Q4_1:
@@ -11156,17 +11427,19 @@ static void ggml_compute_forward_get_rows(
11156
11427
  case GGML_TYPE_IQ2_XXS:
11157
11428
  case GGML_TYPE_IQ2_XS:
11158
11429
  case GGML_TYPE_IQ3_XXS:
11430
+ case GGML_TYPE_IQ1_S:
11431
+ case GGML_TYPE_IQ4_NL:
11159
11432
  {
11160
- ggml_compute_forward_get_rows_q(params, src0, src1, dst);
11433
+ ggml_compute_forward_get_rows_q(params, dst);
11161
11434
  } break;
11162
11435
  case GGML_TYPE_F16:
11163
11436
  {
11164
- ggml_compute_forward_get_rows_f16(params, src0, src1, dst);
11437
+ ggml_compute_forward_get_rows_f16(params, dst);
11165
11438
  } break;
11166
11439
  case GGML_TYPE_F32:
11167
11440
  case GGML_TYPE_I32:
11168
11441
  {
11169
- ggml_compute_forward_get_rows_f32(params, src0, src1, dst);
11442
+ ggml_compute_forward_get_rows_f32(params, dst);
11170
11443
  } break;
11171
11444
  default:
11172
11445
  {
@@ -11197,9 +11470,11 @@ static void ggml_compute_forward_get_rows(
11197
11470
 
11198
11471
  static void ggml_compute_forward_get_rows_back_f32_f16(
11199
11472
  const struct ggml_compute_params * params,
11200
- const struct ggml_tensor * src0,
11201
- const struct ggml_tensor * src1,
11202
11473
  struct ggml_tensor * dst) {
11474
+
11475
+ const struct ggml_tensor * src0 = dst->src[0];
11476
+ const struct ggml_tensor * src1 = dst->src[1];
11477
+
11203
11478
  GGML_ASSERT(params->ith == 0);
11204
11479
  GGML_ASSERT(ggml_is_contiguous(dst));
11205
11480
 
@@ -11234,9 +11509,11 @@ static void ggml_compute_forward_get_rows_back_f32_f16(
11234
11509
 
11235
11510
  static void ggml_compute_forward_get_rows_back_f32(
11236
11511
  const struct ggml_compute_params * params,
11237
- const struct ggml_tensor * src0,
11238
- const struct ggml_tensor * src1,
11239
11512
  struct ggml_tensor * dst) {
11513
+
11514
+ const struct ggml_tensor * src0 = dst->src[0];
11515
+ const struct ggml_tensor * src1 = dst->src[1];
11516
+
11240
11517
  GGML_ASSERT(params->ith == 0);
11241
11518
  GGML_ASSERT(ggml_is_contiguous(dst));
11242
11519
 
@@ -11271,17 +11548,18 @@ static void ggml_compute_forward_get_rows_back_f32(
11271
11548
 
11272
11549
  static void ggml_compute_forward_get_rows_back(
11273
11550
  const struct ggml_compute_params * params,
11274
- const struct ggml_tensor * src0,
11275
- const struct ggml_tensor * src1,
11276
11551
  struct ggml_tensor * dst) {
11552
+
11553
+ const struct ggml_tensor * src0 = dst->src[0];
11554
+
11277
11555
  switch (src0->type) {
11278
11556
  case GGML_TYPE_F16:
11279
11557
  {
11280
- ggml_compute_forward_get_rows_back_f32_f16(params, src0, src1, dst);
11558
+ ggml_compute_forward_get_rows_back_f32_f16(params, dst);
11281
11559
  } break;
11282
11560
  case GGML_TYPE_F32:
11283
11561
  {
11284
- ggml_compute_forward_get_rows_back_f32(params, src0, src1, dst);
11562
+ ggml_compute_forward_get_rows_back_f32(params, dst);
11285
11563
  } break;
11286
11564
  default:
11287
11565
  {
@@ -11312,8 +11590,10 @@ static void ggml_compute_forward_get_rows_back(
11312
11590
 
11313
11591
  static void ggml_compute_forward_diag_f32(
11314
11592
  const struct ggml_compute_params * params,
11315
- const struct ggml_tensor * src0,
11316
11593
  struct ggml_tensor * dst) {
11594
+
11595
+ const struct ggml_tensor * src0 = dst->src[0];
11596
+
11317
11597
  GGML_ASSERT(params->ith == 0);
11318
11598
 
11319
11599
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -11352,12 +11632,14 @@ static void ggml_compute_forward_diag_f32(
11352
11632
 
11353
11633
  static void ggml_compute_forward_diag(
11354
11634
  const struct ggml_compute_params * params,
11355
- const struct ggml_tensor * src0,
11356
11635
  struct ggml_tensor * dst) {
11636
+
11637
+ const struct ggml_tensor * src0 = dst->src[0];
11638
+
11357
11639
  switch (src0->type) {
11358
11640
  case GGML_TYPE_F32:
11359
11641
  {
11360
- ggml_compute_forward_diag_f32(params, src0, dst);
11642
+ ggml_compute_forward_diag_f32(params, dst);
11361
11643
  } break;
11362
11644
  default:
11363
11645
  {
@@ -11370,10 +11652,11 @@ static void ggml_compute_forward_diag(
11370
11652
 
11371
11653
  static void ggml_compute_forward_diag_mask_f32(
11372
11654
  const struct ggml_compute_params * params,
11373
- const struct ggml_tensor * src0,
11374
11655
  struct ggml_tensor * dst,
11375
11656
  const float value) {
11376
11657
 
11658
+ const struct ggml_tensor * src0 = dst->src[0];
11659
+
11377
11660
  const int ith = params->ith;
11378
11661
  const int nth = params->nth;
11379
11662
 
@@ -11423,12 +11706,14 @@ static void ggml_compute_forward_diag_mask_f32(
11423
11706
 
11424
11707
  static void ggml_compute_forward_diag_mask_inf(
11425
11708
  const struct ggml_compute_params * params,
11426
- const struct ggml_tensor * src0,
11427
11709
  struct ggml_tensor * dst) {
11710
+
11711
+ const struct ggml_tensor * src0 = dst->src[0];
11712
+
11428
11713
  switch (src0->type) {
11429
11714
  case GGML_TYPE_F32:
11430
11715
  {
11431
- ggml_compute_forward_diag_mask_f32(params, src0, dst, -INFINITY);
11716
+ ggml_compute_forward_diag_mask_f32(params, dst, -INFINITY);
11432
11717
  } break;
11433
11718
  default:
11434
11719
  {
@@ -11439,12 +11724,14 @@ static void ggml_compute_forward_diag_mask_inf(
11439
11724
 
11440
11725
  static void ggml_compute_forward_diag_mask_zero(
11441
11726
  const struct ggml_compute_params * params,
11442
- const struct ggml_tensor * src0,
11443
11727
  struct ggml_tensor * dst) {
11728
+
11729
+ const struct ggml_tensor * src0 = dst->src[0];
11730
+
11444
11731
  switch (src0->type) {
11445
11732
  case GGML_TYPE_F32:
11446
11733
  {
11447
- ggml_compute_forward_diag_mask_f32(params, src0, dst, 0);
11734
+ ggml_compute_forward_diag_mask_f32(params, dst, 0);
11448
11735
  } break;
11449
11736
  default:
11450
11737
  {
@@ -11457,9 +11744,12 @@ static void ggml_compute_forward_diag_mask_zero(
11457
11744
 
11458
11745
  static void ggml_compute_forward_soft_max_f32(
11459
11746
  const struct ggml_compute_params * params,
11460
- const struct ggml_tensor * src0,
11461
- const struct ggml_tensor * src1,
11462
11747
  struct ggml_tensor * dst) {
11748
+
11749
+ const struct ggml_tensor * src0 = dst->src[0];
11750
+ const struct ggml_tensor * src1 = dst->src[1];
11751
+ const struct ggml_tensor * src2 = dst->src[2];
11752
+
11463
11753
  assert(ggml_is_contiguous(dst));
11464
11754
  assert(ggml_are_same_shape(src0, dst));
11465
11755
 
@@ -11467,16 +11757,29 @@ static void ggml_compute_forward_soft_max_f32(
11467
11757
  return;
11468
11758
  }
11469
11759
 
11470
- float scale = 1.0f;
11471
- memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
11760
+ float scale = 1.0f;
11761
+ float max_bias = 0.0f;
11762
+
11763
+ memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
11764
+ memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
11472
11765
 
11473
11766
  // TODO: handle transposed/permuted matrices
11474
11767
 
11475
11768
  const int ith = params->ith;
11476
11769
  const int nth = params->nth;
11477
11770
 
11771
+ GGML_TENSOR_UNARY_OP_LOCALS
11772
+
11478
11773
  const int64_t ne11 = src1 ? src1->ne[1] : 1;
11479
11774
 
11775
+ // TODO: is this supposed to be ceil instead of floor?
11776
+ // https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370
11777
+ const uint32_t n_head_kv = ne02;
11778
+ const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head_kv));
11779
+
11780
+ const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
11781
+ const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
11782
+
11480
11783
  const int nc = src0->ne[0];
11481
11784
  const int nr = ggml_nrows(src0);
11482
11785
 
@@ -11489,6 +11792,9 @@ static void ggml_compute_forward_soft_max_f32(
11489
11792
 
11490
11793
  float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
11491
11794
 
11795
+ // when max_bias <= 0.0f, src2 is not used and we default it to src0 to avoid branching
11796
+ float * pos = src2 ? (float *) src2->data : src0->data;
11797
+
11492
11798
  for (int i1 = ir0; i1 < ir1; i1++) {
11493
11799
  float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
11494
11800
  float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
@@ -11502,6 +11808,16 @@ static void ggml_compute_forward_soft_max_f32(
11502
11808
  ggml_vec_acc_f32(nc, wp, mp);
11503
11809
  }
11504
11810
 
11811
+ // ALiBi bias
11812
+ if (max_bias > 0.0f) {
11813
+ const uint32_t h = (i1/ne01)%ne02; // head
11814
+ const float slope = h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1);
11815
+
11816
+ for (int i = 0; i < nc; i++) {
11817
+ wp[i] = wp[i] + slope*pos[i];
11818
+ }
11819
+ }
11820
+
11505
11821
  #ifndef NDEBUG
11506
11822
  for (int i = 0; i < nc; ++i) {
11507
11823
  //printf("p[%d] = %f\n", i, p[i]);
@@ -11544,13 +11860,14 @@ static void ggml_compute_forward_soft_max_f32(
11544
11860
 
11545
11861
  static void ggml_compute_forward_soft_max(
11546
11862
  const struct ggml_compute_params * params,
11547
- const struct ggml_tensor * src0,
11548
- const struct ggml_tensor * src1,
11549
11863
  struct ggml_tensor * dst) {
11864
+
11865
+ const struct ggml_tensor * src0 = dst->src[0];
11866
+
11550
11867
  switch (src0->type) {
11551
11868
  case GGML_TYPE_F32:
11552
11869
  {
11553
- ggml_compute_forward_soft_max_f32(params, src0, src1, dst);
11870
+ ggml_compute_forward_soft_max_f32(params, dst);
11554
11871
  } break;
11555
11872
  default:
11556
11873
  {
@@ -11563,9 +11880,11 @@ static void ggml_compute_forward_soft_max(
11563
11880
 
11564
11881
  static void ggml_compute_forward_soft_max_back_f32(
11565
11882
  const struct ggml_compute_params * params,
11566
- const struct ggml_tensor * src0,
11567
- const struct ggml_tensor * src1,
11568
11883
  struct ggml_tensor * dst) {
11884
+
11885
+ const struct ggml_tensor * src0 = dst->src[0];
11886
+ const struct ggml_tensor * src1 = dst->src[1];
11887
+
11569
11888
  GGML_ASSERT(ggml_is_contiguous(src0));
11570
11889
  GGML_ASSERT(ggml_is_contiguous(src1));
11571
11890
  GGML_ASSERT(ggml_is_contiguous(dst));
@@ -11640,13 +11959,14 @@ static void ggml_compute_forward_soft_max_back_f32(
11640
11959
 
11641
11960
  static void ggml_compute_forward_soft_max_back(
11642
11961
  const struct ggml_compute_params * params,
11643
- const struct ggml_tensor * src0,
11644
- const struct ggml_tensor * src1,
11645
11962
  struct ggml_tensor * dst) {
11963
+
11964
+ const struct ggml_tensor * src0 = dst->src[0];
11965
+
11646
11966
  switch (src0->type) {
11647
11967
  case GGML_TYPE_F32:
11648
11968
  {
11649
- ggml_compute_forward_soft_max_back_f32(params, src0, src1, dst);
11969
+ ggml_compute_forward_soft_max_back_f32(params, dst);
11650
11970
  } break;
11651
11971
  default:
11652
11972
  {
@@ -11659,8 +11979,10 @@ static void ggml_compute_forward_soft_max_back(
11659
11979
 
11660
11980
  static void ggml_compute_forward_alibi_f32(
11661
11981
  const struct ggml_compute_params * params,
11662
- const struct ggml_tensor * src0,
11663
11982
  struct ggml_tensor * dst) {
11983
+
11984
+ const struct ggml_tensor * src0 = dst->src[0];
11985
+
11664
11986
  assert(params->ith == 0);
11665
11987
 
11666
11988
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -11694,22 +12016,20 @@ static void ggml_compute_forward_alibi_f32(
11694
12016
  const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
11695
12017
  const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
11696
12018
 
11697
- for (int64_t i = 0; i < ne0; i++) {
11698
- for (int64_t j = 0; j < ne1; j++) {
11699
- for (int64_t k = 0; k < ne2_ne3; k++) {
11700
- float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
11701
- float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
11702
-
11703
- // TODO: k*nb2 or k*nb3
11704
-
11705
- float m_k;
12019
+ for (int64_t k = 0; k < ne2_ne3; k++) {
12020
+ // TODO: k*nb2 or k*nb3
12021
+ float m_k;
11706
12022
 
11707
- if (k < n_heads_log2_floor) {
11708
- m_k = powf(m0, k + 1);
11709
- } else {
11710
- m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
11711
- }
12023
+ if (k < n_heads_log2_floor) {
12024
+ m_k = powf(m0, k + 1);
12025
+ } else {
12026
+ m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
12027
+ }
11712
12028
 
12029
+ for (int64_t i = 0; i < ne0; i++) {
12030
+ for (int64_t j = 0; j < ne1; j++) {
12031
+ float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
12032
+ float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
11713
12033
  pdst[0] = i * m_k + src[0];
11714
12034
  }
11715
12035
  }
@@ -11718,8 +12038,10 @@ static void ggml_compute_forward_alibi_f32(
11718
12038
 
11719
12039
  static void ggml_compute_forward_alibi_f16(
11720
12040
  const struct ggml_compute_params * params,
11721
- const struct ggml_tensor * src0,
11722
12041
  struct ggml_tensor * dst) {
12042
+
12043
+ const struct ggml_tensor * src0 = dst->src[0];
12044
+
11723
12045
  assert(params->ith == 0);
11724
12046
 
11725
12047
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -11754,21 +12076,20 @@ static void ggml_compute_forward_alibi_f16(
11754
12076
  const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
11755
12077
  const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
11756
12078
 
11757
- for (int i = 0; i < ne0; i++) {
11758
- for (int j = 0; j < ne1; j++) {
11759
- for (int k = 0; k < ne2_ne3; k++) {
11760
- ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
11761
- float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
11762
-
11763
- // TODO: k*nb2 or k*nb3
12079
+ for (int k = 0; k < ne2_ne3; k++) {
12080
+ // TODO: k*nb2 or k*nb3
12081
+ float m_k;
11764
12082
 
11765
- float m_k;
12083
+ if (k < n_heads_log2_floor) {
12084
+ m_k = powf(m0, k + 1);
12085
+ } else {
12086
+ m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
12087
+ }
11766
12088
 
11767
- if (k < n_heads_log2_floor) {
11768
- m_k = powf(m0, k + 1);
11769
- } else {
11770
- m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
11771
- }
12089
+ for (int i = 0; i < ne0; i++) {
12090
+ for (int j = 0; j < ne1; j++) {
12091
+ ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
12092
+ float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
11772
12093
 
11773
12094
  // we return F32
11774
12095
  pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]);
@@ -11779,16 +12100,18 @@ static void ggml_compute_forward_alibi_f16(
11779
12100
 
11780
12101
  static void ggml_compute_forward_alibi(
11781
12102
  const struct ggml_compute_params * params,
11782
- const struct ggml_tensor * src0,
11783
12103
  struct ggml_tensor * dst) {
12104
+
12105
+ const struct ggml_tensor * src0 = dst->src[0];
12106
+
11784
12107
  switch (src0->type) {
11785
12108
  case GGML_TYPE_F16:
11786
12109
  {
11787
- ggml_compute_forward_alibi_f16(params, src0, dst);
12110
+ ggml_compute_forward_alibi_f16(params, dst);
11788
12111
  } break;
11789
12112
  case GGML_TYPE_F32:
11790
12113
  {
11791
- ggml_compute_forward_alibi_f32(params, src0, dst);
12114
+ ggml_compute_forward_alibi_f32(params, dst);
11792
12115
  } break;
11793
12116
  case GGML_TYPE_Q4_0:
11794
12117
  case GGML_TYPE_Q4_1:
@@ -11804,6 +12127,8 @@ static void ggml_compute_forward_alibi(
11804
12127
  case GGML_TYPE_IQ2_XXS:
11805
12128
  case GGML_TYPE_IQ2_XS:
11806
12129
  case GGML_TYPE_IQ3_XXS:
12130
+ case GGML_TYPE_IQ1_S:
12131
+ case GGML_TYPE_IQ4_NL:
11807
12132
  case GGML_TYPE_Q8_K:
11808
12133
  case GGML_TYPE_I8:
11809
12134
  case GGML_TYPE_I16:
@@ -11819,8 +12144,10 @@ static void ggml_compute_forward_alibi(
11819
12144
 
11820
12145
  static void ggml_compute_forward_clamp_f32(
11821
12146
  const struct ggml_compute_params * params,
11822
- const struct ggml_tensor * src0,
11823
12147
  struct ggml_tensor * dst) {
12148
+
12149
+ const struct ggml_tensor * src0 = dst->src[0];
12150
+
11824
12151
  assert(params->ith == 0);
11825
12152
 
11826
12153
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -11859,12 +12186,14 @@ static void ggml_compute_forward_clamp_f32(
11859
12186
 
11860
12187
  static void ggml_compute_forward_clamp(
11861
12188
  const struct ggml_compute_params * params,
11862
- const struct ggml_tensor * src0,
11863
12189
  struct ggml_tensor * dst) {
12190
+
12191
+ const struct ggml_tensor * src0 = dst->src[0];
12192
+
11864
12193
  switch (src0->type) {
11865
12194
  case GGML_TYPE_F32:
11866
12195
  {
11867
- ggml_compute_forward_clamp_f32(params, src0, dst);
12196
+ ggml_compute_forward_clamp_f32(params, dst);
11868
12197
  } break;
11869
12198
  case GGML_TYPE_F16:
11870
12199
  case GGML_TYPE_Q4_0:
@@ -11881,6 +12210,8 @@ static void ggml_compute_forward_clamp(
11881
12210
  case GGML_TYPE_IQ2_XXS:
11882
12211
  case GGML_TYPE_IQ2_XS:
11883
12212
  case GGML_TYPE_IQ3_XXS:
12213
+ case GGML_TYPE_IQ1_S:
12214
+ case GGML_TYPE_IQ4_NL:
11884
12215
  case GGML_TYPE_Q8_K:
11885
12216
  case GGML_TYPE_I8:
11886
12217
  case GGML_TYPE_I16:
@@ -11952,10 +12283,12 @@ GGML_CALL void ggml_rope_yarn_corr_dims(
11952
12283
 
11953
12284
  static void ggml_compute_forward_rope_f32(
11954
12285
  const struct ggml_compute_params * params,
11955
- const struct ggml_tensor * src0,
11956
- const struct ggml_tensor * src1,
11957
12286
  struct ggml_tensor * dst,
11958
12287
  const bool forward) {
12288
+
12289
+ const struct ggml_tensor * src0 = dst->src[0];
12290
+ const struct ggml_tensor * src1 = dst->src[1];
12291
+
11959
12292
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11960
12293
  return;
11961
12294
  }
@@ -12128,10 +12461,12 @@ static void ggml_compute_forward_rope_f32(
12128
12461
 
12129
12462
  static void ggml_compute_forward_rope_f16(
12130
12463
  const struct ggml_compute_params * params,
12131
- const struct ggml_tensor * src0,
12132
- const struct ggml_tensor * src1,
12133
12464
  struct ggml_tensor * dst,
12134
12465
  const bool forward) {
12466
+
12467
+ const struct ggml_tensor * src0 = dst->src[0];
12468
+ const struct ggml_tensor * src1 = dst->src[1];
12469
+
12135
12470
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12136
12471
  return;
12137
12472
  }
@@ -12293,17 +12628,18 @@ static void ggml_compute_forward_rope_f16(
12293
12628
 
12294
12629
  static void ggml_compute_forward_rope(
12295
12630
  const struct ggml_compute_params * params,
12296
- const struct ggml_tensor * src0,
12297
- const struct ggml_tensor * src1,
12298
12631
  struct ggml_tensor * dst) {
12632
+
12633
+ const struct ggml_tensor * src0 = dst->src[0];
12634
+
12299
12635
  switch (src0->type) {
12300
12636
  case GGML_TYPE_F16:
12301
12637
  {
12302
- ggml_compute_forward_rope_f16(params, src0, src1, dst, true);
12638
+ ggml_compute_forward_rope_f16(params, dst, true);
12303
12639
  } break;
12304
12640
  case GGML_TYPE_F32:
12305
12641
  {
12306
- ggml_compute_forward_rope_f32(params, src0, src1, dst, true);
12642
+ ggml_compute_forward_rope_f32(params, dst, true);
12307
12643
  } break;
12308
12644
  default:
12309
12645
  {
@@ -12316,17 +12652,18 @@ static void ggml_compute_forward_rope(
12316
12652
 
12317
12653
  static void ggml_compute_forward_rope_back(
12318
12654
  const struct ggml_compute_params * params,
12319
- const struct ggml_tensor * src0,
12320
- const struct ggml_tensor * src1,
12321
12655
  struct ggml_tensor * dst) {
12656
+
12657
+ const struct ggml_tensor * src0 = dst->src[0];
12658
+
12322
12659
  switch (src0->type) {
12323
12660
  case GGML_TYPE_F16:
12324
12661
  {
12325
- ggml_compute_forward_rope_f16(params, src0, src1, dst, false);
12662
+ ggml_compute_forward_rope_f16(params, dst, false);
12326
12663
  } break;
12327
12664
  case GGML_TYPE_F32:
12328
12665
  {
12329
- ggml_compute_forward_rope_f32(params, src0, src1, dst, false);
12666
+ ggml_compute_forward_rope_f32(params, dst, false);
12330
12667
  } break;
12331
12668
  default:
12332
12669
  {
@@ -12339,9 +12676,11 @@ static void ggml_compute_forward_rope_back(
12339
12676
 
12340
12677
  static void ggml_compute_forward_conv_transpose_1d_f16_f32(
12341
12678
  const struct ggml_compute_params * params,
12342
- const struct ggml_tensor * src0,
12343
- const struct ggml_tensor * src1,
12344
12679
  struct ggml_tensor * dst) {
12680
+
12681
+ const struct ggml_tensor * src0 = dst->src[0];
12682
+ const struct ggml_tensor * src1 = dst->src[1];
12683
+
12345
12684
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
12346
12685
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
12347
12686
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -12436,9 +12775,11 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
12436
12775
 
12437
12776
  static void ggml_compute_forward_conv_transpose_1d_f32(
12438
12777
  const struct ggml_compute_params * params,
12439
- const struct ggml_tensor * src0,
12440
- const struct ggml_tensor * src1,
12441
12778
  struct ggml_tensor * dst) {
12779
+
12780
+ const struct ggml_tensor * src0 = dst->src[0];
12781
+ const struct ggml_tensor * src1 = dst->src[1];
12782
+
12442
12783
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
12443
12784
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
12444
12785
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -12533,17 +12874,18 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
12533
12874
 
12534
12875
  static void ggml_compute_forward_conv_transpose_1d(
12535
12876
  const struct ggml_compute_params * params,
12536
- const struct ggml_tensor * src0,
12537
- const struct ggml_tensor * src1,
12538
12877
  struct ggml_tensor * dst) {
12878
+
12879
+ const struct ggml_tensor * src0 = dst->src[0];
12880
+
12539
12881
  switch (src0->type) {
12540
12882
  case GGML_TYPE_F16:
12541
12883
  {
12542
- ggml_compute_forward_conv_transpose_1d_f16_f32(params, src0, src1, dst);
12884
+ ggml_compute_forward_conv_transpose_1d_f16_f32(params, dst);
12543
12885
  } break;
12544
12886
  case GGML_TYPE_F32:
12545
12887
  {
12546
- ggml_compute_forward_conv_transpose_1d_f32(params, src0, src1, dst);
12888
+ ggml_compute_forward_conv_transpose_1d_f32(params, dst);
12547
12889
  } break;
12548
12890
  default:
12549
12891
  {
@@ -12557,9 +12899,11 @@ static void ggml_compute_forward_conv_transpose_1d(
12557
12899
  // dst: result [N, OH, OW, IC*KH*KW]
12558
12900
  static void ggml_compute_forward_im2col_f32(
12559
12901
  const struct ggml_compute_params * params,
12560
- const struct ggml_tensor * src0,
12561
- const struct ggml_tensor * src1,
12562
12902
  struct ggml_tensor * dst) {
12903
+
12904
+ const struct ggml_tensor * src0 = dst->src[0];
12905
+ const struct ggml_tensor * src1 = dst->src[1];
12906
+
12563
12907
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
12564
12908
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
12565
12909
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -12643,9 +12987,11 @@ static void ggml_compute_forward_im2col_f32(
12643
12987
  // dst: result [N, OH, OW, IC*KH*KW]
12644
12988
  static void ggml_compute_forward_im2col_f16(
12645
12989
  const struct ggml_compute_params * params,
12646
- const struct ggml_tensor * src0,
12647
- const struct ggml_tensor * src1,
12648
12990
  struct ggml_tensor * dst) {
12991
+
12992
+ const struct ggml_tensor * src0 = dst->src[0];
12993
+ const struct ggml_tensor * src1 = dst->src[1];
12994
+
12649
12995
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
12650
12996
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
12651
12997
  GGML_ASSERT( dst->type == GGML_TYPE_F16);
@@ -12725,17 +13071,15 @@ static void ggml_compute_forward_im2col_f16(
12725
13071
 
12726
13072
  static void ggml_compute_forward_im2col(
12727
13073
  const struct ggml_compute_params * params,
12728
- const struct ggml_tensor * src0,
12729
- const struct ggml_tensor * src1,
12730
13074
  struct ggml_tensor * dst) {
12731
13075
  switch (dst->type) {
12732
13076
  case GGML_TYPE_F16:
12733
13077
  {
12734
- ggml_compute_forward_im2col_f16(params, src0, src1, dst);
13078
+ ggml_compute_forward_im2col_f16(params, dst);
12735
13079
  } break;
12736
13080
  case GGML_TYPE_F32:
12737
13081
  {
12738
- ggml_compute_forward_im2col_f32(params, src0, src1, dst);
13082
+ ggml_compute_forward_im2col_f32(params, dst);
12739
13083
  } break;
12740
13084
  default:
12741
13085
  {
@@ -12749,9 +13093,11 @@ static void ggml_compute_forward_im2col(
12749
13093
 
12750
13094
  static void ggml_compute_forward_conv_transpose_2d(
12751
13095
  const struct ggml_compute_params * params,
12752
- const struct ggml_tensor * src0,
12753
- const struct ggml_tensor * src1,
12754
13096
  struct ggml_tensor * dst) {
13097
+
13098
+ const struct ggml_tensor * src0 = dst->src[0];
13099
+ const struct ggml_tensor * src1 = dst->src[1];
13100
+
12755
13101
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
12756
13102
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
12757
13103
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -12855,9 +13201,11 @@ static void ggml_compute_forward_conv_transpose_2d(
12855
13201
  static void ggml_compute_forward_pool_1d_sk_p0(
12856
13202
  const struct ggml_compute_params * params,
12857
13203
  const enum ggml_op_pool op,
12858
- const struct ggml_tensor * src,
12859
13204
  const int k,
12860
13205
  struct ggml_tensor * dst) {
13206
+
13207
+ const struct ggml_tensor * src = dst->src[0];
13208
+
12861
13209
  assert(src->type == GGML_TYPE_F32);
12862
13210
  assert(params->ith == 0);
12863
13211
 
@@ -12906,7 +13254,6 @@ static void ggml_compute_forward_pool_1d_sk_p0(
12906
13254
 
12907
13255
  static void ggml_compute_forward_pool_1d(
12908
13256
  const struct ggml_compute_params * params,
12909
- const struct ggml_tensor * src0,
12910
13257
  struct ggml_tensor * dst) {
12911
13258
 
12912
13259
  const int32_t * opts = (const int32_t *)dst->op_params;
@@ -12917,15 +13264,17 @@ static void ggml_compute_forward_pool_1d(
12917
13264
  GGML_ASSERT(p0 == 0); // padding not supported
12918
13265
  GGML_ASSERT(k0 == s0); // only s = k supported
12919
13266
 
12920
- ggml_compute_forward_pool_1d_sk_p0(params, op, src0, k0, dst);
13267
+ ggml_compute_forward_pool_1d_sk_p0(params, op, k0, dst);
12921
13268
  }
12922
13269
 
12923
13270
  // ggml_compute_forward_pool_2d
12924
13271
 
12925
13272
  static void ggml_compute_forward_pool_2d(
12926
13273
  const struct ggml_compute_params * params,
12927
- const struct ggml_tensor * src,
12928
13274
  struct ggml_tensor * dst) {
13275
+
13276
+ const struct ggml_tensor * src = dst->src[0];
13277
+
12929
13278
  GGML_ASSERT(src->type == GGML_TYPE_F32);
12930
13279
  GGML_ASSERT(params->ith == 0);
12931
13280
 
@@ -12998,9 +13347,10 @@ static void ggml_compute_forward_pool_2d(
12998
13347
 
12999
13348
  static void ggml_compute_forward_upscale_f32(
13000
13349
  const struct ggml_compute_params * params,
13001
- const struct ggml_tensor * src0,
13002
13350
  struct ggml_tensor * dst) {
13003
13351
 
13352
+ const struct ggml_tensor * src0 = dst->src[0];
13353
+
13004
13354
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
13005
13355
  return;
13006
13356
  }
@@ -13037,12 +13387,14 @@ static void ggml_compute_forward_upscale_f32(
13037
13387
 
13038
13388
  static void ggml_compute_forward_upscale(
13039
13389
  const struct ggml_compute_params * params,
13040
- const struct ggml_tensor * src0,
13041
13390
  struct ggml_tensor * dst) {
13391
+
13392
+ const struct ggml_tensor * src0 = dst->src[0];
13393
+
13042
13394
  switch (src0->type) {
13043
13395
  case GGML_TYPE_F32:
13044
13396
  {
13045
- ggml_compute_forward_upscale_f32(params, src0, dst);
13397
+ ggml_compute_forward_upscale_f32(params, dst);
13046
13398
  } break;
13047
13399
  default:
13048
13400
  {
@@ -13055,9 +13407,10 @@ static void ggml_compute_forward_upscale(
13055
13407
 
13056
13408
  static void ggml_compute_forward_pad_f32(
13057
13409
  const struct ggml_compute_params * params,
13058
- const struct ggml_tensor * src0,
13059
13410
  struct ggml_tensor * dst) {
13060
13411
 
13412
+ const struct ggml_tensor * src0 = dst->src[0];
13413
+
13061
13414
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
13062
13415
  return;
13063
13416
  }
@@ -13095,12 +13448,14 @@ static void ggml_compute_forward_pad_f32(
13095
13448
 
13096
13449
  static void ggml_compute_forward_pad(
13097
13450
  const struct ggml_compute_params * params,
13098
- const struct ggml_tensor * src0,
13099
13451
  struct ggml_tensor * dst) {
13452
+
13453
+ const struct ggml_tensor * src0 = dst->src[0];
13454
+
13100
13455
  switch (src0->type) {
13101
13456
  case GGML_TYPE_F32:
13102
13457
  {
13103
- ggml_compute_forward_pad_f32(params, src0, dst);
13458
+ ggml_compute_forward_pad_f32(params, dst);
13104
13459
  } break;
13105
13460
  default:
13106
13461
  {
@@ -13113,9 +13468,10 @@ static void ggml_compute_forward_pad(
13113
13468
 
13114
13469
  static void ggml_compute_forward_argsort_f32(
13115
13470
  const struct ggml_compute_params * params,
13116
- const struct ggml_tensor * src0,
13117
13471
  struct ggml_tensor * dst) {
13118
13472
 
13473
+ const struct ggml_tensor * src0 = dst->src[0];
13474
+
13119
13475
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
13120
13476
  return;
13121
13477
  }
@@ -13155,13 +13511,14 @@ static void ggml_compute_forward_argsort_f32(
13155
13511
 
13156
13512
  static void ggml_compute_forward_argsort(
13157
13513
  const struct ggml_compute_params * params,
13158
- const struct ggml_tensor * src0,
13159
13514
  struct ggml_tensor * dst) {
13160
13515
 
13516
+ const struct ggml_tensor * src0 = dst->src[0];
13517
+
13161
13518
  switch (src0->type) {
13162
13519
  case GGML_TYPE_F32:
13163
13520
  {
13164
- ggml_compute_forward_argsort_f32(params, src0, dst);
13521
+ ggml_compute_forward_argsort_f32(params, dst);
13165
13522
  } break;
13166
13523
  default:
13167
13524
  {
@@ -13174,11 +13531,13 @@ static void ggml_compute_forward_argsort(
13174
13531
 
13175
13532
  static void ggml_compute_forward_flash_attn_f32(
13176
13533
  const struct ggml_compute_params * params,
13177
- const struct ggml_tensor * q,
13178
- const struct ggml_tensor * k,
13179
- const struct ggml_tensor * v,
13180
13534
  const bool masked,
13181
13535
  struct ggml_tensor * dst) {
13536
+
13537
+ const struct ggml_tensor * q = dst->src[0];
13538
+ const struct ggml_tensor * k = dst->src[1];
13539
+ const struct ggml_tensor * v = dst->src[2];
13540
+
13182
13541
  int64_t t0 = ggml_perf_time_us();
13183
13542
  UNUSED(t0);
13184
13543
 
@@ -13364,11 +13723,13 @@ static void ggml_compute_forward_flash_attn_f32(
13364
13723
 
13365
13724
  static void ggml_compute_forward_flash_attn_f16(
13366
13725
  const struct ggml_compute_params * params,
13367
- const struct ggml_tensor * q,
13368
- const struct ggml_tensor * k,
13369
- const struct ggml_tensor * v,
13370
13726
  const bool masked,
13371
13727
  struct ggml_tensor * dst) {
13728
+
13729
+ const struct ggml_tensor * q = dst->src[0];
13730
+ const struct ggml_tensor * k = dst->src[1];
13731
+ const struct ggml_tensor * v = dst->src[2];
13732
+
13372
13733
  int64_t t0 = ggml_perf_time_us();
13373
13734
  UNUSED(t0);
13374
13735
 
@@ -13590,19 +13951,19 @@ static void ggml_compute_forward_flash_attn_f16(
13590
13951
 
13591
13952
  static void ggml_compute_forward_flash_attn(
13592
13953
  const struct ggml_compute_params * params,
13593
- const struct ggml_tensor * q,
13594
- const struct ggml_tensor * k,
13595
- const struct ggml_tensor * v,
13596
13954
  const bool masked,
13597
13955
  struct ggml_tensor * dst) {
13956
+
13957
+ const struct ggml_tensor * q = dst->src[0];
13958
+
13598
13959
  switch (q->type) {
13599
13960
  case GGML_TYPE_F16:
13600
13961
  {
13601
- ggml_compute_forward_flash_attn_f16(params, q, k, v, masked, dst);
13962
+ ggml_compute_forward_flash_attn_f16(params, masked, dst);
13602
13963
  } break;
13603
13964
  case GGML_TYPE_F32:
13604
13965
  {
13605
- ggml_compute_forward_flash_attn_f32(params, q, k, v, masked, dst);
13966
+ ggml_compute_forward_flash_attn_f32(params, masked, dst);
13606
13967
  } break;
13607
13968
  default:
13608
13969
  {
@@ -13615,12 +13976,14 @@ static void ggml_compute_forward_flash_attn(
13615
13976
 
13616
13977
  static void ggml_compute_forward_flash_ff_f16(
13617
13978
  const struct ggml_compute_params * params,
13618
- const struct ggml_tensor * a, // F16
13619
- const struct ggml_tensor * b0, // F16 fc_w
13620
- const struct ggml_tensor * b1, // F32 fc_b
13621
- const struct ggml_tensor * c0, // F16 proj_w
13622
- const struct ggml_tensor * c1, // F32 proj_b
13623
13979
  struct ggml_tensor * dst) {
13980
+
13981
+ const struct ggml_tensor * a = dst->src[0]; // F16
13982
+ const struct ggml_tensor * b0 = dst->src[1]; // F16 fc_w
13983
+ const struct ggml_tensor * b1 = dst->src[2]; // F32 fc_b
13984
+ const struct ggml_tensor * c0 = dst->src[3]; // F16 proj_w
13985
+ const struct ggml_tensor * c1 = dst->src[4]; // F32 proj_b
13986
+
13624
13987
  int64_t t0 = ggml_perf_time_us();
13625
13988
  UNUSED(t0);
13626
13989
 
@@ -13748,16 +14111,14 @@ static void ggml_compute_forward_flash_ff_f16(
13748
14111
 
13749
14112
  static void ggml_compute_forward_flash_ff(
13750
14113
  const struct ggml_compute_params * params,
13751
- const struct ggml_tensor * a,
13752
- const struct ggml_tensor * b0,
13753
- const struct ggml_tensor * b1,
13754
- const struct ggml_tensor * c0,
13755
- const struct ggml_tensor * c1,
13756
14114
  struct ggml_tensor * dst) {
14115
+
14116
+ const struct ggml_tensor * b0 = dst->src[1];
14117
+
13757
14118
  switch (b0->type) {
13758
14119
  case GGML_TYPE_F16:
13759
14120
  {
13760
- ggml_compute_forward_flash_ff_f16(params, a, b0, b1, c0, c1, dst);
14121
+ ggml_compute_forward_flash_ff_f16(params, dst);
13761
14122
  } break;
13762
14123
  case GGML_TYPE_F32:
13763
14124
  {
@@ -13774,12 +14135,14 @@ static void ggml_compute_forward_flash_ff(
13774
14135
 
13775
14136
  static void ggml_compute_forward_flash_attn_back_f32(
13776
14137
  const struct ggml_compute_params * params,
13777
- const struct ggml_tensor * q,
13778
- const struct ggml_tensor * k,
13779
- const struct ggml_tensor * v,
13780
- const struct ggml_tensor * d,
13781
14138
  const bool masked,
13782
14139
  struct ggml_tensor * dst) {
14140
+
14141
+ const struct ggml_tensor * q = dst->src[0];
14142
+ const struct ggml_tensor * k = dst->src[1];
14143
+ const struct ggml_tensor * v = dst->src[2];
14144
+ const struct ggml_tensor * d = dst->src[3];
14145
+
13783
14146
  int64_t t0 = ggml_perf_time_us();
13784
14147
  UNUSED(t0);
13785
14148
 
@@ -14127,16 +14490,15 @@ static void ggml_compute_forward_flash_attn_back_f32(
14127
14490
 
14128
14491
  static void ggml_compute_forward_flash_attn_back(
14129
14492
  const struct ggml_compute_params * params,
14130
- const struct ggml_tensor * q,
14131
- const struct ggml_tensor * k,
14132
- const struct ggml_tensor * v,
14133
- const struct ggml_tensor * d,
14134
14493
  const bool masked,
14135
14494
  struct ggml_tensor * dst) {
14495
+
14496
+ const struct ggml_tensor * q = dst->src[0];
14497
+
14136
14498
  switch (q->type) {
14137
14499
  case GGML_TYPE_F32:
14138
14500
  {
14139
- ggml_compute_forward_flash_attn_back_f32(params, q, k, v, d, masked, dst);
14501
+ ggml_compute_forward_flash_attn_back_f32(params, masked, dst);
14140
14502
  } break;
14141
14503
  default:
14142
14504
  {
@@ -14149,8 +14511,10 @@ static void ggml_compute_forward_flash_attn_back(
14149
14511
 
14150
14512
  static void ggml_compute_forward_win_part_f32(
14151
14513
  const struct ggml_compute_params * params,
14152
- const struct ggml_tensor * src0,
14153
14514
  struct ggml_tensor * dst) {
14515
+
14516
+ const struct ggml_tensor * src0 = dst->src[0];
14517
+
14154
14518
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14155
14519
  return;
14156
14520
  }
@@ -14193,12 +14557,14 @@ static void ggml_compute_forward_win_part_f32(
14193
14557
 
14194
14558
  static void ggml_compute_forward_win_part(
14195
14559
  const struct ggml_compute_params * params,
14196
- const struct ggml_tensor * src0,
14197
14560
  struct ggml_tensor * dst) {
14561
+
14562
+ const struct ggml_tensor * src0 = dst->src[0];
14563
+
14198
14564
  switch (src0->type) {
14199
14565
  case GGML_TYPE_F32:
14200
14566
  {
14201
- ggml_compute_forward_win_part_f32(params, src0, dst);
14567
+ ggml_compute_forward_win_part_f32(params, dst);
14202
14568
  } break;
14203
14569
  default:
14204
14570
  {
@@ -14211,8 +14577,10 @@ static void ggml_compute_forward_win_part(
14211
14577
 
14212
14578
  static void ggml_compute_forward_win_unpart_f32(
14213
14579
  const struct ggml_compute_params * params,
14214
- const struct ggml_tensor * src0,
14215
14580
  struct ggml_tensor * dst) {
14581
+
14582
+ const struct ggml_tensor * src0 = dst->src[0];
14583
+
14216
14584
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14217
14585
  return;
14218
14586
  }
@@ -14253,12 +14621,14 @@ static void ggml_compute_forward_win_unpart_f32(
14253
14621
 
14254
14622
  static void ggml_compute_forward_win_unpart(
14255
14623
  const struct ggml_compute_params * params,
14256
- const struct ggml_tensor * src0,
14257
14624
  struct ggml_tensor * dst) {
14625
+
14626
+ const struct ggml_tensor * src0 = dst->src[0];
14627
+
14258
14628
  switch (src0->type) {
14259
14629
  case GGML_TYPE_F32:
14260
14630
  {
14261
- ggml_compute_forward_win_unpart_f32(params, src0, dst);
14631
+ ggml_compute_forward_win_unpart_f32(params, dst);
14262
14632
  } break;
14263
14633
  default:
14264
14634
  {
@@ -14271,58 +14641,58 @@ static void ggml_compute_forward_win_unpart(
14271
14641
 
14272
14642
  static void ggml_compute_forward_unary(
14273
14643
  const struct ggml_compute_params * params,
14274
- const struct ggml_tensor * src0,
14275
14644
  struct ggml_tensor * dst) {
14645
+
14276
14646
  const enum ggml_unary_op op = ggml_get_unary_op(dst);
14277
14647
 
14278
14648
  switch (op) {
14279
14649
  case GGML_UNARY_OP_ABS:
14280
14650
  {
14281
- ggml_compute_forward_abs(params, src0, dst);
14651
+ ggml_compute_forward_abs(params, dst);
14282
14652
  } break;
14283
14653
  case GGML_UNARY_OP_SGN:
14284
14654
  {
14285
- ggml_compute_forward_sgn(params, src0, dst);
14655
+ ggml_compute_forward_sgn(params, dst);
14286
14656
  } break;
14287
14657
  case GGML_UNARY_OP_NEG:
14288
14658
  {
14289
- ggml_compute_forward_neg(params, src0, dst);
14659
+ ggml_compute_forward_neg(params, dst);
14290
14660
  } break;
14291
14661
  case GGML_UNARY_OP_STEP:
14292
14662
  {
14293
- ggml_compute_forward_step(params, src0, dst);
14663
+ ggml_compute_forward_step(params, dst);
14294
14664
  } break;
14295
14665
  case GGML_UNARY_OP_TANH:
14296
14666
  {
14297
- ggml_compute_forward_tanh(params, src0, dst);
14667
+ ggml_compute_forward_tanh(params, dst);
14298
14668
  } break;
14299
14669
  case GGML_UNARY_OP_ELU:
14300
14670
  {
14301
- ggml_compute_forward_elu(params, src0, dst);
14671
+ ggml_compute_forward_elu(params, dst);
14302
14672
  } break;
14303
14673
  case GGML_UNARY_OP_RELU:
14304
14674
  {
14305
- ggml_compute_forward_relu(params, src0, dst);
14675
+ ggml_compute_forward_relu(params, dst);
14306
14676
  } break;
14307
14677
  case GGML_UNARY_OP_GELU:
14308
14678
  {
14309
- ggml_compute_forward_gelu(params, src0, dst);
14679
+ ggml_compute_forward_gelu(params, dst);
14310
14680
  } break;
14311
14681
  case GGML_UNARY_OP_GELU_QUICK:
14312
14682
  {
14313
- ggml_compute_forward_gelu_quick(params, src0, dst);
14683
+ ggml_compute_forward_gelu_quick(params, dst);
14314
14684
  } break;
14315
14685
  case GGML_UNARY_OP_SILU:
14316
14686
  {
14317
- ggml_compute_forward_silu(params, src0, dst);
14687
+ ggml_compute_forward_silu(params, dst);
14318
14688
  } break;
14319
14689
  case GGML_UNARY_OP_HARDSWISH:
14320
14690
  {
14321
- ggml_compute_forward_hardswish(params, src0, dst);
14691
+ ggml_compute_forward_hardswish(params, dst);
14322
14692
  } break;
14323
14693
  case GGML_UNARY_OP_HARDSIGMOID:
14324
14694
  {
14325
- ggml_compute_forward_hardsigmoid(params, src0, dst);
14695
+ ggml_compute_forward_hardsigmoid(params, dst);
14326
14696
  } break;
14327
14697
  default:
14328
14698
  {
@@ -14335,8 +14705,10 @@ static void ggml_compute_forward_unary(
14335
14705
 
14336
14706
  static void ggml_compute_forward_get_rel_pos_f16(
14337
14707
  const struct ggml_compute_params * params,
14338
- const struct ggml_tensor * src0,
14339
14708
  struct ggml_tensor * dst) {
14709
+
14710
+ const struct ggml_tensor * src0 = dst->src[0];
14711
+
14340
14712
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14341
14713
  return;
14342
14714
  }
@@ -14362,12 +14734,14 @@ static void ggml_compute_forward_get_rel_pos_f16(
14362
14734
 
14363
14735
  static void ggml_compute_forward_get_rel_pos(
14364
14736
  const struct ggml_compute_params * params,
14365
- const struct ggml_tensor * src0,
14366
14737
  struct ggml_tensor * dst) {
14738
+
14739
+ const struct ggml_tensor * src0 = dst->src[0];
14740
+
14367
14741
  switch (src0->type) {
14368
14742
  case GGML_TYPE_F16:
14369
14743
  {
14370
- ggml_compute_forward_get_rel_pos_f16(params, src0, dst);
14744
+ ggml_compute_forward_get_rel_pos_f16(params, dst);
14371
14745
  } break;
14372
14746
  default:
14373
14747
  {
@@ -14380,11 +14754,12 @@ static void ggml_compute_forward_get_rel_pos(
14380
14754
 
14381
14755
  static void ggml_compute_forward_add_rel_pos_f32(
14382
14756
  const struct ggml_compute_params * params,
14383
- const struct ggml_tensor * src0,
14384
- const struct ggml_tensor * src1,
14385
- const struct ggml_tensor * src2,
14386
14757
  struct ggml_tensor * dst) {
14387
14758
 
14759
+ const struct ggml_tensor * src0 = dst->src[0];
14760
+ const struct ggml_tensor * src1 = dst->src[1];
14761
+ const struct ggml_tensor * src2 = dst->src[2];
14762
+
14388
14763
  const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
14389
14764
  if (!inplace && params->type == GGML_TASK_INIT) {
14390
14765
  if (params->ith != 0) {
@@ -14448,14 +14823,14 @@ static void ggml_compute_forward_add_rel_pos_f32(
14448
14823
 
14449
14824
  static void ggml_compute_forward_add_rel_pos(
14450
14825
  const struct ggml_compute_params * params,
14451
- const struct ggml_tensor * src0,
14452
- const struct ggml_tensor * src1,
14453
- const struct ggml_tensor * src2,
14454
14826
  struct ggml_tensor * dst) {
14827
+
14828
+ const struct ggml_tensor * src0 = dst->src[0];
14829
+
14455
14830
  switch (src0->type) {
14456
14831
  case GGML_TYPE_F32:
14457
14832
  {
14458
- ggml_compute_forward_add_rel_pos_f32(params, src0, src1, src2, dst);
14833
+ ggml_compute_forward_add_rel_pos_f32(params, dst);
14459
14834
  } break;
14460
14835
  default:
14461
14836
  {
@@ -14468,9 +14843,11 @@ static void ggml_compute_forward_add_rel_pos(
14468
14843
 
14469
14844
  static void ggml_compute_forward_map_unary_f32(
14470
14845
  const struct ggml_compute_params * params,
14471
- const struct ggml_tensor * src0,
14472
14846
  struct ggml_tensor * dst,
14473
14847
  const ggml_unary_op_f32_t fun) {
14848
+
14849
+ const struct ggml_tensor * src0 = dst->src[0];
14850
+
14474
14851
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
14475
14852
 
14476
14853
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -14492,13 +14869,15 @@ static void ggml_compute_forward_map_unary_f32(
14492
14869
 
14493
14870
  static void ggml_compute_forward_map_unary(
14494
14871
  const struct ggml_compute_params * params,
14495
- const struct ggml_tensor * src0,
14496
14872
  struct ggml_tensor * dst,
14497
14873
  const ggml_unary_op_f32_t fun) {
14874
+
14875
+ const struct ggml_tensor * src0 = dst->src[0];
14876
+
14498
14877
  switch (src0->type) {
14499
14878
  case GGML_TYPE_F32:
14500
14879
  {
14501
- ggml_compute_forward_map_unary_f32(params, src0, dst, fun);
14880
+ ggml_compute_forward_map_unary_f32(params, dst, fun);
14502
14881
  } break;
14503
14882
  default:
14504
14883
  {
@@ -14511,10 +14890,12 @@ static void ggml_compute_forward_map_unary(
14511
14890
 
14512
14891
  static void ggml_compute_forward_map_binary_f32(
14513
14892
  const struct ggml_compute_params * params,
14514
- const struct ggml_tensor * src0,
14515
- const struct ggml_tensor * src1,
14516
14893
  struct ggml_tensor * dst,
14517
14894
  const ggml_binary_op_f32_t fun) {
14895
+
14896
+ const struct ggml_tensor * src0 = dst->src[0];
14897
+ const struct ggml_tensor * src1 = dst->src[1];
14898
+
14518
14899
  assert(params->ith == 0);
14519
14900
  assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
14520
14901
 
@@ -14539,14 +14920,15 @@ static void ggml_compute_forward_map_binary_f32(
14539
14920
 
14540
14921
  static void ggml_compute_forward_map_binary(
14541
14922
  const struct ggml_compute_params * params,
14542
- const struct ggml_tensor * src0,
14543
- const struct ggml_tensor * src1,
14544
14923
  struct ggml_tensor * dst,
14545
14924
  const ggml_binary_op_f32_t fun) {
14925
+
14926
+ const struct ggml_tensor * src0 = dst->src[0];
14927
+
14546
14928
  switch (src0->type) {
14547
14929
  case GGML_TYPE_F32:
14548
14930
  {
14549
- ggml_compute_forward_map_binary_f32(params, src0, src1, dst, fun);
14931
+ ggml_compute_forward_map_binary_f32(params, dst, fun);
14550
14932
  } break;
14551
14933
  default:
14552
14934
  {
@@ -14559,9 +14941,11 @@ static void ggml_compute_forward_map_binary(
14559
14941
 
14560
14942
  static void ggml_compute_forward_map_custom1_f32(
14561
14943
  const struct ggml_compute_params * params,
14562
- const struct ggml_tensor * a,
14563
14944
  struct ggml_tensor * dst,
14564
14945
  const ggml_custom1_op_f32_t fun) {
14946
+
14947
+ const struct ggml_tensor * a = dst->src[0];
14948
+
14565
14949
  assert(params->ith == 0);
14566
14950
 
14567
14951
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -14575,10 +14959,12 @@ static void ggml_compute_forward_map_custom1_f32(
14575
14959
 
14576
14960
  static void ggml_compute_forward_map_custom2_f32(
14577
14961
  const struct ggml_compute_params * params,
14578
- const struct ggml_tensor * a,
14579
- const struct ggml_tensor * b,
14580
14962
  struct ggml_tensor * dst,
14581
14963
  const ggml_custom2_op_f32_t fun) {
14964
+
14965
+ const struct ggml_tensor * a = dst->src[0];
14966
+ const struct ggml_tensor * b = dst->src[1];
14967
+
14582
14968
  assert(params->ith == 0);
14583
14969
 
14584
14970
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -14592,11 +14978,13 @@ static void ggml_compute_forward_map_custom2_f32(
14592
14978
 
14593
14979
  static void ggml_compute_forward_map_custom3_f32(
14594
14980
  const struct ggml_compute_params * params,
14595
- const struct ggml_tensor * a,
14596
- const struct ggml_tensor * b,
14597
- const struct ggml_tensor * c,
14598
14981
  struct ggml_tensor * dst,
14599
14982
  const ggml_custom3_op_f32_t fun) {
14983
+
14984
+ const struct ggml_tensor * a = dst->src[0];
14985
+ const struct ggml_tensor * b = dst->src[1];
14986
+ const struct ggml_tensor * c = dst->src[1];
14987
+
14600
14988
  assert(params->ith == 0);
14601
14989
 
14602
14990
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -14610,8 +14998,10 @@ static void ggml_compute_forward_map_custom3_f32(
14610
14998
 
14611
14999
  static void ggml_compute_forward_map_custom1(
14612
15000
  const struct ggml_compute_params * params,
14613
- const struct ggml_tensor * a,
14614
15001
  struct ggml_tensor * dst) {
15002
+
15003
+ const struct ggml_tensor * a = dst->src[0];
15004
+
14615
15005
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14616
15006
  return;
14617
15007
  }
@@ -14625,9 +15015,11 @@ static void ggml_compute_forward_map_custom1(
14625
15015
 
14626
15016
  static void ggml_compute_forward_map_custom2(
14627
15017
  const struct ggml_compute_params * params,
14628
- const struct ggml_tensor * a,
14629
- const struct ggml_tensor * b,
14630
15018
  struct ggml_tensor * dst) {
15019
+
15020
+ const struct ggml_tensor * a = dst->src[0];
15021
+ const struct ggml_tensor * b = dst->src[1];
15022
+
14631
15023
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14632
15024
  return;
14633
15025
  }
@@ -14641,10 +15033,12 @@ static void ggml_compute_forward_map_custom2(
14641
15033
 
14642
15034
  static void ggml_compute_forward_map_custom3(
14643
15035
  const struct ggml_compute_params * params,
14644
- const struct ggml_tensor * a,
14645
- const struct ggml_tensor * b,
14646
- const struct ggml_tensor * c,
14647
15036
  struct ggml_tensor * dst) {
15037
+
15038
+ const struct ggml_tensor * a = dst->src[0];
15039
+ const struct ggml_tensor * b = dst->src[1];
15040
+ const struct ggml_tensor * c = dst->src[2];
15041
+
14648
15042
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14649
15043
  return;
14650
15044
  }
@@ -14658,9 +15052,11 @@ static void ggml_compute_forward_map_custom3(
14658
15052
 
14659
15053
  static void ggml_compute_forward_cross_entropy_loss_f32(
14660
15054
  const struct ggml_compute_params * params,
14661
- const struct ggml_tensor * src0,
14662
- const struct ggml_tensor * src1,
14663
15055
  struct ggml_tensor * dst) {
15056
+
15057
+ const struct ggml_tensor * src0 = dst->src[0];
15058
+ const struct ggml_tensor * src1 = dst->src[1];
15059
+
14664
15060
  GGML_ASSERT(ggml_is_contiguous(src0));
14665
15061
  GGML_ASSERT(ggml_is_contiguous(src1));
14666
15062
  GGML_ASSERT(ggml_is_scalar(dst));
@@ -14764,13 +15160,14 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
14764
15160
 
14765
15161
  static void ggml_compute_forward_cross_entropy_loss(
14766
15162
  const struct ggml_compute_params * params,
14767
- const struct ggml_tensor * src0,
14768
- const struct ggml_tensor * src1,
14769
15163
  struct ggml_tensor * dst) {
15164
+
15165
+ const struct ggml_tensor * src0 = dst->src[0];
15166
+
14770
15167
  switch (src0->type) {
14771
15168
  case GGML_TYPE_F32:
14772
15169
  {
14773
- ggml_compute_forward_cross_entropy_loss_f32(params, src0, src1, dst);
15170
+ ggml_compute_forward_cross_entropy_loss_f32(params, dst);
14774
15171
  } break;
14775
15172
  default:
14776
15173
  {
@@ -14783,10 +15180,12 @@ static void ggml_compute_forward_cross_entropy_loss(
14783
15180
 
14784
15181
  static void ggml_compute_forward_cross_entropy_loss_back_f32(
14785
15182
  const struct ggml_compute_params * params,
14786
- const struct ggml_tensor * src0,
14787
- const struct ggml_tensor * src1,
14788
- const struct ggml_tensor * opt0,
14789
15183
  struct ggml_tensor * dst) {
15184
+
15185
+ const struct ggml_tensor * src0 = dst->src[0];
15186
+ const struct ggml_tensor * src1 = dst->src[1];
15187
+ const struct ggml_tensor * opt0 = dst->src[2];
15188
+
14790
15189
  GGML_ASSERT(ggml_is_contiguous(dst));
14791
15190
  GGML_ASSERT(ggml_is_contiguous(src0));
14792
15191
  GGML_ASSERT(ggml_is_contiguous(src1));
@@ -14873,14 +15272,14 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
14873
15272
 
14874
15273
  static void ggml_compute_forward_cross_entropy_loss_back(
14875
15274
  const struct ggml_compute_params * params,
14876
- const struct ggml_tensor * src0,
14877
- const struct ggml_tensor * src1,
14878
- const struct ggml_tensor * opt0,
14879
15275
  struct ggml_tensor * dst) {
15276
+
15277
+ const struct ggml_tensor * src0 = dst->src[0];
15278
+
14880
15279
  switch (src0->type) {
14881
15280
  case GGML_TYPE_F32:
14882
15281
  {
14883
- ggml_compute_forward_cross_entropy_loss_back_f32(params, src0, src1, opt0, dst);
15282
+ ggml_compute_forward_cross_entropy_loss_back_f32(params, dst);
14884
15283
  } break;
14885
15284
  default:
14886
15285
  {
@@ -14928,312 +15327,312 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14928
15327
  switch (tensor->op) {
14929
15328
  case GGML_OP_DUP:
14930
15329
  {
14931
- ggml_compute_forward_dup(params, tensor->src[0], tensor);
15330
+ ggml_compute_forward_dup(params, tensor);
14932
15331
  } break;
14933
15332
  case GGML_OP_ADD:
14934
15333
  {
14935
- ggml_compute_forward_add(params, tensor->src[0], tensor->src[1], tensor);
15334
+ ggml_compute_forward_add(params, tensor);
14936
15335
  } break;
14937
15336
  case GGML_OP_ADD1:
14938
15337
  {
14939
- ggml_compute_forward_add1(params, tensor->src[0], tensor->src[1], tensor);
15338
+ ggml_compute_forward_add1(params, tensor);
14940
15339
  } break;
14941
15340
  case GGML_OP_ACC:
14942
15341
  {
14943
- ggml_compute_forward_acc(params, tensor->src[0], tensor->src[1], tensor);
15342
+ ggml_compute_forward_acc(params, tensor);
14944
15343
  } break;
14945
15344
  case GGML_OP_SUB:
14946
15345
  {
14947
- ggml_compute_forward_sub(params, tensor->src[0], tensor->src[1], tensor);
15346
+ ggml_compute_forward_sub(params, tensor);
14948
15347
  } break;
14949
15348
  case GGML_OP_MUL:
14950
15349
  {
14951
- ggml_compute_forward_mul(params, tensor->src[0], tensor->src[1], tensor);
15350
+ ggml_compute_forward_mul(params, tensor);
14952
15351
  } break;
14953
15352
  case GGML_OP_DIV:
14954
15353
  {
14955
- ggml_compute_forward_div(params, tensor->src[0], tensor->src[1], tensor);
15354
+ ggml_compute_forward_div(params, tensor);
14956
15355
  } break;
14957
15356
  case GGML_OP_SQR:
14958
15357
  {
14959
- ggml_compute_forward_sqr(params, tensor->src[0], tensor);
15358
+ ggml_compute_forward_sqr(params, tensor);
14960
15359
  } break;
14961
15360
  case GGML_OP_SQRT:
14962
15361
  {
14963
- ggml_compute_forward_sqrt(params, tensor->src[0], tensor);
15362
+ ggml_compute_forward_sqrt(params, tensor);
14964
15363
  } break;
14965
15364
  case GGML_OP_LOG:
14966
15365
  {
14967
- ggml_compute_forward_log(params, tensor->src[0], tensor);
15366
+ ggml_compute_forward_log(params, tensor);
14968
15367
  } break;
14969
15368
  case GGML_OP_SUM:
14970
15369
  {
14971
- ggml_compute_forward_sum(params, tensor->src[0], tensor);
15370
+ ggml_compute_forward_sum(params, tensor);
14972
15371
  } break;
14973
15372
  case GGML_OP_SUM_ROWS:
14974
15373
  {
14975
- ggml_compute_forward_sum_rows(params, tensor->src[0], tensor);
15374
+ ggml_compute_forward_sum_rows(params, tensor);
14976
15375
  } break;
14977
15376
  case GGML_OP_MEAN:
14978
15377
  {
14979
- ggml_compute_forward_mean(params, tensor->src[0], tensor);
15378
+ ggml_compute_forward_mean(params, tensor);
14980
15379
  } break;
14981
15380
  case GGML_OP_ARGMAX:
14982
15381
  {
14983
- ggml_compute_forward_argmax(params, tensor->src[0], tensor);
15382
+ ggml_compute_forward_argmax(params, tensor);
14984
15383
  } break;
14985
15384
  case GGML_OP_REPEAT:
14986
15385
  {
14987
- ggml_compute_forward_repeat(params, tensor->src[0], tensor);
15386
+ ggml_compute_forward_repeat(params, tensor);
14988
15387
  } break;
14989
15388
  case GGML_OP_REPEAT_BACK:
14990
15389
  {
14991
- ggml_compute_forward_repeat_back(params, tensor->src[0], tensor);
15390
+ ggml_compute_forward_repeat_back(params, tensor);
14992
15391
  } break;
14993
15392
  case GGML_OP_CONCAT:
14994
15393
  {
14995
- ggml_compute_forward_concat(params, tensor->src[0], tensor->src[1], tensor);
15394
+ ggml_compute_forward_concat(params, tensor);
14996
15395
  } break;
14997
15396
  case GGML_OP_SILU_BACK:
14998
15397
  {
14999
- ggml_compute_forward_silu_back(params, tensor->src[0], tensor->src[1], tensor);
15398
+ ggml_compute_forward_silu_back(params, tensor);
15000
15399
  } break;
15001
15400
  case GGML_OP_NORM:
15002
15401
  {
15003
- ggml_compute_forward_norm(params, tensor->src[0], tensor);
15402
+ ggml_compute_forward_norm(params, tensor);
15004
15403
  } break;
15005
15404
  case GGML_OP_RMS_NORM:
15006
15405
  {
15007
- ggml_compute_forward_rms_norm(params, tensor->src[0], tensor);
15406
+ ggml_compute_forward_rms_norm(params, tensor);
15008
15407
  } break;
15009
15408
  case GGML_OP_RMS_NORM_BACK:
15010
15409
  {
15011
- ggml_compute_forward_rms_norm_back(params, tensor->src[0], tensor->src[1], tensor);
15410
+ ggml_compute_forward_rms_norm_back(params, tensor);
15012
15411
  } break;
15013
15412
  case GGML_OP_GROUP_NORM:
15014
15413
  {
15015
- ggml_compute_forward_group_norm(params, tensor->src[0], tensor);
15414
+ ggml_compute_forward_group_norm(params, tensor);
15016
15415
  } break;
15017
15416
  case GGML_OP_MUL_MAT:
15018
15417
  {
15019
- ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
15418
+ ggml_compute_forward_mul_mat(params, tensor);
15020
15419
  } break;
15021
15420
  case GGML_OP_MUL_MAT_ID:
15022
15421
  {
15023
- ggml_compute_forward_mul_mat_id(params, tensor->src[0], tensor->src[1], tensor);
15422
+ ggml_compute_forward_mul_mat_id(params, tensor);
15024
15423
  } break;
15025
15424
  case GGML_OP_OUT_PROD:
15026
15425
  {
15027
- ggml_compute_forward_out_prod(params, tensor->src[0], tensor->src[1], tensor);
15426
+ ggml_compute_forward_out_prod(params, tensor);
15028
15427
  } break;
15029
15428
  case GGML_OP_SCALE:
15030
15429
  {
15031
- ggml_compute_forward_scale(params, tensor->src[0], tensor);
15430
+ ggml_compute_forward_scale(params, tensor);
15032
15431
  } break;
15033
15432
  case GGML_OP_SET:
15034
15433
  {
15035
- ggml_compute_forward_set(params, tensor->src[0], tensor->src[1], tensor);
15434
+ ggml_compute_forward_set(params, tensor);
15036
15435
  } break;
15037
15436
  case GGML_OP_CPY:
15038
15437
  {
15039
- ggml_compute_forward_cpy(params, tensor->src[0], tensor);
15438
+ ggml_compute_forward_cpy(params, tensor);
15040
15439
  } break;
15041
15440
  case GGML_OP_CONT:
15042
15441
  {
15043
- ggml_compute_forward_cont(params, tensor->src[0], tensor);
15442
+ ggml_compute_forward_cont(params, tensor);
15044
15443
  } break;
15045
15444
  case GGML_OP_RESHAPE:
15046
15445
  {
15047
- ggml_compute_forward_reshape(params, tensor->src[0], tensor);
15446
+ ggml_compute_forward_reshape(params, tensor);
15048
15447
  } break;
15049
15448
  case GGML_OP_VIEW:
15050
15449
  {
15051
- ggml_compute_forward_view(params, tensor->src[0]);
15450
+ ggml_compute_forward_view(params, tensor);
15052
15451
  } break;
15053
15452
  case GGML_OP_PERMUTE:
15054
15453
  {
15055
- ggml_compute_forward_permute(params, tensor->src[0]);
15454
+ ggml_compute_forward_permute(params, tensor);
15056
15455
  } break;
15057
15456
  case GGML_OP_TRANSPOSE:
15058
15457
  {
15059
- ggml_compute_forward_transpose(params, tensor->src[0]);
15458
+ ggml_compute_forward_transpose(params, tensor);
15060
15459
  } break;
15061
15460
  case GGML_OP_GET_ROWS:
15062
15461
  {
15063
- ggml_compute_forward_get_rows(params, tensor->src[0], tensor->src[1], tensor);
15462
+ ggml_compute_forward_get_rows(params, tensor);
15064
15463
  } break;
15065
15464
  case GGML_OP_GET_ROWS_BACK:
15066
15465
  {
15067
- ggml_compute_forward_get_rows_back(params, tensor->src[0], tensor->src[1], tensor);
15466
+ ggml_compute_forward_get_rows_back(params, tensor);
15068
15467
  } break;
15069
15468
  case GGML_OP_DIAG:
15070
15469
  {
15071
- ggml_compute_forward_diag(params, tensor->src[0], tensor);
15470
+ ggml_compute_forward_diag(params, tensor);
15072
15471
  } break;
15073
15472
  case GGML_OP_DIAG_MASK_INF:
15074
15473
  {
15075
- ggml_compute_forward_diag_mask_inf(params, tensor->src[0], tensor);
15474
+ ggml_compute_forward_diag_mask_inf(params, tensor);
15076
15475
  } break;
15077
15476
  case GGML_OP_DIAG_MASK_ZERO:
15078
15477
  {
15079
- ggml_compute_forward_diag_mask_zero(params, tensor->src[0], tensor);
15478
+ ggml_compute_forward_diag_mask_zero(params, tensor);
15080
15479
  } break;
15081
15480
  case GGML_OP_SOFT_MAX:
15082
15481
  {
15083
- ggml_compute_forward_soft_max(params, tensor->src[0], tensor->src[1], tensor);
15482
+ ggml_compute_forward_soft_max(params, tensor);
15084
15483
  } break;
15085
15484
  case GGML_OP_SOFT_MAX_BACK:
15086
15485
  {
15087
- ggml_compute_forward_soft_max_back(params, tensor->src[0], tensor->src[1], tensor);
15486
+ ggml_compute_forward_soft_max_back(params, tensor);
15088
15487
  } break;
15089
15488
  case GGML_OP_ROPE:
15090
15489
  {
15091
- ggml_compute_forward_rope(params, tensor->src[0], tensor->src[1], tensor);
15490
+ ggml_compute_forward_rope(params, tensor);
15092
15491
  } break;
15093
15492
  case GGML_OP_ROPE_BACK:
15094
15493
  {
15095
- ggml_compute_forward_rope_back(params, tensor->src[0], tensor->src[1], tensor);
15494
+ ggml_compute_forward_rope_back(params, tensor);
15096
15495
  } break;
15097
15496
  case GGML_OP_ALIBI:
15098
15497
  {
15099
- ggml_compute_forward_alibi(params, tensor->src[0], tensor);
15498
+ ggml_compute_forward_alibi(params, tensor);
15100
15499
  } break;
15101
15500
  case GGML_OP_CLAMP:
15102
15501
  {
15103
- ggml_compute_forward_clamp(params, tensor->src[0], tensor);
15502
+ ggml_compute_forward_clamp(params, tensor);
15104
15503
  } break;
15105
15504
  case GGML_OP_CONV_TRANSPOSE_1D:
15106
15505
  {
15107
- ggml_compute_forward_conv_transpose_1d(params, tensor->src[0], tensor->src[1], tensor);
15506
+ ggml_compute_forward_conv_transpose_1d(params, tensor);
15108
15507
  } break;
15109
15508
  case GGML_OP_IM2COL:
15110
15509
  {
15111
- ggml_compute_forward_im2col(params, tensor->src[0], tensor->src[1], tensor);
15510
+ ggml_compute_forward_im2col(params, tensor);
15112
15511
  } break;
15113
15512
  case GGML_OP_CONV_TRANSPOSE_2D:
15114
15513
  {
15115
- ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor);
15514
+ ggml_compute_forward_conv_transpose_2d(params, tensor);
15116
15515
  } break;
15117
15516
  case GGML_OP_POOL_1D:
15118
15517
  {
15119
- ggml_compute_forward_pool_1d(params, tensor->src[0], tensor);
15518
+ ggml_compute_forward_pool_1d(params, tensor);
15120
15519
  } break;
15121
15520
  case GGML_OP_POOL_2D:
15122
15521
  {
15123
- ggml_compute_forward_pool_2d(params, tensor->src[0], tensor);
15522
+ ggml_compute_forward_pool_2d(params, tensor);
15124
15523
  } break;
15125
15524
  case GGML_OP_UPSCALE:
15126
15525
  {
15127
- ggml_compute_forward_upscale(params, tensor->src[0], tensor);
15526
+ ggml_compute_forward_upscale(params, tensor);
15128
15527
  } break;
15129
15528
  case GGML_OP_PAD:
15130
15529
  {
15131
- ggml_compute_forward_pad(params, tensor->src[0], tensor);
15530
+ ggml_compute_forward_pad(params, tensor);
15132
15531
  } break;
15133
15532
  case GGML_OP_ARGSORT:
15134
15533
  {
15135
- ggml_compute_forward_argsort(params, tensor->src[0], tensor);
15534
+ ggml_compute_forward_argsort(params, tensor);
15136
15535
  } break;
15137
15536
  case GGML_OP_LEAKY_RELU:
15138
15537
  {
15139
- ggml_compute_forward_leaky_relu(params, tensor->src[0], tensor);
15538
+ ggml_compute_forward_leaky_relu(params, tensor);
15140
15539
  } break;
15141
15540
  case GGML_OP_FLASH_ATTN:
15142
15541
  {
15143
15542
  const int32_t t = ggml_get_op_params_i32(tensor, 0);
15144
15543
  GGML_ASSERT(t == 0 || t == 1);
15145
15544
  const bool masked = t != 0;
15146
- ggml_compute_forward_flash_attn(params, tensor->src[0], tensor->src[1], tensor->src[2], masked, tensor);
15545
+ ggml_compute_forward_flash_attn(params, masked, tensor);
15147
15546
  } break;
15148
15547
  case GGML_OP_FLASH_FF:
15149
15548
  {
15150
- ggml_compute_forward_flash_ff(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor->src[4], tensor);
15549
+ ggml_compute_forward_flash_ff(params, tensor);
15151
15550
  } break;
15152
15551
  case GGML_OP_FLASH_ATTN_BACK:
15153
15552
  {
15154
15553
  int32_t t = ggml_get_op_params_i32(tensor, 0);
15155
15554
  GGML_ASSERT(t == 0 || t == 1);
15156
15555
  bool masked = t != 0;
15157
- ggml_compute_forward_flash_attn_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], masked, tensor);
15556
+ ggml_compute_forward_flash_attn_back(params, masked, tensor);
15158
15557
  } break;
15159
15558
  case GGML_OP_WIN_PART:
15160
15559
  {
15161
- ggml_compute_forward_win_part(params, tensor->src[0], tensor);
15560
+ ggml_compute_forward_win_part(params, tensor);
15162
15561
  } break;
15163
15562
  case GGML_OP_WIN_UNPART:
15164
15563
  {
15165
- ggml_compute_forward_win_unpart(params, tensor->src[0], tensor);
15564
+ ggml_compute_forward_win_unpart(params, tensor);
15166
15565
  } break;
15167
15566
  case GGML_OP_UNARY:
15168
15567
  {
15169
- ggml_compute_forward_unary(params, tensor->src[0], tensor);
15568
+ ggml_compute_forward_unary(params, tensor);
15170
15569
  } break;
15171
15570
  case GGML_OP_GET_REL_POS:
15172
15571
  {
15173
- ggml_compute_forward_get_rel_pos(params, tensor->src[0], tensor);
15572
+ ggml_compute_forward_get_rel_pos(params, tensor);
15174
15573
  } break;
15175
15574
  case GGML_OP_ADD_REL_POS:
15176
15575
  {
15177
- ggml_compute_forward_add_rel_pos(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
15576
+ ggml_compute_forward_add_rel_pos(params, tensor);
15178
15577
  } break;
15179
15578
  case GGML_OP_MAP_UNARY:
15180
15579
  {
15181
15580
  ggml_unary_op_f32_t fun;
15182
15581
  memcpy(&fun, tensor->op_params, sizeof(fun));
15183
- ggml_compute_forward_map_unary(params, tensor->src[0], tensor, fun);
15582
+ ggml_compute_forward_map_unary(params, tensor, fun);
15184
15583
  }
15185
15584
  break;
15186
15585
  case GGML_OP_MAP_BINARY:
15187
15586
  {
15188
15587
  ggml_binary_op_f32_t fun;
15189
15588
  memcpy(&fun, tensor->op_params, sizeof(fun));
15190
- ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun);
15589
+ ggml_compute_forward_map_binary(params, tensor, fun);
15191
15590
  }
15192
15591
  break;
15193
15592
  case GGML_OP_MAP_CUSTOM1_F32:
15194
15593
  {
15195
15594
  ggml_custom1_op_f32_t fun;
15196
15595
  memcpy(&fun, tensor->op_params, sizeof(fun));
15197
- ggml_compute_forward_map_custom1_f32(params, tensor->src[0], tensor, fun);
15596
+ ggml_compute_forward_map_custom1_f32(params, tensor, fun);
15198
15597
  }
15199
15598
  break;
15200
15599
  case GGML_OP_MAP_CUSTOM2_F32:
15201
15600
  {
15202
15601
  ggml_custom2_op_f32_t fun;
15203
15602
  memcpy(&fun, tensor->op_params, sizeof(fun));
15204
- ggml_compute_forward_map_custom2_f32(params, tensor->src[0], tensor->src[1], tensor, fun);
15603
+ ggml_compute_forward_map_custom2_f32(params, tensor, fun);
15205
15604
  }
15206
15605
  break;
15207
15606
  case GGML_OP_MAP_CUSTOM3_F32:
15208
15607
  {
15209
15608
  ggml_custom3_op_f32_t fun;
15210
15609
  memcpy(&fun, tensor->op_params, sizeof(fun));
15211
- ggml_compute_forward_map_custom3_f32(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun);
15610
+ ggml_compute_forward_map_custom3_f32(params, tensor, fun);
15212
15611
  }
15213
15612
  break;
15214
15613
  case GGML_OP_MAP_CUSTOM1:
15215
15614
  {
15216
- ggml_compute_forward_map_custom1(params, tensor->src[0], tensor);
15615
+ ggml_compute_forward_map_custom1(params, tensor);
15217
15616
  }
15218
15617
  break;
15219
15618
  case GGML_OP_MAP_CUSTOM2:
15220
15619
  {
15221
- ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor);
15620
+ ggml_compute_forward_map_custom2(params, tensor);
15222
15621
  }
15223
15622
  break;
15224
15623
  case GGML_OP_MAP_CUSTOM3:
15225
15624
  {
15226
- ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
15625
+ ggml_compute_forward_map_custom3(params, tensor);
15227
15626
  }
15228
15627
  break;
15229
15628
  case GGML_OP_CROSS_ENTROPY_LOSS:
15230
15629
  {
15231
- ggml_compute_forward_cross_entropy_loss(params, tensor->src[0], tensor->src[1], tensor);
15630
+ ggml_compute_forward_cross_entropy_loss(params, tensor);
15232
15631
  }
15233
15632
  break;
15234
15633
  case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
15235
15634
  {
15236
- ggml_compute_forward_cross_entropy_loss_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
15635
+ ggml_compute_forward_cross_entropy_loss_back(params, tensor);
15237
15636
  }
15238
15637
  break;
15239
15638
  case GGML_OP_NONE:
@@ -16637,27 +17036,47 @@ typedef pthread_t ggml_thread_t;
16637
17036
  #endif
16638
17037
 
16639
17038
  // Android's libc implementation "bionic" does not support setting affinity
16640
- #if defined(__linux__) && !defined(__BIONIC__)
16641
- static void set_numa_thread_affinity(int thread_n, int n_threads) {
17039
+ #if defined(__gnu_linux__)
17040
+ static void set_numa_thread_affinity(int thread_n) {
16642
17041
  if (!ggml_is_numa()) {
16643
17042
  return;
16644
17043
  }
16645
17044
 
16646
- // run thread on node_num thread_n / (threads per node)
16647
- const int node_num = thread_n / ((n_threads + g_state.numa.n_nodes - 1) / g_state.numa.n_nodes);
16648
- struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
17045
+ int node_num;
17046
+ int rv;
16649
17047
  size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
16650
17048
 
17049
+ switch(g_state.numa.numa_strategy) {
17050
+ case GGML_NUMA_STRATEGY_DISTRIBUTE:
17051
+ // run thread on node_num thread_n / (threads per node)
17052
+ node_num = thread_n % g_state.numa.n_nodes;
17053
+ break;
17054
+ case GGML_NUMA_STRATEGY_ISOLATE:
17055
+ // run thread on current_node
17056
+ node_num = g_state.numa.current_node;
17057
+ break;
17058
+ case GGML_NUMA_STRATEGY_NUMACTL:
17059
+ // use the cpuset that numactl gave us
17060
+ rv = pthread_setaffinity_np(pthread_self(), setsize, &g_state.numa.cpuset);
17061
+ if (rv) {
17062
+ fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",strerror(rv));
17063
+ }
17064
+ return;
17065
+ default:
17066
+ return;
17067
+ }
17068
+
17069
+ struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
17070
+
16651
17071
  cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
16652
17072
  CPU_ZERO_S(setsize, cpus);
16653
17073
  for (size_t i = 0; i < node->n_cpus; ++i) {
16654
17074
  CPU_SET_S(node->cpus[i], setsize, cpus);
16655
17075
  }
16656
17076
 
16657
- int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
17077
+ rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
16658
17078
  if (rv) {
16659
- fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
16660
- strerror(rv));
17079
+ fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
16661
17080
  }
16662
17081
 
16663
17082
  CPU_FREE(cpus);
@@ -16678,8 +17097,7 @@ static void clear_numa_thread_affinity(void) {
16678
17097
 
16679
17098
  int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
16680
17099
  if (rv) {
16681
- fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
16682
- strerror(rv));
17100
+ fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
16683
17101
  }
16684
17102
 
16685
17103
  CPU_FREE(cpus);
@@ -16687,7 +17105,7 @@ static void clear_numa_thread_affinity(void) {
16687
17105
  #else
16688
17106
  // TODO: Windows etc.
16689
17107
  // (the linux implementation may also work on BSD, someone should test)
16690
- static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
17108
+ static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n); }
16691
17109
  static void clear_numa_thread_affinity(void) {}
16692
17110
  #endif
16693
17111
 
@@ -16987,7 +17405,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16987
17405
 
16988
17406
  const int n_threads = state->shared->n_threads;
16989
17407
 
16990
- set_numa_thread_affinity(state->ith, n_threads);
17408
+ set_numa_thread_affinity(state->ith);
16991
17409
 
16992
17410
  int node_n = -1;
16993
17411
  int task_phase = GGML_TASK_FINALIZE;
@@ -17793,7 +18211,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
17793
18211
 
17794
18212
  ptr += ggml_nbytes(tensor);
17795
18213
 
17796
- fprintf(stderr, "%s: loaded leaf %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
18214
+ fprintf(stderr, "%s: loaded leaf %u: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
17797
18215
  }
17798
18216
  }
17799
18217
 
@@ -17896,7 +18314,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
17896
18314
 
17897
18315
  result->nodes[i] = tensor;
17898
18316
 
17899
- fprintf(stderr, "%s: loaded node %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
18317
+ fprintf(stderr, "%s: loaded node %u: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
17900
18318
  }
17901
18319
  }
17902
18320
  }
@@ -18521,7 +18939,9 @@ static enum ggml_opt_result linesearch_backtracking(
18521
18939
  (*step) *= width;
18522
18940
  }
18523
18941
 
18524
- GGML_UNREACHABLE();
18942
+ GGML_ASSERT(false && "line search failed");
18943
+
18944
+ return GGML_LINESEARCH_FAIL;
18525
18945
  }
18526
18946
 
18527
18947
  static enum ggml_opt_result ggml_opt_lbfgs(
@@ -18789,7 +19209,9 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18789
19209
  step[0] = 1.0;
18790
19210
  }
18791
19211
 
18792
- GGML_UNREACHABLE();
19212
+ GGML_ASSERT(false && "lbfgs failed");
19213
+
19214
+ return GGML_OPT_DID_NOT_CONVERGE;
18793
19215
  }
18794
19216
 
18795
19217
  struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
@@ -19037,8 +19459,9 @@ void ggml_quantize_init(enum ggml_type type) {
19037
19459
  ggml_critical_section_start();
19038
19460
 
19039
19461
  switch (type) {
19040
- case GGML_TYPE_IQ2_XXS: iq2xs_init_impl(256); break;
19041
- case GGML_TYPE_IQ2_XS: iq2xs_init_impl(512); break;
19462
+ case GGML_TYPE_IQ2_XXS:
19463
+ case GGML_TYPE_IQ2_XS:
19464
+ case GGML_TYPE_IQ1_S: iq2xs_init_impl(type); break;
19042
19465
  case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
19043
19466
  default: // nothing
19044
19467
  break;
@@ -19050,8 +19473,10 @@ void ggml_quantize_init(enum ggml_type type) {
19050
19473
  void ggml_quantize_free(void) {
19051
19474
  ggml_critical_section_start();
19052
19475
 
19053
- iq2xs_free_impl(256);
19054
- iq2xs_free_impl(512);
19476
+ iq2xs_free_impl(GGML_TYPE_IQ2_XXS);
19477
+ iq2xs_free_impl(GGML_TYPE_IQ2_XS);
19478
+ iq2xs_free_impl(GGML_TYPE_IQ1_S);
19479
+ iq3xs_free_impl(256);
19055
19480
 
19056
19481
  ggml_critical_section_end();
19057
19482
  }
@@ -19186,7 +19611,8 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
19186
19611
  bool ggml_quantize_requires_imatrix(enum ggml_type type) {
19187
19612
  return
19188
19613
  type == GGML_TYPE_IQ2_XXS ||
19189
- type == GGML_TYPE_IQ2_XS;
19614
+ type == GGML_TYPE_IQ2_XS ||
19615
+ type == GGML_TYPE_IQ1_S;
19190
19616
  }
19191
19617
 
19192
19618
  size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
@@ -19311,6 +19737,24 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
19311
19737
  result = quantize_iq3_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19312
19738
  GGML_ASSERT(result == row_size * nrows);
19313
19739
  } break;
19740
+ case GGML_TYPE_IQ1_S:
19741
+ {
19742
+ GGML_ASSERT(start % QK_K == 0);
19743
+ GGML_ASSERT(start % n_per_row == 0);
19744
+ size_t start_row = start / n_per_row;
19745
+ size_t row_size = ggml_row_size(type, n_per_row);
19746
+ result = quantize_iq1_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19747
+ GGML_ASSERT(result == row_size * nrows);
19748
+ } break;
19749
+ case GGML_TYPE_IQ4_NL:
19750
+ {
19751
+ GGML_ASSERT(start % QK4_NL == 0);
19752
+ GGML_ASSERT(start % n_per_row == 0);
19753
+ size_t start_row = start / n_per_row;
19754
+ size_t row_size = ggml_row_size(type, n_per_row);
19755
+ result = quantize_iq4_nl(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19756
+ GGML_ASSERT(result == row_size * nrows);
19757
+ } break;
19314
19758
  case GGML_TYPE_F16:
19315
19759
  {
19316
19760
  size_t elemsize = sizeof(ggml_fp16_t);