llama_cpp 0.12.6 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -23,6 +23,9 @@
23
23
  #include <limits.h>
24
24
  #include <stdarg.h>
25
25
  #include <signal.h>
26
+ #if defined(__gnu_linux__)
27
+ #include <syscall.h>
28
+ #endif
26
29
 
27
30
  #ifdef GGML_USE_METAL
28
31
  #include <unistd.h>
@@ -270,6 +273,8 @@ inline static void * ggml_calloc(size_t num, size_t size) {
270
273
  #include <Accelerate/Accelerate.h>
271
274
  #if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
272
275
  #include "ggml-opencl.h"
276
+ #elif defined(GGML_USE_VULKAN)
277
+ #include "ggml-vulkan.h"
273
278
  #endif
274
279
  #elif defined(GGML_USE_OPENBLAS)
275
280
  #if defined(GGML_BLAS_USE_MKL)
@@ -318,7 +323,7 @@ float ggml_table_f32_f16[1 << 16];
318
323
  // note: do not use these inside ggml.c
319
324
  // these are meant to be used via the ggml.h API
320
325
  float ggml_fp16_to_fp32(ggml_fp16_t x) {
321
- return (float) GGML_FP16_TO_FP32(x);
326
+ return GGML_FP16_TO_FP32(x);
322
327
  }
323
328
 
324
329
  ggml_fp16_t ggml_fp32_to_fp16(float x) {
@@ -350,6 +355,10 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n) {
350
355
  }
351
356
  }
352
357
 
358
+ bool ggml_guid_matches(ggml_guid_t guid_a, ggml_guid_t guid_b) {
359
+ return memcmp(guid_a, guid_b, sizeof(ggml_guid)) == 0;
360
+ }
361
+
353
362
  //
354
363
  // timing
355
364
  //
@@ -673,6 +682,74 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
673
682
  .vec_dot_type = GGML_TYPE_Q8_K,
674
683
  .nrows = 1,
675
684
  },
685
+ [GGML_TYPE_IQ3_S] = {
686
+ .type_name = "iq3_s",
687
+ .blck_size = QK_K,
688
+ .type_size = sizeof(block_iq3_s),
689
+ .is_quantized = true,
690
+ .to_float = (ggml_to_float_t) dequantize_row_iq3_s,
691
+ .from_float = quantize_row_iq3_s,
692
+ .from_float_reference = (ggml_from_float_t)quantize_row_iq3_s_reference,
693
+ .vec_dot = ggml_vec_dot_iq3_s_q8_K,
694
+ .vec_dot_type = GGML_TYPE_Q8_K,
695
+ .nrows = 1,
696
+ },
697
+ [GGML_TYPE_IQ2_S] = {
698
+ .type_name = "iq2_s",
699
+ .blck_size = QK_K,
700
+ .type_size = sizeof(block_iq2_s),
701
+ .is_quantized = true,
702
+ .to_float = (ggml_to_float_t) dequantize_row_iq2_s,
703
+ .from_float = quantize_row_iq2_s,
704
+ .from_float_reference = (ggml_from_float_t)quantize_row_iq2_s_reference,
705
+ .vec_dot = ggml_vec_dot_iq2_s_q8_K,
706
+ .vec_dot_type = GGML_TYPE_Q8_K,
707
+ .nrows = 1,
708
+ },
709
+ [GGML_TYPE_IQ1_S] = {
710
+ .type_name = "iq1_s",
711
+ .blck_size = QK_K,
712
+ .type_size = sizeof(block_iq1_s),
713
+ .is_quantized = true,
714
+ .to_float = (ggml_to_float_t) dequantize_row_iq1_s,
715
+ .from_float = NULL,
716
+ .from_float_reference = NULL,
717
+ .vec_dot = ggml_vec_dot_iq1_s_q8_K,
718
+ .vec_dot_type = GGML_TYPE_Q8_K,
719
+ .nrows = 1,
720
+ },
721
+ [GGML_TYPE_IQ4_NL] = {
722
+ .type_name = "iq4_nl",
723
+ .blck_size = QK4_NL,
724
+ .type_size = sizeof(block_iq4_nl),
725
+ .is_quantized = true,
726
+ .to_float = (ggml_to_float_t) dequantize_row_iq4_nl,
727
+ .from_float = quantize_row_iq4_nl,
728
+ .from_float_reference = (ggml_from_float_t)quantize_row_iq4_nl_reference,
729
+ .vec_dot = ggml_vec_dot_iq4_nl_q8_0,
730
+ .vec_dot_type = GGML_TYPE_Q8_0,
731
+ .nrows = 1,
732
+ },
733
+ [GGML_TYPE_IQ4_XS] = {
734
+ .type_name = "iq4_xs",
735
+ #if QK_K == 64
736
+ .blck_size = QK4_NL,
737
+ #else
738
+ .blck_size = QK_K,
739
+ #endif
740
+ .type_size = sizeof(block_iq4_xs),
741
+ .is_quantized = true,
742
+ .to_float = (ggml_to_float_t) dequantize_row_iq4_xs,
743
+ .from_float = quantize_row_iq4_xs,
744
+ .from_float_reference = (ggml_from_float_t)quantize_row_iq4_xs_reference,
745
+ .vec_dot = ggml_vec_dot_iq4_xs_q8_K,
746
+ #if QK_K == 64
747
+ .vec_dot_type = GGML_TYPE_Q8_0,
748
+ #else
749
+ .vec_dot_type = GGML_TYPE_Q8_K,
750
+ #endif
751
+ .nrows = 1,
752
+ },
676
753
  [GGML_TYPE_Q8_K] = {
677
754
  .type_name = "q8_K",
678
755
  .blck_size = QK_K,
@@ -769,7 +846,7 @@ inline static float vaddvq_f32(float32x4_t v) {
769
846
  #define GGML_F16x8 float16x8_t
770
847
  #define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
771
848
  #define GGML_F16x8_SET1(x) vdupq_n_f16(x)
772
- #define GGML_F16x8_LOAD vld1q_f16
849
+ #define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x))
773
850
  #define GGML_F16x8_STORE vst1q_f16
774
851
  #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
775
852
  #define GGML_F16x8_ADD vaddq_f16
@@ -812,7 +889,7 @@ inline static float vaddvq_f32(float32x4_t v) {
812
889
  #define GGML_F32Cx4 float32x4_t
813
890
  #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
814
891
  #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
815
- #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16(x))
892
+ #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
816
893
  #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
817
894
  #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
818
895
  #define GGML_F32Cx4_ADD vaddq_f32
@@ -868,7 +945,7 @@ do { \
868
945
  const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \
869
946
  _mm256_extractf128_ps(x[0], 1)); \
870
947
  const __m128 t1 = _mm_hadd_ps(t0, t0); \
871
- res = _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \
948
+ res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \
872
949
  } while (0)
873
950
  // TODO: is this optimal ?
874
951
 
@@ -1149,7 +1226,7 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
1149
1226
  x[i] = _mm_add_ps(x[i], x[offset+i]); \
1150
1227
  } \
1151
1228
  const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
1152
- res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
1229
+ res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
1153
1230
  }
1154
1231
  // TODO: is this optimal ?
1155
1232
 
@@ -1531,9 +1608,15 @@ inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp
1531
1608
  inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
1532
1609
  uint16_t t;
1533
1610
  for (int i = 0; i < n; ++i) {
1534
- ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
1535
- memcpy(&t, &fp16, sizeof(uint16_t));
1536
- y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]);
1611
+ if (x[i] <= -10.0f) {
1612
+ y[i] = 0.0f;
1613
+ } else if (x[i] >= 10.0f) {
1614
+ y[i] = x[i];
1615
+ } else {
1616
+ ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
1617
+ memcpy(&t, &fp16, sizeof(uint16_t));
1618
+ y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]);
1619
+ }
1537
1620
  }
1538
1621
  }
1539
1622
  #else
@@ -1954,9 +2037,16 @@ struct ggml_numa_node {
1954
2037
  };
1955
2038
 
1956
2039
  struct ggml_numa_nodes {
2040
+ enum ggml_numa_strategy numa_strategy;
1957
2041
  struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
1958
2042
  uint32_t n_nodes;
1959
2043
  uint32_t total_cpus; // hardware threads on system
2044
+ uint32_t current_node; // node on which main process is execting
2045
+ #if defined(__gnu_linux__)
2046
+ cpu_set_t cpuset; // cpuset from numactl
2047
+ #else
2048
+ uint32_t cpuset; // no NUMA support outside of Linux at this time. Use a portable datatype
2049
+ #endif
1960
2050
  };
1961
2051
 
1962
2052
  //
@@ -1990,18 +2080,40 @@ inline static void ggml_critical_section_end(void) {
1990
2080
  atomic_fetch_sub(&g_state_barrier, 1);
1991
2081
  }
1992
2082
 
1993
- void ggml_numa_init(void) {
2083
+ #if defined(__gnu_linux__)
2084
+ static cpu_set_t ggml_get_numa_affinity(void) {
2085
+ cpu_set_t cpuset;
2086
+ pthread_t thread;
2087
+ thread = pthread_self();
2088
+ CPU_ZERO(&cpuset);
2089
+ pthread_getaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
2090
+ return cpuset;
2091
+ }
2092
+ #else
2093
+ static uint32_t ggml_get_numa_affinity(void) {
2094
+ return 0; // no NUMA support
2095
+ }
2096
+ #endif
2097
+
2098
+ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
1994
2099
  if (g_state.numa.n_nodes > 0) {
1995
2100
  fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
1996
2101
 
1997
2102
  return;
1998
2103
  }
1999
2104
 
2000
- #ifdef __linux__
2105
+ #if defined(__gnu_linux__)
2001
2106
  struct stat st;
2002
2107
  char path[256];
2003
2108
  int rv;
2004
2109
 
2110
+ // set numa scheme
2111
+ g_state.numa.numa_strategy = numa_flag;
2112
+
2113
+ GGML_PRINT_DEBUG("numa strategy %u\n",g_state.numa.numa_strategy);
2114
+
2115
+ g_state.numa.cpuset = ggml_get_numa_affinity();
2116
+
2005
2117
  // enumerate nodes
2006
2118
  while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
2007
2119
  rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
@@ -2020,11 +2132,23 @@ void ggml_numa_init(void) {
2020
2132
 
2021
2133
  GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
2022
2134
 
2023
- if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1) {
2135
+ // figure out which node we're on
2136
+ uint current_cpu;
2137
+ int getcpu_ret = 0;
2138
+ #if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 28)
2139
+ getcpu_ret = getcpu(&current_cpu, &g_state.numa.current_node);
2140
+ #else
2141
+ // old glibc doesn't have a wrapper for this call. Fall back on direct syscall
2142
+ getcpu_ret = syscall(SYS_getcpu,&current_cpu,&g_state.numa.current_node);
2143
+ #endif
2144
+
2145
+ if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) {
2024
2146
  g_state.numa.n_nodes = 0;
2025
2147
  return;
2026
2148
  }
2027
2149
 
2150
+ GGML_PRINT_DEBUG("found our process on numa node %u, CPU %u\n", g_state.numa.current_node, current_cpu);
2151
+
2028
2152
  for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
2029
2153
  struct ggml_numa_node * node = &g_state.numa.nodes[n];
2030
2154
  GGML_PRINT_DEBUG("CPUs on node %u:", n);
@@ -2051,6 +2175,7 @@ void ggml_numa_init(void) {
2051
2175
  }
2052
2176
  }
2053
2177
  #else
2178
+ GGML_UNUSED(numa_flag);
2054
2179
  // TODO
2055
2180
  #endif
2056
2181
  }
@@ -2231,6 +2356,11 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
2231
2356
  case GGML_FTYPE_MOSTLY_IQ2_XXS: wtype = GGML_TYPE_IQ2_XXS; break;
2232
2357
  case GGML_FTYPE_MOSTLY_IQ2_XS: wtype = GGML_TYPE_IQ2_XS; break;
2233
2358
  case GGML_FTYPE_MOSTLY_IQ3_XXS: wtype = GGML_TYPE_IQ3_XXS; break;
2359
+ case GGML_FTYPE_MOSTLY_IQ1_S: wtype = GGML_TYPE_IQ1_S; break;
2360
+ case GGML_FTYPE_MOSTLY_IQ4_NL: wtype = GGML_TYPE_IQ4_NL; break;
2361
+ case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break;
2362
+ case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break;
2363
+ case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break;
2234
2364
  case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
2235
2365
  case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
2236
2366
  }
@@ -2635,7 +2765,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
2635
2765
  }
2636
2766
  }
2637
2767
 
2638
- struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size);
2768
+ struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TYPE_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size);
2639
2769
 
2640
2770
  // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here
2641
2771
 
@@ -2643,7 +2773,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
2643
2773
 
2644
2774
  *result = (struct ggml_tensor) {
2645
2775
  /*.type =*/ type,
2646
- /*.backend =*/ GGML_BACKEND_CPU,
2776
+ /*.backend =*/ GGML_BACKEND_TYPE_CPU,
2647
2777
  /*.buffer =*/ NULL,
2648
2778
  /*.ne =*/ { 1, 1, 1, 1 },
2649
2779
  /*.nb =*/ { 0, 0, 0, 0 },
@@ -3184,7 +3314,7 @@ const char * ggml_get_name(const struct ggml_tensor * tensor) {
3184
3314
  }
3185
3315
 
3186
3316
  struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name) {
3187
- strncpy(tensor->name, name, sizeof(tensor->name));
3317
+ strncpy(tensor->name, name, sizeof(tensor->name) - 1);
3188
3318
  tensor->name[sizeof(tensor->name) - 1] = '\0';
3189
3319
  return tensor;
3190
3320
  }
@@ -3216,7 +3346,7 @@ struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx) {
3216
3346
  char * const mem_buffer = ctx->mem_buffer;
3217
3347
 
3218
3348
  while (obj != NULL) {
3219
- if (obj->type == GGML_OBJECT_TENSOR) {
3349
+ if (obj->type == GGML_OBJECT_TYPE_TENSOR) {
3220
3350
  return (struct ggml_tensor *)(mem_buffer + obj->offs);
3221
3351
  }
3222
3352
 
@@ -3233,7 +3363,7 @@ struct ggml_tensor * ggml_get_next_tensor(const struct ggml_context * ctx, struc
3233
3363
  char * const mem_buffer = ctx->mem_buffer;
3234
3364
 
3235
3365
  while (obj != NULL) {
3236
- if (obj->type == GGML_OBJECT_TENSOR) {
3366
+ if (obj->type == GGML_OBJECT_TYPE_TENSOR) {
3237
3367
  return (struct ggml_tensor *)(mem_buffer + obj->offs);
3238
3368
  }
3239
3369
 
@@ -3249,7 +3379,7 @@ struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * nam
3249
3379
  char * const mem_buffer = ctx->mem_buffer;
3250
3380
 
3251
3381
  while (obj != NULL) {
3252
- if (obj->type == GGML_OBJECT_TENSOR) {
3382
+ if (obj->type == GGML_OBJECT_TYPE_TENSOR) {
3253
3383
  struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
3254
3384
  if (strcmp(cur->name, name) == 0) {
3255
3385
  return cur;
@@ -5060,16 +5190,28 @@ static struct ggml_tensor * ggml_soft_max_impl(
5060
5190
  struct ggml_context * ctx,
5061
5191
  struct ggml_tensor * a,
5062
5192
  struct ggml_tensor * mask,
5193
+ struct ggml_tensor * pos,
5063
5194
  float scale,
5195
+ float max_bias,
5064
5196
  bool inplace) {
5065
5197
  GGML_ASSERT(ggml_is_contiguous(a));
5198
+
5066
5199
  if (mask) {
5067
5200
  GGML_ASSERT(ggml_is_contiguous(mask));
5068
- GGML_ASSERT(mask->ne[2] == 1);
5069
- GGML_ASSERT(mask->ne[3] == 1);
5201
+ GGML_ASSERT(ggml_is_matrix(mask));
5070
5202
  GGML_ASSERT(ggml_can_repeat_rows(mask, a));
5071
5203
  }
5072
5204
 
5205
+ if (pos) {
5206
+ GGML_ASSERT(ggml_is_vector(pos));
5207
+ GGML_ASSERT(pos->type == GGML_TYPE_F32);
5208
+ GGML_ASSERT(pos->ne[0] == a->ne[0]);
5209
+ }
5210
+
5211
+ if (max_bias > 0.0f) {
5212
+ GGML_ASSERT(pos);
5213
+ }
5214
+
5073
5215
  bool is_node = false;
5074
5216
 
5075
5217
  if (a->grad) {
@@ -5078,13 +5220,14 @@ static struct ggml_tensor * ggml_soft_max_impl(
5078
5220
 
5079
5221
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5080
5222
 
5081
- float params[] = { scale };
5223
+ float params[] = { scale, max_bias };
5082
5224
  ggml_set_op_params(result, params, sizeof(params));
5083
5225
 
5084
5226
  result->op = GGML_OP_SOFT_MAX;
5085
5227
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5086
5228
  result->src[0] = a;
5087
5229
  result->src[1] = mask;
5230
+ result->src[2] = pos;
5088
5231
 
5089
5232
  return result;
5090
5233
  }
@@ -5092,21 +5235,23 @@ static struct ggml_tensor * ggml_soft_max_impl(
5092
5235
  struct ggml_tensor * ggml_soft_max(
5093
5236
  struct ggml_context * ctx,
5094
5237
  struct ggml_tensor * a) {
5095
- return ggml_soft_max_impl(ctx, a, NULL, 1.0f, false);
5238
+ return ggml_soft_max_impl(ctx, a, NULL, NULL, 1.0f, 0.0f, false);
5096
5239
  }
5097
5240
 
5098
5241
  struct ggml_tensor * ggml_soft_max_inplace(
5099
5242
  struct ggml_context * ctx,
5100
5243
  struct ggml_tensor * a) {
5101
- return ggml_soft_max_impl(ctx, a, NULL, 1.0f, true);
5244
+ return ggml_soft_max_impl(ctx, a, NULL, NULL, 1.0f, 0.0f, true);
5102
5245
  }
5103
5246
 
5104
5247
  struct ggml_tensor * ggml_soft_max_ext(
5105
5248
  struct ggml_context * ctx,
5106
5249
  struct ggml_tensor * a,
5107
5250
  struct ggml_tensor * mask,
5108
- float scale) {
5109
- return ggml_soft_max_impl(ctx, a, mask, scale, false);
5251
+ struct ggml_tensor * pos,
5252
+ float scale,
5253
+ float max_bias) {
5254
+ return ggml_soft_max_impl(ctx, a, mask, pos, scale, max_bias, false);
5110
5255
  }
5111
5256
 
5112
5257
  // ggml_soft_max_back
@@ -5556,7 +5701,9 @@ struct ggml_tensor * ggml_conv_2d(
5556
5701
  ggml_reshape_2d(ctx, im2col, im2col->ne[0], im2col->ne[3] * im2col->ne[2] * im2col->ne[1]), // [N, OH, OW, IC * KH * KW] => [N*OH*OW, IC * KH * KW]
5557
5702
  ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1] * a->ne[2]), a->ne[3])); // [OC,IC, KH, KW] => [OC, IC * KH * KW]
5558
5703
 
5559
- result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], a->ne[3], im2col->ne[3]); // [N, OC, OH, OW]
5704
+ result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], im2col->ne[3], a->ne[3]); // [OC, N, OH, OW]
5705
+ result = ggml_cont(ctx, ggml_permute(ctx, result, 0, 1, 3, 2)); // [N, OC, OH, OW]
5706
+
5560
5707
 
5561
5708
  return result;
5562
5709
  }
@@ -5639,11 +5786,13 @@ struct ggml_tensor * ggml_pool_1d(
5639
5786
  is_node = true;
5640
5787
  }
5641
5788
 
5642
- const int64_t ne[2] = {
5789
+ const int64_t ne[4] = {
5643
5790
  ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
5644
5791
  a->ne[1],
5792
+ a->ne[2],
5793
+ a->ne[3],
5645
5794
  };
5646
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
5795
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
5647
5796
 
5648
5797
  int32_t params[] = { op, k0, s0, p0 };
5649
5798
  ggml_set_op_params(result, params, sizeof(params));
@@ -5776,7 +5925,7 @@ struct ggml_tensor * ggml_top_k(
5776
5925
  int k) {
5777
5926
  GGML_ASSERT(a->ne[0] >= k);
5778
5927
 
5779
- struct ggml_tensor * result = ggml_argsort(ctx, a, GGML_SORT_DESC);
5928
+ struct ggml_tensor * result = ggml_argsort(ctx, a, GGML_SORT_ORDER_DESC);
5780
5929
 
5781
5930
  result = ggml_view_4d(ctx, result,
5782
5931
  k, result->ne[1], result->ne[2], result->ne[3],
@@ -6562,13 +6711,15 @@ void ggml_set_param(
6562
6711
 
6563
6712
  static void ggml_compute_forward_dup_same_cont(
6564
6713
  const struct ggml_compute_params * params,
6565
- const struct ggml_tensor * src0,
6566
6714
  struct ggml_tensor * dst) {
6715
+
6716
+ const struct ggml_tensor * src0 = dst->src[0];
6717
+
6567
6718
  GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
6568
6719
  GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
6569
6720
  GGML_ASSERT(src0->type == dst->type);
6570
6721
 
6571
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
6722
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
6572
6723
  return;
6573
6724
  }
6574
6725
 
@@ -6594,11 +6745,13 @@ static void ggml_compute_forward_dup_same_cont(
6594
6745
  }
6595
6746
  static void ggml_compute_forward_dup_f16(
6596
6747
  const struct ggml_compute_params * params,
6597
- const struct ggml_tensor * src0,
6598
6748
  struct ggml_tensor * dst) {
6749
+
6750
+ const struct ggml_tensor * src0 = dst->src[0];
6751
+
6599
6752
  GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
6600
6753
 
6601
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
6754
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
6602
6755
  return;
6603
6756
  }
6604
6757
 
@@ -6608,7 +6761,7 @@ static void ggml_compute_forward_dup_f16(
6608
6761
  const int nth = params->nth; // number of threads
6609
6762
 
6610
6763
  if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
6611
- ggml_compute_forward_dup_same_cont(params, src0, dst);
6764
+ ggml_compute_forward_dup_same_cont(params, dst);
6612
6765
  return;
6613
6766
  }
6614
6767
 
@@ -6865,11 +7018,13 @@ static void ggml_compute_forward_dup_f16(
6865
7018
 
6866
7019
  static void ggml_compute_forward_dup_f32(
6867
7020
  const struct ggml_compute_params * params,
6868
- const struct ggml_tensor * src0,
6869
7021
  struct ggml_tensor * dst) {
7022
+
7023
+ const struct ggml_tensor * src0 = dst->src[0];
7024
+
6870
7025
  GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
6871
7026
 
6872
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7027
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
6873
7028
  return;
6874
7029
  }
6875
7030
 
@@ -6879,7 +7034,7 @@ static void ggml_compute_forward_dup_f32(
6879
7034
  const int nth = params->nth; // number of threads
6880
7035
 
6881
7036
  if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
6882
- ggml_compute_forward_dup_same_cont(params, src0, dst);
7037
+ ggml_compute_forward_dup_same_cont(params, dst);
6883
7038
  return;
6884
7039
  }
6885
7040
 
@@ -7115,17 +7270,19 @@ static void ggml_compute_forward_dup_f32(
7115
7270
  // A simplified version of ggml_compute_forward_dup that doesn't do float upcasting, and just plain old memcpy.
7116
7271
  static void ggml_compute_forward_dup_bytes(
7117
7272
  const struct ggml_compute_params * params,
7118
- const struct ggml_tensor * src0,
7119
7273
  struct ggml_tensor * dst) {
7274
+
7275
+ const struct ggml_tensor * src0 = dst->src[0];
7276
+
7120
7277
  GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
7121
7278
  GGML_ASSERT(src0->type == dst->type);
7122
7279
 
7123
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7280
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
7124
7281
  return;
7125
7282
  }
7126
7283
 
7127
7284
  if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
7128
- ggml_compute_forward_dup_same_cont(params, src0, dst);
7285
+ ggml_compute_forward_dup_same_cont(params, dst);
7129
7286
  return;
7130
7287
  }
7131
7288
 
@@ -7264,21 +7421,23 @@ static void ggml_compute_forward_dup_bytes(
7264
7421
 
7265
7422
  static void ggml_compute_forward_dup(
7266
7423
  const struct ggml_compute_params * params,
7267
- const struct ggml_tensor * src0,
7268
7424
  struct ggml_tensor * dst) {
7425
+
7426
+ const struct ggml_tensor * src0 = dst->src[0];
7427
+
7269
7428
  if (src0->type == dst->type) {
7270
- ggml_compute_forward_dup_bytes(params, src0, dst);
7429
+ ggml_compute_forward_dup_bytes(params, dst);
7271
7430
  return;
7272
7431
  }
7273
7432
 
7274
7433
  switch (src0->type) {
7275
7434
  case GGML_TYPE_F16:
7276
7435
  {
7277
- ggml_compute_forward_dup_f16(params, src0, dst);
7436
+ ggml_compute_forward_dup_f16(params, dst);
7278
7437
  } break;
7279
7438
  case GGML_TYPE_F32:
7280
7439
  {
7281
- ggml_compute_forward_dup_f32(params, src0, dst);
7440
+ ggml_compute_forward_dup_f32(params, dst);
7282
7441
  } break;
7283
7442
  default:
7284
7443
  {
@@ -7291,12 +7450,14 @@ static void ggml_compute_forward_dup(
7291
7450
 
7292
7451
  static void ggml_compute_forward_add_f32(
7293
7452
  const struct ggml_compute_params * params,
7294
- const struct ggml_tensor * src0,
7295
- const struct ggml_tensor * src1,
7296
7453
  struct ggml_tensor * dst) {
7454
+
7455
+ const struct ggml_tensor * src0 = dst->src[0];
7456
+ const struct ggml_tensor * src1 = dst->src[1];
7457
+
7297
7458
  GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
7298
7459
 
7299
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7460
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
7300
7461
  return;
7301
7462
  }
7302
7463
 
@@ -7304,7 +7465,7 @@ static void ggml_compute_forward_add_f32(
7304
7465
  const int nth = params->nth;
7305
7466
 
7306
7467
  #ifdef GGML_USE_CLBLAST
7307
- if (src1->backend == GGML_BACKEND_GPU) {
7468
+ if (src1->backend == GGML_BACKEND_TYPE_GPU) {
7308
7469
  // TODO: OpenCL kernel support full broadcast
7309
7470
  GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
7310
7471
  if (ith == 0) {
@@ -7379,12 +7540,14 @@ static void ggml_compute_forward_add_f32(
7379
7540
 
7380
7541
  static void ggml_compute_forward_add_f16_f32(
7381
7542
  const struct ggml_compute_params * params,
7382
- const struct ggml_tensor * src0,
7383
- const struct ggml_tensor * src1,
7384
7543
  struct ggml_tensor * dst) {
7544
+
7545
+ const struct ggml_tensor * src0 = dst->src[0];
7546
+ const struct ggml_tensor * src1 = dst->src[1];
7547
+
7385
7548
  GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
7386
7549
 
7387
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7550
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
7388
7551
  return;
7389
7552
  }
7390
7553
 
@@ -7456,12 +7619,14 @@ static void ggml_compute_forward_add_f16_f32(
7456
7619
 
7457
7620
  static void ggml_compute_forward_add_f16_f16(
7458
7621
  const struct ggml_compute_params * params,
7459
- const struct ggml_tensor * src0,
7460
- const struct ggml_tensor * src1,
7461
7622
  struct ggml_tensor * dst) {
7623
+
7624
+ const struct ggml_tensor * src0 = dst->src[0];
7625
+ const struct ggml_tensor * src1 = dst->src[1];
7626
+
7462
7627
  GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
7463
7628
 
7464
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7629
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
7465
7630
  return;
7466
7631
  }
7467
7632
 
@@ -7510,12 +7675,14 @@ static void ggml_compute_forward_add_f16_f16(
7510
7675
 
7511
7676
  static void ggml_compute_forward_add_q_f32(
7512
7677
  const struct ggml_compute_params * params,
7513
- const struct ggml_tensor * src0,
7514
- const struct ggml_tensor * src1,
7515
7678
  struct ggml_tensor * dst) {
7679
+
7680
+ const struct ggml_tensor * src0 = dst->src[0];
7681
+ const struct ggml_tensor * src1 = dst->src[1];
7682
+
7516
7683
  GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
7517
7684
 
7518
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7685
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
7519
7686
  return;
7520
7687
  }
7521
7688
 
@@ -7588,14 +7755,16 @@ static void ggml_compute_forward_add_q_f32(
7588
7755
 
7589
7756
  static void ggml_compute_forward_add(
7590
7757
  const struct ggml_compute_params * params,
7591
- const struct ggml_tensor * src0,
7592
- const struct ggml_tensor * src1,
7593
7758
  struct ggml_tensor * dst) {
7759
+
7760
+ const struct ggml_tensor * src0 = dst->src[0];
7761
+ const struct ggml_tensor * src1 = dst->src[1];
7762
+
7594
7763
  switch (src0->type) {
7595
7764
  case GGML_TYPE_F32:
7596
7765
  {
7597
7766
  if (src1->type == GGML_TYPE_F32) {
7598
- ggml_compute_forward_add_f32(params, src0, src1, dst);
7767
+ ggml_compute_forward_add_f32(params, dst);
7599
7768
  }
7600
7769
  else {
7601
7770
  GGML_ASSERT(false);
@@ -7604,10 +7773,10 @@ static void ggml_compute_forward_add(
7604
7773
  case GGML_TYPE_F16:
7605
7774
  {
7606
7775
  if (src1->type == GGML_TYPE_F16) {
7607
- ggml_compute_forward_add_f16_f16(params, src0, src1, dst);
7776
+ ggml_compute_forward_add_f16_f16(params, dst);
7608
7777
  }
7609
7778
  else if (src1->type == GGML_TYPE_F32) {
7610
- ggml_compute_forward_add_f16_f32(params, src0, src1, dst);
7779
+ ggml_compute_forward_add_f16_f32(params, dst);
7611
7780
  }
7612
7781
  else {
7613
7782
  GGML_ASSERT(false);
@@ -7626,8 +7795,13 @@ static void ggml_compute_forward_add(
7626
7795
  case GGML_TYPE_IQ2_XXS:
7627
7796
  case GGML_TYPE_IQ2_XS:
7628
7797
  case GGML_TYPE_IQ3_XXS:
7798
+ case GGML_TYPE_IQ1_S:
7799
+ case GGML_TYPE_IQ4_NL:
7800
+ case GGML_TYPE_IQ4_XS:
7801
+ case GGML_TYPE_IQ3_S:
7802
+ case GGML_TYPE_IQ2_S:
7629
7803
  {
7630
- ggml_compute_forward_add_q_f32(params, src0, src1, dst);
7804
+ ggml_compute_forward_add_q_f32(params, dst);
7631
7805
  } break;
7632
7806
  default:
7633
7807
  {
@@ -7640,13 +7814,15 @@ static void ggml_compute_forward_add(
7640
7814
 
7641
7815
  static void ggml_compute_forward_add1_f32(
7642
7816
  const struct ggml_compute_params * params,
7643
- const struct ggml_tensor * src0,
7644
- const struct ggml_tensor * src1,
7645
7817
  struct ggml_tensor * dst) {
7818
+
7819
+ const struct ggml_tensor * src0 = dst->src[0];
7820
+ const struct ggml_tensor * src1 = dst->src[1];
7821
+
7646
7822
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
7647
7823
  GGML_ASSERT(ggml_is_scalar(src1));
7648
7824
 
7649
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7825
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
7650
7826
  return;
7651
7827
  }
7652
7828
 
@@ -7692,13 +7868,15 @@ static void ggml_compute_forward_add1_f32(
7692
7868
 
7693
7869
  static void ggml_compute_forward_add1_f16_f32(
7694
7870
  const struct ggml_compute_params * params,
7695
- const struct ggml_tensor * src0,
7696
- const struct ggml_tensor * src1,
7697
7871
  struct ggml_tensor * dst) {
7872
+
7873
+ const struct ggml_tensor * src0 = dst->src[0];
7874
+ const struct ggml_tensor * src1 = dst->src[1];
7875
+
7698
7876
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
7699
7877
  GGML_ASSERT(ggml_is_scalar(src1));
7700
7878
 
7701
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7879
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
7702
7880
  return;
7703
7881
  }
7704
7882
 
@@ -7742,13 +7920,15 @@ static void ggml_compute_forward_add1_f16_f32(
7742
7920
 
7743
7921
  static void ggml_compute_forward_add1_f16_f16(
7744
7922
  const struct ggml_compute_params * params,
7745
- const struct ggml_tensor * src0,
7746
- const struct ggml_tensor * src1,
7747
7923
  struct ggml_tensor * dst) {
7924
+
7925
+ const struct ggml_tensor * src0 = dst->src[0];
7926
+ const struct ggml_tensor * src1 = dst->src[1];
7927
+
7748
7928
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
7749
7929
  GGML_ASSERT(ggml_is_scalar(src1));
7750
7930
 
7751
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7931
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
7752
7932
  return;
7753
7933
  }
7754
7934
 
@@ -7792,13 +7972,15 @@ static void ggml_compute_forward_add1_f16_f16(
7792
7972
 
7793
7973
  static void ggml_compute_forward_add1_q_f32(
7794
7974
  const struct ggml_compute_params * params,
7795
- const struct ggml_tensor * src0,
7796
- const struct ggml_tensor * src1,
7797
7975
  struct ggml_tensor * dst) {
7976
+
7977
+ const struct ggml_tensor * src0 = dst->src[0];
7978
+ const struct ggml_tensor * src1 = dst->src[1];
7979
+
7798
7980
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
7799
7981
  GGML_ASSERT(ggml_is_scalar(src1));
7800
7982
 
7801
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7983
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
7802
7984
  return;
7803
7985
  }
7804
7986
 
@@ -7859,21 +8041,23 @@ static void ggml_compute_forward_add1_q_f32(
7859
8041
 
7860
8042
  static void ggml_compute_forward_add1(
7861
8043
  const struct ggml_compute_params * params,
7862
- const struct ggml_tensor * src0,
7863
- const struct ggml_tensor * src1,
7864
8044
  struct ggml_tensor * dst) {
8045
+
8046
+ const struct ggml_tensor * src0 = dst->src[0];
8047
+ const struct ggml_tensor * src1 = dst->src[1];
8048
+
7865
8049
  switch (src0->type) {
7866
8050
  case GGML_TYPE_F32:
7867
8051
  {
7868
- ggml_compute_forward_add1_f32(params, src0, src1, dst);
8052
+ ggml_compute_forward_add1_f32(params, dst);
7869
8053
  } break;
7870
8054
  case GGML_TYPE_F16:
7871
8055
  {
7872
8056
  if (src1->type == GGML_TYPE_F16) {
7873
- ggml_compute_forward_add1_f16_f16(params, src0, src1, dst);
8057
+ ggml_compute_forward_add1_f16_f16(params, dst);
7874
8058
  }
7875
8059
  else if (src1->type == GGML_TYPE_F32) {
7876
- ggml_compute_forward_add1_f16_f32(params, src0, src1, dst);
8060
+ ggml_compute_forward_add1_f16_f32(params, dst);
7877
8061
  }
7878
8062
  else {
7879
8063
  GGML_ASSERT(false);
@@ -7893,8 +8077,13 @@ static void ggml_compute_forward_add1(
7893
8077
  case GGML_TYPE_IQ2_XXS:
7894
8078
  case GGML_TYPE_IQ2_XS:
7895
8079
  case GGML_TYPE_IQ3_XXS:
8080
+ case GGML_TYPE_IQ1_S:
8081
+ case GGML_TYPE_IQ4_NL:
8082
+ case GGML_TYPE_IQ4_XS:
8083
+ case GGML_TYPE_IQ3_S:
8084
+ case GGML_TYPE_IQ2_S:
7896
8085
  {
7897
- ggml_compute_forward_add1_q_f32(params, src0, src1, dst);
8086
+ ggml_compute_forward_add1_q_f32(params, dst);
7898
8087
  } break;
7899
8088
  default:
7900
8089
  {
@@ -7907,9 +8096,11 @@ static void ggml_compute_forward_add1(
7907
8096
 
7908
8097
  static void ggml_compute_forward_acc_f32(
7909
8098
  const struct ggml_compute_params * params,
7910
- const struct ggml_tensor * src0,
7911
- const struct ggml_tensor * src1,
7912
8099
  struct ggml_tensor * dst) {
8100
+
8101
+ const struct ggml_tensor * src0 = dst->src[0];
8102
+ const struct ggml_tensor * src1 = dst->src[1];
8103
+
7913
8104
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
7914
8105
  GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
7915
8106
 
@@ -7921,7 +8112,7 @@ static void ggml_compute_forward_acc_f32(
7921
8112
  size_t offset = ((int32_t *) dst->op_params)[3];
7922
8113
  bool inplace = (bool) ((int32_t *) dst->op_params)[4];
7923
8114
 
7924
- if (!inplace && (params->type == GGML_TASK_INIT)) {
8115
+ if (!inplace && (params->type == GGML_TASK_TYPE_INIT)) {
7925
8116
  if (params->ith != 0) {
7926
8117
  return;
7927
8118
  }
@@ -7933,7 +8124,7 @@ static void ggml_compute_forward_acc_f32(
7933
8124
  ggml_nbytes(dst));
7934
8125
  }
7935
8126
 
7936
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8127
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
7937
8128
  return;
7938
8129
  }
7939
8130
 
@@ -7989,14 +8180,14 @@ static void ggml_compute_forward_acc_f32(
7989
8180
 
7990
8181
  static void ggml_compute_forward_acc(
7991
8182
  const struct ggml_compute_params * params,
7992
- const struct ggml_tensor * src0,
7993
- const struct ggml_tensor * src1,
7994
8183
  struct ggml_tensor * dst) {
7995
8184
 
8185
+ const struct ggml_tensor * src0 = dst->src[0];
8186
+
7996
8187
  switch (src0->type) {
7997
8188
  case GGML_TYPE_F32:
7998
8189
  {
7999
- ggml_compute_forward_acc_f32(params, src0, src1, dst);
8190
+ ggml_compute_forward_acc_f32(params, dst);
8000
8191
  } break;
8001
8192
  case GGML_TYPE_F16:
8002
8193
  case GGML_TYPE_Q4_0:
@@ -8013,6 +8204,11 @@ static void ggml_compute_forward_acc(
8013
8204
  case GGML_TYPE_IQ2_XXS:
8014
8205
  case GGML_TYPE_IQ2_XS:
8015
8206
  case GGML_TYPE_IQ3_XXS:
8207
+ case GGML_TYPE_IQ1_S:
8208
+ case GGML_TYPE_IQ4_NL:
8209
+ case GGML_TYPE_IQ4_XS:
8210
+ case GGML_TYPE_IQ3_S:
8211
+ case GGML_TYPE_IQ2_S:
8016
8212
  default:
8017
8213
  {
8018
8214
  GGML_ASSERT(false);
@@ -8024,13 +8220,15 @@ static void ggml_compute_forward_acc(
8024
8220
 
8025
8221
  static void ggml_compute_forward_sub_f32(
8026
8222
  const struct ggml_compute_params * params,
8027
- const struct ggml_tensor * src0,
8028
- const struct ggml_tensor * src1,
8029
8223
  struct ggml_tensor * dst) {
8224
+
8225
+ const struct ggml_tensor * src0 = dst->src[0];
8226
+ const struct ggml_tensor * src1 = dst->src[1];
8227
+
8030
8228
  assert(params->ith == 0);
8031
8229
  assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
8032
8230
 
8033
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8231
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8034
8232
  return;
8035
8233
  }
8036
8234
 
@@ -8084,13 +8282,14 @@ static void ggml_compute_forward_sub_f32(
8084
8282
 
8085
8283
  static void ggml_compute_forward_sub(
8086
8284
  const struct ggml_compute_params * params,
8087
- const struct ggml_tensor * src0,
8088
- const struct ggml_tensor * src1,
8089
8285
  struct ggml_tensor * dst) {
8286
+
8287
+ const struct ggml_tensor * src0 = dst->src[0];
8288
+
8090
8289
  switch (src0->type) {
8091
8290
  case GGML_TYPE_F32:
8092
8291
  {
8093
- ggml_compute_forward_sub_f32(params, src0, src1, dst);
8292
+ ggml_compute_forward_sub_f32(params, dst);
8094
8293
  } break;
8095
8294
  default:
8096
8295
  {
@@ -8103,19 +8302,21 @@ static void ggml_compute_forward_sub(
8103
8302
 
8104
8303
  static void ggml_compute_forward_mul_f32(
8105
8304
  const struct ggml_compute_params * params,
8106
- const struct ggml_tensor * src0,
8107
- const struct ggml_tensor * src1,
8108
8305
  struct ggml_tensor * dst) {
8306
+
8307
+ const struct ggml_tensor * src0 = dst->src[0];
8308
+ const struct ggml_tensor * src1 = dst->src[1];
8309
+
8109
8310
  GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
8110
8311
 
8111
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8312
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8112
8313
  return;
8113
8314
  }
8114
8315
  const int ith = params->ith;
8115
8316
  const int nth = params->nth;
8116
8317
 
8117
8318
  #if defined(GGML_USE_CLBLAST)
8118
- if (src1->backend == GGML_BACKEND_GPU) {
8319
+ if (src1->backend == GGML_BACKEND_TYPE_GPU) {
8119
8320
  // TODO: OpenCL kernel support full broadcast
8120
8321
  GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
8121
8322
  if (ith == 0) {
@@ -8186,15 +8387,17 @@ static void ggml_compute_forward_mul_f32(
8186
8387
 
8187
8388
  static void ggml_compute_forward_mul(
8188
8389
  const struct ggml_compute_params * params,
8189
- const struct ggml_tensor * src0,
8190
- const struct ggml_tensor * src1,
8191
8390
  struct ggml_tensor * dst) {
8391
+
8392
+ const struct ggml_tensor * src0 = dst->src[0];
8393
+ const struct ggml_tensor * src1 = dst->src[1];
8394
+
8192
8395
  GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now");
8193
8396
 
8194
8397
  switch (src0->type) {
8195
8398
  case GGML_TYPE_F32:
8196
8399
  {
8197
- ggml_compute_forward_mul_f32(params, src0, src1, dst);
8400
+ ggml_compute_forward_mul_f32(params, dst);
8198
8401
  } break;
8199
8402
  default:
8200
8403
  {
@@ -8207,12 +8410,14 @@ static void ggml_compute_forward_mul(
8207
8410
 
8208
8411
  static void ggml_compute_forward_div_f32(
8209
8412
  const struct ggml_compute_params * params,
8210
- const struct ggml_tensor * src0,
8211
- const struct ggml_tensor * src1,
8212
8413
  struct ggml_tensor * dst) {
8414
+
8415
+ const struct ggml_tensor * src0 = dst->src[0];
8416
+ const struct ggml_tensor * src1 = dst->src[1];
8417
+
8213
8418
  GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
8214
8419
 
8215
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8420
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8216
8421
  return;
8217
8422
  }
8218
8423
 
@@ -8280,13 +8485,14 @@ static void ggml_compute_forward_div_f32(
8280
8485
 
8281
8486
  static void ggml_compute_forward_div(
8282
8487
  const struct ggml_compute_params * params,
8283
- const struct ggml_tensor * src0,
8284
- const struct ggml_tensor * src1,
8285
8488
  struct ggml_tensor * dst) {
8489
+
8490
+ const struct ggml_tensor * src0 = dst->src[0];
8491
+
8286
8492
  switch (src0->type) {
8287
8493
  case GGML_TYPE_F32:
8288
8494
  {
8289
- ggml_compute_forward_div_f32(params, src0, src1, dst);
8495
+ ggml_compute_forward_div_f32(params, dst);
8290
8496
  } break;
8291
8497
  default:
8292
8498
  {
@@ -8299,12 +8505,14 @@ static void ggml_compute_forward_div(
8299
8505
 
8300
8506
  static void ggml_compute_forward_sqr_f32(
8301
8507
  const struct ggml_compute_params * params,
8302
- const struct ggml_tensor * src0,
8303
8508
  struct ggml_tensor * dst) {
8509
+
8510
+ const struct ggml_tensor * src0 = dst->src[0];
8511
+
8304
8512
  assert(params->ith == 0);
8305
8513
  assert(ggml_are_same_shape(src0, dst));
8306
8514
 
8307
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8515
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8308
8516
  return;
8309
8517
  }
8310
8518
 
@@ -8323,12 +8531,14 @@ static void ggml_compute_forward_sqr_f32(
8323
8531
 
8324
8532
  static void ggml_compute_forward_sqr(
8325
8533
  const struct ggml_compute_params * params,
8326
- const struct ggml_tensor * src0,
8327
8534
  struct ggml_tensor * dst) {
8535
+
8536
+ const struct ggml_tensor * src0 = dst->src[0];
8537
+
8328
8538
  switch (src0->type) {
8329
8539
  case GGML_TYPE_F32:
8330
8540
  {
8331
- ggml_compute_forward_sqr_f32(params, src0, dst);
8541
+ ggml_compute_forward_sqr_f32(params, dst);
8332
8542
  } break;
8333
8543
  default:
8334
8544
  {
@@ -8341,12 +8551,14 @@ static void ggml_compute_forward_sqr(
8341
8551
 
8342
8552
  static void ggml_compute_forward_sqrt_f32(
8343
8553
  const struct ggml_compute_params * params,
8344
- const struct ggml_tensor * src0,
8345
8554
  struct ggml_tensor * dst) {
8555
+
8556
+ const struct ggml_tensor * src0 = dst->src[0];
8557
+
8346
8558
  assert(params->ith == 0);
8347
8559
  assert(ggml_are_same_shape(src0, dst));
8348
8560
 
8349
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8561
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8350
8562
  return;
8351
8563
  }
8352
8564
 
@@ -8365,12 +8577,14 @@ static void ggml_compute_forward_sqrt_f32(
8365
8577
 
8366
8578
  static void ggml_compute_forward_sqrt(
8367
8579
  const struct ggml_compute_params * params,
8368
- const struct ggml_tensor * src0,
8369
8580
  struct ggml_tensor * dst) {
8581
+
8582
+ const struct ggml_tensor * src0 = dst->src[0];
8583
+
8370
8584
  switch (src0->type) {
8371
8585
  case GGML_TYPE_F32:
8372
8586
  {
8373
- ggml_compute_forward_sqrt_f32(params, src0, dst);
8587
+ ggml_compute_forward_sqrt_f32(params, dst);
8374
8588
  } break;
8375
8589
  default:
8376
8590
  {
@@ -8383,12 +8597,14 @@ static void ggml_compute_forward_sqrt(
8383
8597
 
8384
8598
  static void ggml_compute_forward_log_f32(
8385
8599
  const struct ggml_compute_params * params,
8386
- const struct ggml_tensor * src0,
8387
8600
  struct ggml_tensor * dst) {
8601
+
8602
+ const struct ggml_tensor * src0 = dst->src[0];
8603
+
8388
8604
  GGML_ASSERT(params->ith == 0);
8389
8605
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
8390
8606
 
8391
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8607
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8392
8608
  return;
8393
8609
  }
8394
8610
 
@@ -8407,12 +8623,14 @@ static void ggml_compute_forward_log_f32(
8407
8623
 
8408
8624
  static void ggml_compute_forward_log(
8409
8625
  const struct ggml_compute_params * params,
8410
- const struct ggml_tensor * src0,
8411
8626
  struct ggml_tensor * dst) {
8627
+
8628
+ const struct ggml_tensor * src0 = dst->src[0];
8629
+
8412
8630
  switch (src0->type) {
8413
8631
  case GGML_TYPE_F32:
8414
8632
  {
8415
- ggml_compute_forward_log_f32(params, src0, dst);
8633
+ ggml_compute_forward_log_f32(params, dst);
8416
8634
  } break;
8417
8635
  default:
8418
8636
  {
@@ -8425,12 +8643,14 @@ static void ggml_compute_forward_log(
8425
8643
 
8426
8644
  static void ggml_compute_forward_sum_f32(
8427
8645
  const struct ggml_compute_params * params,
8428
- const struct ggml_tensor * src0,
8429
8646
  struct ggml_tensor * dst) {
8647
+
8648
+ const struct ggml_tensor * src0 = dst->src[0];
8649
+
8430
8650
  assert(params->ith == 0);
8431
8651
  assert(ggml_is_scalar(dst));
8432
8652
 
8433
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8653
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8434
8654
  return;
8435
8655
  }
8436
8656
 
@@ -8458,12 +8678,14 @@ static void ggml_compute_forward_sum_f32(
8458
8678
 
8459
8679
  static void ggml_compute_forward_sum_f16(
8460
8680
  const struct ggml_compute_params * params,
8461
- const struct ggml_tensor * src0,
8462
8681
  struct ggml_tensor * dst) {
8682
+
8683
+ const struct ggml_tensor * src0 = dst->src[0];
8684
+
8463
8685
  assert(params->ith == 0);
8464
8686
  assert(ggml_is_scalar(dst));
8465
8687
 
8466
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8688
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8467
8689
  return;
8468
8690
  }
8469
8691
 
@@ -8490,16 +8712,18 @@ static void ggml_compute_forward_sum_f16(
8490
8712
 
8491
8713
  static void ggml_compute_forward_sum(
8492
8714
  const struct ggml_compute_params * params,
8493
- const struct ggml_tensor * src0,
8494
8715
  struct ggml_tensor * dst) {
8716
+
8717
+ const struct ggml_tensor * src0 = dst->src[0];
8718
+
8495
8719
  switch (src0->type) {
8496
8720
  case GGML_TYPE_F32:
8497
8721
  {
8498
- ggml_compute_forward_sum_f32(params, src0, dst);
8722
+ ggml_compute_forward_sum_f32(params, dst);
8499
8723
  } break;
8500
8724
  case GGML_TYPE_F16:
8501
8725
  {
8502
- ggml_compute_forward_sum_f16(params, src0, dst);
8726
+ ggml_compute_forward_sum_f16(params, dst);
8503
8727
  } break;
8504
8728
  default:
8505
8729
  {
@@ -8512,11 +8736,13 @@ static void ggml_compute_forward_sum(
8512
8736
 
8513
8737
  static void ggml_compute_forward_sum_rows_f32(
8514
8738
  const struct ggml_compute_params * params,
8515
- const struct ggml_tensor * src0,
8516
8739
  struct ggml_tensor * dst) {
8740
+
8741
+ const struct ggml_tensor * src0 = dst->src[0];
8742
+
8517
8743
  GGML_ASSERT(params->ith == 0);
8518
8744
 
8519
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8745
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8520
8746
  return;
8521
8747
  }
8522
8748
 
@@ -8545,12 +8771,14 @@ static void ggml_compute_forward_sum_rows_f32(
8545
8771
 
8546
8772
  static void ggml_compute_forward_sum_rows(
8547
8773
  const struct ggml_compute_params * params,
8548
- const struct ggml_tensor * src0,
8549
8774
  struct ggml_tensor * dst) {
8775
+
8776
+ const struct ggml_tensor * src0 = dst->src[0];
8777
+
8550
8778
  switch (src0->type) {
8551
8779
  case GGML_TYPE_F32:
8552
8780
  {
8553
- ggml_compute_forward_sum_rows_f32(params, src0, dst);
8781
+ ggml_compute_forward_sum_rows_f32(params, dst);
8554
8782
  } break;
8555
8783
  default:
8556
8784
  {
@@ -8563,11 +8791,13 @@ static void ggml_compute_forward_sum_rows(
8563
8791
 
8564
8792
  static void ggml_compute_forward_mean_f32(
8565
8793
  const struct ggml_compute_params * params,
8566
- const struct ggml_tensor * src0,
8567
8794
  struct ggml_tensor * dst) {
8795
+
8796
+ const struct ggml_tensor * src0 = dst->src[0];
8797
+
8568
8798
  assert(params->ith == 0);
8569
8799
 
8570
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8800
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8571
8801
  return;
8572
8802
  }
8573
8803
 
@@ -8600,12 +8830,14 @@ static void ggml_compute_forward_mean_f32(
8600
8830
 
8601
8831
  static void ggml_compute_forward_mean(
8602
8832
  const struct ggml_compute_params * params,
8603
- const struct ggml_tensor * src0,
8604
8833
  struct ggml_tensor * dst) {
8834
+
8835
+ const struct ggml_tensor * src0 = dst->src[0];
8836
+
8605
8837
  switch (src0->type) {
8606
8838
  case GGML_TYPE_F32:
8607
8839
  {
8608
- ggml_compute_forward_mean_f32(params, src0, dst);
8840
+ ggml_compute_forward_mean_f32(params, dst);
8609
8841
  } break;
8610
8842
  default:
8611
8843
  {
@@ -8618,11 +8850,13 @@ static void ggml_compute_forward_mean(
8618
8850
 
8619
8851
  static void ggml_compute_forward_argmax_f32(
8620
8852
  const struct ggml_compute_params * params,
8621
- const struct ggml_tensor * src0,
8622
8853
  struct ggml_tensor * dst) {
8854
+
8855
+ const struct ggml_tensor * src0 = dst->src[0];
8856
+
8623
8857
  assert(params->ith == 0);
8624
8858
 
8625
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8859
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8626
8860
  return;
8627
8861
  }
8628
8862
 
@@ -8646,12 +8880,14 @@ static void ggml_compute_forward_argmax_f32(
8646
8880
 
8647
8881
  static void ggml_compute_forward_argmax(
8648
8882
  const struct ggml_compute_params * params,
8649
- const struct ggml_tensor * src0,
8650
8883
  struct ggml_tensor * dst) {
8884
+
8885
+ const struct ggml_tensor * src0 = dst->src[0];
8886
+
8651
8887
  switch (src0->type) {
8652
8888
  case GGML_TYPE_F32:
8653
8889
  {
8654
- ggml_compute_forward_argmax_f32(params, src0, dst);
8890
+ ggml_compute_forward_argmax_f32(params, dst);
8655
8891
  } break;
8656
8892
  default:
8657
8893
  {
@@ -8664,12 +8900,14 @@ static void ggml_compute_forward_argmax(
8664
8900
 
8665
8901
  static void ggml_compute_forward_repeat_f32(
8666
8902
  const struct ggml_compute_params * params,
8667
- const struct ggml_tensor * src0,
8668
8903
  struct ggml_tensor * dst) {
8904
+
8905
+ const struct ggml_tensor * src0 = dst->src[0];
8906
+
8669
8907
  GGML_ASSERT(params->ith == 0);
8670
8908
  GGML_ASSERT(ggml_can_repeat(src0, dst));
8671
8909
 
8672
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8910
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8673
8911
  return;
8674
8912
  }
8675
8913
 
@@ -8707,12 +8945,14 @@ static void ggml_compute_forward_repeat_f32(
8707
8945
 
8708
8946
  static void ggml_compute_forward_repeat_f16(
8709
8947
  const struct ggml_compute_params * params,
8710
- const struct ggml_tensor * src0,
8711
8948
  struct ggml_tensor * dst) {
8949
+
8950
+ const struct ggml_tensor * src0 = dst->src[0];
8951
+
8712
8952
  GGML_ASSERT(params->ith == 0);
8713
8953
  GGML_ASSERT(ggml_can_repeat(src0, dst));
8714
8954
 
8715
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8955
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8716
8956
  return;
8717
8957
  }
8718
8958
 
@@ -8753,18 +8993,20 @@ static void ggml_compute_forward_repeat_f16(
8753
8993
 
8754
8994
  static void ggml_compute_forward_repeat(
8755
8995
  const struct ggml_compute_params * params,
8756
- const struct ggml_tensor * src0,
8757
8996
  struct ggml_tensor * dst) {
8997
+
8998
+ const struct ggml_tensor * src0 = dst->src[0];
8999
+
8758
9000
  switch (src0->type) {
8759
9001
  case GGML_TYPE_F16:
8760
9002
  case GGML_TYPE_I16:
8761
9003
  {
8762
- ggml_compute_forward_repeat_f16(params, src0, dst);
9004
+ ggml_compute_forward_repeat_f16(params, dst);
8763
9005
  } break;
8764
9006
  case GGML_TYPE_F32:
8765
9007
  case GGML_TYPE_I32:
8766
9008
  {
8767
- ggml_compute_forward_repeat_f32(params, src0, dst);
9009
+ ggml_compute_forward_repeat_f32(params, dst);
8768
9010
  } break;
8769
9011
  default:
8770
9012
  {
@@ -8777,12 +9019,14 @@ static void ggml_compute_forward_repeat(
8777
9019
 
8778
9020
  static void ggml_compute_forward_repeat_back_f32(
8779
9021
  const struct ggml_compute_params * params,
8780
- const struct ggml_tensor * src0,
8781
9022
  struct ggml_tensor * dst) {
9023
+
9024
+ const struct ggml_tensor * src0 = dst->src[0];
9025
+
8782
9026
  GGML_ASSERT(params->ith == 0);
8783
9027
  GGML_ASSERT(ggml_can_repeat(dst, src0));
8784
9028
 
8785
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9029
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8786
9030
  return;
8787
9031
  }
8788
9032
 
@@ -8834,12 +9078,14 @@ static void ggml_compute_forward_repeat_back_f32(
8834
9078
 
8835
9079
  static void ggml_compute_forward_repeat_back(
8836
9080
  const struct ggml_compute_params * params,
8837
- const struct ggml_tensor * src0,
8838
9081
  struct ggml_tensor * dst) {
9082
+
9083
+ const struct ggml_tensor * src0 = dst->src[0];
9084
+
8839
9085
  switch (src0->type) {
8840
9086
  case GGML_TYPE_F32:
8841
9087
  {
8842
- ggml_compute_forward_repeat_back_f32(params, src0, dst);
9088
+ ggml_compute_forward_repeat_back_f32(params, dst);
8843
9089
  } break;
8844
9090
  default:
8845
9091
  {
@@ -8852,11 +9098,12 @@ static void ggml_compute_forward_repeat_back(
8852
9098
 
8853
9099
  static void ggml_compute_forward_concat_f32(
8854
9100
  const struct ggml_compute_params * params,
8855
- const struct ggml_tensor * src0,
8856
- const struct ggml_tensor * src1,
8857
9101
  struct ggml_tensor * dst) {
8858
9102
 
8859
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9103
+ const struct ggml_tensor * src0 = dst->src[0];
9104
+ const struct ggml_tensor * src1 = dst->src[1];
9105
+
9106
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8860
9107
  return;
8861
9108
  }
8862
9109
 
@@ -8900,14 +9147,15 @@ static void ggml_compute_forward_concat_f32(
8900
9147
 
8901
9148
  static void ggml_compute_forward_concat(
8902
9149
  const struct ggml_compute_params* params,
8903
- const struct ggml_tensor* src0,
8904
- const struct ggml_tensor* src1,
8905
9150
  struct ggml_tensor* dst) {
9151
+
9152
+ const struct ggml_tensor * src0 = dst->src[0];
9153
+
8906
9154
  switch (src0->type) {
8907
9155
  case GGML_TYPE_F32:
8908
9156
  case GGML_TYPE_I32:
8909
9157
  {
8910
- ggml_compute_forward_concat_f32(params, src0, src1, dst);
9158
+ ggml_compute_forward_concat_f32(params, dst);
8911
9159
  } break;
8912
9160
  default:
8913
9161
  {
@@ -8920,12 +9168,14 @@ static void ggml_compute_forward_concat(
8920
9168
 
8921
9169
  static void ggml_compute_forward_abs_f32(
8922
9170
  const struct ggml_compute_params * params,
8923
- const struct ggml_tensor * src0,
8924
9171
  struct ggml_tensor * dst) {
9172
+
9173
+ const struct ggml_tensor * src0 = dst->src[0];
9174
+
8925
9175
  assert(params->ith == 0);
8926
9176
  assert(ggml_are_same_shape(src0, dst));
8927
9177
 
8928
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9178
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8929
9179
  return;
8930
9180
  }
8931
9181
 
@@ -8944,12 +9194,14 @@ static void ggml_compute_forward_abs_f32(
8944
9194
 
8945
9195
  static void ggml_compute_forward_abs(
8946
9196
  const struct ggml_compute_params * params,
8947
- const struct ggml_tensor * src0,
8948
9197
  struct ggml_tensor * dst) {
9198
+
9199
+ const struct ggml_tensor * src0 = dst->src[0];
9200
+
8949
9201
  switch (src0->type) {
8950
9202
  case GGML_TYPE_F32:
8951
9203
  {
8952
- ggml_compute_forward_abs_f32(params, src0, dst);
9204
+ ggml_compute_forward_abs_f32(params, dst);
8953
9205
  } break;
8954
9206
  default:
8955
9207
  {
@@ -8962,12 +9214,14 @@ static void ggml_compute_forward_abs(
8962
9214
 
8963
9215
  static void ggml_compute_forward_sgn_f32(
8964
9216
  const struct ggml_compute_params * params,
8965
- const struct ggml_tensor * src0,
8966
9217
  struct ggml_tensor * dst) {
9218
+
9219
+ const struct ggml_tensor * src0 = dst->src[0];
9220
+
8967
9221
  assert(params->ith == 0);
8968
9222
  assert(ggml_are_same_shape(src0, dst));
8969
9223
 
8970
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9224
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8971
9225
  return;
8972
9226
  }
8973
9227
 
@@ -8986,12 +9240,14 @@ static void ggml_compute_forward_sgn_f32(
8986
9240
 
8987
9241
  static void ggml_compute_forward_sgn(
8988
9242
  const struct ggml_compute_params * params,
8989
- const struct ggml_tensor * src0,
8990
9243
  struct ggml_tensor * dst) {
9244
+
9245
+ const struct ggml_tensor * src0 = dst->src[0];
9246
+
8991
9247
  switch (src0->type) {
8992
9248
  case GGML_TYPE_F32:
8993
9249
  {
8994
- ggml_compute_forward_sgn_f32(params, src0, dst);
9250
+ ggml_compute_forward_sgn_f32(params, dst);
8995
9251
  } break;
8996
9252
  default:
8997
9253
  {
@@ -9004,12 +9260,14 @@ static void ggml_compute_forward_sgn(
9004
9260
 
9005
9261
  static void ggml_compute_forward_neg_f32(
9006
9262
  const struct ggml_compute_params * params,
9007
- const struct ggml_tensor * src0,
9008
9263
  struct ggml_tensor * dst) {
9264
+
9265
+ const struct ggml_tensor * src0 = dst->src[0];
9266
+
9009
9267
  assert(params->ith == 0);
9010
9268
  assert(ggml_are_same_shape(src0, dst));
9011
9269
 
9012
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9270
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9013
9271
  return;
9014
9272
  }
9015
9273
 
@@ -9028,12 +9286,14 @@ static void ggml_compute_forward_neg_f32(
9028
9286
 
9029
9287
  static void ggml_compute_forward_neg(
9030
9288
  const struct ggml_compute_params * params,
9031
- const struct ggml_tensor * src0,
9032
9289
  struct ggml_tensor * dst) {
9290
+
9291
+ const struct ggml_tensor * src0 = dst->src[0];
9292
+
9033
9293
  switch (src0->type) {
9034
9294
  case GGML_TYPE_F32:
9035
9295
  {
9036
- ggml_compute_forward_neg_f32(params, src0, dst);
9296
+ ggml_compute_forward_neg_f32(params, dst);
9037
9297
  } break;
9038
9298
  default:
9039
9299
  {
@@ -9046,12 +9306,14 @@ static void ggml_compute_forward_neg(
9046
9306
 
9047
9307
  static void ggml_compute_forward_step_f32(
9048
9308
  const struct ggml_compute_params * params,
9049
- const struct ggml_tensor * src0,
9050
9309
  struct ggml_tensor * dst) {
9310
+
9311
+ const struct ggml_tensor * src0 = dst->src[0];
9312
+
9051
9313
  assert(params->ith == 0);
9052
9314
  assert(ggml_are_same_shape(src0, dst));
9053
9315
 
9054
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9316
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9055
9317
  return;
9056
9318
  }
9057
9319
 
@@ -9070,12 +9332,14 @@ static void ggml_compute_forward_step_f32(
9070
9332
 
9071
9333
  static void ggml_compute_forward_step(
9072
9334
  const struct ggml_compute_params * params,
9073
- const struct ggml_tensor * src0,
9074
9335
  struct ggml_tensor * dst) {
9336
+
9337
+ const struct ggml_tensor * src0 = dst->src[0];
9338
+
9075
9339
  switch (src0->type) {
9076
9340
  case GGML_TYPE_F32:
9077
9341
  {
9078
- ggml_compute_forward_step_f32(params, src0, dst);
9342
+ ggml_compute_forward_step_f32(params, dst);
9079
9343
  } break;
9080
9344
  default:
9081
9345
  {
@@ -9088,12 +9352,14 @@ static void ggml_compute_forward_step(
9088
9352
 
9089
9353
  static void ggml_compute_forward_tanh_f32(
9090
9354
  const struct ggml_compute_params * params,
9091
- const struct ggml_tensor * src0,
9092
9355
  struct ggml_tensor * dst) {
9356
+
9357
+ const struct ggml_tensor * src0 = dst->src[0];
9358
+
9093
9359
  assert(params->ith == 0);
9094
9360
  assert(ggml_are_same_shape(src0, dst));
9095
9361
 
9096
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9362
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9097
9363
  return;
9098
9364
  }
9099
9365
 
@@ -9112,12 +9378,14 @@ static void ggml_compute_forward_tanh_f32(
9112
9378
 
9113
9379
  static void ggml_compute_forward_tanh(
9114
9380
  const struct ggml_compute_params * params,
9115
- const struct ggml_tensor * src0,
9116
9381
  struct ggml_tensor * dst) {
9382
+
9383
+ const struct ggml_tensor * src0 = dst->src[0];
9384
+
9117
9385
  switch (src0->type) {
9118
9386
  case GGML_TYPE_F32:
9119
9387
  {
9120
- ggml_compute_forward_tanh_f32(params, src0, dst);
9388
+ ggml_compute_forward_tanh_f32(params, dst);
9121
9389
  } break;
9122
9390
  default:
9123
9391
  {
@@ -9130,12 +9398,14 @@ static void ggml_compute_forward_tanh(
9130
9398
 
9131
9399
  static void ggml_compute_forward_elu_f32(
9132
9400
  const struct ggml_compute_params * params,
9133
- const struct ggml_tensor * src0,
9134
9401
  struct ggml_tensor * dst) {
9402
+
9403
+ const struct ggml_tensor * src0 = dst->src[0];
9404
+
9135
9405
  assert(params->ith == 0);
9136
9406
  assert(ggml_are_same_shape(src0, dst));
9137
9407
 
9138
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9408
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9139
9409
  return;
9140
9410
  }
9141
9411
 
@@ -9154,12 +9424,14 @@ static void ggml_compute_forward_elu_f32(
9154
9424
 
9155
9425
  static void ggml_compute_forward_elu(
9156
9426
  const struct ggml_compute_params * params,
9157
- const struct ggml_tensor * src0,
9158
9427
  struct ggml_tensor * dst) {
9428
+
9429
+ const struct ggml_tensor * src0 = dst->src[0];
9430
+
9159
9431
  switch (src0->type) {
9160
9432
  case GGML_TYPE_F32:
9161
9433
  {
9162
- ggml_compute_forward_elu_f32(params, src0, dst);
9434
+ ggml_compute_forward_elu_f32(params, dst);
9163
9435
  } break;
9164
9436
  default:
9165
9437
  {
@@ -9172,12 +9444,14 @@ static void ggml_compute_forward_elu(
9172
9444
 
9173
9445
  static void ggml_compute_forward_relu_f32(
9174
9446
  const struct ggml_compute_params * params,
9175
- const struct ggml_tensor * src0,
9176
9447
  struct ggml_tensor * dst) {
9448
+
9449
+ const struct ggml_tensor * src0 = dst->src[0];
9450
+
9177
9451
  assert(params->ith == 0);
9178
9452
  assert(ggml_are_same_shape(src0, dst));
9179
9453
 
9180
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9454
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9181
9455
  return;
9182
9456
  }
9183
9457
 
@@ -9196,12 +9470,14 @@ static void ggml_compute_forward_relu_f32(
9196
9470
 
9197
9471
  static void ggml_compute_forward_relu(
9198
9472
  const struct ggml_compute_params * params,
9199
- const struct ggml_tensor * src0,
9200
9473
  struct ggml_tensor * dst) {
9474
+
9475
+ const struct ggml_tensor * src0 = dst->src[0];
9476
+
9201
9477
  switch (src0->type) {
9202
9478
  case GGML_TYPE_F32:
9203
9479
  {
9204
- ggml_compute_forward_relu_f32(params, src0, dst);
9480
+ ggml_compute_forward_relu_f32(params, dst);
9205
9481
  } break;
9206
9482
  default:
9207
9483
  {
@@ -9214,13 +9490,15 @@ static void ggml_compute_forward_relu(
9214
9490
 
9215
9491
  static void ggml_compute_forward_gelu_f32(
9216
9492
  const struct ggml_compute_params * params,
9217
- const struct ggml_tensor * src0,
9218
9493
  struct ggml_tensor * dst) {
9219
- GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
9494
+
9495
+ const struct ggml_tensor * src0 = dst->src[0];
9496
+
9497
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
9220
9498
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
9221
9499
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
9222
9500
 
9223
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9501
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9224
9502
  return;
9225
9503
  }
9226
9504
 
@@ -9255,12 +9533,14 @@ static void ggml_compute_forward_gelu_f32(
9255
9533
 
9256
9534
  static void ggml_compute_forward_gelu(
9257
9535
  const struct ggml_compute_params * params,
9258
- const struct ggml_tensor * src0,
9259
9536
  struct ggml_tensor * dst) {
9537
+
9538
+ const struct ggml_tensor * src0 = dst->src[0];
9539
+
9260
9540
  switch (src0->type) {
9261
9541
  case GGML_TYPE_F32:
9262
9542
  {
9263
- ggml_compute_forward_gelu_f32(params, src0, dst);
9543
+ ggml_compute_forward_gelu_f32(params, dst);
9264
9544
  } break;
9265
9545
  default:
9266
9546
  {
@@ -9273,13 +9553,15 @@ static void ggml_compute_forward_gelu(
9273
9553
 
9274
9554
  static void ggml_compute_forward_gelu_quick_f32(
9275
9555
  const struct ggml_compute_params * params,
9276
- const struct ggml_tensor * src0,
9277
9556
  struct ggml_tensor * dst) {
9557
+
9558
+ const struct ggml_tensor * src0 = dst->src[0];
9559
+
9278
9560
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
9279
9561
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
9280
9562
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
9281
9563
 
9282
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9564
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9283
9565
  return;
9284
9566
  }
9285
9567
 
@@ -9314,12 +9596,14 @@ static void ggml_compute_forward_gelu_quick_f32(
9314
9596
 
9315
9597
  static void ggml_compute_forward_gelu_quick(
9316
9598
  const struct ggml_compute_params * params,
9317
- const struct ggml_tensor * src0,
9318
9599
  struct ggml_tensor * dst) {
9600
+
9601
+ const struct ggml_tensor * src0 = dst->src[0];
9602
+
9319
9603
  switch (src0->type) {
9320
9604
  case GGML_TYPE_F32:
9321
9605
  {
9322
- ggml_compute_forward_gelu_quick_f32(params, src0, dst);
9606
+ ggml_compute_forward_gelu_quick_f32(params, dst);
9323
9607
  } break;
9324
9608
  default:
9325
9609
  {
@@ -9332,13 +9616,15 @@ static void ggml_compute_forward_gelu_quick(
9332
9616
 
9333
9617
  static void ggml_compute_forward_silu_f32(
9334
9618
  const struct ggml_compute_params * params,
9335
- const struct ggml_tensor * src0,
9336
9619
  struct ggml_tensor * dst) {
9620
+
9621
+ const struct ggml_tensor * src0 = dst->src[0];
9622
+
9337
9623
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
9338
9624
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
9339
9625
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
9340
9626
 
9341
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9627
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9342
9628
  return;
9343
9629
  }
9344
9630
 
@@ -9373,12 +9659,14 @@ static void ggml_compute_forward_silu_f32(
9373
9659
 
9374
9660
  static void ggml_compute_forward_silu(
9375
9661
  const struct ggml_compute_params * params,
9376
- const struct ggml_tensor * src0,
9377
9662
  struct ggml_tensor * dst) {
9663
+
9664
+ const struct ggml_tensor * src0 = dst->src[0];
9665
+
9378
9666
  switch (src0->type) {
9379
9667
  case GGML_TYPE_F32:
9380
9668
  {
9381
- ggml_compute_forward_silu_f32(params, src0, dst);
9669
+ ggml_compute_forward_silu_f32(params, dst);
9382
9670
  } break;
9383
9671
  default:
9384
9672
  {
@@ -9390,12 +9678,14 @@ static void ggml_compute_forward_silu(
9390
9678
 
9391
9679
  static void ggml_compute_forward_leaky_relu_f32(
9392
9680
  const struct ggml_compute_params * params,
9393
- const struct ggml_tensor * src0,
9394
9681
  struct ggml_tensor * dst) {
9682
+
9683
+ const struct ggml_tensor * src0 = dst->src[0];
9684
+
9395
9685
  assert(params->ith == 0);
9396
9686
  assert(ggml_are_same_shape(src0, dst));
9397
9687
 
9398
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9688
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9399
9689
  return;
9400
9690
  }
9401
9691
 
@@ -9417,12 +9707,14 @@ static void ggml_compute_forward_leaky_relu_f32(
9417
9707
 
9418
9708
  static void ggml_compute_forward_leaky_relu(
9419
9709
  const struct ggml_compute_params * params,
9420
- const struct ggml_tensor * src0,
9421
9710
  struct ggml_tensor * dst) {
9711
+
9712
+ const struct ggml_tensor * src0 = dst->src[0];
9713
+
9422
9714
  switch (src0->type) {
9423
9715
  case GGML_TYPE_F32:
9424
9716
  {
9425
- ggml_compute_forward_leaky_relu_f32(params, src0, dst);
9717
+ ggml_compute_forward_leaky_relu_f32(params, dst);
9426
9718
  } break;
9427
9719
  default:
9428
9720
  {
@@ -9435,16 +9727,18 @@ static void ggml_compute_forward_leaky_relu(
9435
9727
 
9436
9728
  static void ggml_compute_forward_silu_back_f32(
9437
9729
  const struct ggml_compute_params * params,
9438
- const struct ggml_tensor * src0,
9439
- const struct ggml_tensor * grad,
9440
9730
  struct ggml_tensor * dst) {
9731
+
9732
+ const struct ggml_tensor * src0 = dst->src[0];
9733
+ const struct ggml_tensor * grad = dst->src[1];
9734
+
9441
9735
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(grad));
9442
9736
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
9443
9737
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
9444
9738
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
9445
9739
  GGML_ASSERT(ggml_are_same_shape(src0, grad));
9446
9740
 
9447
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9741
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9448
9742
  return;
9449
9743
  }
9450
9744
 
@@ -9480,13 +9774,14 @@ static void ggml_compute_forward_silu_back_f32(
9480
9774
 
9481
9775
  static void ggml_compute_forward_silu_back(
9482
9776
  const struct ggml_compute_params * params,
9483
- const struct ggml_tensor * src0,
9484
- const struct ggml_tensor * grad,
9485
9777
  struct ggml_tensor * dst) {
9778
+
9779
+ const struct ggml_tensor * src0 = dst->src[0];
9780
+
9486
9781
  switch (src0->type) {
9487
9782
  case GGML_TYPE_F32:
9488
9783
  {
9489
- ggml_compute_forward_silu_back_f32(params, src0, grad, dst);
9784
+ ggml_compute_forward_silu_back_f32(params, dst);
9490
9785
  } break;
9491
9786
  default:
9492
9787
  {
@@ -9498,12 +9793,14 @@ static void ggml_compute_forward_silu_back(
9498
9793
 
9499
9794
  static void ggml_compute_forward_hardswish_f32(
9500
9795
  const struct ggml_compute_params * params,
9501
- const struct ggml_tensor * src0,
9502
9796
  struct ggml_tensor * dst) {
9797
+
9798
+ const struct ggml_tensor * src0 = dst->src[0];
9799
+
9503
9800
  assert(params->ith == 0);
9504
9801
  assert(ggml_are_same_shape(src0, dst));
9505
9802
 
9506
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9803
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9507
9804
  return;
9508
9805
  }
9509
9806
 
@@ -9521,12 +9818,14 @@ static void ggml_compute_forward_hardswish_f32(
9521
9818
  }
9522
9819
  static void ggml_compute_forward_hardswish(
9523
9820
  const struct ggml_compute_params * params,
9524
- const struct ggml_tensor * src0,
9525
9821
  struct ggml_tensor * dst) {
9822
+
9823
+ const struct ggml_tensor * src0 = dst->src[0];
9824
+
9526
9825
  switch (src0->type) {
9527
9826
  case GGML_TYPE_F32:
9528
9827
  {
9529
- ggml_compute_forward_hardswish_f32(params, src0, dst);
9828
+ ggml_compute_forward_hardswish_f32(params, dst);
9530
9829
  } break;
9531
9830
  default:
9532
9831
  {
@@ -9537,12 +9836,14 @@ static void ggml_compute_forward_hardswish(
9537
9836
 
9538
9837
  static void ggml_compute_forward_hardsigmoid_f32(
9539
9838
  const struct ggml_compute_params * params,
9540
- const struct ggml_tensor * src0,
9541
9839
  struct ggml_tensor * dst) {
9840
+
9841
+ const struct ggml_tensor * src0 = dst->src[0];
9842
+
9542
9843
  assert(params->ith == 0);
9543
9844
  assert(ggml_are_same_shape(src0, dst));
9544
9845
 
9545
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9846
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9546
9847
  return;
9547
9848
  }
9548
9849
 
@@ -9561,12 +9862,14 @@ static void ggml_compute_forward_hardsigmoid_f32(
9561
9862
 
9562
9863
  static void ggml_compute_forward_hardsigmoid(
9563
9864
  const struct ggml_compute_params * params,
9564
- const struct ggml_tensor * src0,
9565
9865
  struct ggml_tensor * dst) {
9866
+
9867
+ const struct ggml_tensor * src0 = dst->src[0];
9868
+
9566
9869
  switch (src0->type) {
9567
9870
  case GGML_TYPE_F32:
9568
9871
  {
9569
- ggml_compute_forward_hardsigmoid_f32(params, src0, dst);
9872
+ ggml_compute_forward_hardsigmoid_f32(params, dst);
9570
9873
  } break;
9571
9874
  default:
9572
9875
  {
@@ -9580,11 +9883,13 @@ static void ggml_compute_forward_hardsigmoid(
9580
9883
 
9581
9884
  static void ggml_compute_forward_norm_f32(
9582
9885
  const struct ggml_compute_params * params,
9583
- const struct ggml_tensor * src0,
9584
9886
  struct ggml_tensor * dst) {
9887
+
9888
+ const struct ggml_tensor * src0 = dst->src[0];
9889
+
9585
9890
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
9586
9891
 
9587
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9892
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9588
9893
  return;
9589
9894
  }
9590
9895
 
@@ -9633,12 +9938,14 @@ static void ggml_compute_forward_norm_f32(
9633
9938
 
9634
9939
  static void ggml_compute_forward_norm(
9635
9940
  const struct ggml_compute_params * params,
9636
- const struct ggml_tensor * src0,
9637
9941
  struct ggml_tensor * dst) {
9942
+
9943
+ const struct ggml_tensor * src0 = dst->src[0];
9944
+
9638
9945
  switch (src0->type) {
9639
9946
  case GGML_TYPE_F32:
9640
9947
  {
9641
- ggml_compute_forward_norm_f32(params, src0, dst);
9948
+ ggml_compute_forward_norm_f32(params, dst);
9642
9949
  } break;
9643
9950
  default:
9644
9951
  {
@@ -9651,11 +9958,13 @@ static void ggml_compute_forward_norm(
9651
9958
 
9652
9959
  static void ggml_compute_forward_rms_norm_f32(
9653
9960
  const struct ggml_compute_params * params,
9654
- const struct ggml_tensor * src0,
9655
9961
  struct ggml_tensor * dst) {
9962
+
9963
+ const struct ggml_tensor * src0 = dst->src[0];
9964
+
9656
9965
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
9657
9966
 
9658
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9967
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9659
9968
  return;
9660
9969
  }
9661
9970
 
@@ -9701,12 +10010,14 @@ static void ggml_compute_forward_rms_norm_f32(
9701
10010
 
9702
10011
  static void ggml_compute_forward_rms_norm(
9703
10012
  const struct ggml_compute_params * params,
9704
- const struct ggml_tensor * src0,
9705
10013
  struct ggml_tensor * dst) {
10014
+
10015
+ const struct ggml_tensor * src0 = dst->src[0];
10016
+
9706
10017
  switch (src0->type) {
9707
10018
  case GGML_TYPE_F32:
9708
10019
  {
9709
- ggml_compute_forward_rms_norm_f32(params, src0, dst);
10020
+ ggml_compute_forward_rms_norm_f32(params, dst);
9710
10021
  } break;
9711
10022
  default:
9712
10023
  {
@@ -9717,12 +10028,14 @@ static void ggml_compute_forward_rms_norm(
9717
10028
 
9718
10029
  static void ggml_compute_forward_rms_norm_back_f32(
9719
10030
  const struct ggml_compute_params * params,
9720
- const struct ggml_tensor * src0,
9721
- const struct ggml_tensor * src1,
9722
10031
  struct ggml_tensor * dst) {
10032
+
10033
+ const struct ggml_tensor * src0 = dst->src[0];
10034
+ const struct ggml_tensor * src1 = dst->src[1];
10035
+
9723
10036
  GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_are_same_shape(src0, src1));
9724
10037
 
9725
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10038
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9726
10039
  return;
9727
10040
  }
9728
10041
 
@@ -9874,13 +10187,14 @@ static void ggml_compute_forward_rms_norm_back_f32(
9874
10187
 
9875
10188
  static void ggml_compute_forward_rms_norm_back(
9876
10189
  const struct ggml_compute_params * params,
9877
- const struct ggml_tensor * src0,
9878
- const struct ggml_tensor * src1,
9879
10190
  struct ggml_tensor * dst) {
10191
+
10192
+ const struct ggml_tensor * src0 = dst->src[0];
10193
+
9880
10194
  switch (src0->type) {
9881
10195
  case GGML_TYPE_F32:
9882
10196
  {
9883
- ggml_compute_forward_rms_norm_back_f32(params, src0, src1, dst);
10197
+ ggml_compute_forward_rms_norm_back_f32(params, dst);
9884
10198
  } break;
9885
10199
  default:
9886
10200
  {
@@ -9893,11 +10207,13 @@ static void ggml_compute_forward_rms_norm_back(
9893
10207
 
9894
10208
  static void ggml_compute_forward_group_norm_f32(
9895
10209
  const struct ggml_compute_params * params,
9896
- const struct ggml_tensor * src0,
9897
10210
  struct ggml_tensor * dst) {
10211
+
10212
+ const struct ggml_tensor * src0 = dst->src[0];
10213
+
9898
10214
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
9899
10215
 
9900
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10216
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9901
10217
  return;
9902
10218
  }
9903
10219
 
@@ -9965,12 +10281,14 @@ static void ggml_compute_forward_group_norm_f32(
9965
10281
 
9966
10282
  static void ggml_compute_forward_group_norm(
9967
10283
  const struct ggml_compute_params * params,
9968
- const struct ggml_tensor * src0,
9969
10284
  struct ggml_tensor * dst) {
10285
+
10286
+ const struct ggml_tensor * src0 = dst->src[0];
10287
+
9970
10288
  switch (src0->type) {
9971
10289
  case GGML_TYPE_F32:
9972
10290
  {
9973
- ggml_compute_forward_group_norm_f32(params, src0, dst);
10291
+ ggml_compute_forward_group_norm_f32(params, dst);
9974
10292
  } break;
9975
10293
  default:
9976
10294
  {
@@ -10016,9 +10334,11 @@ static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
10016
10334
 
10017
10335
  static void ggml_compute_forward_mul_mat(
10018
10336
  const struct ggml_compute_params * params,
10019
- const struct ggml_tensor * src0,
10020
- const struct ggml_tensor * src1,
10021
10337
  struct ggml_tensor * dst) {
10338
+
10339
+ const struct ggml_tensor * src0 = dst->src[0];
10340
+ const struct ggml_tensor * src1 = dst->src[1];
10341
+
10022
10342
  int64_t t0 = ggml_perf_time_us();
10023
10343
  UNUSED(t0);
10024
10344
 
@@ -10060,7 +10380,7 @@ static void ggml_compute_forward_mul_mat(
10060
10380
 
10061
10381
  #if defined(GGML_USE_CLBLAST)
10062
10382
  if (ggml_cl_can_mul_mat(src0, src1, dst)) {
10063
- if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
10383
+ if (params->ith == 0 && params->type == GGML_TASK_TYPE_COMPUTE) {
10064
10384
  ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
10065
10385
  }
10066
10386
  return;
@@ -10073,7 +10393,7 @@ static void ggml_compute_forward_mul_mat(
10073
10393
  const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
10074
10394
  UNUSED(desired_wsize);
10075
10395
 
10076
- if (params->type == GGML_TASK_INIT) {
10396
+ if (params->type == GGML_TASK_TYPE_INIT) {
10077
10397
  if (type != GGML_TYPE_F32) {
10078
10398
  assert(params->wsize >= desired_wsize);
10079
10399
  // parallelize by src0 rows
@@ -10096,7 +10416,7 @@ static void ggml_compute_forward_mul_mat(
10096
10416
  return;
10097
10417
  }
10098
10418
 
10099
- if (params->type == GGML_TASK_FINALIZE) {
10419
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
10100
10420
  return;
10101
10421
  }
10102
10422
 
@@ -10134,7 +10454,7 @@ static void ggml_compute_forward_mul_mat(
10134
10454
  }
10135
10455
  #endif
10136
10456
 
10137
- if (params->type == GGML_TASK_INIT) {
10457
+ if (params->type == GGML_TASK_TYPE_INIT) {
10138
10458
  if (ith != 0) {
10139
10459
  return;
10140
10460
  }
@@ -10158,7 +10478,7 @@ static void ggml_compute_forward_mul_mat(
10158
10478
  return;
10159
10479
  }
10160
10480
 
10161
- if (params->type == GGML_TASK_FINALIZE) {
10481
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
10162
10482
  return;
10163
10483
  }
10164
10484
 
@@ -10263,10 +10583,11 @@ static void ggml_compute_forward_mul_mat(
10263
10583
 
10264
10584
  static void ggml_compute_forward_mul_mat_id(
10265
10585
  const struct ggml_compute_params * params,
10266
- const struct ggml_tensor * ids,
10267
- const struct ggml_tensor * src1,
10268
10586
  struct ggml_tensor * dst) {
10269
10587
 
10588
+ const struct ggml_tensor * ids = dst->src[0];
10589
+ const struct ggml_tensor * src1 = dst->src[1];
10590
+
10270
10591
  const struct ggml_tensor * src0 = dst->src[2]; // only for GGML_TENSOR_BINARY_OP_LOCALS
10271
10592
 
10272
10593
  GGML_TENSOR_BINARY_OP_LOCALS
@@ -10314,7 +10635,7 @@ static void ggml_compute_forward_mul_mat_id(
10314
10635
 
10315
10636
  #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
10316
10637
 
10317
- if (params->type == GGML_TASK_INIT) {
10638
+ if (params->type == GGML_TASK_TYPE_INIT) {
10318
10639
  if (ith != 0) {
10319
10640
  return;
10320
10641
  }
@@ -10351,7 +10672,7 @@ static void ggml_compute_forward_mul_mat_id(
10351
10672
  return;
10352
10673
  }
10353
10674
 
10354
- if (params->type == GGML_TASK_FINALIZE) {
10675
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
10355
10676
  return;
10356
10677
  }
10357
10678
 
@@ -10457,9 +10778,11 @@ static void ggml_compute_forward_mul_mat_id(
10457
10778
 
10458
10779
  static void ggml_compute_forward_out_prod_f32(
10459
10780
  const struct ggml_compute_params * params,
10460
- const struct ggml_tensor * src0,
10461
- const struct ggml_tensor * src1,
10462
10781
  struct ggml_tensor * dst) {
10782
+
10783
+ const struct ggml_tensor * src0 = dst->src[0];
10784
+ const struct ggml_tensor * src1 = dst->src[1];
10785
+
10463
10786
  // int64_t t0 = ggml_perf_time_us();
10464
10787
  // UNUSED(t0);
10465
10788
 
@@ -10497,7 +10820,7 @@ static void ggml_compute_forward_out_prod_f32(
10497
10820
  (ggml_is_contiguous(src1) || ggml_is_transposed(src1));
10498
10821
  #endif
10499
10822
 
10500
- if (params->type == GGML_TASK_INIT) {
10823
+ if (params->type == GGML_TASK_TYPE_INIT) {
10501
10824
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
10502
10825
  if (use_blas) {
10503
10826
  return;
@@ -10510,7 +10833,7 @@ static void ggml_compute_forward_out_prod_f32(
10510
10833
  return;
10511
10834
  }
10512
10835
 
10513
- if (params->type == GGML_TASK_FINALIZE) {
10836
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
10514
10837
  return;
10515
10838
  }
10516
10839
 
@@ -10649,9 +10972,11 @@ static void ggml_compute_forward_out_prod_f32(
10649
10972
 
10650
10973
  static void ggml_compute_forward_out_prod_q_f32(
10651
10974
  const struct ggml_compute_params * params,
10652
- const struct ggml_tensor * src0,
10653
- const struct ggml_tensor * src1,
10654
10975
  struct ggml_tensor * dst) {
10976
+
10977
+ const struct ggml_tensor * src0 = dst->src[0];
10978
+ const struct ggml_tensor * src1 = dst->src[1];
10979
+
10655
10980
  // int64_t t0 = ggml_perf_time_us();
10656
10981
  // UNUSED(t0);
10657
10982
 
@@ -10688,7 +11013,7 @@ static void ggml_compute_forward_out_prod_q_f32(
10688
11013
  // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
10689
11014
  // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
10690
11015
 
10691
- if (params->type == GGML_TASK_INIT) {
11016
+ if (params->type == GGML_TASK_TYPE_INIT) {
10692
11017
  if (ith != 0) {
10693
11018
  return;
10694
11019
  }
@@ -10696,7 +11021,7 @@ static void ggml_compute_forward_out_prod_q_f32(
10696
11021
  return;
10697
11022
  }
10698
11023
 
10699
- if (params->type == GGML_TASK_FINALIZE) {
11024
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
10700
11025
  return;
10701
11026
  }
10702
11027
 
@@ -10762,9 +11087,10 @@ static void ggml_compute_forward_out_prod_q_f32(
10762
11087
 
10763
11088
  static void ggml_compute_forward_out_prod(
10764
11089
  const struct ggml_compute_params * params,
10765
- const struct ggml_tensor * src0,
10766
- const struct ggml_tensor * src1,
10767
11090
  struct ggml_tensor * dst) {
11091
+
11092
+ const struct ggml_tensor * src0 = dst->src[0];
11093
+
10768
11094
  switch (src0->type) {
10769
11095
  case GGML_TYPE_Q4_0:
10770
11096
  case GGML_TYPE_Q4_1:
@@ -10779,17 +11105,22 @@ static void ggml_compute_forward_out_prod(
10779
11105
  case GGML_TYPE_IQ2_XXS:
10780
11106
  case GGML_TYPE_IQ2_XS:
10781
11107
  case GGML_TYPE_IQ3_XXS:
11108
+ case GGML_TYPE_IQ1_S:
11109
+ case GGML_TYPE_IQ4_NL:
11110
+ case GGML_TYPE_IQ4_XS:
11111
+ case GGML_TYPE_IQ3_S:
11112
+ case GGML_TYPE_IQ2_S:
10782
11113
  {
10783
- ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst);
11114
+ ggml_compute_forward_out_prod_q_f32(params, dst);
10784
11115
  } break;
10785
11116
  case GGML_TYPE_F16:
10786
11117
  {
10787
11118
  GGML_ASSERT(false); // todo
10788
- // ggml_compute_forward_out_prod_f16_f32(params, src0, src1, dst);
11119
+ // ggml_compute_forward_out_prod_f16_f32(params, dst);
10789
11120
  } break;
10790
11121
  case GGML_TYPE_F32:
10791
11122
  {
10792
- ggml_compute_forward_out_prod_f32(params, src0, src1, dst);
11123
+ ggml_compute_forward_out_prod_f32(params, dst);
10793
11124
  } break;
10794
11125
  default:
10795
11126
  {
@@ -10802,13 +11133,15 @@ static void ggml_compute_forward_out_prod(
10802
11133
 
10803
11134
  static void ggml_compute_forward_scale_f32(
10804
11135
  const struct ggml_compute_params * params,
10805
- const struct ggml_tensor * src0,
10806
11136
  struct ggml_tensor * dst) {
11137
+
11138
+ const struct ggml_tensor * src0 = dst->src[0];
11139
+
10807
11140
  GGML_ASSERT(ggml_is_contiguous(src0));
10808
11141
  GGML_ASSERT(ggml_is_contiguous(dst));
10809
11142
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
10810
11143
 
10811
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11144
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
10812
11145
  return;
10813
11146
  }
10814
11147
 
@@ -10844,12 +11177,14 @@ static void ggml_compute_forward_scale_f32(
10844
11177
 
10845
11178
  static void ggml_compute_forward_scale(
10846
11179
  const struct ggml_compute_params * params,
10847
- const struct ggml_tensor * src0,
10848
11180
  struct ggml_tensor * dst) {
11181
+
11182
+ const struct ggml_tensor * src0 = dst->src[0];
11183
+
10849
11184
  switch (src0->type) {
10850
11185
  case GGML_TYPE_F32:
10851
11186
  {
10852
- ggml_compute_forward_scale_f32(params, src0, dst);
11187
+ ggml_compute_forward_scale_f32(params, dst);
10853
11188
  } break;
10854
11189
  default:
10855
11190
  {
@@ -10862,9 +11197,11 @@ static void ggml_compute_forward_scale(
10862
11197
 
10863
11198
  static void ggml_compute_forward_set_f32(
10864
11199
  const struct ggml_compute_params * params,
10865
- const struct ggml_tensor * src0,
10866
- const struct ggml_tensor * src1,
10867
11200
  struct ggml_tensor * dst) {
11201
+
11202
+ const struct ggml_tensor * src0 = dst->src[0];
11203
+ const struct ggml_tensor * src1 = dst->src[1];
11204
+
10868
11205
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
10869
11206
  GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
10870
11207
 
@@ -10876,7 +11213,7 @@ static void ggml_compute_forward_set_f32(
10876
11213
  size_t offset = ((int32_t *) dst->op_params)[3];
10877
11214
  bool inplace = (bool) ((int32_t *) dst->op_params)[4];
10878
11215
 
10879
- if (!inplace && (params->type == GGML_TASK_INIT)) {
11216
+ if (!inplace && (params->type == GGML_TASK_TYPE_INIT)) {
10880
11217
  if (params->ith != 0) {
10881
11218
  return;
10882
11219
  }
@@ -10888,7 +11225,7 @@ static void ggml_compute_forward_set_f32(
10888
11225
  ggml_nbytes(dst));
10889
11226
  }
10890
11227
 
10891
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11228
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
10892
11229
  return;
10893
11230
  }
10894
11231
 
@@ -10935,14 +11272,14 @@ static void ggml_compute_forward_set_f32(
10935
11272
 
10936
11273
  static void ggml_compute_forward_set(
10937
11274
  const struct ggml_compute_params * params,
10938
- const struct ggml_tensor * src0,
10939
- const struct ggml_tensor * src1,
10940
11275
  struct ggml_tensor * dst) {
10941
11276
 
11277
+ const struct ggml_tensor * src0 = dst->src[0];
11278
+
10942
11279
  switch (src0->type) {
10943
11280
  case GGML_TYPE_F32:
10944
11281
  {
10945
- ggml_compute_forward_set_f32(params, src0, src1, dst);
11282
+ ggml_compute_forward_set_f32(params, dst);
10946
11283
  } break;
10947
11284
  case GGML_TYPE_F16:
10948
11285
  case GGML_TYPE_Q4_0:
@@ -10959,6 +11296,11 @@ static void ggml_compute_forward_set(
10959
11296
  case GGML_TYPE_IQ2_XXS:
10960
11297
  case GGML_TYPE_IQ2_XS:
10961
11298
  case GGML_TYPE_IQ3_XXS:
11299
+ case GGML_TYPE_IQ1_S:
11300
+ case GGML_TYPE_IQ4_NL:
11301
+ case GGML_TYPE_IQ4_XS:
11302
+ case GGML_TYPE_IQ3_S:
11303
+ case GGML_TYPE_IQ2_S:
10962
11304
  default:
10963
11305
  {
10964
11306
  GGML_ASSERT(false);
@@ -10970,29 +11312,25 @@ static void ggml_compute_forward_set(
10970
11312
 
10971
11313
  static void ggml_compute_forward_cpy(
10972
11314
  const struct ggml_compute_params * params,
10973
- const struct ggml_tensor * src0,
10974
11315
  struct ggml_tensor * dst) {
10975
- ggml_compute_forward_dup(params, src0, dst);
11316
+ ggml_compute_forward_dup(params, dst);
10976
11317
  }
10977
11318
 
10978
11319
  // ggml_compute_forward_cont
10979
11320
 
10980
11321
  static void ggml_compute_forward_cont(
10981
11322
  const struct ggml_compute_params * params,
10982
- const struct ggml_tensor * src0,
10983
11323
  struct ggml_tensor * dst) {
10984
- ggml_compute_forward_dup(params, src0, dst);
11324
+ ggml_compute_forward_dup(params, dst);
10985
11325
  }
10986
11326
 
10987
11327
  // ggml_compute_forward_reshape
10988
11328
 
10989
11329
  static void ggml_compute_forward_reshape(
10990
11330
  const struct ggml_compute_params * params,
10991
- const struct ggml_tensor * src0,
10992
11331
  struct ggml_tensor * dst) {
10993
11332
  // NOP
10994
11333
  UNUSED(params);
10995
- UNUSED(src0);
10996
11334
  UNUSED(dst);
10997
11335
  }
10998
11336
 
@@ -11000,42 +11338,44 @@ static void ggml_compute_forward_reshape(
11000
11338
 
11001
11339
  static void ggml_compute_forward_view(
11002
11340
  const struct ggml_compute_params * params,
11003
- const struct ggml_tensor * src0) {
11341
+ const struct ggml_tensor * dst) {
11004
11342
  // NOP
11005
11343
  UNUSED(params);
11006
- UNUSED(src0);
11344
+ UNUSED(dst);
11007
11345
  }
11008
11346
 
11009
11347
  // ggml_compute_forward_permute
11010
11348
 
11011
11349
  static void ggml_compute_forward_permute(
11012
11350
  const struct ggml_compute_params * params,
11013
- const struct ggml_tensor * src0) {
11351
+ const struct ggml_tensor * dst) {
11014
11352
  // NOP
11015
11353
  UNUSED(params);
11016
- UNUSED(src0);
11354
+ UNUSED(dst);
11017
11355
  }
11018
11356
 
11019
11357
  // ggml_compute_forward_transpose
11020
11358
 
11021
11359
  static void ggml_compute_forward_transpose(
11022
11360
  const struct ggml_compute_params * params,
11023
- const struct ggml_tensor * src0) {
11361
+ const struct ggml_tensor * dst) {
11024
11362
  // NOP
11025
11363
  UNUSED(params);
11026
- UNUSED(src0);
11364
+ UNUSED(dst);
11027
11365
  }
11028
11366
 
11029
11367
  // ggml_compute_forward_get_rows
11030
11368
 
11031
11369
  static void ggml_compute_forward_get_rows_q(
11032
11370
  const struct ggml_compute_params * params,
11033
- const struct ggml_tensor * src0,
11034
- const struct ggml_tensor * src1,
11035
11371
  struct ggml_tensor * dst) {
11372
+
11373
+ const struct ggml_tensor * src0 = dst->src[0];
11374
+ const struct ggml_tensor * src1 = dst->src[1];
11375
+
11036
11376
  assert(params->ith == 0);
11037
11377
 
11038
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11378
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11039
11379
  return;
11040
11380
  }
11041
11381
 
@@ -11068,12 +11408,14 @@ static void ggml_compute_forward_get_rows_q(
11068
11408
 
11069
11409
  static void ggml_compute_forward_get_rows_f16(
11070
11410
  const struct ggml_compute_params * params,
11071
- const struct ggml_tensor * src0,
11072
- const struct ggml_tensor * src1,
11073
11411
  struct ggml_tensor * dst) {
11412
+
11413
+ const struct ggml_tensor * src0 = dst->src[0];
11414
+ const struct ggml_tensor * src1 = dst->src[1];
11415
+
11074
11416
  assert(params->ith == 0);
11075
11417
 
11076
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11418
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11077
11419
  return;
11078
11420
  }
11079
11421
 
@@ -11103,12 +11445,14 @@ static void ggml_compute_forward_get_rows_f16(
11103
11445
 
11104
11446
  static void ggml_compute_forward_get_rows_f32(
11105
11447
  const struct ggml_compute_params * params,
11106
- const struct ggml_tensor * src0,
11107
- const struct ggml_tensor * src1,
11108
11448
  struct ggml_tensor * dst) {
11449
+
11450
+ const struct ggml_tensor * src0 = dst->src[0];
11451
+ const struct ggml_tensor * src1 = dst->src[1];
11452
+
11109
11453
  assert(params->ith == 0);
11110
11454
 
11111
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11455
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11112
11456
  return;
11113
11457
  }
11114
11458
 
@@ -11138,9 +11482,10 @@ static void ggml_compute_forward_get_rows_f32(
11138
11482
 
11139
11483
  static void ggml_compute_forward_get_rows(
11140
11484
  const struct ggml_compute_params * params,
11141
- const struct ggml_tensor * src0,
11142
- const struct ggml_tensor * src1,
11143
11485
  struct ggml_tensor * dst) {
11486
+
11487
+ const struct ggml_tensor * src0 = dst->src[0];
11488
+
11144
11489
  switch (src0->type) {
11145
11490
  case GGML_TYPE_Q4_0:
11146
11491
  case GGML_TYPE_Q4_1:
@@ -11156,17 +11501,22 @@ static void ggml_compute_forward_get_rows(
11156
11501
  case GGML_TYPE_IQ2_XXS:
11157
11502
  case GGML_TYPE_IQ2_XS:
11158
11503
  case GGML_TYPE_IQ3_XXS:
11504
+ case GGML_TYPE_IQ1_S:
11505
+ case GGML_TYPE_IQ4_NL:
11506
+ case GGML_TYPE_IQ4_XS:
11507
+ case GGML_TYPE_IQ3_S:
11508
+ case GGML_TYPE_IQ2_S:
11159
11509
  {
11160
- ggml_compute_forward_get_rows_q(params, src0, src1, dst);
11510
+ ggml_compute_forward_get_rows_q(params, dst);
11161
11511
  } break;
11162
11512
  case GGML_TYPE_F16:
11163
11513
  {
11164
- ggml_compute_forward_get_rows_f16(params, src0, src1, dst);
11514
+ ggml_compute_forward_get_rows_f16(params, dst);
11165
11515
  } break;
11166
11516
  case GGML_TYPE_F32:
11167
11517
  case GGML_TYPE_I32:
11168
11518
  {
11169
- ggml_compute_forward_get_rows_f32(params, src0, src1, dst);
11519
+ ggml_compute_forward_get_rows_f32(params, dst);
11170
11520
  } break;
11171
11521
  default:
11172
11522
  {
@@ -11197,22 +11547,24 @@ static void ggml_compute_forward_get_rows(
11197
11547
 
11198
11548
  static void ggml_compute_forward_get_rows_back_f32_f16(
11199
11549
  const struct ggml_compute_params * params,
11200
- const struct ggml_tensor * src0,
11201
- const struct ggml_tensor * src1,
11202
11550
  struct ggml_tensor * dst) {
11551
+
11552
+ const struct ggml_tensor * src0 = dst->src[0];
11553
+ const struct ggml_tensor * src1 = dst->src[1];
11554
+
11203
11555
  GGML_ASSERT(params->ith == 0);
11204
11556
  GGML_ASSERT(ggml_is_contiguous(dst));
11205
11557
 
11206
11558
  // ggml_compute_forward_dup_same_cont(params, opt0, dst);
11207
11559
 
11208
- if (params->type == GGML_TASK_INIT) {
11560
+ if (params->type == GGML_TASK_TYPE_INIT) {
11209
11561
  if (params->ith != 0) {
11210
11562
  return;
11211
11563
  }
11212
11564
  memset(dst->data, 0, ggml_nbytes(dst));
11213
11565
  }
11214
11566
 
11215
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11567
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11216
11568
  return;
11217
11569
  }
11218
11570
 
@@ -11234,22 +11586,24 @@ static void ggml_compute_forward_get_rows_back_f32_f16(
11234
11586
 
11235
11587
  static void ggml_compute_forward_get_rows_back_f32(
11236
11588
  const struct ggml_compute_params * params,
11237
- const struct ggml_tensor * src0,
11238
- const struct ggml_tensor * src1,
11239
11589
  struct ggml_tensor * dst) {
11590
+
11591
+ const struct ggml_tensor * src0 = dst->src[0];
11592
+ const struct ggml_tensor * src1 = dst->src[1];
11593
+
11240
11594
  GGML_ASSERT(params->ith == 0);
11241
11595
  GGML_ASSERT(ggml_is_contiguous(dst));
11242
11596
 
11243
11597
  // ggml_compute_forward_dup_same_cont(params, opt0, dst);
11244
11598
 
11245
- if (params->type == GGML_TASK_INIT) {
11599
+ if (params->type == GGML_TASK_TYPE_INIT) {
11246
11600
  if (params->ith != 0) {
11247
11601
  return;
11248
11602
  }
11249
11603
  memset(dst->data, 0, ggml_nbytes(dst));
11250
11604
  }
11251
11605
 
11252
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11606
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11253
11607
  return;
11254
11608
  }
11255
11609
 
@@ -11271,17 +11625,18 @@ static void ggml_compute_forward_get_rows_back_f32(
11271
11625
 
11272
11626
  static void ggml_compute_forward_get_rows_back(
11273
11627
  const struct ggml_compute_params * params,
11274
- const struct ggml_tensor * src0,
11275
- const struct ggml_tensor * src1,
11276
11628
  struct ggml_tensor * dst) {
11629
+
11630
+ const struct ggml_tensor * src0 = dst->src[0];
11631
+
11277
11632
  switch (src0->type) {
11278
11633
  case GGML_TYPE_F16:
11279
11634
  {
11280
- ggml_compute_forward_get_rows_back_f32_f16(params, src0, src1, dst);
11635
+ ggml_compute_forward_get_rows_back_f32_f16(params, dst);
11281
11636
  } break;
11282
11637
  case GGML_TYPE_F32:
11283
11638
  {
11284
- ggml_compute_forward_get_rows_back_f32(params, src0, src1, dst);
11639
+ ggml_compute_forward_get_rows_back_f32(params, dst);
11285
11640
  } break;
11286
11641
  default:
11287
11642
  {
@@ -11312,11 +11667,13 @@ static void ggml_compute_forward_get_rows_back(
11312
11667
 
11313
11668
  static void ggml_compute_forward_diag_f32(
11314
11669
  const struct ggml_compute_params * params,
11315
- const struct ggml_tensor * src0,
11316
11670
  struct ggml_tensor * dst) {
11671
+
11672
+ const struct ggml_tensor * src0 = dst->src[0];
11673
+
11317
11674
  GGML_ASSERT(params->ith == 0);
11318
11675
 
11319
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11676
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11320
11677
  return;
11321
11678
  }
11322
11679
 
@@ -11352,12 +11709,14 @@ static void ggml_compute_forward_diag_f32(
11352
11709
 
11353
11710
  static void ggml_compute_forward_diag(
11354
11711
  const struct ggml_compute_params * params,
11355
- const struct ggml_tensor * src0,
11356
11712
  struct ggml_tensor * dst) {
11713
+
11714
+ const struct ggml_tensor * src0 = dst->src[0];
11715
+
11357
11716
  switch (src0->type) {
11358
11717
  case GGML_TYPE_F32:
11359
11718
  {
11360
- ggml_compute_forward_diag_f32(params, src0, dst);
11719
+ ggml_compute_forward_diag_f32(params, dst);
11361
11720
  } break;
11362
11721
  default:
11363
11722
  {
@@ -11370,10 +11729,11 @@ static void ggml_compute_forward_diag(
11370
11729
 
11371
11730
  static void ggml_compute_forward_diag_mask_f32(
11372
11731
  const struct ggml_compute_params * params,
11373
- const struct ggml_tensor * src0,
11374
11732
  struct ggml_tensor * dst,
11375
11733
  const float value) {
11376
11734
 
11735
+ const struct ggml_tensor * src0 = dst->src[0];
11736
+
11377
11737
  const int ith = params->ith;
11378
11738
  const int nth = params->nth;
11379
11739
 
@@ -11382,7 +11742,7 @@ static void ggml_compute_forward_diag_mask_f32(
11382
11742
 
11383
11743
  GGML_ASSERT(n_past >= 0);
11384
11744
 
11385
- if (!inplace && (params->type == GGML_TASK_INIT)) {
11745
+ if (!inplace && (params->type == GGML_TASK_TYPE_INIT)) {
11386
11746
  if (ith != 0) {
11387
11747
  return;
11388
11748
  }
@@ -11396,7 +11756,7 @@ static void ggml_compute_forward_diag_mask_f32(
11396
11756
  ggml_nbytes(dst));
11397
11757
  }
11398
11758
 
11399
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11759
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11400
11760
  return;
11401
11761
  }
11402
11762
 
@@ -11423,12 +11783,14 @@ static void ggml_compute_forward_diag_mask_f32(
11423
11783
 
11424
11784
  static void ggml_compute_forward_diag_mask_inf(
11425
11785
  const struct ggml_compute_params * params,
11426
- const struct ggml_tensor * src0,
11427
11786
  struct ggml_tensor * dst) {
11787
+
11788
+ const struct ggml_tensor * src0 = dst->src[0];
11789
+
11428
11790
  switch (src0->type) {
11429
11791
  case GGML_TYPE_F32:
11430
11792
  {
11431
- ggml_compute_forward_diag_mask_f32(params, src0, dst, -INFINITY);
11793
+ ggml_compute_forward_diag_mask_f32(params, dst, -INFINITY);
11432
11794
  } break;
11433
11795
  default:
11434
11796
  {
@@ -11439,12 +11801,14 @@ static void ggml_compute_forward_diag_mask_inf(
11439
11801
 
11440
11802
  static void ggml_compute_forward_diag_mask_zero(
11441
11803
  const struct ggml_compute_params * params,
11442
- const struct ggml_tensor * src0,
11443
11804
  struct ggml_tensor * dst) {
11805
+
11806
+ const struct ggml_tensor * src0 = dst->src[0];
11807
+
11444
11808
  switch (src0->type) {
11445
11809
  case GGML_TYPE_F32:
11446
11810
  {
11447
- ggml_compute_forward_diag_mask_f32(params, src0, dst, 0);
11811
+ ggml_compute_forward_diag_mask_f32(params, dst, 0);
11448
11812
  } break;
11449
11813
  default:
11450
11814
  {
@@ -11457,26 +11821,42 @@ static void ggml_compute_forward_diag_mask_zero(
11457
11821
 
11458
11822
  static void ggml_compute_forward_soft_max_f32(
11459
11823
  const struct ggml_compute_params * params,
11460
- const struct ggml_tensor * src0,
11461
- const struct ggml_tensor * src1,
11462
11824
  struct ggml_tensor * dst) {
11825
+
11826
+ const struct ggml_tensor * src0 = dst->src[0];
11827
+ const struct ggml_tensor * src1 = dst->src[1];
11828
+ const struct ggml_tensor * src2 = dst->src[2];
11829
+
11463
11830
  assert(ggml_is_contiguous(dst));
11464
11831
  assert(ggml_are_same_shape(src0, dst));
11465
11832
 
11466
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11833
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11467
11834
  return;
11468
11835
  }
11469
11836
 
11470
- float scale = 1.0f;
11471
- memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
11837
+ float scale = 1.0f;
11838
+ float max_bias = 0.0f;
11839
+
11840
+ memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
11841
+ memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
11472
11842
 
11473
11843
  // TODO: handle transposed/permuted matrices
11474
11844
 
11475
11845
  const int ith = params->ith;
11476
11846
  const int nth = params->nth;
11477
11847
 
11848
+ GGML_TENSOR_UNARY_OP_LOCALS
11849
+
11478
11850
  const int64_t ne11 = src1 ? src1->ne[1] : 1;
11479
11851
 
11852
+ // TODO: is this supposed to be ceil instead of floor?
11853
+ // https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370
11854
+ const uint32_t n_head_kv = ne02;
11855
+ const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head_kv));
11856
+
11857
+ const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
11858
+ const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
11859
+
11480
11860
  const int nc = src0->ne[0];
11481
11861
  const int nr = ggml_nrows(src0);
11482
11862
 
@@ -11489,6 +11869,9 @@ static void ggml_compute_forward_soft_max_f32(
11489
11869
 
11490
11870
  float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
11491
11871
 
11872
+ // when max_bias <= 0.0f, src2 is not used and we default it to src0 to avoid branching
11873
+ float * pos = src2 ? (float *) src2->data : src0->data;
11874
+
11492
11875
  for (int i1 = ir0; i1 < ir1; i1++) {
11493
11876
  float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
11494
11877
  float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
@@ -11502,6 +11885,16 @@ static void ggml_compute_forward_soft_max_f32(
11502
11885
  ggml_vec_acc_f32(nc, wp, mp);
11503
11886
  }
11504
11887
 
11888
+ // ALiBi bias
11889
+ if (max_bias > 0.0f) {
11890
+ const uint32_t h = (i1/ne01)%ne02; // head
11891
+ const float slope = h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1);
11892
+
11893
+ for (int i = 0; i < nc; i++) {
11894
+ wp[i] = wp[i] + slope*pos[i];
11895
+ }
11896
+ }
11897
+
11505
11898
  #ifndef NDEBUG
11506
11899
  for (int i = 0; i < nc; ++i) {
11507
11900
  //printf("p[%d] = %f\n", i, p[i]);
@@ -11544,13 +11937,14 @@ static void ggml_compute_forward_soft_max_f32(
11544
11937
 
11545
11938
  static void ggml_compute_forward_soft_max(
11546
11939
  const struct ggml_compute_params * params,
11547
- const struct ggml_tensor * src0,
11548
- const struct ggml_tensor * src1,
11549
11940
  struct ggml_tensor * dst) {
11941
+
11942
+ const struct ggml_tensor * src0 = dst->src[0];
11943
+
11550
11944
  switch (src0->type) {
11551
11945
  case GGML_TYPE_F32:
11552
11946
  {
11553
- ggml_compute_forward_soft_max_f32(params, src0, src1, dst);
11947
+ ggml_compute_forward_soft_max_f32(params, dst);
11554
11948
  } break;
11555
11949
  default:
11556
11950
  {
@@ -11563,16 +11957,18 @@ static void ggml_compute_forward_soft_max(
11563
11957
 
11564
11958
  static void ggml_compute_forward_soft_max_back_f32(
11565
11959
  const struct ggml_compute_params * params,
11566
- const struct ggml_tensor * src0,
11567
- const struct ggml_tensor * src1,
11568
11960
  struct ggml_tensor * dst) {
11961
+
11962
+ const struct ggml_tensor * src0 = dst->src[0];
11963
+ const struct ggml_tensor * src1 = dst->src[1];
11964
+
11569
11965
  GGML_ASSERT(ggml_is_contiguous(src0));
11570
11966
  GGML_ASSERT(ggml_is_contiguous(src1));
11571
11967
  GGML_ASSERT(ggml_is_contiguous(dst));
11572
11968
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
11573
11969
  GGML_ASSERT(ggml_are_same_shape(src1, dst));
11574
11970
 
11575
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11971
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11576
11972
  return;
11577
11973
  }
11578
11974
 
@@ -11640,13 +12036,14 @@ static void ggml_compute_forward_soft_max_back_f32(
11640
12036
 
11641
12037
  static void ggml_compute_forward_soft_max_back(
11642
12038
  const struct ggml_compute_params * params,
11643
- const struct ggml_tensor * src0,
11644
- const struct ggml_tensor * src1,
11645
12039
  struct ggml_tensor * dst) {
12040
+
12041
+ const struct ggml_tensor * src0 = dst->src[0];
12042
+
11646
12043
  switch (src0->type) {
11647
12044
  case GGML_TYPE_F32:
11648
12045
  {
11649
- ggml_compute_forward_soft_max_back_f32(params, src0, src1, dst);
12046
+ ggml_compute_forward_soft_max_back_f32(params, dst);
11650
12047
  } break;
11651
12048
  default:
11652
12049
  {
@@ -11659,11 +12056,13 @@ static void ggml_compute_forward_soft_max_back(
11659
12056
 
11660
12057
  static void ggml_compute_forward_alibi_f32(
11661
12058
  const struct ggml_compute_params * params,
11662
- const struct ggml_tensor * src0,
11663
12059
  struct ggml_tensor * dst) {
12060
+
12061
+ const struct ggml_tensor * src0 = dst->src[0];
12062
+
11664
12063
  assert(params->ith == 0);
11665
12064
 
11666
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12065
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11667
12066
  return;
11668
12067
  }
11669
12068
 
@@ -11694,22 +12093,20 @@ static void ggml_compute_forward_alibi_f32(
11694
12093
  const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
11695
12094
  const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
11696
12095
 
11697
- for (int64_t i = 0; i < ne0; i++) {
11698
- for (int64_t j = 0; j < ne1; j++) {
11699
- for (int64_t k = 0; k < ne2_ne3; k++) {
11700
- float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
11701
- float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
11702
-
11703
- // TODO: k*nb2 or k*nb3
12096
+ for (int64_t k = 0; k < ne2_ne3; k++) {
12097
+ // TODO: k*nb2 or k*nb3
12098
+ float m_k;
11704
12099
 
11705
- float m_k;
11706
-
11707
- if (k < n_heads_log2_floor) {
11708
- m_k = powf(m0, k + 1);
11709
- } else {
11710
- m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
11711
- }
12100
+ if (k < n_heads_log2_floor) {
12101
+ m_k = powf(m0, k + 1);
12102
+ } else {
12103
+ m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
12104
+ }
11712
12105
 
12106
+ for (int64_t i = 0; i < ne0; i++) {
12107
+ for (int64_t j = 0; j < ne1; j++) {
12108
+ float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
12109
+ float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
11713
12110
  pdst[0] = i * m_k + src[0];
11714
12111
  }
11715
12112
  }
@@ -11718,11 +12115,13 @@ static void ggml_compute_forward_alibi_f32(
11718
12115
 
11719
12116
  static void ggml_compute_forward_alibi_f16(
11720
12117
  const struct ggml_compute_params * params,
11721
- const struct ggml_tensor * src0,
11722
12118
  struct ggml_tensor * dst) {
12119
+
12120
+ const struct ggml_tensor * src0 = dst->src[0];
12121
+
11723
12122
  assert(params->ith == 0);
11724
12123
 
11725
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12124
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11726
12125
  return;
11727
12126
  }
11728
12127
 
@@ -11754,21 +12153,20 @@ static void ggml_compute_forward_alibi_f16(
11754
12153
  const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
11755
12154
  const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
11756
12155
 
11757
- for (int i = 0; i < ne0; i++) {
11758
- for (int j = 0; j < ne1; j++) {
11759
- for (int k = 0; k < ne2_ne3; k++) {
11760
- ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
11761
- float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
11762
-
11763
- // TODO: k*nb2 or k*nb3
12156
+ for (int k = 0; k < ne2_ne3; k++) {
12157
+ // TODO: k*nb2 or k*nb3
12158
+ float m_k;
11764
12159
 
11765
- float m_k;
12160
+ if (k < n_heads_log2_floor) {
12161
+ m_k = powf(m0, k + 1);
12162
+ } else {
12163
+ m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
12164
+ }
11766
12165
 
11767
- if (k < n_heads_log2_floor) {
11768
- m_k = powf(m0, k + 1);
11769
- } else {
11770
- m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
11771
- }
12166
+ for (int i = 0; i < ne0; i++) {
12167
+ for (int j = 0; j < ne1; j++) {
12168
+ ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
12169
+ float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
11772
12170
 
11773
12171
  // we return F32
11774
12172
  pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]);
@@ -11779,16 +12177,18 @@ static void ggml_compute_forward_alibi_f16(
11779
12177
 
11780
12178
  static void ggml_compute_forward_alibi(
11781
12179
  const struct ggml_compute_params * params,
11782
- const struct ggml_tensor * src0,
11783
12180
  struct ggml_tensor * dst) {
12181
+
12182
+ const struct ggml_tensor * src0 = dst->src[0];
12183
+
11784
12184
  switch (src0->type) {
11785
12185
  case GGML_TYPE_F16:
11786
12186
  {
11787
- ggml_compute_forward_alibi_f16(params, src0, dst);
12187
+ ggml_compute_forward_alibi_f16(params, dst);
11788
12188
  } break;
11789
12189
  case GGML_TYPE_F32:
11790
12190
  {
11791
- ggml_compute_forward_alibi_f32(params, src0, dst);
12191
+ ggml_compute_forward_alibi_f32(params, dst);
11792
12192
  } break;
11793
12193
  case GGML_TYPE_Q4_0:
11794
12194
  case GGML_TYPE_Q4_1:
@@ -11804,6 +12204,11 @@ static void ggml_compute_forward_alibi(
11804
12204
  case GGML_TYPE_IQ2_XXS:
11805
12205
  case GGML_TYPE_IQ2_XS:
11806
12206
  case GGML_TYPE_IQ3_XXS:
12207
+ case GGML_TYPE_IQ1_S:
12208
+ case GGML_TYPE_IQ4_NL:
12209
+ case GGML_TYPE_IQ4_XS:
12210
+ case GGML_TYPE_IQ3_S:
12211
+ case GGML_TYPE_IQ2_S:
11807
12212
  case GGML_TYPE_Q8_K:
11808
12213
  case GGML_TYPE_I8:
11809
12214
  case GGML_TYPE_I16:
@@ -11819,11 +12224,13 @@ static void ggml_compute_forward_alibi(
11819
12224
 
11820
12225
  static void ggml_compute_forward_clamp_f32(
11821
12226
  const struct ggml_compute_params * params,
11822
- const struct ggml_tensor * src0,
11823
12227
  struct ggml_tensor * dst) {
12228
+
12229
+ const struct ggml_tensor * src0 = dst->src[0];
12230
+
11824
12231
  assert(params->ith == 0);
11825
12232
 
11826
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12233
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11827
12234
  return;
11828
12235
  }
11829
12236
 
@@ -11859,12 +12266,14 @@ static void ggml_compute_forward_clamp_f32(
11859
12266
 
11860
12267
  static void ggml_compute_forward_clamp(
11861
12268
  const struct ggml_compute_params * params,
11862
- const struct ggml_tensor * src0,
11863
12269
  struct ggml_tensor * dst) {
12270
+
12271
+ const struct ggml_tensor * src0 = dst->src[0];
12272
+
11864
12273
  switch (src0->type) {
11865
12274
  case GGML_TYPE_F32:
11866
12275
  {
11867
- ggml_compute_forward_clamp_f32(params, src0, dst);
12276
+ ggml_compute_forward_clamp_f32(params, dst);
11868
12277
  } break;
11869
12278
  case GGML_TYPE_F16:
11870
12279
  case GGML_TYPE_Q4_0:
@@ -11881,6 +12290,11 @@ static void ggml_compute_forward_clamp(
11881
12290
  case GGML_TYPE_IQ2_XXS:
11882
12291
  case GGML_TYPE_IQ2_XS:
11883
12292
  case GGML_TYPE_IQ3_XXS:
12293
+ case GGML_TYPE_IQ1_S:
12294
+ case GGML_TYPE_IQ4_NL:
12295
+ case GGML_TYPE_IQ4_XS:
12296
+ case GGML_TYPE_IQ3_S:
12297
+ case GGML_TYPE_IQ2_S:
11884
12298
  case GGML_TYPE_Q8_K:
11885
12299
  case GGML_TYPE_I8:
11886
12300
  case GGML_TYPE_I16:
@@ -11952,11 +12366,13 @@ GGML_CALL void ggml_rope_yarn_corr_dims(
11952
12366
 
11953
12367
  static void ggml_compute_forward_rope_f32(
11954
12368
  const struct ggml_compute_params * params,
11955
- const struct ggml_tensor * src0,
11956
- const struct ggml_tensor * src1,
11957
12369
  struct ggml_tensor * dst,
11958
12370
  const bool forward) {
11959
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12371
+
12372
+ const struct ggml_tensor * src0 = dst->src[0];
12373
+ const struct ggml_tensor * src1 = dst->src[1];
12374
+
12375
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11960
12376
  return;
11961
12377
  }
11962
12378
 
@@ -12128,11 +12544,13 @@ static void ggml_compute_forward_rope_f32(
12128
12544
 
12129
12545
  static void ggml_compute_forward_rope_f16(
12130
12546
  const struct ggml_compute_params * params,
12131
- const struct ggml_tensor * src0,
12132
- const struct ggml_tensor * src1,
12133
12547
  struct ggml_tensor * dst,
12134
12548
  const bool forward) {
12135
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12549
+
12550
+ const struct ggml_tensor * src0 = dst->src[0];
12551
+ const struct ggml_tensor * src1 = dst->src[1];
12552
+
12553
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
12136
12554
  return;
12137
12555
  }
12138
12556
 
@@ -12293,17 +12711,18 @@ static void ggml_compute_forward_rope_f16(
12293
12711
 
12294
12712
  static void ggml_compute_forward_rope(
12295
12713
  const struct ggml_compute_params * params,
12296
- const struct ggml_tensor * src0,
12297
- const struct ggml_tensor * src1,
12298
12714
  struct ggml_tensor * dst) {
12715
+
12716
+ const struct ggml_tensor * src0 = dst->src[0];
12717
+
12299
12718
  switch (src0->type) {
12300
12719
  case GGML_TYPE_F16:
12301
12720
  {
12302
- ggml_compute_forward_rope_f16(params, src0, src1, dst, true);
12721
+ ggml_compute_forward_rope_f16(params, dst, true);
12303
12722
  } break;
12304
12723
  case GGML_TYPE_F32:
12305
12724
  {
12306
- ggml_compute_forward_rope_f32(params, src0, src1, dst, true);
12725
+ ggml_compute_forward_rope_f32(params, dst, true);
12307
12726
  } break;
12308
12727
  default:
12309
12728
  {
@@ -12316,17 +12735,18 @@ static void ggml_compute_forward_rope(
12316
12735
 
12317
12736
  static void ggml_compute_forward_rope_back(
12318
12737
  const struct ggml_compute_params * params,
12319
- const struct ggml_tensor * src0,
12320
- const struct ggml_tensor * src1,
12321
12738
  struct ggml_tensor * dst) {
12739
+
12740
+ const struct ggml_tensor * src0 = dst->src[0];
12741
+
12322
12742
  switch (src0->type) {
12323
12743
  case GGML_TYPE_F16:
12324
12744
  {
12325
- ggml_compute_forward_rope_f16(params, src0, src1, dst, false);
12745
+ ggml_compute_forward_rope_f16(params, dst, false);
12326
12746
  } break;
12327
12747
  case GGML_TYPE_F32:
12328
12748
  {
12329
- ggml_compute_forward_rope_f32(params, src0, src1, dst, false);
12749
+ ggml_compute_forward_rope_f32(params, dst, false);
12330
12750
  } break;
12331
12751
  default:
12332
12752
  {
@@ -12339,9 +12759,11 @@ static void ggml_compute_forward_rope_back(
12339
12759
 
12340
12760
  static void ggml_compute_forward_conv_transpose_1d_f16_f32(
12341
12761
  const struct ggml_compute_params * params,
12342
- const struct ggml_tensor * src0,
12343
- const struct ggml_tensor * src1,
12344
12762
  struct ggml_tensor * dst) {
12763
+
12764
+ const struct ggml_tensor * src0 = dst->src[0];
12765
+ const struct ggml_tensor * src1 = dst->src[1];
12766
+
12345
12767
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
12346
12768
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
12347
12769
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -12359,7 +12781,7 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
12359
12781
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
12360
12782
  GGML_ASSERT(nb10 == sizeof(float));
12361
12783
 
12362
- if (params->type == GGML_TASK_INIT) {
12784
+ if (params->type == GGML_TASK_TYPE_INIT) {
12363
12785
  if (ith != 0) {
12364
12786
  return;
12365
12787
  }
@@ -12399,7 +12821,7 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
12399
12821
  return;
12400
12822
  }
12401
12823
 
12402
- if (params->type == GGML_TASK_FINALIZE) {
12824
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
12403
12825
  return;
12404
12826
  }
12405
12827
 
@@ -12436,9 +12858,11 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
12436
12858
 
12437
12859
  static void ggml_compute_forward_conv_transpose_1d_f32(
12438
12860
  const struct ggml_compute_params * params,
12439
- const struct ggml_tensor * src0,
12440
- const struct ggml_tensor * src1,
12441
12861
  struct ggml_tensor * dst) {
12862
+
12863
+ const struct ggml_tensor * src0 = dst->src[0];
12864
+ const struct ggml_tensor * src1 = dst->src[1];
12865
+
12442
12866
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
12443
12867
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
12444
12868
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -12456,7 +12880,7 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
12456
12880
  GGML_ASSERT(nb00 == sizeof(float));
12457
12881
  GGML_ASSERT(nb10 == sizeof(float));
12458
12882
 
12459
- if (params->type == GGML_TASK_INIT) {
12883
+ if (params->type == GGML_TASK_TYPE_INIT) {
12460
12884
  if (ith != 0) {
12461
12885
  return;
12462
12886
  }
@@ -12496,7 +12920,7 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
12496
12920
  return;
12497
12921
  }
12498
12922
 
12499
- if (params->type == GGML_TASK_FINALIZE) {
12923
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
12500
12924
  return;
12501
12925
  }
12502
12926
 
@@ -12533,17 +12957,18 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
12533
12957
 
12534
12958
  static void ggml_compute_forward_conv_transpose_1d(
12535
12959
  const struct ggml_compute_params * params,
12536
- const struct ggml_tensor * src0,
12537
- const struct ggml_tensor * src1,
12538
12960
  struct ggml_tensor * dst) {
12961
+
12962
+ const struct ggml_tensor * src0 = dst->src[0];
12963
+
12539
12964
  switch (src0->type) {
12540
12965
  case GGML_TYPE_F16:
12541
12966
  {
12542
- ggml_compute_forward_conv_transpose_1d_f16_f32(params, src0, src1, dst);
12967
+ ggml_compute_forward_conv_transpose_1d_f16_f32(params, dst);
12543
12968
  } break;
12544
12969
  case GGML_TYPE_F32:
12545
12970
  {
12546
- ggml_compute_forward_conv_transpose_1d_f32(params, src0, src1, dst);
12971
+ ggml_compute_forward_conv_transpose_1d_f32(params, dst);
12547
12972
  } break;
12548
12973
  default:
12549
12974
  {
@@ -12557,9 +12982,11 @@ static void ggml_compute_forward_conv_transpose_1d(
12557
12982
  // dst: result [N, OH, OW, IC*KH*KW]
12558
12983
  static void ggml_compute_forward_im2col_f32(
12559
12984
  const struct ggml_compute_params * params,
12560
- const struct ggml_tensor * src0,
12561
- const struct ggml_tensor * src1,
12562
12985
  struct ggml_tensor * dst) {
12986
+
12987
+ const struct ggml_tensor * src0 = dst->src[0];
12988
+ const struct ggml_tensor * src1 = dst->src[1];
12989
+
12563
12990
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
12564
12991
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
12565
12992
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -12597,11 +13024,11 @@ static void ggml_compute_forward_im2col_f32(
12597
13024
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
12598
13025
  GGML_ASSERT(nb10 == sizeof(float));
12599
13026
 
12600
- if (params->type == GGML_TASK_INIT) {
13027
+ if (params->type == GGML_TASK_TYPE_INIT) {
12601
13028
  return;
12602
13029
  }
12603
13030
 
12604
- if (params->type == GGML_TASK_FINALIZE) {
13031
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
12605
13032
  return;
12606
13033
  }
12607
13034
 
@@ -12643,9 +13070,11 @@ static void ggml_compute_forward_im2col_f32(
12643
13070
  // dst: result [N, OH, OW, IC*KH*KW]
12644
13071
  static void ggml_compute_forward_im2col_f16(
12645
13072
  const struct ggml_compute_params * params,
12646
- const struct ggml_tensor * src0,
12647
- const struct ggml_tensor * src1,
12648
13073
  struct ggml_tensor * dst) {
13074
+
13075
+ const struct ggml_tensor * src0 = dst->src[0];
13076
+ const struct ggml_tensor * src1 = dst->src[1];
13077
+
12649
13078
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
12650
13079
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
12651
13080
  GGML_ASSERT( dst->type == GGML_TYPE_F16);
@@ -12683,11 +13112,11 @@ static void ggml_compute_forward_im2col_f16(
12683
13112
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
12684
13113
  GGML_ASSERT(nb10 == sizeof(float));
12685
13114
 
12686
- if (params->type == GGML_TASK_INIT) {
13115
+ if (params->type == GGML_TASK_TYPE_INIT) {
12687
13116
  return;
12688
13117
  }
12689
13118
 
12690
- if (params->type == GGML_TASK_FINALIZE) {
13119
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
12691
13120
  return;
12692
13121
  }
12693
13122
 
@@ -12725,17 +13154,15 @@ static void ggml_compute_forward_im2col_f16(
12725
13154
 
12726
13155
  static void ggml_compute_forward_im2col(
12727
13156
  const struct ggml_compute_params * params,
12728
- const struct ggml_tensor * src0,
12729
- const struct ggml_tensor * src1,
12730
13157
  struct ggml_tensor * dst) {
12731
13158
  switch (dst->type) {
12732
13159
  case GGML_TYPE_F16:
12733
13160
  {
12734
- ggml_compute_forward_im2col_f16(params, src0, src1, dst);
13161
+ ggml_compute_forward_im2col_f16(params, dst);
12735
13162
  } break;
12736
13163
  case GGML_TYPE_F32:
12737
13164
  {
12738
- ggml_compute_forward_im2col_f32(params, src0, src1, dst);
13165
+ ggml_compute_forward_im2col_f32(params, dst);
12739
13166
  } break;
12740
13167
  default:
12741
13168
  {
@@ -12749,9 +13176,11 @@ static void ggml_compute_forward_im2col(
12749
13176
 
12750
13177
  static void ggml_compute_forward_conv_transpose_2d(
12751
13178
  const struct ggml_compute_params * params,
12752
- const struct ggml_tensor * src0,
12753
- const struct ggml_tensor * src1,
12754
13179
  struct ggml_tensor * dst) {
13180
+
13181
+ const struct ggml_tensor * src0 = dst->src[0];
13182
+ const struct ggml_tensor * src1 = dst->src[1];
13183
+
12755
13184
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
12756
13185
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
12757
13186
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -12769,7 +13198,7 @@ static void ggml_compute_forward_conv_transpose_2d(
12769
13198
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
12770
13199
  GGML_ASSERT(nb10 == sizeof(float));
12771
13200
 
12772
- if (params->type == GGML_TASK_INIT) {
13201
+ if (params->type == GGML_TASK_TYPE_INIT) {
12773
13202
  if (ith != 0) {
12774
13203
  return;
12775
13204
  }
@@ -12811,7 +13240,7 @@ static void ggml_compute_forward_conv_transpose_2d(
12811
13240
  return;
12812
13241
  }
12813
13242
 
12814
- if (params->type == GGML_TASK_FINALIZE) {
13243
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
12815
13244
  return;
12816
13245
  }
12817
13246
 
@@ -12855,13 +13284,15 @@ static void ggml_compute_forward_conv_transpose_2d(
12855
13284
  static void ggml_compute_forward_pool_1d_sk_p0(
12856
13285
  const struct ggml_compute_params * params,
12857
13286
  const enum ggml_op_pool op,
12858
- const struct ggml_tensor * src,
12859
13287
  const int k,
12860
13288
  struct ggml_tensor * dst) {
13289
+
13290
+ const struct ggml_tensor * src = dst->src[0];
13291
+
12861
13292
  assert(src->type == GGML_TYPE_F32);
12862
13293
  assert(params->ith == 0);
12863
13294
 
12864
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
13295
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
12865
13296
  return;
12866
13297
  }
12867
13298
 
@@ -12906,7 +13337,6 @@ static void ggml_compute_forward_pool_1d_sk_p0(
12906
13337
 
12907
13338
  static void ggml_compute_forward_pool_1d(
12908
13339
  const struct ggml_compute_params * params,
12909
- const struct ggml_tensor * src0,
12910
13340
  struct ggml_tensor * dst) {
12911
13341
 
12912
13342
  const int32_t * opts = (const int32_t *)dst->op_params;
@@ -12917,19 +13347,21 @@ static void ggml_compute_forward_pool_1d(
12917
13347
  GGML_ASSERT(p0 == 0); // padding not supported
12918
13348
  GGML_ASSERT(k0 == s0); // only s = k supported
12919
13349
 
12920
- ggml_compute_forward_pool_1d_sk_p0(params, op, src0, k0, dst);
13350
+ ggml_compute_forward_pool_1d_sk_p0(params, op, k0, dst);
12921
13351
  }
12922
13352
 
12923
13353
  // ggml_compute_forward_pool_2d
12924
13354
 
12925
13355
  static void ggml_compute_forward_pool_2d(
12926
13356
  const struct ggml_compute_params * params,
12927
- const struct ggml_tensor * src,
12928
13357
  struct ggml_tensor * dst) {
13358
+
13359
+ const struct ggml_tensor * src = dst->src[0];
13360
+
12929
13361
  GGML_ASSERT(src->type == GGML_TYPE_F32);
12930
13362
  GGML_ASSERT(params->ith == 0);
12931
13363
 
12932
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
13364
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
12933
13365
  return;
12934
13366
  }
12935
13367
 
@@ -12998,10 +13430,11 @@ static void ggml_compute_forward_pool_2d(
12998
13430
 
12999
13431
  static void ggml_compute_forward_upscale_f32(
13000
13432
  const struct ggml_compute_params * params,
13001
- const struct ggml_tensor * src0,
13002
13433
  struct ggml_tensor * dst) {
13003
13434
 
13004
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
13435
+ const struct ggml_tensor * src0 = dst->src[0];
13436
+
13437
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
13005
13438
  return;
13006
13439
  }
13007
13440
 
@@ -13037,12 +13470,14 @@ static void ggml_compute_forward_upscale_f32(
13037
13470
 
13038
13471
  static void ggml_compute_forward_upscale(
13039
13472
  const struct ggml_compute_params * params,
13040
- const struct ggml_tensor * src0,
13041
13473
  struct ggml_tensor * dst) {
13474
+
13475
+ const struct ggml_tensor * src0 = dst->src[0];
13476
+
13042
13477
  switch (src0->type) {
13043
13478
  case GGML_TYPE_F32:
13044
13479
  {
13045
- ggml_compute_forward_upscale_f32(params, src0, dst);
13480
+ ggml_compute_forward_upscale_f32(params, dst);
13046
13481
  } break;
13047
13482
  default:
13048
13483
  {
@@ -13055,10 +13490,11 @@ static void ggml_compute_forward_upscale(
13055
13490
 
13056
13491
  static void ggml_compute_forward_pad_f32(
13057
13492
  const struct ggml_compute_params * params,
13058
- const struct ggml_tensor * src0,
13059
13493
  struct ggml_tensor * dst) {
13060
13494
 
13061
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
13495
+ const struct ggml_tensor * src0 = dst->src[0];
13496
+
13497
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
13062
13498
  return;
13063
13499
  }
13064
13500
 
@@ -13095,12 +13531,14 @@ static void ggml_compute_forward_pad_f32(
13095
13531
 
13096
13532
  static void ggml_compute_forward_pad(
13097
13533
  const struct ggml_compute_params * params,
13098
- const struct ggml_tensor * src0,
13099
13534
  struct ggml_tensor * dst) {
13535
+
13536
+ const struct ggml_tensor * src0 = dst->src[0];
13537
+
13100
13538
  switch (src0->type) {
13101
13539
  case GGML_TYPE_F32:
13102
13540
  {
13103
- ggml_compute_forward_pad_f32(params, src0, dst);
13541
+ ggml_compute_forward_pad_f32(params, dst);
13104
13542
  } break;
13105
13543
  default:
13106
13544
  {
@@ -13113,10 +13551,11 @@ static void ggml_compute_forward_pad(
13113
13551
 
13114
13552
  static void ggml_compute_forward_argsort_f32(
13115
13553
  const struct ggml_compute_params * params,
13116
- const struct ggml_tensor * src0,
13117
13554
  struct ggml_tensor * dst) {
13118
13555
 
13119
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
13556
+ const struct ggml_tensor * src0 = dst->src[0];
13557
+
13558
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
13120
13559
  return;
13121
13560
  }
13122
13561
 
@@ -13142,8 +13581,8 @@ static void ggml_compute_forward_argsort_f32(
13142
13581
  // C doesn't have a functional sort, so we do a bubble sort instead
13143
13582
  for (int64_t j = 0; j < ne0; j++) {
13144
13583
  for (int64_t k = j + 1; k < ne0; k++) {
13145
- if ((order == GGML_SORT_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
13146
- (order == GGML_SORT_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
13584
+ if ((order == GGML_SORT_ORDER_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
13585
+ (order == GGML_SORT_ORDER_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
13147
13586
  int32_t tmp = dst_data[j];
13148
13587
  dst_data[j] = dst_data[k];
13149
13588
  dst_data[k] = tmp;
@@ -13155,13 +13594,14 @@ static void ggml_compute_forward_argsort_f32(
13155
13594
 
13156
13595
  static void ggml_compute_forward_argsort(
13157
13596
  const struct ggml_compute_params * params,
13158
- const struct ggml_tensor * src0,
13159
13597
  struct ggml_tensor * dst) {
13160
13598
 
13599
+ const struct ggml_tensor * src0 = dst->src[0];
13600
+
13161
13601
  switch (src0->type) {
13162
13602
  case GGML_TYPE_F32:
13163
13603
  {
13164
- ggml_compute_forward_argsort_f32(params, src0, dst);
13604
+ ggml_compute_forward_argsort_f32(params, dst);
13165
13605
  } break;
13166
13606
  default:
13167
13607
  {
@@ -13174,11 +13614,13 @@ static void ggml_compute_forward_argsort(
13174
13614
 
13175
13615
  static void ggml_compute_forward_flash_attn_f32(
13176
13616
  const struct ggml_compute_params * params,
13177
- const struct ggml_tensor * q,
13178
- const struct ggml_tensor * k,
13179
- const struct ggml_tensor * v,
13180
13617
  const bool masked,
13181
13618
  struct ggml_tensor * dst) {
13619
+
13620
+ const struct ggml_tensor * q = dst->src[0];
13621
+ const struct ggml_tensor * k = dst->src[1];
13622
+ const struct ggml_tensor * v = dst->src[2];
13623
+
13182
13624
  int64_t t0 = ggml_perf_time_us();
13183
13625
  UNUSED(t0);
13184
13626
 
@@ -13223,11 +13665,11 @@ static void ggml_compute_forward_flash_attn_f32(
13223
13665
  GGML_ASSERT(nb1 <= nb2);
13224
13666
  GGML_ASSERT(nb2 <= nb3);
13225
13667
 
13226
- if (params->type == GGML_TASK_INIT) {
13668
+ if (params->type == GGML_TASK_TYPE_INIT) {
13227
13669
  return;
13228
13670
  }
13229
13671
 
13230
- if (params->type == GGML_TASK_FINALIZE) {
13672
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
13231
13673
  return;
13232
13674
  }
13233
13675
 
@@ -13364,11 +13806,13 @@ static void ggml_compute_forward_flash_attn_f32(
13364
13806
 
13365
13807
  static void ggml_compute_forward_flash_attn_f16(
13366
13808
  const struct ggml_compute_params * params,
13367
- const struct ggml_tensor * q,
13368
- const struct ggml_tensor * k,
13369
- const struct ggml_tensor * v,
13370
13809
  const bool masked,
13371
13810
  struct ggml_tensor * dst) {
13811
+
13812
+ const struct ggml_tensor * q = dst->src[0];
13813
+ const struct ggml_tensor * k = dst->src[1];
13814
+ const struct ggml_tensor * v = dst->src[2];
13815
+
13372
13816
  int64_t t0 = ggml_perf_time_us();
13373
13817
  UNUSED(t0);
13374
13818
 
@@ -13413,11 +13857,11 @@ static void ggml_compute_forward_flash_attn_f16(
13413
13857
  GGML_ASSERT(nb1 <= nb2);
13414
13858
  GGML_ASSERT(nb2 <= nb3);
13415
13859
 
13416
- if (params->type == GGML_TASK_INIT) {
13860
+ if (params->type == GGML_TASK_TYPE_INIT) {
13417
13861
  return;
13418
13862
  }
13419
13863
 
13420
- if (params->type == GGML_TASK_FINALIZE) {
13864
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
13421
13865
  return;
13422
13866
  }
13423
13867
 
@@ -13590,19 +14034,19 @@ static void ggml_compute_forward_flash_attn_f16(
13590
14034
 
13591
14035
  static void ggml_compute_forward_flash_attn(
13592
14036
  const struct ggml_compute_params * params,
13593
- const struct ggml_tensor * q,
13594
- const struct ggml_tensor * k,
13595
- const struct ggml_tensor * v,
13596
14037
  const bool masked,
13597
14038
  struct ggml_tensor * dst) {
14039
+
14040
+ const struct ggml_tensor * q = dst->src[0];
14041
+
13598
14042
  switch (q->type) {
13599
14043
  case GGML_TYPE_F16:
13600
14044
  {
13601
- ggml_compute_forward_flash_attn_f16(params, q, k, v, masked, dst);
14045
+ ggml_compute_forward_flash_attn_f16(params, masked, dst);
13602
14046
  } break;
13603
14047
  case GGML_TYPE_F32:
13604
14048
  {
13605
- ggml_compute_forward_flash_attn_f32(params, q, k, v, masked, dst);
14049
+ ggml_compute_forward_flash_attn_f32(params, masked, dst);
13606
14050
  } break;
13607
14051
  default:
13608
14052
  {
@@ -13615,12 +14059,14 @@ static void ggml_compute_forward_flash_attn(
13615
14059
 
13616
14060
  static void ggml_compute_forward_flash_ff_f16(
13617
14061
  const struct ggml_compute_params * params,
13618
- const struct ggml_tensor * a, // F16
13619
- const struct ggml_tensor * b0, // F16 fc_w
13620
- const struct ggml_tensor * b1, // F32 fc_b
13621
- const struct ggml_tensor * c0, // F16 proj_w
13622
- const struct ggml_tensor * c1, // F32 proj_b
13623
14062
  struct ggml_tensor * dst) {
14063
+
14064
+ const struct ggml_tensor * a = dst->src[0]; // F16
14065
+ const struct ggml_tensor * b0 = dst->src[1]; // F16 fc_w
14066
+ const struct ggml_tensor * b1 = dst->src[2]; // F32 fc_b
14067
+ const struct ggml_tensor * c0 = dst->src[3]; // F16 proj_w
14068
+ const struct ggml_tensor * c1 = dst->src[4]; // F32 proj_b
14069
+
13624
14070
  int64_t t0 = ggml_perf_time_us();
13625
14071
  UNUSED(t0);
13626
14072
 
@@ -13670,11 +14116,11 @@ static void ggml_compute_forward_flash_ff_f16(
13670
14116
  GGML_ASSERT(nb1 <= nb2);
13671
14117
  GGML_ASSERT(nb2 <= nb3);
13672
14118
 
13673
- if (params->type == GGML_TASK_INIT) {
14119
+ if (params->type == GGML_TASK_TYPE_INIT) {
13674
14120
  return;
13675
14121
  }
13676
14122
 
13677
- if (params->type == GGML_TASK_FINALIZE) {
14123
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
13678
14124
  return;
13679
14125
  }
13680
14126
 
@@ -13748,16 +14194,14 @@ static void ggml_compute_forward_flash_ff_f16(
13748
14194
 
13749
14195
  static void ggml_compute_forward_flash_ff(
13750
14196
  const struct ggml_compute_params * params,
13751
- const struct ggml_tensor * a,
13752
- const struct ggml_tensor * b0,
13753
- const struct ggml_tensor * b1,
13754
- const struct ggml_tensor * c0,
13755
- const struct ggml_tensor * c1,
13756
14197
  struct ggml_tensor * dst) {
14198
+
14199
+ const struct ggml_tensor * b0 = dst->src[1];
14200
+
13757
14201
  switch (b0->type) {
13758
14202
  case GGML_TYPE_F16:
13759
14203
  {
13760
- ggml_compute_forward_flash_ff_f16(params, a, b0, b1, c0, c1, dst);
14204
+ ggml_compute_forward_flash_ff_f16(params, dst);
13761
14205
  } break;
13762
14206
  case GGML_TYPE_F32:
13763
14207
  {
@@ -13774,12 +14218,14 @@ static void ggml_compute_forward_flash_ff(
13774
14218
 
13775
14219
  static void ggml_compute_forward_flash_attn_back_f32(
13776
14220
  const struct ggml_compute_params * params,
13777
- const struct ggml_tensor * q,
13778
- const struct ggml_tensor * k,
13779
- const struct ggml_tensor * v,
13780
- const struct ggml_tensor * d,
13781
14221
  const bool masked,
13782
14222
  struct ggml_tensor * dst) {
14223
+
14224
+ const struct ggml_tensor * q = dst->src[0];
14225
+ const struct ggml_tensor * k = dst->src[1];
14226
+ const struct ggml_tensor * v = dst->src[2];
14227
+ const struct ggml_tensor * d = dst->src[3];
14228
+
13783
14229
  int64_t t0 = ggml_perf_time_us();
13784
14230
  UNUSED(t0);
13785
14231
 
@@ -13829,14 +14275,14 @@ static void ggml_compute_forward_flash_attn_back_f32(
13829
14275
  GGML_ASSERT(nb1 <= nb2);
13830
14276
  GGML_ASSERT(nb2 <= nb3);
13831
14277
 
13832
- if (params->type == GGML_TASK_INIT) {
14278
+ if (params->type == GGML_TASK_TYPE_INIT) {
13833
14279
  if (ith == 0) {
13834
14280
  memset(dst->data, 0, nb0*ne0*ne1*ne2*ne3);
13835
14281
  }
13836
14282
  return;
13837
14283
  }
13838
14284
 
13839
- if (params->type == GGML_TASK_FINALIZE) {
14285
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
13840
14286
  return;
13841
14287
  }
13842
14288
 
@@ -14127,16 +14573,15 @@ static void ggml_compute_forward_flash_attn_back_f32(
14127
14573
 
14128
14574
  static void ggml_compute_forward_flash_attn_back(
14129
14575
  const struct ggml_compute_params * params,
14130
- const struct ggml_tensor * q,
14131
- const struct ggml_tensor * k,
14132
- const struct ggml_tensor * v,
14133
- const struct ggml_tensor * d,
14134
14576
  const bool masked,
14135
14577
  struct ggml_tensor * dst) {
14578
+
14579
+ const struct ggml_tensor * q = dst->src[0];
14580
+
14136
14581
  switch (q->type) {
14137
14582
  case GGML_TYPE_F32:
14138
14583
  {
14139
- ggml_compute_forward_flash_attn_back_f32(params, q, k, v, d, masked, dst);
14584
+ ggml_compute_forward_flash_attn_back_f32(params, masked, dst);
14140
14585
  } break;
14141
14586
  default:
14142
14587
  {
@@ -14149,9 +14594,11 @@ static void ggml_compute_forward_flash_attn_back(
14149
14594
 
14150
14595
  static void ggml_compute_forward_win_part_f32(
14151
14596
  const struct ggml_compute_params * params,
14152
- const struct ggml_tensor * src0,
14153
14597
  struct ggml_tensor * dst) {
14154
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14598
+
14599
+ const struct ggml_tensor * src0 = dst->src[0];
14600
+
14601
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14155
14602
  return;
14156
14603
  }
14157
14604
 
@@ -14193,12 +14640,14 @@ static void ggml_compute_forward_win_part_f32(
14193
14640
 
14194
14641
  static void ggml_compute_forward_win_part(
14195
14642
  const struct ggml_compute_params * params,
14196
- const struct ggml_tensor * src0,
14197
14643
  struct ggml_tensor * dst) {
14644
+
14645
+ const struct ggml_tensor * src0 = dst->src[0];
14646
+
14198
14647
  switch (src0->type) {
14199
14648
  case GGML_TYPE_F32:
14200
14649
  {
14201
- ggml_compute_forward_win_part_f32(params, src0, dst);
14650
+ ggml_compute_forward_win_part_f32(params, dst);
14202
14651
  } break;
14203
14652
  default:
14204
14653
  {
@@ -14211,9 +14660,11 @@ static void ggml_compute_forward_win_part(
14211
14660
 
14212
14661
  static void ggml_compute_forward_win_unpart_f32(
14213
14662
  const struct ggml_compute_params * params,
14214
- const struct ggml_tensor * src0,
14215
14663
  struct ggml_tensor * dst) {
14216
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14664
+
14665
+ const struct ggml_tensor * src0 = dst->src[0];
14666
+
14667
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14217
14668
  return;
14218
14669
  }
14219
14670
 
@@ -14253,12 +14704,14 @@ static void ggml_compute_forward_win_unpart_f32(
14253
14704
 
14254
14705
  static void ggml_compute_forward_win_unpart(
14255
14706
  const struct ggml_compute_params * params,
14256
- const struct ggml_tensor * src0,
14257
14707
  struct ggml_tensor * dst) {
14708
+
14709
+ const struct ggml_tensor * src0 = dst->src[0];
14710
+
14258
14711
  switch (src0->type) {
14259
14712
  case GGML_TYPE_F32:
14260
14713
  {
14261
- ggml_compute_forward_win_unpart_f32(params, src0, dst);
14714
+ ggml_compute_forward_win_unpart_f32(params, dst);
14262
14715
  } break;
14263
14716
  default:
14264
14717
  {
@@ -14271,58 +14724,58 @@ static void ggml_compute_forward_win_unpart(
14271
14724
 
14272
14725
  static void ggml_compute_forward_unary(
14273
14726
  const struct ggml_compute_params * params,
14274
- const struct ggml_tensor * src0,
14275
14727
  struct ggml_tensor * dst) {
14728
+
14276
14729
  const enum ggml_unary_op op = ggml_get_unary_op(dst);
14277
14730
 
14278
14731
  switch (op) {
14279
14732
  case GGML_UNARY_OP_ABS:
14280
14733
  {
14281
- ggml_compute_forward_abs(params, src0, dst);
14734
+ ggml_compute_forward_abs(params, dst);
14282
14735
  } break;
14283
14736
  case GGML_UNARY_OP_SGN:
14284
14737
  {
14285
- ggml_compute_forward_sgn(params, src0, dst);
14738
+ ggml_compute_forward_sgn(params, dst);
14286
14739
  } break;
14287
14740
  case GGML_UNARY_OP_NEG:
14288
14741
  {
14289
- ggml_compute_forward_neg(params, src0, dst);
14742
+ ggml_compute_forward_neg(params, dst);
14290
14743
  } break;
14291
14744
  case GGML_UNARY_OP_STEP:
14292
14745
  {
14293
- ggml_compute_forward_step(params, src0, dst);
14746
+ ggml_compute_forward_step(params, dst);
14294
14747
  } break;
14295
14748
  case GGML_UNARY_OP_TANH:
14296
14749
  {
14297
- ggml_compute_forward_tanh(params, src0, dst);
14750
+ ggml_compute_forward_tanh(params, dst);
14298
14751
  } break;
14299
14752
  case GGML_UNARY_OP_ELU:
14300
14753
  {
14301
- ggml_compute_forward_elu(params, src0, dst);
14754
+ ggml_compute_forward_elu(params, dst);
14302
14755
  } break;
14303
14756
  case GGML_UNARY_OP_RELU:
14304
14757
  {
14305
- ggml_compute_forward_relu(params, src0, dst);
14758
+ ggml_compute_forward_relu(params, dst);
14306
14759
  } break;
14307
14760
  case GGML_UNARY_OP_GELU:
14308
14761
  {
14309
- ggml_compute_forward_gelu(params, src0, dst);
14762
+ ggml_compute_forward_gelu(params, dst);
14310
14763
  } break;
14311
14764
  case GGML_UNARY_OP_GELU_QUICK:
14312
14765
  {
14313
- ggml_compute_forward_gelu_quick(params, src0, dst);
14766
+ ggml_compute_forward_gelu_quick(params, dst);
14314
14767
  } break;
14315
14768
  case GGML_UNARY_OP_SILU:
14316
14769
  {
14317
- ggml_compute_forward_silu(params, src0, dst);
14770
+ ggml_compute_forward_silu(params, dst);
14318
14771
  } break;
14319
14772
  case GGML_UNARY_OP_HARDSWISH:
14320
14773
  {
14321
- ggml_compute_forward_hardswish(params, src0, dst);
14774
+ ggml_compute_forward_hardswish(params, dst);
14322
14775
  } break;
14323
14776
  case GGML_UNARY_OP_HARDSIGMOID:
14324
14777
  {
14325
- ggml_compute_forward_hardsigmoid(params, src0, dst);
14778
+ ggml_compute_forward_hardsigmoid(params, dst);
14326
14779
  } break;
14327
14780
  default:
14328
14781
  {
@@ -14335,9 +14788,11 @@ static void ggml_compute_forward_unary(
14335
14788
 
14336
14789
  static void ggml_compute_forward_get_rel_pos_f16(
14337
14790
  const struct ggml_compute_params * params,
14338
- const struct ggml_tensor * src0,
14339
14791
  struct ggml_tensor * dst) {
14340
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14792
+
14793
+ const struct ggml_tensor * src0 = dst->src[0];
14794
+
14795
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14341
14796
  return;
14342
14797
  }
14343
14798
 
@@ -14362,12 +14817,14 @@ static void ggml_compute_forward_get_rel_pos_f16(
14362
14817
 
14363
14818
  static void ggml_compute_forward_get_rel_pos(
14364
14819
  const struct ggml_compute_params * params,
14365
- const struct ggml_tensor * src0,
14366
14820
  struct ggml_tensor * dst) {
14821
+
14822
+ const struct ggml_tensor * src0 = dst->src[0];
14823
+
14367
14824
  switch (src0->type) {
14368
14825
  case GGML_TYPE_F16:
14369
14826
  {
14370
- ggml_compute_forward_get_rel_pos_f16(params, src0, dst);
14827
+ ggml_compute_forward_get_rel_pos_f16(params, dst);
14371
14828
  } break;
14372
14829
  default:
14373
14830
  {
@@ -14380,20 +14837,21 @@ static void ggml_compute_forward_get_rel_pos(
14380
14837
 
14381
14838
  static void ggml_compute_forward_add_rel_pos_f32(
14382
14839
  const struct ggml_compute_params * params,
14383
- const struct ggml_tensor * src0,
14384
- const struct ggml_tensor * src1,
14385
- const struct ggml_tensor * src2,
14386
14840
  struct ggml_tensor * dst) {
14387
14841
 
14842
+ const struct ggml_tensor * src0 = dst->src[0];
14843
+ const struct ggml_tensor * src1 = dst->src[1];
14844
+ const struct ggml_tensor * src2 = dst->src[2];
14845
+
14388
14846
  const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
14389
- if (!inplace && params->type == GGML_TASK_INIT) {
14847
+ if (!inplace && params->type == GGML_TASK_TYPE_INIT) {
14390
14848
  if (params->ith != 0) {
14391
14849
  return;
14392
14850
  }
14393
14851
  memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
14394
14852
  return;
14395
14853
  }
14396
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14854
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14397
14855
  return;
14398
14856
  }
14399
14857
 
@@ -14448,14 +14906,14 @@ static void ggml_compute_forward_add_rel_pos_f32(
14448
14906
 
14449
14907
  static void ggml_compute_forward_add_rel_pos(
14450
14908
  const struct ggml_compute_params * params,
14451
- const struct ggml_tensor * src0,
14452
- const struct ggml_tensor * src1,
14453
- const struct ggml_tensor * src2,
14454
14909
  struct ggml_tensor * dst) {
14910
+
14911
+ const struct ggml_tensor * src0 = dst->src[0];
14912
+
14455
14913
  switch (src0->type) {
14456
14914
  case GGML_TYPE_F32:
14457
14915
  {
14458
- ggml_compute_forward_add_rel_pos_f32(params, src0, src1, src2, dst);
14916
+ ggml_compute_forward_add_rel_pos_f32(params, dst);
14459
14917
  } break;
14460
14918
  default:
14461
14919
  {
@@ -14468,12 +14926,14 @@ static void ggml_compute_forward_add_rel_pos(
14468
14926
 
14469
14927
  static void ggml_compute_forward_map_unary_f32(
14470
14928
  const struct ggml_compute_params * params,
14471
- const struct ggml_tensor * src0,
14472
14929
  struct ggml_tensor * dst,
14473
14930
  const ggml_unary_op_f32_t fun) {
14931
+
14932
+ const struct ggml_tensor * src0 = dst->src[0];
14933
+
14474
14934
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
14475
14935
 
14476
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14936
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14477
14937
  return;
14478
14938
  }
14479
14939
 
@@ -14492,13 +14952,15 @@ static void ggml_compute_forward_map_unary_f32(
14492
14952
 
14493
14953
  static void ggml_compute_forward_map_unary(
14494
14954
  const struct ggml_compute_params * params,
14495
- const struct ggml_tensor * src0,
14496
14955
  struct ggml_tensor * dst,
14497
14956
  const ggml_unary_op_f32_t fun) {
14957
+
14958
+ const struct ggml_tensor * src0 = dst->src[0];
14959
+
14498
14960
  switch (src0->type) {
14499
14961
  case GGML_TYPE_F32:
14500
14962
  {
14501
- ggml_compute_forward_map_unary_f32(params, src0, dst, fun);
14963
+ ggml_compute_forward_map_unary_f32(params, dst, fun);
14502
14964
  } break;
14503
14965
  default:
14504
14966
  {
@@ -14511,14 +14973,16 @@ static void ggml_compute_forward_map_unary(
14511
14973
 
14512
14974
  static void ggml_compute_forward_map_binary_f32(
14513
14975
  const struct ggml_compute_params * params,
14514
- const struct ggml_tensor * src0,
14515
- const struct ggml_tensor * src1,
14516
14976
  struct ggml_tensor * dst,
14517
14977
  const ggml_binary_op_f32_t fun) {
14978
+
14979
+ const struct ggml_tensor * src0 = dst->src[0];
14980
+ const struct ggml_tensor * src1 = dst->src[1];
14981
+
14518
14982
  assert(params->ith == 0);
14519
14983
  assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
14520
14984
 
14521
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14985
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14522
14986
  return;
14523
14987
  }
14524
14988
 
@@ -14539,14 +15003,15 @@ static void ggml_compute_forward_map_binary_f32(
14539
15003
 
14540
15004
  static void ggml_compute_forward_map_binary(
14541
15005
  const struct ggml_compute_params * params,
14542
- const struct ggml_tensor * src0,
14543
- const struct ggml_tensor * src1,
14544
15006
  struct ggml_tensor * dst,
14545
15007
  const ggml_binary_op_f32_t fun) {
15008
+
15009
+ const struct ggml_tensor * src0 = dst->src[0];
15010
+
14546
15011
  switch (src0->type) {
14547
15012
  case GGML_TYPE_F32:
14548
15013
  {
14549
- ggml_compute_forward_map_binary_f32(params, src0, src1, dst, fun);
15014
+ ggml_compute_forward_map_binary_f32(params, dst, fun);
14550
15015
  } break;
14551
15016
  default:
14552
15017
  {
@@ -14559,12 +15024,14 @@ static void ggml_compute_forward_map_binary(
14559
15024
 
14560
15025
  static void ggml_compute_forward_map_custom1_f32(
14561
15026
  const struct ggml_compute_params * params,
14562
- const struct ggml_tensor * a,
14563
15027
  struct ggml_tensor * dst,
14564
15028
  const ggml_custom1_op_f32_t fun) {
15029
+
15030
+ const struct ggml_tensor * a = dst->src[0];
15031
+
14565
15032
  assert(params->ith == 0);
14566
15033
 
14567
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15034
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14568
15035
  return;
14569
15036
  }
14570
15037
 
@@ -14575,13 +15042,15 @@ static void ggml_compute_forward_map_custom1_f32(
14575
15042
 
14576
15043
  static void ggml_compute_forward_map_custom2_f32(
14577
15044
  const struct ggml_compute_params * params,
14578
- const struct ggml_tensor * a,
14579
- const struct ggml_tensor * b,
14580
15045
  struct ggml_tensor * dst,
14581
15046
  const ggml_custom2_op_f32_t fun) {
15047
+
15048
+ const struct ggml_tensor * a = dst->src[0];
15049
+ const struct ggml_tensor * b = dst->src[1];
15050
+
14582
15051
  assert(params->ith == 0);
14583
15052
 
14584
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15053
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14585
15054
  return;
14586
15055
  }
14587
15056
 
@@ -14592,14 +15061,16 @@ static void ggml_compute_forward_map_custom2_f32(
14592
15061
 
14593
15062
  static void ggml_compute_forward_map_custom3_f32(
14594
15063
  const struct ggml_compute_params * params,
14595
- const struct ggml_tensor * a,
14596
- const struct ggml_tensor * b,
14597
- const struct ggml_tensor * c,
14598
15064
  struct ggml_tensor * dst,
14599
15065
  const ggml_custom3_op_f32_t fun) {
15066
+
15067
+ const struct ggml_tensor * a = dst->src[0];
15068
+ const struct ggml_tensor * b = dst->src[1];
15069
+ const struct ggml_tensor * c = dst->src[1];
15070
+
14600
15071
  assert(params->ith == 0);
14601
15072
 
14602
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15073
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14603
15074
  return;
14604
15075
  }
14605
15076
 
@@ -14610,57 +15081,68 @@ static void ggml_compute_forward_map_custom3_f32(
14610
15081
 
14611
15082
  static void ggml_compute_forward_map_custom1(
14612
15083
  const struct ggml_compute_params * params,
14613
- const struct ggml_tensor * a,
14614
15084
  struct ggml_tensor * dst) {
14615
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15085
+
15086
+ const struct ggml_tensor * a = dst->src[0];
15087
+
15088
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14616
15089
  return;
14617
15090
  }
14618
15091
 
14619
- struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) dst->op_params;
15092
+ struct ggml_map_custom1_op_params p;
15093
+ memcpy(&p, dst->op_params, sizeof(p));
14620
15094
 
14621
- p->fun(dst, a, params->ith, params->nth, p->userdata);
15095
+ p.fun(dst, a, params->ith, params->nth, p.userdata);
14622
15096
  }
14623
15097
 
14624
15098
  // ggml_compute_forward_map_custom2
14625
15099
 
14626
15100
  static void ggml_compute_forward_map_custom2(
14627
15101
  const struct ggml_compute_params * params,
14628
- const struct ggml_tensor * a,
14629
- const struct ggml_tensor * b,
14630
15102
  struct ggml_tensor * dst) {
14631
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15103
+
15104
+ const struct ggml_tensor * a = dst->src[0];
15105
+ const struct ggml_tensor * b = dst->src[1];
15106
+
15107
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14632
15108
  return;
14633
15109
  }
14634
15110
 
14635
- struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) dst->op_params;
15111
+ struct ggml_map_custom2_op_params p;
15112
+ memcpy(&p, dst->op_params, sizeof(p));
14636
15113
 
14637
- p->fun(dst, a, b, params->ith, params->nth, p->userdata);
15114
+ p.fun(dst, a, b, params->ith, params->nth, p.userdata);
14638
15115
  }
14639
15116
 
14640
15117
  // ggml_compute_forward_map_custom3
14641
15118
 
14642
15119
  static void ggml_compute_forward_map_custom3(
14643
15120
  const struct ggml_compute_params * params,
14644
- const struct ggml_tensor * a,
14645
- const struct ggml_tensor * b,
14646
- const struct ggml_tensor * c,
14647
15121
  struct ggml_tensor * dst) {
14648
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15122
+
15123
+ const struct ggml_tensor * a = dst->src[0];
15124
+ const struct ggml_tensor * b = dst->src[1];
15125
+ const struct ggml_tensor * c = dst->src[2];
15126
+
15127
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14649
15128
  return;
14650
15129
  }
14651
15130
 
14652
- struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) dst->op_params;
15131
+ struct ggml_map_custom3_op_params p;
15132
+ memcpy(&p, dst->op_params, sizeof(p));
14653
15133
 
14654
- p->fun(dst, a, b, c, params->ith, params->nth, p->userdata);
15134
+ p.fun(dst, a, b, c, params->ith, params->nth, p.userdata);
14655
15135
  }
14656
15136
 
14657
15137
  // ggml_compute_forward_cross_entropy_loss
14658
15138
 
14659
15139
  static void ggml_compute_forward_cross_entropy_loss_f32(
14660
15140
  const struct ggml_compute_params * params,
14661
- const struct ggml_tensor * src0,
14662
- const struct ggml_tensor * src1,
14663
15141
  struct ggml_tensor * dst) {
15142
+
15143
+ const struct ggml_tensor * src0 = dst->src[0];
15144
+ const struct ggml_tensor * src1 = dst->src[1];
15145
+
14664
15146
  GGML_ASSERT(ggml_is_contiguous(src0));
14665
15147
  GGML_ASSERT(ggml_is_contiguous(src1));
14666
15148
  GGML_ASSERT(ggml_is_scalar(dst));
@@ -14677,14 +15159,14 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
14677
15159
 
14678
15160
  GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc));
14679
15161
 
14680
- if (params->type == GGML_TASK_INIT) {
15162
+ if (params->type == GGML_TASK_TYPE_INIT) {
14681
15163
  if (ith == 0) {
14682
15164
  memset(sums, 0, sizeof(float) * (nth + nth * nc));
14683
15165
  }
14684
15166
  return;
14685
15167
  }
14686
15168
 
14687
- if (params->type == GGML_TASK_FINALIZE) {
15169
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
14688
15170
  if (ith == 0) {
14689
15171
  float * dp = (float *) dst->data;
14690
15172
  ggml_vec_sum_f32(nth, dp, sums);
@@ -14764,13 +15246,14 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
14764
15246
 
14765
15247
  static void ggml_compute_forward_cross_entropy_loss(
14766
15248
  const struct ggml_compute_params * params,
14767
- const struct ggml_tensor * src0,
14768
- const struct ggml_tensor * src1,
14769
15249
  struct ggml_tensor * dst) {
15250
+
15251
+ const struct ggml_tensor * src0 = dst->src[0];
15252
+
14770
15253
  switch (src0->type) {
14771
15254
  case GGML_TYPE_F32:
14772
15255
  {
14773
- ggml_compute_forward_cross_entropy_loss_f32(params, src0, src1, dst);
15256
+ ggml_compute_forward_cross_entropy_loss_f32(params, dst);
14774
15257
  } break;
14775
15258
  default:
14776
15259
  {
@@ -14783,10 +15266,12 @@ static void ggml_compute_forward_cross_entropy_loss(
14783
15266
 
14784
15267
  static void ggml_compute_forward_cross_entropy_loss_back_f32(
14785
15268
  const struct ggml_compute_params * params,
14786
- const struct ggml_tensor * src0,
14787
- const struct ggml_tensor * src1,
14788
- const struct ggml_tensor * opt0,
14789
15269
  struct ggml_tensor * dst) {
15270
+
15271
+ const struct ggml_tensor * src0 = dst->src[0];
15272
+ const struct ggml_tensor * src1 = dst->src[1];
15273
+ const struct ggml_tensor * opt0 = dst->src[2];
15274
+
14790
15275
  GGML_ASSERT(ggml_is_contiguous(dst));
14791
15276
  GGML_ASSERT(ggml_is_contiguous(src0));
14792
15277
  GGML_ASSERT(ggml_is_contiguous(src1));
@@ -14796,7 +15281,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
14796
15281
  const int64_t ith = params->ith;
14797
15282
  const int64_t nth = params->nth;
14798
15283
 
14799
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15284
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14800
15285
  return;
14801
15286
  }
14802
15287
 
@@ -14873,14 +15358,14 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
14873
15358
 
14874
15359
  static void ggml_compute_forward_cross_entropy_loss_back(
14875
15360
  const struct ggml_compute_params * params,
14876
- const struct ggml_tensor * src0,
14877
- const struct ggml_tensor * src1,
14878
- const struct ggml_tensor * opt0,
14879
15361
  struct ggml_tensor * dst) {
15362
+
15363
+ const struct ggml_tensor * src0 = dst->src[0];
15364
+
14880
15365
  switch (src0->type) {
14881
15366
  case GGML_TYPE_F32:
14882
15367
  {
14883
- ggml_compute_forward_cross_entropy_loss_back_f32(params, src0, src1, opt0, dst);
15368
+ ggml_compute_forward_cross_entropy_loss_back_f32(params, dst);
14884
15369
  } break;
14885
15370
  default:
14886
15371
  {
@@ -14903,8 +15388,8 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14903
15388
  if (skip_cpu) {
14904
15389
  return;
14905
15390
  }
14906
- GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
14907
- GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
15391
+ GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
15392
+ GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
14908
15393
  #elif defined(GGML_USE_VULKAN)
14909
15394
  const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
14910
15395
  #ifdef GGML_VULKAN_CHECK_RESULTS
@@ -14915,8 +15400,8 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14915
15400
  if (skip_cpu) {
14916
15401
  return;
14917
15402
  }
14918
- GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
14919
- GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
15403
+ GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
15404
+ GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
14920
15405
  #endif // GGML_USE_CUBLAS
14921
15406
 
14922
15407
  #ifdef GGML_USE_SYCL
@@ -14928,312 +15413,312 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14928
15413
  switch (tensor->op) {
14929
15414
  case GGML_OP_DUP:
14930
15415
  {
14931
- ggml_compute_forward_dup(params, tensor->src[0], tensor);
15416
+ ggml_compute_forward_dup(params, tensor);
14932
15417
  } break;
14933
15418
  case GGML_OP_ADD:
14934
15419
  {
14935
- ggml_compute_forward_add(params, tensor->src[0], tensor->src[1], tensor);
15420
+ ggml_compute_forward_add(params, tensor);
14936
15421
  } break;
14937
15422
  case GGML_OP_ADD1:
14938
15423
  {
14939
- ggml_compute_forward_add1(params, tensor->src[0], tensor->src[1], tensor);
15424
+ ggml_compute_forward_add1(params, tensor);
14940
15425
  } break;
14941
15426
  case GGML_OP_ACC:
14942
15427
  {
14943
- ggml_compute_forward_acc(params, tensor->src[0], tensor->src[1], tensor);
15428
+ ggml_compute_forward_acc(params, tensor);
14944
15429
  } break;
14945
15430
  case GGML_OP_SUB:
14946
15431
  {
14947
- ggml_compute_forward_sub(params, tensor->src[0], tensor->src[1], tensor);
15432
+ ggml_compute_forward_sub(params, tensor);
14948
15433
  } break;
14949
15434
  case GGML_OP_MUL:
14950
15435
  {
14951
- ggml_compute_forward_mul(params, tensor->src[0], tensor->src[1], tensor);
15436
+ ggml_compute_forward_mul(params, tensor);
14952
15437
  } break;
14953
15438
  case GGML_OP_DIV:
14954
15439
  {
14955
- ggml_compute_forward_div(params, tensor->src[0], tensor->src[1], tensor);
15440
+ ggml_compute_forward_div(params, tensor);
14956
15441
  } break;
14957
15442
  case GGML_OP_SQR:
14958
15443
  {
14959
- ggml_compute_forward_sqr(params, tensor->src[0], tensor);
15444
+ ggml_compute_forward_sqr(params, tensor);
14960
15445
  } break;
14961
15446
  case GGML_OP_SQRT:
14962
15447
  {
14963
- ggml_compute_forward_sqrt(params, tensor->src[0], tensor);
15448
+ ggml_compute_forward_sqrt(params, tensor);
14964
15449
  } break;
14965
15450
  case GGML_OP_LOG:
14966
15451
  {
14967
- ggml_compute_forward_log(params, tensor->src[0], tensor);
15452
+ ggml_compute_forward_log(params, tensor);
14968
15453
  } break;
14969
15454
  case GGML_OP_SUM:
14970
15455
  {
14971
- ggml_compute_forward_sum(params, tensor->src[0], tensor);
15456
+ ggml_compute_forward_sum(params, tensor);
14972
15457
  } break;
14973
15458
  case GGML_OP_SUM_ROWS:
14974
15459
  {
14975
- ggml_compute_forward_sum_rows(params, tensor->src[0], tensor);
15460
+ ggml_compute_forward_sum_rows(params, tensor);
14976
15461
  } break;
14977
15462
  case GGML_OP_MEAN:
14978
15463
  {
14979
- ggml_compute_forward_mean(params, tensor->src[0], tensor);
15464
+ ggml_compute_forward_mean(params, tensor);
14980
15465
  } break;
14981
15466
  case GGML_OP_ARGMAX:
14982
15467
  {
14983
- ggml_compute_forward_argmax(params, tensor->src[0], tensor);
15468
+ ggml_compute_forward_argmax(params, tensor);
14984
15469
  } break;
14985
15470
  case GGML_OP_REPEAT:
14986
15471
  {
14987
- ggml_compute_forward_repeat(params, tensor->src[0], tensor);
15472
+ ggml_compute_forward_repeat(params, tensor);
14988
15473
  } break;
14989
15474
  case GGML_OP_REPEAT_BACK:
14990
15475
  {
14991
- ggml_compute_forward_repeat_back(params, tensor->src[0], tensor);
15476
+ ggml_compute_forward_repeat_back(params, tensor);
14992
15477
  } break;
14993
15478
  case GGML_OP_CONCAT:
14994
15479
  {
14995
- ggml_compute_forward_concat(params, tensor->src[0], tensor->src[1], tensor);
15480
+ ggml_compute_forward_concat(params, tensor);
14996
15481
  } break;
14997
15482
  case GGML_OP_SILU_BACK:
14998
15483
  {
14999
- ggml_compute_forward_silu_back(params, tensor->src[0], tensor->src[1], tensor);
15484
+ ggml_compute_forward_silu_back(params, tensor);
15000
15485
  } break;
15001
15486
  case GGML_OP_NORM:
15002
15487
  {
15003
- ggml_compute_forward_norm(params, tensor->src[0], tensor);
15488
+ ggml_compute_forward_norm(params, tensor);
15004
15489
  } break;
15005
15490
  case GGML_OP_RMS_NORM:
15006
15491
  {
15007
- ggml_compute_forward_rms_norm(params, tensor->src[0], tensor);
15492
+ ggml_compute_forward_rms_norm(params, tensor);
15008
15493
  } break;
15009
15494
  case GGML_OP_RMS_NORM_BACK:
15010
15495
  {
15011
- ggml_compute_forward_rms_norm_back(params, tensor->src[0], tensor->src[1], tensor);
15496
+ ggml_compute_forward_rms_norm_back(params, tensor);
15012
15497
  } break;
15013
15498
  case GGML_OP_GROUP_NORM:
15014
15499
  {
15015
- ggml_compute_forward_group_norm(params, tensor->src[0], tensor);
15500
+ ggml_compute_forward_group_norm(params, tensor);
15016
15501
  } break;
15017
15502
  case GGML_OP_MUL_MAT:
15018
15503
  {
15019
- ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
15504
+ ggml_compute_forward_mul_mat(params, tensor);
15020
15505
  } break;
15021
15506
  case GGML_OP_MUL_MAT_ID:
15022
15507
  {
15023
- ggml_compute_forward_mul_mat_id(params, tensor->src[0], tensor->src[1], tensor);
15508
+ ggml_compute_forward_mul_mat_id(params, tensor);
15024
15509
  } break;
15025
15510
  case GGML_OP_OUT_PROD:
15026
15511
  {
15027
- ggml_compute_forward_out_prod(params, tensor->src[0], tensor->src[1], tensor);
15512
+ ggml_compute_forward_out_prod(params, tensor);
15028
15513
  } break;
15029
15514
  case GGML_OP_SCALE:
15030
15515
  {
15031
- ggml_compute_forward_scale(params, tensor->src[0], tensor);
15516
+ ggml_compute_forward_scale(params, tensor);
15032
15517
  } break;
15033
15518
  case GGML_OP_SET:
15034
15519
  {
15035
- ggml_compute_forward_set(params, tensor->src[0], tensor->src[1], tensor);
15520
+ ggml_compute_forward_set(params, tensor);
15036
15521
  } break;
15037
15522
  case GGML_OP_CPY:
15038
15523
  {
15039
- ggml_compute_forward_cpy(params, tensor->src[0], tensor);
15524
+ ggml_compute_forward_cpy(params, tensor);
15040
15525
  } break;
15041
15526
  case GGML_OP_CONT:
15042
15527
  {
15043
- ggml_compute_forward_cont(params, tensor->src[0], tensor);
15528
+ ggml_compute_forward_cont(params, tensor);
15044
15529
  } break;
15045
15530
  case GGML_OP_RESHAPE:
15046
15531
  {
15047
- ggml_compute_forward_reshape(params, tensor->src[0], tensor);
15532
+ ggml_compute_forward_reshape(params, tensor);
15048
15533
  } break;
15049
15534
  case GGML_OP_VIEW:
15050
15535
  {
15051
- ggml_compute_forward_view(params, tensor->src[0]);
15536
+ ggml_compute_forward_view(params, tensor);
15052
15537
  } break;
15053
15538
  case GGML_OP_PERMUTE:
15054
15539
  {
15055
- ggml_compute_forward_permute(params, tensor->src[0]);
15540
+ ggml_compute_forward_permute(params, tensor);
15056
15541
  } break;
15057
15542
  case GGML_OP_TRANSPOSE:
15058
15543
  {
15059
- ggml_compute_forward_transpose(params, tensor->src[0]);
15544
+ ggml_compute_forward_transpose(params, tensor);
15060
15545
  } break;
15061
15546
  case GGML_OP_GET_ROWS:
15062
15547
  {
15063
- ggml_compute_forward_get_rows(params, tensor->src[0], tensor->src[1], tensor);
15548
+ ggml_compute_forward_get_rows(params, tensor);
15064
15549
  } break;
15065
15550
  case GGML_OP_GET_ROWS_BACK:
15066
15551
  {
15067
- ggml_compute_forward_get_rows_back(params, tensor->src[0], tensor->src[1], tensor);
15552
+ ggml_compute_forward_get_rows_back(params, tensor);
15068
15553
  } break;
15069
15554
  case GGML_OP_DIAG:
15070
15555
  {
15071
- ggml_compute_forward_diag(params, tensor->src[0], tensor);
15556
+ ggml_compute_forward_diag(params, tensor);
15072
15557
  } break;
15073
15558
  case GGML_OP_DIAG_MASK_INF:
15074
15559
  {
15075
- ggml_compute_forward_diag_mask_inf(params, tensor->src[0], tensor);
15560
+ ggml_compute_forward_diag_mask_inf(params, tensor);
15076
15561
  } break;
15077
15562
  case GGML_OP_DIAG_MASK_ZERO:
15078
15563
  {
15079
- ggml_compute_forward_diag_mask_zero(params, tensor->src[0], tensor);
15564
+ ggml_compute_forward_diag_mask_zero(params, tensor);
15080
15565
  } break;
15081
15566
  case GGML_OP_SOFT_MAX:
15082
15567
  {
15083
- ggml_compute_forward_soft_max(params, tensor->src[0], tensor->src[1], tensor);
15568
+ ggml_compute_forward_soft_max(params, tensor);
15084
15569
  } break;
15085
15570
  case GGML_OP_SOFT_MAX_BACK:
15086
15571
  {
15087
- ggml_compute_forward_soft_max_back(params, tensor->src[0], tensor->src[1], tensor);
15572
+ ggml_compute_forward_soft_max_back(params, tensor);
15088
15573
  } break;
15089
15574
  case GGML_OP_ROPE:
15090
15575
  {
15091
- ggml_compute_forward_rope(params, tensor->src[0], tensor->src[1], tensor);
15576
+ ggml_compute_forward_rope(params, tensor);
15092
15577
  } break;
15093
15578
  case GGML_OP_ROPE_BACK:
15094
15579
  {
15095
- ggml_compute_forward_rope_back(params, tensor->src[0], tensor->src[1], tensor);
15580
+ ggml_compute_forward_rope_back(params, tensor);
15096
15581
  } break;
15097
15582
  case GGML_OP_ALIBI:
15098
15583
  {
15099
- ggml_compute_forward_alibi(params, tensor->src[0], tensor);
15584
+ ggml_compute_forward_alibi(params, tensor);
15100
15585
  } break;
15101
15586
  case GGML_OP_CLAMP:
15102
15587
  {
15103
- ggml_compute_forward_clamp(params, tensor->src[0], tensor);
15588
+ ggml_compute_forward_clamp(params, tensor);
15104
15589
  } break;
15105
15590
  case GGML_OP_CONV_TRANSPOSE_1D:
15106
15591
  {
15107
- ggml_compute_forward_conv_transpose_1d(params, tensor->src[0], tensor->src[1], tensor);
15592
+ ggml_compute_forward_conv_transpose_1d(params, tensor);
15108
15593
  } break;
15109
15594
  case GGML_OP_IM2COL:
15110
15595
  {
15111
- ggml_compute_forward_im2col(params, tensor->src[0], tensor->src[1], tensor);
15596
+ ggml_compute_forward_im2col(params, tensor);
15112
15597
  } break;
15113
15598
  case GGML_OP_CONV_TRANSPOSE_2D:
15114
15599
  {
15115
- ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor);
15600
+ ggml_compute_forward_conv_transpose_2d(params, tensor);
15116
15601
  } break;
15117
15602
  case GGML_OP_POOL_1D:
15118
15603
  {
15119
- ggml_compute_forward_pool_1d(params, tensor->src[0], tensor);
15604
+ ggml_compute_forward_pool_1d(params, tensor);
15120
15605
  } break;
15121
15606
  case GGML_OP_POOL_2D:
15122
15607
  {
15123
- ggml_compute_forward_pool_2d(params, tensor->src[0], tensor);
15608
+ ggml_compute_forward_pool_2d(params, tensor);
15124
15609
  } break;
15125
15610
  case GGML_OP_UPSCALE:
15126
15611
  {
15127
- ggml_compute_forward_upscale(params, tensor->src[0], tensor);
15612
+ ggml_compute_forward_upscale(params, tensor);
15128
15613
  } break;
15129
15614
  case GGML_OP_PAD:
15130
15615
  {
15131
- ggml_compute_forward_pad(params, tensor->src[0], tensor);
15616
+ ggml_compute_forward_pad(params, tensor);
15132
15617
  } break;
15133
15618
  case GGML_OP_ARGSORT:
15134
15619
  {
15135
- ggml_compute_forward_argsort(params, tensor->src[0], tensor);
15620
+ ggml_compute_forward_argsort(params, tensor);
15136
15621
  } break;
15137
15622
  case GGML_OP_LEAKY_RELU:
15138
15623
  {
15139
- ggml_compute_forward_leaky_relu(params, tensor->src[0], tensor);
15624
+ ggml_compute_forward_leaky_relu(params, tensor);
15140
15625
  } break;
15141
15626
  case GGML_OP_FLASH_ATTN:
15142
15627
  {
15143
15628
  const int32_t t = ggml_get_op_params_i32(tensor, 0);
15144
15629
  GGML_ASSERT(t == 0 || t == 1);
15145
15630
  const bool masked = t != 0;
15146
- ggml_compute_forward_flash_attn(params, tensor->src[0], tensor->src[1], tensor->src[2], masked, tensor);
15631
+ ggml_compute_forward_flash_attn(params, masked, tensor);
15147
15632
  } break;
15148
15633
  case GGML_OP_FLASH_FF:
15149
15634
  {
15150
- ggml_compute_forward_flash_ff(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor->src[4], tensor);
15635
+ ggml_compute_forward_flash_ff(params, tensor);
15151
15636
  } break;
15152
15637
  case GGML_OP_FLASH_ATTN_BACK:
15153
15638
  {
15154
15639
  int32_t t = ggml_get_op_params_i32(tensor, 0);
15155
15640
  GGML_ASSERT(t == 0 || t == 1);
15156
15641
  bool masked = t != 0;
15157
- ggml_compute_forward_flash_attn_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], masked, tensor);
15642
+ ggml_compute_forward_flash_attn_back(params, masked, tensor);
15158
15643
  } break;
15159
15644
  case GGML_OP_WIN_PART:
15160
15645
  {
15161
- ggml_compute_forward_win_part(params, tensor->src[0], tensor);
15646
+ ggml_compute_forward_win_part(params, tensor);
15162
15647
  } break;
15163
15648
  case GGML_OP_WIN_UNPART:
15164
15649
  {
15165
- ggml_compute_forward_win_unpart(params, tensor->src[0], tensor);
15650
+ ggml_compute_forward_win_unpart(params, tensor);
15166
15651
  } break;
15167
15652
  case GGML_OP_UNARY:
15168
15653
  {
15169
- ggml_compute_forward_unary(params, tensor->src[0], tensor);
15654
+ ggml_compute_forward_unary(params, tensor);
15170
15655
  } break;
15171
15656
  case GGML_OP_GET_REL_POS:
15172
15657
  {
15173
- ggml_compute_forward_get_rel_pos(params, tensor->src[0], tensor);
15658
+ ggml_compute_forward_get_rel_pos(params, tensor);
15174
15659
  } break;
15175
15660
  case GGML_OP_ADD_REL_POS:
15176
15661
  {
15177
- ggml_compute_forward_add_rel_pos(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
15662
+ ggml_compute_forward_add_rel_pos(params, tensor);
15178
15663
  } break;
15179
15664
  case GGML_OP_MAP_UNARY:
15180
15665
  {
15181
15666
  ggml_unary_op_f32_t fun;
15182
15667
  memcpy(&fun, tensor->op_params, sizeof(fun));
15183
- ggml_compute_forward_map_unary(params, tensor->src[0], tensor, fun);
15668
+ ggml_compute_forward_map_unary(params, tensor, fun);
15184
15669
  }
15185
15670
  break;
15186
15671
  case GGML_OP_MAP_BINARY:
15187
15672
  {
15188
15673
  ggml_binary_op_f32_t fun;
15189
15674
  memcpy(&fun, tensor->op_params, sizeof(fun));
15190
- ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun);
15675
+ ggml_compute_forward_map_binary(params, tensor, fun);
15191
15676
  }
15192
15677
  break;
15193
15678
  case GGML_OP_MAP_CUSTOM1_F32:
15194
15679
  {
15195
15680
  ggml_custom1_op_f32_t fun;
15196
15681
  memcpy(&fun, tensor->op_params, sizeof(fun));
15197
- ggml_compute_forward_map_custom1_f32(params, tensor->src[0], tensor, fun);
15682
+ ggml_compute_forward_map_custom1_f32(params, tensor, fun);
15198
15683
  }
15199
15684
  break;
15200
15685
  case GGML_OP_MAP_CUSTOM2_F32:
15201
15686
  {
15202
15687
  ggml_custom2_op_f32_t fun;
15203
15688
  memcpy(&fun, tensor->op_params, sizeof(fun));
15204
- ggml_compute_forward_map_custom2_f32(params, tensor->src[0], tensor->src[1], tensor, fun);
15689
+ ggml_compute_forward_map_custom2_f32(params, tensor, fun);
15205
15690
  }
15206
15691
  break;
15207
15692
  case GGML_OP_MAP_CUSTOM3_F32:
15208
15693
  {
15209
15694
  ggml_custom3_op_f32_t fun;
15210
15695
  memcpy(&fun, tensor->op_params, sizeof(fun));
15211
- ggml_compute_forward_map_custom3_f32(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun);
15696
+ ggml_compute_forward_map_custom3_f32(params, tensor, fun);
15212
15697
  }
15213
15698
  break;
15214
15699
  case GGML_OP_MAP_CUSTOM1:
15215
15700
  {
15216
- ggml_compute_forward_map_custom1(params, tensor->src[0], tensor);
15701
+ ggml_compute_forward_map_custom1(params, tensor);
15217
15702
  }
15218
15703
  break;
15219
15704
  case GGML_OP_MAP_CUSTOM2:
15220
15705
  {
15221
- ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor);
15706
+ ggml_compute_forward_map_custom2(params, tensor);
15222
15707
  }
15223
15708
  break;
15224
15709
  case GGML_OP_MAP_CUSTOM3:
15225
15710
  {
15226
- ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
15711
+ ggml_compute_forward_map_custom3(params, tensor);
15227
15712
  }
15228
15713
  break;
15229
15714
  case GGML_OP_CROSS_ENTROPY_LOSS:
15230
15715
  {
15231
- ggml_compute_forward_cross_entropy_loss(params, tensor->src[0], tensor->src[1], tensor);
15716
+ ggml_compute_forward_cross_entropy_loss(params, tensor);
15232
15717
  }
15233
15718
  break;
15234
15719
  case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
15235
15720
  {
15236
- ggml_compute_forward_cross_entropy_loss_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
15721
+ ggml_compute_forward_cross_entropy_loss_back(params, tensor);
15237
15722
  }
15238
15723
  break;
15239
15724
  case GGML_OP_NONE:
@@ -16462,7 +16947,7 @@ size_t ggml_graph_overhead(void) {
16462
16947
 
16463
16948
  struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads) {
16464
16949
  const size_t obj_size = ggml_graph_nbytes(size, grads);
16465
- struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, obj_size);
16950
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_GRAPH, obj_size);
16466
16951
  struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
16467
16952
 
16468
16953
  struct ggml_tensor ** data_start = (struct ggml_tensor **) (cgraph + 1);
@@ -16637,27 +17122,47 @@ typedef pthread_t ggml_thread_t;
16637
17122
  #endif
16638
17123
 
16639
17124
  // Android's libc implementation "bionic" does not support setting affinity
16640
- #if defined(__linux__) && !defined(__BIONIC__)
16641
- static void set_numa_thread_affinity(int thread_n, int n_threads) {
17125
+ #if defined(__gnu_linux__)
17126
+ static void set_numa_thread_affinity(int thread_n) {
16642
17127
  if (!ggml_is_numa()) {
16643
17128
  return;
16644
17129
  }
16645
17130
 
16646
- // run thread on node_num thread_n / (threads per node)
16647
- const int node_num = thread_n / ((n_threads + g_state.numa.n_nodes - 1) / g_state.numa.n_nodes);
16648
- struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
17131
+ int node_num;
17132
+ int rv;
16649
17133
  size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
16650
17134
 
17135
+ switch(g_state.numa.numa_strategy) {
17136
+ case GGML_NUMA_STRATEGY_DISTRIBUTE:
17137
+ // run thread on node_num thread_n / (threads per node)
17138
+ node_num = thread_n % g_state.numa.n_nodes;
17139
+ break;
17140
+ case GGML_NUMA_STRATEGY_ISOLATE:
17141
+ // run thread on current_node
17142
+ node_num = g_state.numa.current_node;
17143
+ break;
17144
+ case GGML_NUMA_STRATEGY_NUMACTL:
17145
+ // use the cpuset that numactl gave us
17146
+ rv = pthread_setaffinity_np(pthread_self(), setsize, &g_state.numa.cpuset);
17147
+ if (rv) {
17148
+ fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",strerror(rv));
17149
+ }
17150
+ return;
17151
+ default:
17152
+ return;
17153
+ }
17154
+
17155
+ struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
17156
+
16651
17157
  cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
16652
17158
  CPU_ZERO_S(setsize, cpus);
16653
17159
  for (size_t i = 0; i < node->n_cpus; ++i) {
16654
17160
  CPU_SET_S(node->cpus[i], setsize, cpus);
16655
17161
  }
16656
17162
 
16657
- int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
17163
+ rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
16658
17164
  if (rv) {
16659
- fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
16660
- strerror(rv));
17165
+ fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
16661
17166
  }
16662
17167
 
16663
17168
  CPU_FREE(cpus);
@@ -16678,8 +17183,7 @@ static void clear_numa_thread_affinity(void) {
16678
17183
 
16679
17184
  int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
16680
17185
  if (rv) {
16681
- fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
16682
- strerror(rv));
17186
+ fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
16683
17187
  }
16684
17188
 
16685
17189
  CPU_FREE(cpus);
@@ -16687,7 +17191,7 @@ static void clear_numa_thread_affinity(void) {
16687
17191
  #else
16688
17192
  // TODO: Windows etc.
16689
17193
  // (the linux implementation may also work on BSD, someone should test)
16690
- static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
17194
+ static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n); }
16691
17195
  static void clear_numa_thread_affinity(void) {}
16692
17196
  #endif
16693
17197
 
@@ -16893,29 +17397,32 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
16893
17397
  } break;
16894
17398
  case GGML_OP_MAP_CUSTOM1:
16895
17399
  {
16896
- struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) node->op_params;
16897
- if (p->n_tasks == GGML_N_TASKS_MAX) {
17400
+ struct ggml_map_custom1_op_params p;
17401
+ memcpy(&p, node->op_params, sizeof(p));
17402
+ if (p.n_tasks == GGML_N_TASKS_MAX) {
16898
17403
  n_tasks = n_threads;
16899
17404
  } else {
16900
- n_tasks = MIN(p->n_tasks, n_threads);
17405
+ n_tasks = MIN(p.n_tasks, n_threads);
16901
17406
  }
16902
17407
  } break;
16903
17408
  case GGML_OP_MAP_CUSTOM2:
16904
17409
  {
16905
- struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) node->op_params;
16906
- if (p->n_tasks == GGML_N_TASKS_MAX) {
17410
+ struct ggml_map_custom2_op_params p;
17411
+ memcpy(&p, node->op_params, sizeof(p));
17412
+ if (p.n_tasks == GGML_N_TASKS_MAX) {
16907
17413
  n_tasks = n_threads;
16908
17414
  } else {
16909
- n_tasks = MIN(p->n_tasks, n_threads);
17415
+ n_tasks = MIN(p.n_tasks, n_threads);
16910
17416
  }
16911
17417
  } break;
16912
17418
  case GGML_OP_MAP_CUSTOM3:
16913
17419
  {
16914
- struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) node->op_params;
16915
- if (p->n_tasks == GGML_N_TASKS_MAX) {
17420
+ struct ggml_map_custom3_op_params p;
17421
+ memcpy(&p, node->op_params, sizeof(p));
17422
+ if (p.n_tasks == GGML_N_TASKS_MAX) {
16916
17423
  n_tasks = n_threads;
16917
17424
  } else {
16918
- n_tasks = MIN(p->n_tasks, n_threads);
17425
+ n_tasks = MIN(p.n_tasks, n_threads);
16919
17426
  }
16920
17427
  } break;
16921
17428
  case GGML_OP_CROSS_ENTROPY_LOSS:
@@ -16987,10 +17494,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16987
17494
 
16988
17495
  const int n_threads = state->shared->n_threads;
16989
17496
 
16990
- set_numa_thread_affinity(state->ith, n_threads);
17497
+ set_numa_thread_affinity(state->ith);
16991
17498
 
16992
17499
  int node_n = -1;
16993
- int task_phase = GGML_TASK_FINALIZE;
17500
+ int task_phase = GGML_TASK_TYPE_FINALIZE;
16994
17501
 
16995
17502
  while (true) {
16996
17503
  if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
@@ -17002,7 +17509,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
17002
17509
  // all other threads are finished and spinning
17003
17510
  // do finalize and init here so we don't have synchronize again
17004
17511
  struct ggml_compute_params params = {
17005
- /*.type =*/ GGML_TASK_FINALIZE,
17512
+ /*.type =*/ GGML_TASK_TYPE_FINALIZE,
17006
17513
  /*.ith =*/ 0,
17007
17514
  /*.nth =*/ 0,
17008
17515
  /*.wsize =*/ cplan->work_size,
@@ -17033,17 +17540,17 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
17033
17540
  if (n_tasks == 1) {
17034
17541
  /* INIT */
17035
17542
  if (GGML_OP_HAS_INIT[node->op]) {
17036
- params.type = GGML_TASK_INIT;
17543
+ params.type = GGML_TASK_TYPE_INIT;
17037
17544
  ggml_compute_forward(&params, node);
17038
17545
  }
17039
17546
 
17040
17547
  // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
17041
17548
  // they do something more efficient than spinning (?)
17042
- params.type = GGML_TASK_COMPUTE;
17549
+ params.type = GGML_TASK_TYPE_COMPUTE;
17043
17550
  ggml_compute_forward(&params, node);
17044
17551
 
17045
17552
  if (GGML_OP_HAS_FINALIZE[node->op]) {
17046
- params.type = GGML_TASK_FINALIZE;
17553
+ params.type = GGML_TASK_TYPE_FINALIZE;
17047
17554
  ggml_compute_forward(&params, node);
17048
17555
  }
17049
17556
 
@@ -17057,7 +17564,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
17057
17564
  }
17058
17565
  }
17059
17566
 
17060
- task_phase = GGML_TASK_INIT;
17567
+ task_phase = GGML_TASK_TYPE_INIT;
17061
17568
  atomic_store(&state->shared->n_active, n_threads);
17062
17569
  atomic_store(&state->shared->node_n, node_n);
17063
17570
  atomic_store(&state->shared->node_task, task_phase);
@@ -17074,7 +17581,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
17074
17581
  const int n_tasks = ggml_get_n_tasks(node, n_threads);
17075
17582
 
17076
17583
  struct ggml_compute_params params = {
17077
- /*.type =*/ GGML_TASK_INIT,
17584
+ /*.type =*/ GGML_TASK_TYPE_INIT,
17078
17585
  /*.ith =*/ state->ith,
17079
17586
  /*.nth =*/ n_tasks,
17080
17587
  /*.wsize =*/ cplan->work_size,
@@ -17088,7 +17595,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
17088
17595
  }
17089
17596
 
17090
17597
  if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
17091
- task_phase = GGML_TASK_COMPUTE;
17598
+ task_phase = GGML_TASK_TYPE_COMPUTE;
17092
17599
  atomic_store(&state->shared->n_active, n_threads);
17093
17600
  atomic_store(&state->shared->node_task, task_phase);
17094
17601
  }
@@ -17103,12 +17610,12 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
17103
17610
  }
17104
17611
 
17105
17612
  if (state->ith < n_tasks) {
17106
- params.type = GGML_TASK_COMPUTE;
17613
+ params.type = GGML_TASK_TYPE_COMPUTE;
17107
17614
  ggml_compute_forward(&params, node);
17108
17615
  }
17109
17616
 
17110
17617
  if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
17111
- task_phase = GGML_TASK_FINALIZE;
17618
+ task_phase = GGML_TASK_TYPE_FINALIZE;
17112
17619
  atomic_store(&state->shared->n_active, n_threads);
17113
17620
  atomic_store(&state->shared->node_task, task_phase);
17114
17621
  }
@@ -17344,7 +17851,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
17344
17851
  /*.n_threads =*/ n_threads,
17345
17852
  /*.n_active =*/ n_threads,
17346
17853
  /*.node_n =*/ -1,
17347
- /*.node_task =*/ GGML_TASK_FINALIZE,
17854
+ /*.node_task =*/ GGML_TASK_TYPE_FINALIZE,
17348
17855
  /*.abort_callback =*/ NULL,
17349
17856
  /*.abort_callback_data =*/ NULL,
17350
17857
  };
@@ -17412,7 +17919,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
17412
17919
  void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
17413
17920
  struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
17414
17921
 
17415
- struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
17922
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
17416
17923
 
17417
17924
  cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
17418
17925
 
@@ -17793,7 +18300,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
17793
18300
 
17794
18301
  ptr += ggml_nbytes(tensor);
17795
18302
 
17796
- fprintf(stderr, "%s: loaded leaf %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
18303
+ fprintf(stderr, "%s: loaded leaf %u: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
17797
18304
  }
17798
18305
  }
17799
18306
 
@@ -17896,7 +18403,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
17896
18403
 
17897
18404
  result->nodes[i] = tensor;
17898
18405
 
17899
- fprintf(stderr, "%s: loaded node %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
18406
+ fprintf(stderr, "%s: loaded node %u: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
17900
18407
  }
17901
18408
  }
17902
18409
  }
@@ -18220,7 +18727,7 @@ static enum ggml_opt_result ggml_opt_adam(
18220
18727
  float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
18221
18728
 
18222
18729
  struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
18223
- struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
18730
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
18224
18731
  cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
18225
18732
 
18226
18733
  bool cancel = false;
@@ -18232,7 +18739,7 @@ static enum ggml_opt_result ggml_opt_adam(
18232
18739
  if (callback) {
18233
18740
  callback(callback_data, accum_step, &sched, &cancel);
18234
18741
  if (cancel) {
18235
- return GGML_OPT_CANCEL;
18742
+ return GGML_OPT_RESULT_CANCEL;
18236
18743
  }
18237
18744
  }
18238
18745
  // ggml_graph_reset (gf);
@@ -18323,7 +18830,7 @@ static enum ggml_opt_result ggml_opt_adam(
18323
18830
  if (callback) {
18324
18831
  callback(callback_data, accum_step, &sched, &cancel);
18325
18832
  if (cancel) {
18326
- return GGML_OPT_CANCEL;;
18833
+ return GGML_OPT_RESULT_CANCEL;;
18327
18834
  }
18328
18835
  }
18329
18836
  // ggml_graph_reset (gf);
@@ -18340,7 +18847,7 @@ static enum ggml_opt_result ggml_opt_adam(
18340
18847
  if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) {
18341
18848
  GGML_PRINT_DEBUG("converged\n");
18342
18849
 
18343
- return GGML_OPT_OK;
18850
+ return GGML_OPT_RESULT_OK;
18344
18851
  }
18345
18852
 
18346
18853
  // delta-based convergence test
@@ -18350,7 +18857,7 @@ static enum ggml_opt_result ggml_opt_adam(
18350
18857
  const float rate = (pf[(iter0 + t)%params.past] - fx)/fx;
18351
18858
 
18352
18859
  if (fabsf(rate) < params.delta) {
18353
- return GGML_OPT_OK;
18860
+ return GGML_OPT_RESULT_OK;
18354
18861
  }
18355
18862
  }
18356
18863
 
@@ -18366,7 +18873,7 @@ static enum ggml_opt_result ggml_opt_adam(
18366
18873
  ++n_no_improvement[0];
18367
18874
 
18368
18875
  if (n_no_improvement[0] >= params.max_no_improvement) {
18369
- return GGML_OPT_OK;
18876
+ return GGML_OPT_RESULT_OK;
18370
18877
  }
18371
18878
  }
18372
18879
  }
@@ -18384,7 +18891,7 @@ static enum ggml_opt_result ggml_opt_adam(
18384
18891
  }
18385
18892
  }
18386
18893
 
18387
- return GGML_OPT_DID_NOT_CONVERGE;
18894
+ return GGML_OPT_RESULT_DID_NOT_CONVERGE;
18388
18895
  }
18389
18896
 
18390
18897
  //
@@ -18465,7 +18972,7 @@ static enum ggml_opt_result linesearch_backtracking(
18465
18972
  float sched = 0;
18466
18973
  callback(callback_data, accum_step, &sched, cancel);
18467
18974
  if (*cancel) {
18468
- return GGML_OPT_CANCEL;
18975
+ return GGML_OPT_RESULT_CANCEL;
18469
18976
  }
18470
18977
  }
18471
18978
  // ggml_graph_reset (gf);
@@ -18521,7 +19028,9 @@ static enum ggml_opt_result linesearch_backtracking(
18521
19028
  (*step) *= width;
18522
19029
  }
18523
19030
 
18524
- GGML_UNREACHABLE();
19031
+ GGML_ASSERT(false && "line search failed");
19032
+
19033
+ return GGML_LINESEARCH_FAIL;
18525
19034
  }
18526
19035
 
18527
19036
  static enum ggml_opt_result ggml_opt_lbfgs(
@@ -18536,7 +19045,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18536
19045
  if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE ||
18537
19046
  params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) {
18538
19047
  if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) {
18539
- return GGML_OPT_INVALID_WOLFE;
19048
+ return GGML_OPT_RESULT_INVALID_WOLFE;
18540
19049
  }
18541
19050
  }
18542
19051
 
@@ -18565,7 +19074,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18565
19074
  }
18566
19075
 
18567
19076
  struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
18568
- struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
19077
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
18569
19078
  cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
18570
19079
 
18571
19080
  float * x = opt->lbfgs.x->data; // current parameters
@@ -18606,7 +19115,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18606
19115
  float sched = 0;
18607
19116
  callback(callback_data, accum_step, &sched, &cancel);
18608
19117
  if (cancel) {
18609
- return GGML_OPT_CANCEL;
19118
+ return GGML_OPT_RESULT_CANCEL;
18610
19119
  }
18611
19120
  }
18612
19121
  // ggml_graph_reset (gf);
@@ -18634,7 +19143,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18634
19143
 
18635
19144
  // already optimized
18636
19145
  if (gnorm/xnorm <= params.lbfgs.eps) {
18637
- return GGML_OPT_OK;
19146
+ return GGML_OPT_RESULT_OK;
18638
19147
  }
18639
19148
 
18640
19149
  if (opt->just_initialized) {
@@ -18679,7 +19188,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18679
19188
  // way to test and don't want to break something with so many changes lined up
18680
19189
  ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
18681
19190
  if (cancel) {
18682
- return GGML_OPT_CANCEL;
19191
+ return GGML_OPT_RESULT_CANCEL;
18683
19192
  }
18684
19193
 
18685
19194
  if (ls < 0) {
@@ -18702,7 +19211,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18702
19211
  }
18703
19212
  if (gnorm/xnorm <= params.lbfgs.eps) {
18704
19213
  // converged
18705
- return GGML_OPT_OK;
19214
+ return GGML_OPT_RESULT_OK;
18706
19215
  }
18707
19216
 
18708
19217
  // delta-based convergence test
@@ -18712,7 +19221,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18712
19221
  const float rate = (pf[k[0]%params.past] - fx)/fx;
18713
19222
 
18714
19223
  if (fabsf(rate) < params.delta) {
18715
- return GGML_OPT_OK;
19224
+ return GGML_OPT_RESULT_OK;
18716
19225
  }
18717
19226
  }
18718
19227
 
@@ -18728,14 +19237,14 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18728
19237
  n_no_improvement[0]++;
18729
19238
 
18730
19239
  if (n_no_improvement[0] >= params.max_no_improvement) {
18731
- return GGML_OPT_OK;
19240
+ return GGML_OPT_RESULT_OK;
18732
19241
  }
18733
19242
  }
18734
19243
  }
18735
19244
 
18736
19245
  if (params.lbfgs.n_iter != 0 && params.lbfgs.n_iter < it + 1) {
18737
19246
  // reached the maximum number of iterations
18738
- return GGML_OPT_DID_NOT_CONVERGE;
19247
+ return GGML_OPT_RESULT_DID_NOT_CONVERGE;
18739
19248
  }
18740
19249
 
18741
19250
  // update vectors s and y:
@@ -18789,17 +19298,19 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18789
19298
  step[0] = 1.0;
18790
19299
  }
18791
19300
 
18792
- GGML_UNREACHABLE();
19301
+ GGML_ASSERT(false && "lbfgs failed");
19302
+
19303
+ return GGML_OPT_RESULT_DID_NOT_CONVERGE;
18793
19304
  }
18794
19305
 
18795
19306
  struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
18796
19307
  struct ggml_opt_params result;
18797
19308
 
18798
19309
  switch (type) {
18799
- case GGML_OPT_ADAM:
19310
+ case GGML_OPT_TYPE_ADAM:
18800
19311
  {
18801
19312
  result = (struct ggml_opt_params) {
18802
- .type = GGML_OPT_ADAM,
19313
+ .type = GGML_OPT_TYPE_ADAM,
18803
19314
  .graph_size = GGML_DEFAULT_GRAPH_SIZE,
18804
19315
  .n_threads = 1, // FIXME: GGML_DEFAULT_N_THREADS ?
18805
19316
  .past = 0,
@@ -18827,10 +19338,10 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
18827
19338
  },
18828
19339
  };
18829
19340
  } break;
18830
- case GGML_OPT_LBFGS:
19341
+ case GGML_OPT_TYPE_LBFGS:
18831
19342
  {
18832
19343
  result = (struct ggml_opt_params) {
18833
- .type = GGML_OPT_LBFGS,
19344
+ .type = GGML_OPT_TYPE_LBFGS,
18834
19345
  .graph_size = GGML_DEFAULT_GRAPH_SIZE,
18835
19346
  .n_threads = 1,
18836
19347
  .past = 0,
@@ -18875,12 +19386,12 @@ GGML_API void ggml_opt_init(
18875
19386
  opt->just_initialized = true;
18876
19387
  if (opt->ctx == NULL) {
18877
19388
  struct ggml_init_params ctx_opt_params;
18878
- if (opt->params.type == GGML_OPT_ADAM) {
19389
+ if (opt->params.type == GGML_OPT_TYPE_ADAM) {
18879
19390
  ctx_opt_params.mem_size = GGML_MEM_ALIGN*3 + ggml_tensor_overhead()*3 + ggml_type_size(GGML_TYPE_F32)*nx*3;
18880
19391
  if (opt->params.past > 0) {
18881
19392
  ctx_opt_params.mem_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_size(GGML_TYPE_F32)*opt->params.past;
18882
19393
  }
18883
- } else if (opt->params.type == GGML_OPT_LBFGS) {
19394
+ } else if (opt->params.type == GGML_OPT_TYPE_LBFGS) {
18884
19395
  ctx_opt_params.mem_size = GGML_MEM_ALIGN*9 + ggml_tensor_overhead()*9 + ggml_type_size(GGML_TYPE_F32)*(nx*5 + opt->params.lbfgs.m*2 + nx*opt->params.lbfgs.m*2);
18885
19396
  if (opt->params.past > 0) {
18886
19397
  ctx_opt_params.mem_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_size(GGML_TYPE_F32)*opt->params.past;
@@ -18892,7 +19403,7 @@ GGML_API void ggml_opt_init(
18892
19403
  opt->ctx = ggml_init(ctx_opt_params);
18893
19404
  }
18894
19405
  switch (opt->params.type) {
18895
- case GGML_OPT_ADAM:
19406
+ case GGML_OPT_TYPE_ADAM:
18896
19407
  {
18897
19408
  opt->adam.g = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
18898
19409
  opt->adam.m = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
@@ -18906,7 +19417,7 @@ GGML_API void ggml_opt_init(
18906
19417
  ggml_set_zero(opt->adam.pf);
18907
19418
  }
18908
19419
  } break;
18909
- case GGML_OPT_LBFGS:
19420
+ case GGML_OPT_TYPE_LBFGS:
18910
19421
  {
18911
19422
  opt->lbfgs.x = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
18912
19423
  opt->lbfgs.xp = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
@@ -18950,13 +19461,13 @@ enum ggml_opt_result ggml_opt(
18950
19461
 
18951
19462
  ctx = ggml_init(params_ctx);
18952
19463
  if (ctx == NULL) {
18953
- return GGML_OPT_NO_CONTEXT;
19464
+ return GGML_OPT_RESULT_NO_CONTEXT;
18954
19465
  }
18955
19466
 
18956
19467
  free_ctx = true;
18957
19468
  }
18958
19469
 
18959
- enum ggml_opt_result result = GGML_OPT_OK;
19470
+ enum ggml_opt_result result = GGML_OPT_RESULT_OK;
18960
19471
 
18961
19472
  struct ggml_opt_context * opt = (struct ggml_opt_context *) alloca(sizeof(struct ggml_opt_context));
18962
19473
 
@@ -18995,14 +19506,14 @@ enum ggml_opt_result ggml_opt_resume_g(
18995
19506
  void * callback_data) {
18996
19507
 
18997
19508
  // build forward + backward compute graphs
18998
- enum ggml_opt_result result = GGML_OPT_OK;
19509
+ enum ggml_opt_result result = GGML_OPT_RESULT_OK;
18999
19510
 
19000
19511
  switch (opt->params.type) {
19001
- case GGML_OPT_ADAM:
19512
+ case GGML_OPT_TYPE_ADAM:
19002
19513
  {
19003
19514
  result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
19004
19515
  } break;
19005
- case GGML_OPT_LBFGS:
19516
+ case GGML_OPT_TYPE_LBFGS:
19006
19517
  {
19007
19518
  result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
19008
19519
  } break;
@@ -19037,9 +19548,12 @@ void ggml_quantize_init(enum ggml_type type) {
19037
19548
  ggml_critical_section_start();
19038
19549
 
19039
19550
  switch (type) {
19040
- case GGML_TYPE_IQ2_XXS: iq2xs_init_impl(256); break;
19041
- case GGML_TYPE_IQ2_XS: iq2xs_init_impl(512); break;
19551
+ case GGML_TYPE_IQ2_XXS:
19552
+ case GGML_TYPE_IQ2_XS:
19553
+ case GGML_TYPE_IQ2_S:
19554
+ case GGML_TYPE_IQ1_S: iq2xs_init_impl(type); break;
19042
19555
  case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
19556
+ case GGML_TYPE_IQ3_S: iq3xs_init_impl(512); break;
19043
19557
  default: // nothing
19044
19558
  break;
19045
19559
  }
@@ -19050,8 +19564,10 @@ void ggml_quantize_init(enum ggml_type type) {
19050
19564
  void ggml_quantize_free(void) {
19051
19565
  ggml_critical_section_start();
19052
19566
 
19053
- iq2xs_free_impl(256);
19054
- iq2xs_free_impl(512);
19567
+ iq2xs_free_impl(GGML_TYPE_IQ2_XXS);
19568
+ iq2xs_free_impl(GGML_TYPE_IQ2_XS);
19569
+ iq2xs_free_impl(GGML_TYPE_IQ1_S);
19570
+ iq3xs_free_impl(256);
19055
19571
 
19056
19572
  ggml_critical_section_end();
19057
19573
  }
@@ -19186,7 +19702,8 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
19186
19702
  bool ggml_quantize_requires_imatrix(enum ggml_type type) {
19187
19703
  return
19188
19704
  type == GGML_TYPE_IQ2_XXS ||
19189
- type == GGML_TYPE_IQ2_XS;
19705
+ type == GGML_TYPE_IQ2_XS ||
19706
+ type == GGML_TYPE_IQ1_S;
19190
19707
  }
19191
19708
 
19192
19709
  size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
@@ -19311,6 +19828,56 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
19311
19828
  result = quantize_iq3_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19312
19829
  GGML_ASSERT(result == row_size * nrows);
19313
19830
  } break;
19831
+ case GGML_TYPE_IQ3_S:
19832
+ {
19833
+ GGML_ASSERT(start % QK_K == 0);
19834
+ GGML_ASSERT(start % n_per_row == 0);
19835
+ size_t start_row = start / n_per_row;
19836
+ size_t row_size = ggml_row_size(type, n_per_row);
19837
+ result = quantize_iq3_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19838
+ GGML_ASSERT(result == row_size * nrows);
19839
+ } break;
19840
+ case GGML_TYPE_IQ2_S:
19841
+ {
19842
+ GGML_ASSERT(start % QK_K == 0);
19843
+ GGML_ASSERT(start % n_per_row == 0);
19844
+ size_t start_row = start / n_per_row;
19845
+ size_t row_size = ggml_row_size(type, n_per_row);
19846
+ result = quantize_iq2_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19847
+ GGML_ASSERT(result == row_size * nrows);
19848
+ } break;
19849
+ case GGML_TYPE_IQ1_S:
19850
+ {
19851
+ GGML_ASSERT(start % QK_K == 0);
19852
+ GGML_ASSERT(start % n_per_row == 0);
19853
+ size_t start_row = start / n_per_row;
19854
+ size_t row_size = ggml_row_size(type, n_per_row);
19855
+ result = quantize_iq1_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19856
+ GGML_ASSERT(result == row_size * nrows);
19857
+ } break;
19858
+ case GGML_TYPE_IQ4_NL:
19859
+ #if QK_K == 64
19860
+ case GGML_TYPE_IQ4_XS:
19861
+ #endif
19862
+ {
19863
+ GGML_ASSERT(start % QK4_NL == 0);
19864
+ GGML_ASSERT(start % n_per_row == 0);
19865
+ size_t start_row = start / n_per_row;
19866
+ size_t row_size = ggml_row_size(type, n_per_row);
19867
+ result = quantize_iq4_nl(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19868
+ GGML_ASSERT(result == row_size * nrows);
19869
+ } break;
19870
+ #if QK_K != 64
19871
+ case GGML_TYPE_IQ4_XS:
19872
+ {
19873
+ GGML_ASSERT(start % QK_K == 0);
19874
+ GGML_ASSERT(start % n_per_row == 0);
19875
+ size_t start_row = start / n_per_row;
19876
+ size_t row_size = ggml_row_size(type, n_per_row);
19877
+ result = quantize_iq4_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19878
+ GGML_ASSERT(result == row_size * nrows);
19879
+ } break;
19880
+ #endif
19314
19881
  case GGML_TYPE_F16:
19315
19882
  {
19316
19883
  size_t elemsize = sizeof(ggml_fp16_t);