llama_cpp 0.12.5 → 0.12.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -23,6 +23,9 @@
23
23
  #include <limits.h>
24
24
  #include <stdarg.h>
25
25
  #include <signal.h>
26
+ #if defined(__gnu_linux__)
27
+ #include <syscall.h>
28
+ #endif
26
29
 
27
30
  #ifdef GGML_USE_METAL
28
31
  #include <unistd.h>
@@ -270,6 +273,8 @@ inline static void * ggml_calloc(size_t num, size_t size) {
270
273
  #include <Accelerate/Accelerate.h>
271
274
  #if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
272
275
  #include "ggml-opencl.h"
276
+ #elif defined(GGML_USE_VULKAN)
277
+ #include "ggml-vulkan.h"
273
278
  #endif
274
279
  #elif defined(GGML_USE_OPENBLAS)
275
280
  #if defined(GGML_BLAS_USE_MKL)
@@ -318,7 +323,7 @@ float ggml_table_f32_f16[1 << 16];
318
323
  // note: do not use these inside ggml.c
319
324
  // these are meant to be used via the ggml.h API
320
325
  float ggml_fp16_to_fp32(ggml_fp16_t x) {
321
- return (float) GGML_FP16_TO_FP32(x);
326
+ return GGML_FP16_TO_FP32(x);
322
327
  }
323
328
 
324
329
  ggml_fp16_t ggml_fp32_to_fp16(float x) {
@@ -428,8 +433,8 @@ int64_t ggml_cycles_per_ms(void) {
428
433
 
429
434
  static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
430
435
 
431
- static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
432
- static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
436
+ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
437
+ static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
433
438
 
434
439
  static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
435
440
  [GGML_TYPE_I8] = {
@@ -457,6 +462,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
457
462
  .is_quantized = false,
458
463
  .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
459
464
  .vec_dot_type = GGML_TYPE_F32,
465
+ .nrows = 1,
460
466
  },
461
467
  [GGML_TYPE_F16] = {
462
468
  .type_name = "f16",
@@ -468,6 +474,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
468
474
  .from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row,
469
475
  .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
470
476
  .vec_dot_type = GGML_TYPE_F16,
477
+ .nrows = 1,
471
478
  },
472
479
  [GGML_TYPE_Q4_0] = {
473
480
  .type_name = "q4_0",
@@ -479,6 +486,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
479
486
  .from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference,
480
487
  .vec_dot = ggml_vec_dot_q4_0_q8_0,
481
488
  .vec_dot_type = GGML_TYPE_Q8_0,
489
+ #if defined (__ARM_FEATURE_MATMUL_INT8)
490
+ .nrows = 2,
491
+ #else
492
+ .nrows = 1,
493
+ #endif
482
494
  },
483
495
  [GGML_TYPE_Q4_1] = {
484
496
  .type_name = "q4_1",
@@ -490,6 +502,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
490
502
  .from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference,
491
503
  .vec_dot = ggml_vec_dot_q4_1_q8_1,
492
504
  .vec_dot_type = GGML_TYPE_Q8_1,
505
+ #if defined (__ARM_FEATURE_MATMUL_INT8)
506
+ .nrows = 2,
507
+ #else
508
+ .nrows = 1,
509
+ #endif
493
510
  },
494
511
  [4] = { // GGML_TYPE_Q4_2
495
512
  .type_name = "DEPRECATED",
@@ -501,6 +518,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
501
518
  .from_float_reference = NULL,
502
519
  .vec_dot = NULL,
503
520
  .vec_dot_type = GGML_TYPE_COUNT,
521
+ .nrows = 1,
504
522
  },
505
523
  [5] = { // GGML_TYPE_Q4_3
506
524
  .type_name = "DEPRECATED",
@@ -512,6 +530,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
512
530
  .from_float_reference = NULL,
513
531
  .vec_dot = NULL,
514
532
  .vec_dot_type = GGML_TYPE_COUNT,
533
+ .nrows = 1,
515
534
  },
516
535
  [GGML_TYPE_Q5_0] = {
517
536
  .type_name = "q5_0",
@@ -523,6 +542,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
523
542
  .from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference,
524
543
  .vec_dot = ggml_vec_dot_q5_0_q8_0,
525
544
  .vec_dot_type = GGML_TYPE_Q8_0,
545
+ .nrows = 1,
526
546
  },
527
547
  [GGML_TYPE_Q5_1] = {
528
548
  .type_name = "q5_1",
@@ -534,6 +554,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
534
554
  .from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference,
535
555
  .vec_dot = ggml_vec_dot_q5_1_q8_1,
536
556
  .vec_dot_type = GGML_TYPE_Q8_1,
557
+ .nrows = 1,
537
558
  },
538
559
  [GGML_TYPE_Q8_0] = {
539
560
  .type_name = "q8_0",
@@ -545,6 +566,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
545
566
  .from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference,
546
567
  .vec_dot = ggml_vec_dot_q8_0_q8_0,
547
568
  .vec_dot_type = GGML_TYPE_Q8_0,
569
+ #if defined (__ARM_FEATURE_MATMUL_INT8)
570
+ .nrows = 2,
571
+ #else
572
+ .nrows = 1,
573
+ #endif
548
574
  },
549
575
  [GGML_TYPE_Q8_1] = {
550
576
  .type_name = "q8_1",
@@ -554,6 +580,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
554
580
  .from_float = quantize_row_q8_1,
555
581
  .from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference,
556
582
  .vec_dot_type = GGML_TYPE_Q8_1,
583
+ .nrows = 1,
557
584
  },
558
585
  [GGML_TYPE_Q2_K] = {
559
586
  .type_name = "q2_K",
@@ -565,6 +592,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
565
592
  .from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference,
566
593
  .vec_dot = ggml_vec_dot_q2_K_q8_K,
567
594
  .vec_dot_type = GGML_TYPE_Q8_K,
595
+ .nrows = 1,
568
596
  },
569
597
  [GGML_TYPE_Q3_K] = {
570
598
  .type_name = "q3_K",
@@ -576,6 +604,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
576
604
  .from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference,
577
605
  .vec_dot = ggml_vec_dot_q3_K_q8_K,
578
606
  .vec_dot_type = GGML_TYPE_Q8_K,
607
+ .nrows = 1,
579
608
  },
580
609
  [GGML_TYPE_Q4_K] = {
581
610
  .type_name = "q4_K",
@@ -587,6 +616,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
587
616
  .from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference,
588
617
  .vec_dot = ggml_vec_dot_q4_K_q8_K,
589
618
  .vec_dot_type = GGML_TYPE_Q8_K,
619
+ .nrows = 1,
590
620
  },
591
621
  [GGML_TYPE_Q5_K] = {
592
622
  .type_name = "q5_K",
@@ -598,6 +628,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
598
628
  .from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference,
599
629
  .vec_dot = ggml_vec_dot_q5_K_q8_K,
600
630
  .vec_dot_type = GGML_TYPE_Q8_K,
631
+ .nrows = 1,
601
632
  },
602
633
  [GGML_TYPE_Q6_K] = {
603
634
  .type_name = "q6_K",
@@ -609,6 +640,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
609
640
  .from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference,
610
641
  .vec_dot = ggml_vec_dot_q6_K_q8_K,
611
642
  .vec_dot_type = GGML_TYPE_Q8_K,
643
+ .nrows = 1,
612
644
  },
613
645
  [GGML_TYPE_IQ2_XXS] = {
614
646
  .type_name = "iq2_xxs",
@@ -620,6 +652,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
620
652
  .from_float_reference = NULL,
621
653
  .vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
622
654
  .vec_dot_type = GGML_TYPE_Q8_K,
655
+ .nrows = 1,
623
656
  },
624
657
  [GGML_TYPE_IQ2_XS] = {
625
658
  .type_name = "iq2_xs",
@@ -631,6 +664,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
631
664
  .from_float_reference = NULL,
632
665
  .vec_dot = ggml_vec_dot_iq2_xs_q8_K,
633
666
  .vec_dot_type = GGML_TYPE_Q8_K,
667
+ .nrows = 1,
634
668
  },
635
669
  [GGML_TYPE_IQ3_XXS] = {
636
670
  .type_name = "iq3_xxs",
@@ -642,6 +676,31 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
642
676
  .from_float_reference = (ggml_from_float_t)quantize_row_iq3_xxs_reference,
643
677
  .vec_dot = ggml_vec_dot_iq3_xxs_q8_K,
644
678
  .vec_dot_type = GGML_TYPE_Q8_K,
679
+ .nrows = 1,
680
+ },
681
+ [GGML_TYPE_IQ1_S] = {
682
+ .type_name = "iq1_s",
683
+ .blck_size = QK_K,
684
+ .type_size = sizeof(block_iq1_s),
685
+ .is_quantized = true,
686
+ .to_float = (ggml_to_float_t) dequantize_row_iq1_s,
687
+ .from_float = NULL,
688
+ .from_float_reference = NULL,
689
+ .vec_dot = ggml_vec_dot_iq1_s_q8_K,
690
+ .vec_dot_type = GGML_TYPE_Q8_K,
691
+ .nrows = 1,
692
+ },
693
+ [GGML_TYPE_IQ4_NL] = {
694
+ .type_name = "iq4_nl",
695
+ .blck_size = QK4_NL,
696
+ .type_size = sizeof(block_iq4_nl),
697
+ .is_quantized = true,
698
+ .to_float = (ggml_to_float_t) dequantize_row_iq4_nl,
699
+ .from_float = quantize_row_iq4_nl,
700
+ .from_float_reference = (ggml_from_float_t)quantize_row_iq4_nl_reference,
701
+ .vec_dot = ggml_vec_dot_iq4_nl_q8_0,
702
+ .vec_dot_type = GGML_TYPE_Q8_0,
703
+ .nrows = 1,
645
704
  },
646
705
  [GGML_TYPE_Q8_K] = {
647
706
  .type_name = "q8_K",
@@ -739,7 +798,7 @@ inline static float vaddvq_f32(float32x4_t v) {
739
798
  #define GGML_F16x8 float16x8_t
740
799
  #define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
741
800
  #define GGML_F16x8_SET1(x) vdupq_n_f16(x)
742
- #define GGML_F16x8_LOAD vld1q_f16
801
+ #define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x))
743
802
  #define GGML_F16x8_STORE vst1q_f16
744
803
  #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
745
804
  #define GGML_F16x8_ADD vaddq_f16
@@ -782,7 +841,7 @@ inline static float vaddvq_f32(float32x4_t v) {
782
841
  #define GGML_F32Cx4 float32x4_t
783
842
  #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
784
843
  #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
785
- #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16(x))
844
+ #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
786
845
  #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
787
846
  #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
788
847
  #define GGML_F32Cx4_ADD vaddq_f32
@@ -838,7 +897,7 @@ do { \
838
897
  const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \
839
898
  _mm256_extractf128_ps(x[0], 1)); \
840
899
  const __m128 t1 = _mm_hadd_ps(t0, t0); \
841
- res = _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \
900
+ res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \
842
901
  } while (0)
843
902
  // TODO: is this optimal ?
844
903
 
@@ -1119,7 +1178,7 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
1119
1178
  x[i] = _mm_add_ps(x[i], x[offset+i]); \
1120
1179
  } \
1121
1180
  const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
1122
- res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
1181
+ res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
1123
1182
  }
1124
1183
  // TODO: is this optimal ?
1125
1184
 
@@ -1212,7 +1271,13 @@ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)
1212
1271
  inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
1213
1272
  inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
1214
1273
 
1215
- static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
1274
+ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc) {
1275
+ assert(nrc == 1);
1276
+ UNUSED(nrc);
1277
+ UNUSED(bx);
1278
+ UNUSED(by);
1279
+ UNUSED(bs);
1280
+
1216
1281
  #ifdef GGML_SIMD
1217
1282
  float sumf = 0.0f;
1218
1283
  const int np = (n & ~(GGML_F32_STEP - 1));
@@ -1249,7 +1314,13 @@ static void ggml_vec_dot_f32(const int n, float * restrict s, const float * rest
1249
1314
  *s = sumf;
1250
1315
  }
1251
1316
 
1252
- static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
1317
+ static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc) {
1318
+ assert(nrc == 1);
1319
+ UNUSED(nrc);
1320
+ UNUSED(bx);
1321
+ UNUSED(by);
1322
+ UNUSED(bs);
1323
+
1253
1324
  ggml_float sumf = 0.0;
1254
1325
 
1255
1326
  #if defined(GGML_SIMD)
@@ -1455,7 +1526,7 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
1455
1526
  #endif
1456
1527
  }
1457
1528
 
1458
- inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, x, x); *s = sqrtf(*s); }
1529
+ inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); }
1459
1530
  inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
1460
1531
  inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
1461
1532
  inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); }
@@ -1912,9 +1983,16 @@ struct ggml_numa_node {
1912
1983
  };
1913
1984
 
1914
1985
  struct ggml_numa_nodes {
1986
+ enum ggml_numa_strategy numa_strategy;
1915
1987
  struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
1916
1988
  uint32_t n_nodes;
1917
1989
  uint32_t total_cpus; // hardware threads on system
1990
+ uint32_t current_node; // node on which main process is execting
1991
+ #if defined(__gnu_linux__)
1992
+ cpu_set_t cpuset; // cpuset from numactl
1993
+ #else
1994
+ uint32_t cpuset; // no NUMA support outside of Linux at this time. Use a portable datatype
1995
+ #endif
1918
1996
  };
1919
1997
 
1920
1998
  //
@@ -1948,18 +2026,40 @@ inline static void ggml_critical_section_end(void) {
1948
2026
  atomic_fetch_sub(&g_state_barrier, 1);
1949
2027
  }
1950
2028
 
1951
- void ggml_numa_init(void) {
2029
+ #if defined(__gnu_linux__)
2030
+ static cpu_set_t ggml_get_numa_affinity(void) {
2031
+ cpu_set_t cpuset;
2032
+ pthread_t thread;
2033
+ thread = pthread_self();
2034
+ CPU_ZERO(&cpuset);
2035
+ pthread_getaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
2036
+ return cpuset;
2037
+ }
2038
+ #else
2039
+ static uint32_t ggml_get_numa_affinity(void) {
2040
+ return 0; // no NUMA support
2041
+ }
2042
+ #endif
2043
+
2044
+ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
1952
2045
  if (g_state.numa.n_nodes > 0) {
1953
2046
  fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
1954
2047
 
1955
2048
  return;
1956
2049
  }
1957
2050
 
1958
- #ifdef __linux__
2051
+ #if defined(__gnu_linux__)
1959
2052
  struct stat st;
1960
2053
  char path[256];
1961
2054
  int rv;
1962
2055
 
2056
+ // set numa scheme
2057
+ g_state.numa.numa_strategy = numa_flag;
2058
+
2059
+ GGML_PRINT_DEBUG("numa strategy %u\n",g_state.numa.numa_strategy);
2060
+
2061
+ g_state.numa.cpuset = ggml_get_numa_affinity();
2062
+
1963
2063
  // enumerate nodes
1964
2064
  while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
1965
2065
  rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
@@ -1978,11 +2078,23 @@ void ggml_numa_init(void) {
1978
2078
 
1979
2079
  GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
1980
2080
 
1981
- if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1) {
2081
+ // figure out which node we're on
2082
+ uint current_cpu;
2083
+ int getcpu_ret = 0;
2084
+ #if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 28)
2085
+ getcpu_ret = getcpu(&current_cpu, &g_state.numa.current_node);
2086
+ #else
2087
+ // old glibc doesn't have a wrapper for this call. Fall back on direct syscall
2088
+ getcpu_ret = syscall(SYS_getcpu,&current_cpu,&g_state.numa.current_node);
2089
+ #endif
2090
+
2091
+ if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) {
1982
2092
  g_state.numa.n_nodes = 0;
1983
2093
  return;
1984
2094
  }
1985
2095
 
2096
+ GGML_PRINT_DEBUG("found our process on numa node %u, CPU %u\n", g_state.numa.current_node, current_cpu);
2097
+
1986
2098
  for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
1987
2099
  struct ggml_numa_node * node = &g_state.numa.nodes[n];
1988
2100
  GGML_PRINT_DEBUG("CPUs on node %u:", n);
@@ -2009,6 +2121,7 @@ void ggml_numa_init(void) {
2009
2121
  }
2010
2122
  }
2011
2123
  #else
2124
+ GGML_UNUSED(numa_flag);
2012
2125
  // TODO
2013
2126
  #endif
2014
2127
  }
@@ -2189,6 +2302,8 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
2189
2302
  case GGML_FTYPE_MOSTLY_IQ2_XXS: wtype = GGML_TYPE_IQ2_XXS; break;
2190
2303
  case GGML_FTYPE_MOSTLY_IQ2_XS: wtype = GGML_TYPE_IQ2_XS; break;
2191
2304
  case GGML_FTYPE_MOSTLY_IQ3_XXS: wtype = GGML_TYPE_IQ3_XXS; break;
2305
+ case GGML_FTYPE_MOSTLY_IQ1_S: wtype = GGML_TYPE_IQ1_S; break;
2306
+ case GGML_FTYPE_MOSTLY_IQ4_NL: wtype = GGML_TYPE_IQ4_NL; break;
2192
2307
  case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
2193
2308
  case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
2194
2309
  }
@@ -2607,7 +2722,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
2607
2722
  /*.nb =*/ { 0, 0, 0, 0 },
2608
2723
  /*.op =*/ GGML_OP_NONE,
2609
2724
  /*.op_params =*/ { 0 },
2610
- /*.is_param =*/ false,
2725
+ /*.flags =*/ 0,
2611
2726
  /*.grad =*/ NULL,
2612
2727
  /*.src =*/ { NULL },
2613
2728
  /*.perf_runs =*/ 0,
@@ -3142,7 +3257,7 @@ const char * ggml_get_name(const struct ggml_tensor * tensor) {
3142
3257
  }
3143
3258
 
3144
3259
  struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name) {
3145
- strncpy(tensor->name, name, sizeof(tensor->name));
3260
+ strncpy(tensor->name, name, sizeof(tensor->name) - 1);
3146
3261
  tensor->name[sizeof(tensor->name) - 1] = '\0';
3147
3262
  return tensor;
3148
3263
  }
@@ -5018,16 +5133,28 @@ static struct ggml_tensor * ggml_soft_max_impl(
5018
5133
  struct ggml_context * ctx,
5019
5134
  struct ggml_tensor * a,
5020
5135
  struct ggml_tensor * mask,
5136
+ struct ggml_tensor * pos,
5021
5137
  float scale,
5138
+ float max_bias,
5022
5139
  bool inplace) {
5023
5140
  GGML_ASSERT(ggml_is_contiguous(a));
5141
+
5024
5142
  if (mask) {
5025
5143
  GGML_ASSERT(ggml_is_contiguous(mask));
5026
- GGML_ASSERT(mask->ne[2] == 1);
5027
- GGML_ASSERT(mask->ne[3] == 1);
5144
+ GGML_ASSERT(ggml_is_matrix(mask));
5028
5145
  GGML_ASSERT(ggml_can_repeat_rows(mask, a));
5029
5146
  }
5030
5147
 
5148
+ if (pos) {
5149
+ GGML_ASSERT(ggml_is_vector(pos));
5150
+ GGML_ASSERT(pos->type == GGML_TYPE_F32);
5151
+ GGML_ASSERT(pos->ne[0] == a->ne[0]);
5152
+ }
5153
+
5154
+ if (max_bias > 0.0f) {
5155
+ GGML_ASSERT(pos);
5156
+ }
5157
+
5031
5158
  bool is_node = false;
5032
5159
 
5033
5160
  if (a->grad) {
@@ -5036,13 +5163,14 @@ static struct ggml_tensor * ggml_soft_max_impl(
5036
5163
 
5037
5164
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5038
5165
 
5039
- float params[] = { scale };
5166
+ float params[] = { scale, max_bias };
5040
5167
  ggml_set_op_params(result, params, sizeof(params));
5041
5168
 
5042
5169
  result->op = GGML_OP_SOFT_MAX;
5043
5170
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5044
5171
  result->src[0] = a;
5045
5172
  result->src[1] = mask;
5173
+ result->src[2] = pos;
5046
5174
 
5047
5175
  return result;
5048
5176
  }
@@ -5050,21 +5178,23 @@ static struct ggml_tensor * ggml_soft_max_impl(
5050
5178
  struct ggml_tensor * ggml_soft_max(
5051
5179
  struct ggml_context * ctx,
5052
5180
  struct ggml_tensor * a) {
5053
- return ggml_soft_max_impl(ctx, a, NULL, 1.0f, false);
5181
+ return ggml_soft_max_impl(ctx, a, NULL, NULL, 1.0f, 0.0f, false);
5054
5182
  }
5055
5183
 
5056
5184
  struct ggml_tensor * ggml_soft_max_inplace(
5057
5185
  struct ggml_context * ctx,
5058
5186
  struct ggml_tensor * a) {
5059
- return ggml_soft_max_impl(ctx, a, NULL, 1.0f, true);
5187
+ return ggml_soft_max_impl(ctx, a, NULL, NULL, 1.0f, 0.0f, true);
5060
5188
  }
5061
5189
 
5062
5190
  struct ggml_tensor * ggml_soft_max_ext(
5063
5191
  struct ggml_context * ctx,
5064
5192
  struct ggml_tensor * a,
5065
5193
  struct ggml_tensor * mask,
5066
- float scale) {
5067
- return ggml_soft_max_impl(ctx, a, mask, scale, false);
5194
+ struct ggml_tensor * pos,
5195
+ float scale,
5196
+ float max_bias) {
5197
+ return ggml_soft_max_impl(ctx, a, mask, pos, scale, max_bias, false);
5068
5198
  }
5069
5199
 
5070
5200
  // ggml_soft_max_back
@@ -5514,7 +5644,9 @@ struct ggml_tensor * ggml_conv_2d(
5514
5644
  ggml_reshape_2d(ctx, im2col, im2col->ne[0], im2col->ne[3] * im2col->ne[2] * im2col->ne[1]), // [N, OH, OW, IC * KH * KW] => [N*OH*OW, IC * KH * KW]
5515
5645
  ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1] * a->ne[2]), a->ne[3])); // [OC,IC, KH, KW] => [OC, IC * KH * KW]
5516
5646
 
5517
- result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], a->ne[3], im2col->ne[3]); // [N, OC, OH, OW]
5647
+ result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], im2col->ne[3], a->ne[3]); // [OC, N, OH, OW]
5648
+ result = ggml_cont(ctx, ggml_permute(ctx, result, 0, 1, 3, 2)); // [N, OC, OH, OW]
5649
+
5518
5650
 
5519
5651
  return result;
5520
5652
  }
@@ -6509,7 +6641,7 @@ struct ggml_tensor * ggml_cross_entropy_loss_back(
6509
6641
  void ggml_set_param(
6510
6642
  struct ggml_context * ctx,
6511
6643
  struct ggml_tensor * tensor) {
6512
- tensor->is_param = true;
6644
+ tensor->flags |= GGML_TENSOR_FLAG_PARAM;
6513
6645
 
6514
6646
  GGML_ASSERT(tensor->grad == NULL);
6515
6647
  tensor->grad = ggml_dup_tensor(ctx, tensor);
@@ -6520,8 +6652,10 @@ void ggml_set_param(
6520
6652
 
6521
6653
  static void ggml_compute_forward_dup_same_cont(
6522
6654
  const struct ggml_compute_params * params,
6523
- const struct ggml_tensor * src0,
6524
6655
  struct ggml_tensor * dst) {
6656
+
6657
+ const struct ggml_tensor * src0 = dst->src[0];
6658
+
6525
6659
  GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
6526
6660
  GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
6527
6661
  GGML_ASSERT(src0->type == dst->type);
@@ -6552,8 +6686,10 @@ static void ggml_compute_forward_dup_same_cont(
6552
6686
  }
6553
6687
  static void ggml_compute_forward_dup_f16(
6554
6688
  const struct ggml_compute_params * params,
6555
- const struct ggml_tensor * src0,
6556
6689
  struct ggml_tensor * dst) {
6690
+
6691
+ const struct ggml_tensor * src0 = dst->src[0];
6692
+
6557
6693
  GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
6558
6694
 
6559
6695
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -6566,7 +6702,7 @@ static void ggml_compute_forward_dup_f16(
6566
6702
  const int nth = params->nth; // number of threads
6567
6703
 
6568
6704
  if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
6569
- ggml_compute_forward_dup_same_cont(params, src0, dst);
6705
+ ggml_compute_forward_dup_same_cont(params, dst);
6570
6706
  return;
6571
6707
  }
6572
6708
 
@@ -6823,8 +6959,10 @@ static void ggml_compute_forward_dup_f16(
6823
6959
 
6824
6960
  static void ggml_compute_forward_dup_f32(
6825
6961
  const struct ggml_compute_params * params,
6826
- const struct ggml_tensor * src0,
6827
6962
  struct ggml_tensor * dst) {
6963
+
6964
+ const struct ggml_tensor * src0 = dst->src[0];
6965
+
6828
6966
  GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
6829
6967
 
6830
6968
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -6837,7 +6975,7 @@ static void ggml_compute_forward_dup_f32(
6837
6975
  const int nth = params->nth; // number of threads
6838
6976
 
6839
6977
  if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
6840
- ggml_compute_forward_dup_same_cont(params, src0, dst);
6978
+ ggml_compute_forward_dup_same_cont(params, dst);
6841
6979
  return;
6842
6980
  }
6843
6981
 
@@ -7073,8 +7211,10 @@ static void ggml_compute_forward_dup_f32(
7073
7211
  // A simplified version of ggml_compute_forward_dup that doesn't do float upcasting, and just plain old memcpy.
7074
7212
  static void ggml_compute_forward_dup_bytes(
7075
7213
  const struct ggml_compute_params * params,
7076
- const struct ggml_tensor * src0,
7077
7214
  struct ggml_tensor * dst) {
7215
+
7216
+ const struct ggml_tensor * src0 = dst->src[0];
7217
+
7078
7218
  GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
7079
7219
  GGML_ASSERT(src0->type == dst->type);
7080
7220
 
@@ -7083,7 +7223,7 @@ static void ggml_compute_forward_dup_bytes(
7083
7223
  }
7084
7224
 
7085
7225
  if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
7086
- ggml_compute_forward_dup_same_cont(params, src0, dst);
7226
+ ggml_compute_forward_dup_same_cont(params, dst);
7087
7227
  return;
7088
7228
  }
7089
7229
 
@@ -7222,21 +7362,23 @@ static void ggml_compute_forward_dup_bytes(
7222
7362
 
7223
7363
  static void ggml_compute_forward_dup(
7224
7364
  const struct ggml_compute_params * params,
7225
- const struct ggml_tensor * src0,
7226
7365
  struct ggml_tensor * dst) {
7366
+
7367
+ const struct ggml_tensor * src0 = dst->src[0];
7368
+
7227
7369
  if (src0->type == dst->type) {
7228
- ggml_compute_forward_dup_bytes(params, src0, dst);
7370
+ ggml_compute_forward_dup_bytes(params, dst);
7229
7371
  return;
7230
7372
  }
7231
7373
 
7232
7374
  switch (src0->type) {
7233
7375
  case GGML_TYPE_F16:
7234
7376
  {
7235
- ggml_compute_forward_dup_f16(params, src0, dst);
7377
+ ggml_compute_forward_dup_f16(params, dst);
7236
7378
  } break;
7237
7379
  case GGML_TYPE_F32:
7238
7380
  {
7239
- ggml_compute_forward_dup_f32(params, src0, dst);
7381
+ ggml_compute_forward_dup_f32(params, dst);
7240
7382
  } break;
7241
7383
  default:
7242
7384
  {
@@ -7249,9 +7391,11 @@ static void ggml_compute_forward_dup(
7249
7391
 
7250
7392
  static void ggml_compute_forward_add_f32(
7251
7393
  const struct ggml_compute_params * params,
7252
- const struct ggml_tensor * src0,
7253
- const struct ggml_tensor * src1,
7254
7394
  struct ggml_tensor * dst) {
7395
+
7396
+ const struct ggml_tensor * src0 = dst->src[0];
7397
+ const struct ggml_tensor * src1 = dst->src[1];
7398
+
7255
7399
  GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
7256
7400
 
7257
7401
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -7337,9 +7481,11 @@ static void ggml_compute_forward_add_f32(
7337
7481
 
7338
7482
  static void ggml_compute_forward_add_f16_f32(
7339
7483
  const struct ggml_compute_params * params,
7340
- const struct ggml_tensor * src0,
7341
- const struct ggml_tensor * src1,
7342
7484
  struct ggml_tensor * dst) {
7485
+
7486
+ const struct ggml_tensor * src0 = dst->src[0];
7487
+ const struct ggml_tensor * src1 = dst->src[1];
7488
+
7343
7489
  GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
7344
7490
 
7345
7491
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -7414,9 +7560,11 @@ static void ggml_compute_forward_add_f16_f32(
7414
7560
 
7415
7561
  static void ggml_compute_forward_add_f16_f16(
7416
7562
  const struct ggml_compute_params * params,
7417
- const struct ggml_tensor * src0,
7418
- const struct ggml_tensor * src1,
7419
7563
  struct ggml_tensor * dst) {
7564
+
7565
+ const struct ggml_tensor * src0 = dst->src[0];
7566
+ const struct ggml_tensor * src1 = dst->src[1];
7567
+
7420
7568
  GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
7421
7569
 
7422
7570
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -7468,9 +7616,11 @@ static void ggml_compute_forward_add_f16_f16(
7468
7616
 
7469
7617
  static void ggml_compute_forward_add_q_f32(
7470
7618
  const struct ggml_compute_params * params,
7471
- const struct ggml_tensor * src0,
7472
- const struct ggml_tensor * src1,
7473
7619
  struct ggml_tensor * dst) {
7620
+
7621
+ const struct ggml_tensor * src0 = dst->src[0];
7622
+ const struct ggml_tensor * src1 = dst->src[1];
7623
+
7474
7624
  GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
7475
7625
 
7476
7626
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -7546,14 +7696,16 @@ static void ggml_compute_forward_add_q_f32(
7546
7696
 
7547
7697
  static void ggml_compute_forward_add(
7548
7698
  const struct ggml_compute_params * params,
7549
- const struct ggml_tensor * src0,
7550
- const struct ggml_tensor * src1,
7551
7699
  struct ggml_tensor * dst) {
7700
+
7701
+ const struct ggml_tensor * src0 = dst->src[0];
7702
+ const struct ggml_tensor * src1 = dst->src[1];
7703
+
7552
7704
  switch (src0->type) {
7553
7705
  case GGML_TYPE_F32:
7554
7706
  {
7555
7707
  if (src1->type == GGML_TYPE_F32) {
7556
- ggml_compute_forward_add_f32(params, src0, src1, dst);
7708
+ ggml_compute_forward_add_f32(params, dst);
7557
7709
  }
7558
7710
  else {
7559
7711
  GGML_ASSERT(false);
@@ -7562,10 +7714,10 @@ static void ggml_compute_forward_add(
7562
7714
  case GGML_TYPE_F16:
7563
7715
  {
7564
7716
  if (src1->type == GGML_TYPE_F16) {
7565
- ggml_compute_forward_add_f16_f16(params, src0, src1, dst);
7717
+ ggml_compute_forward_add_f16_f16(params, dst);
7566
7718
  }
7567
7719
  else if (src1->type == GGML_TYPE_F32) {
7568
- ggml_compute_forward_add_f16_f32(params, src0, src1, dst);
7720
+ ggml_compute_forward_add_f16_f32(params, dst);
7569
7721
  }
7570
7722
  else {
7571
7723
  GGML_ASSERT(false);
@@ -7584,8 +7736,10 @@ static void ggml_compute_forward_add(
7584
7736
  case GGML_TYPE_IQ2_XXS:
7585
7737
  case GGML_TYPE_IQ2_XS:
7586
7738
  case GGML_TYPE_IQ3_XXS:
7739
+ case GGML_TYPE_IQ1_S:
7740
+ case GGML_TYPE_IQ4_NL:
7587
7741
  {
7588
- ggml_compute_forward_add_q_f32(params, src0, src1, dst);
7742
+ ggml_compute_forward_add_q_f32(params, dst);
7589
7743
  } break;
7590
7744
  default:
7591
7745
  {
@@ -7598,9 +7752,11 @@ static void ggml_compute_forward_add(
7598
7752
 
7599
7753
  static void ggml_compute_forward_add1_f32(
7600
7754
  const struct ggml_compute_params * params,
7601
- const struct ggml_tensor * src0,
7602
- const struct ggml_tensor * src1,
7603
7755
  struct ggml_tensor * dst) {
7756
+
7757
+ const struct ggml_tensor * src0 = dst->src[0];
7758
+ const struct ggml_tensor * src1 = dst->src[1];
7759
+
7604
7760
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
7605
7761
  GGML_ASSERT(ggml_is_scalar(src1));
7606
7762
 
@@ -7650,9 +7806,11 @@ static void ggml_compute_forward_add1_f32(
7650
7806
 
7651
7807
  static void ggml_compute_forward_add1_f16_f32(
7652
7808
  const struct ggml_compute_params * params,
7653
- const struct ggml_tensor * src0,
7654
- const struct ggml_tensor * src1,
7655
7809
  struct ggml_tensor * dst) {
7810
+
7811
+ const struct ggml_tensor * src0 = dst->src[0];
7812
+ const struct ggml_tensor * src1 = dst->src[1];
7813
+
7656
7814
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
7657
7815
  GGML_ASSERT(ggml_is_scalar(src1));
7658
7816
 
@@ -7700,9 +7858,11 @@ static void ggml_compute_forward_add1_f16_f32(
7700
7858
 
7701
7859
  static void ggml_compute_forward_add1_f16_f16(
7702
7860
  const struct ggml_compute_params * params,
7703
- const struct ggml_tensor * src0,
7704
- const struct ggml_tensor * src1,
7705
7861
  struct ggml_tensor * dst) {
7862
+
7863
+ const struct ggml_tensor * src0 = dst->src[0];
7864
+ const struct ggml_tensor * src1 = dst->src[1];
7865
+
7706
7866
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
7707
7867
  GGML_ASSERT(ggml_is_scalar(src1));
7708
7868
 
@@ -7750,9 +7910,11 @@ static void ggml_compute_forward_add1_f16_f16(
7750
7910
 
7751
7911
  static void ggml_compute_forward_add1_q_f32(
7752
7912
  const struct ggml_compute_params * params,
7753
- const struct ggml_tensor * src0,
7754
- const struct ggml_tensor * src1,
7755
7913
  struct ggml_tensor * dst) {
7914
+
7915
+ const struct ggml_tensor * src0 = dst->src[0];
7916
+ const struct ggml_tensor * src1 = dst->src[1];
7917
+
7756
7918
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
7757
7919
  GGML_ASSERT(ggml_is_scalar(src1));
7758
7920
 
@@ -7817,21 +7979,23 @@ static void ggml_compute_forward_add1_q_f32(
7817
7979
 
7818
7980
  static void ggml_compute_forward_add1(
7819
7981
  const struct ggml_compute_params * params,
7820
- const struct ggml_tensor * src0,
7821
- const struct ggml_tensor * src1,
7822
7982
  struct ggml_tensor * dst) {
7983
+
7984
+ const struct ggml_tensor * src0 = dst->src[0];
7985
+ const struct ggml_tensor * src1 = dst->src[1];
7986
+
7823
7987
  switch (src0->type) {
7824
7988
  case GGML_TYPE_F32:
7825
7989
  {
7826
- ggml_compute_forward_add1_f32(params, src0, src1, dst);
7990
+ ggml_compute_forward_add1_f32(params, dst);
7827
7991
  } break;
7828
7992
  case GGML_TYPE_F16:
7829
7993
  {
7830
7994
  if (src1->type == GGML_TYPE_F16) {
7831
- ggml_compute_forward_add1_f16_f16(params, src0, src1, dst);
7995
+ ggml_compute_forward_add1_f16_f16(params, dst);
7832
7996
  }
7833
7997
  else if (src1->type == GGML_TYPE_F32) {
7834
- ggml_compute_forward_add1_f16_f32(params, src0, src1, dst);
7998
+ ggml_compute_forward_add1_f16_f32(params, dst);
7835
7999
  }
7836
8000
  else {
7837
8001
  GGML_ASSERT(false);
@@ -7851,8 +8015,10 @@ static void ggml_compute_forward_add1(
7851
8015
  case GGML_TYPE_IQ2_XXS:
7852
8016
  case GGML_TYPE_IQ2_XS:
7853
8017
  case GGML_TYPE_IQ3_XXS:
8018
+ case GGML_TYPE_IQ1_S:
8019
+ case GGML_TYPE_IQ4_NL:
7854
8020
  {
7855
- ggml_compute_forward_add1_q_f32(params, src0, src1, dst);
8021
+ ggml_compute_forward_add1_q_f32(params, dst);
7856
8022
  } break;
7857
8023
  default:
7858
8024
  {
@@ -7865,9 +8031,11 @@ static void ggml_compute_forward_add1(
7865
8031
 
7866
8032
  static void ggml_compute_forward_acc_f32(
7867
8033
  const struct ggml_compute_params * params,
7868
- const struct ggml_tensor * src0,
7869
- const struct ggml_tensor * src1,
7870
8034
  struct ggml_tensor * dst) {
8035
+
8036
+ const struct ggml_tensor * src0 = dst->src[0];
8037
+ const struct ggml_tensor * src1 = dst->src[1];
8038
+
7871
8039
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
7872
8040
  GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
7873
8041
 
@@ -7947,14 +8115,14 @@ static void ggml_compute_forward_acc_f32(
7947
8115
 
7948
8116
  static void ggml_compute_forward_acc(
7949
8117
  const struct ggml_compute_params * params,
7950
- const struct ggml_tensor * src0,
7951
- const struct ggml_tensor * src1,
7952
8118
  struct ggml_tensor * dst) {
7953
8119
 
8120
+ const struct ggml_tensor * src0 = dst->src[0];
8121
+
7954
8122
  switch (src0->type) {
7955
8123
  case GGML_TYPE_F32:
7956
8124
  {
7957
- ggml_compute_forward_acc_f32(params, src0, src1, dst);
8125
+ ggml_compute_forward_acc_f32(params, dst);
7958
8126
  } break;
7959
8127
  case GGML_TYPE_F16:
7960
8128
  case GGML_TYPE_Q4_0:
@@ -7971,6 +8139,8 @@ static void ggml_compute_forward_acc(
7971
8139
  case GGML_TYPE_IQ2_XXS:
7972
8140
  case GGML_TYPE_IQ2_XS:
7973
8141
  case GGML_TYPE_IQ3_XXS:
8142
+ case GGML_TYPE_IQ1_S:
8143
+ case GGML_TYPE_IQ4_NL:
7974
8144
  default:
7975
8145
  {
7976
8146
  GGML_ASSERT(false);
@@ -7982,9 +8152,11 @@ static void ggml_compute_forward_acc(
7982
8152
 
7983
8153
  static void ggml_compute_forward_sub_f32(
7984
8154
  const struct ggml_compute_params * params,
7985
- const struct ggml_tensor * src0,
7986
- const struct ggml_tensor * src1,
7987
8155
  struct ggml_tensor * dst) {
8156
+
8157
+ const struct ggml_tensor * src0 = dst->src[0];
8158
+ const struct ggml_tensor * src1 = dst->src[1];
8159
+
7988
8160
  assert(params->ith == 0);
7989
8161
  assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
7990
8162
 
@@ -8042,13 +8214,14 @@ static void ggml_compute_forward_sub_f32(
8042
8214
 
8043
8215
  static void ggml_compute_forward_sub(
8044
8216
  const struct ggml_compute_params * params,
8045
- const struct ggml_tensor * src0,
8046
- const struct ggml_tensor * src1,
8047
8217
  struct ggml_tensor * dst) {
8218
+
8219
+ const struct ggml_tensor * src0 = dst->src[0];
8220
+
8048
8221
  switch (src0->type) {
8049
8222
  case GGML_TYPE_F32:
8050
8223
  {
8051
- ggml_compute_forward_sub_f32(params, src0, src1, dst);
8224
+ ggml_compute_forward_sub_f32(params, dst);
8052
8225
  } break;
8053
8226
  default:
8054
8227
  {
@@ -8061,9 +8234,11 @@ static void ggml_compute_forward_sub(
8061
8234
 
8062
8235
  static void ggml_compute_forward_mul_f32(
8063
8236
  const struct ggml_compute_params * params,
8064
- const struct ggml_tensor * src0,
8065
- const struct ggml_tensor * src1,
8066
8237
  struct ggml_tensor * dst) {
8238
+
8239
+ const struct ggml_tensor * src0 = dst->src[0];
8240
+ const struct ggml_tensor * src1 = dst->src[1];
8241
+
8067
8242
  GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
8068
8243
 
8069
8244
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -8144,15 +8319,17 @@ static void ggml_compute_forward_mul_f32(
8144
8319
 
8145
8320
  static void ggml_compute_forward_mul(
8146
8321
  const struct ggml_compute_params * params,
8147
- const struct ggml_tensor * src0,
8148
- const struct ggml_tensor * src1,
8149
8322
  struct ggml_tensor * dst) {
8323
+
8324
+ const struct ggml_tensor * src0 = dst->src[0];
8325
+ const struct ggml_tensor * src1 = dst->src[1];
8326
+
8150
8327
  GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now");
8151
8328
 
8152
8329
  switch (src0->type) {
8153
8330
  case GGML_TYPE_F32:
8154
8331
  {
8155
- ggml_compute_forward_mul_f32(params, src0, src1, dst);
8332
+ ggml_compute_forward_mul_f32(params, dst);
8156
8333
  } break;
8157
8334
  default:
8158
8335
  {
@@ -8165,9 +8342,11 @@ static void ggml_compute_forward_mul(
8165
8342
 
8166
8343
  static void ggml_compute_forward_div_f32(
8167
8344
  const struct ggml_compute_params * params,
8168
- const struct ggml_tensor * src0,
8169
- const struct ggml_tensor * src1,
8170
8345
  struct ggml_tensor * dst) {
8346
+
8347
+ const struct ggml_tensor * src0 = dst->src[0];
8348
+ const struct ggml_tensor * src1 = dst->src[1];
8349
+
8171
8350
  GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
8172
8351
 
8173
8352
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -8238,13 +8417,14 @@ static void ggml_compute_forward_div_f32(
8238
8417
 
8239
8418
  static void ggml_compute_forward_div(
8240
8419
  const struct ggml_compute_params * params,
8241
- const struct ggml_tensor * src0,
8242
- const struct ggml_tensor * src1,
8243
8420
  struct ggml_tensor * dst) {
8421
+
8422
+ const struct ggml_tensor * src0 = dst->src[0];
8423
+
8244
8424
  switch (src0->type) {
8245
8425
  case GGML_TYPE_F32:
8246
8426
  {
8247
- ggml_compute_forward_div_f32(params, src0, src1, dst);
8427
+ ggml_compute_forward_div_f32(params, dst);
8248
8428
  } break;
8249
8429
  default:
8250
8430
  {
@@ -8257,8 +8437,10 @@ static void ggml_compute_forward_div(
8257
8437
 
8258
8438
  static void ggml_compute_forward_sqr_f32(
8259
8439
  const struct ggml_compute_params * params,
8260
- const struct ggml_tensor * src0,
8261
8440
  struct ggml_tensor * dst) {
8441
+
8442
+ const struct ggml_tensor * src0 = dst->src[0];
8443
+
8262
8444
  assert(params->ith == 0);
8263
8445
  assert(ggml_are_same_shape(src0, dst));
8264
8446
 
@@ -8281,12 +8463,14 @@ static void ggml_compute_forward_sqr_f32(
8281
8463
 
8282
8464
  static void ggml_compute_forward_sqr(
8283
8465
  const struct ggml_compute_params * params,
8284
- const struct ggml_tensor * src0,
8285
8466
  struct ggml_tensor * dst) {
8467
+
8468
+ const struct ggml_tensor * src0 = dst->src[0];
8469
+
8286
8470
  switch (src0->type) {
8287
8471
  case GGML_TYPE_F32:
8288
8472
  {
8289
- ggml_compute_forward_sqr_f32(params, src0, dst);
8473
+ ggml_compute_forward_sqr_f32(params, dst);
8290
8474
  } break;
8291
8475
  default:
8292
8476
  {
@@ -8299,8 +8483,10 @@ static void ggml_compute_forward_sqr(
8299
8483
 
8300
8484
  static void ggml_compute_forward_sqrt_f32(
8301
8485
  const struct ggml_compute_params * params,
8302
- const struct ggml_tensor * src0,
8303
8486
  struct ggml_tensor * dst) {
8487
+
8488
+ const struct ggml_tensor * src0 = dst->src[0];
8489
+
8304
8490
  assert(params->ith == 0);
8305
8491
  assert(ggml_are_same_shape(src0, dst));
8306
8492
 
@@ -8323,12 +8509,14 @@ static void ggml_compute_forward_sqrt_f32(
8323
8509
 
8324
8510
  static void ggml_compute_forward_sqrt(
8325
8511
  const struct ggml_compute_params * params,
8326
- const struct ggml_tensor * src0,
8327
8512
  struct ggml_tensor * dst) {
8513
+
8514
+ const struct ggml_tensor * src0 = dst->src[0];
8515
+
8328
8516
  switch (src0->type) {
8329
8517
  case GGML_TYPE_F32:
8330
8518
  {
8331
- ggml_compute_forward_sqrt_f32(params, src0, dst);
8519
+ ggml_compute_forward_sqrt_f32(params, dst);
8332
8520
  } break;
8333
8521
  default:
8334
8522
  {
@@ -8341,8 +8529,10 @@ static void ggml_compute_forward_sqrt(
8341
8529
 
8342
8530
  static void ggml_compute_forward_log_f32(
8343
8531
  const struct ggml_compute_params * params,
8344
- const struct ggml_tensor * src0,
8345
8532
  struct ggml_tensor * dst) {
8533
+
8534
+ const struct ggml_tensor * src0 = dst->src[0];
8535
+
8346
8536
  GGML_ASSERT(params->ith == 0);
8347
8537
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
8348
8538
 
@@ -8365,12 +8555,14 @@ static void ggml_compute_forward_log_f32(
8365
8555
 
8366
8556
  static void ggml_compute_forward_log(
8367
8557
  const struct ggml_compute_params * params,
8368
- const struct ggml_tensor * src0,
8369
8558
  struct ggml_tensor * dst) {
8559
+
8560
+ const struct ggml_tensor * src0 = dst->src[0];
8561
+
8370
8562
  switch (src0->type) {
8371
8563
  case GGML_TYPE_F32:
8372
8564
  {
8373
- ggml_compute_forward_log_f32(params, src0, dst);
8565
+ ggml_compute_forward_log_f32(params, dst);
8374
8566
  } break;
8375
8567
  default:
8376
8568
  {
@@ -8383,8 +8575,10 @@ static void ggml_compute_forward_log(
8383
8575
 
8384
8576
  static void ggml_compute_forward_sum_f32(
8385
8577
  const struct ggml_compute_params * params,
8386
- const struct ggml_tensor * src0,
8387
8578
  struct ggml_tensor * dst) {
8579
+
8580
+ const struct ggml_tensor * src0 = dst->src[0];
8581
+
8388
8582
  assert(params->ith == 0);
8389
8583
  assert(ggml_is_scalar(dst));
8390
8584
 
@@ -8416,8 +8610,10 @@ static void ggml_compute_forward_sum_f32(
8416
8610
 
8417
8611
  static void ggml_compute_forward_sum_f16(
8418
8612
  const struct ggml_compute_params * params,
8419
- const struct ggml_tensor * src0,
8420
8613
  struct ggml_tensor * dst) {
8614
+
8615
+ const struct ggml_tensor * src0 = dst->src[0];
8616
+
8421
8617
  assert(params->ith == 0);
8422
8618
  assert(ggml_is_scalar(dst));
8423
8619
 
@@ -8448,16 +8644,18 @@ static void ggml_compute_forward_sum_f16(
8448
8644
 
8449
8645
  static void ggml_compute_forward_sum(
8450
8646
  const struct ggml_compute_params * params,
8451
- const struct ggml_tensor * src0,
8452
8647
  struct ggml_tensor * dst) {
8648
+
8649
+ const struct ggml_tensor * src0 = dst->src[0];
8650
+
8453
8651
  switch (src0->type) {
8454
8652
  case GGML_TYPE_F32:
8455
8653
  {
8456
- ggml_compute_forward_sum_f32(params, src0, dst);
8654
+ ggml_compute_forward_sum_f32(params, dst);
8457
8655
  } break;
8458
8656
  case GGML_TYPE_F16:
8459
8657
  {
8460
- ggml_compute_forward_sum_f16(params, src0, dst);
8658
+ ggml_compute_forward_sum_f16(params, dst);
8461
8659
  } break;
8462
8660
  default:
8463
8661
  {
@@ -8470,8 +8668,10 @@ static void ggml_compute_forward_sum(
8470
8668
 
8471
8669
  static void ggml_compute_forward_sum_rows_f32(
8472
8670
  const struct ggml_compute_params * params,
8473
- const struct ggml_tensor * src0,
8474
8671
  struct ggml_tensor * dst) {
8672
+
8673
+ const struct ggml_tensor * src0 = dst->src[0];
8674
+
8475
8675
  GGML_ASSERT(params->ith == 0);
8476
8676
 
8477
8677
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -8503,12 +8703,14 @@ static void ggml_compute_forward_sum_rows_f32(
8503
8703
 
8504
8704
  static void ggml_compute_forward_sum_rows(
8505
8705
  const struct ggml_compute_params * params,
8506
- const struct ggml_tensor * src0,
8507
8706
  struct ggml_tensor * dst) {
8707
+
8708
+ const struct ggml_tensor * src0 = dst->src[0];
8709
+
8508
8710
  switch (src0->type) {
8509
8711
  case GGML_TYPE_F32:
8510
8712
  {
8511
- ggml_compute_forward_sum_rows_f32(params, src0, dst);
8713
+ ggml_compute_forward_sum_rows_f32(params, dst);
8512
8714
  } break;
8513
8715
  default:
8514
8716
  {
@@ -8521,8 +8723,10 @@ static void ggml_compute_forward_sum_rows(
8521
8723
 
8522
8724
  static void ggml_compute_forward_mean_f32(
8523
8725
  const struct ggml_compute_params * params,
8524
- const struct ggml_tensor * src0,
8525
8726
  struct ggml_tensor * dst) {
8727
+
8728
+ const struct ggml_tensor * src0 = dst->src[0];
8729
+
8526
8730
  assert(params->ith == 0);
8527
8731
 
8528
8732
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -8558,12 +8762,14 @@ static void ggml_compute_forward_mean_f32(
8558
8762
 
8559
8763
  static void ggml_compute_forward_mean(
8560
8764
  const struct ggml_compute_params * params,
8561
- const struct ggml_tensor * src0,
8562
8765
  struct ggml_tensor * dst) {
8766
+
8767
+ const struct ggml_tensor * src0 = dst->src[0];
8768
+
8563
8769
  switch (src0->type) {
8564
8770
  case GGML_TYPE_F32:
8565
8771
  {
8566
- ggml_compute_forward_mean_f32(params, src0, dst);
8772
+ ggml_compute_forward_mean_f32(params, dst);
8567
8773
  } break;
8568
8774
  default:
8569
8775
  {
@@ -8576,8 +8782,10 @@ static void ggml_compute_forward_mean(
8576
8782
 
8577
8783
  static void ggml_compute_forward_argmax_f32(
8578
8784
  const struct ggml_compute_params * params,
8579
- const struct ggml_tensor * src0,
8580
8785
  struct ggml_tensor * dst) {
8786
+
8787
+ const struct ggml_tensor * src0 = dst->src[0];
8788
+
8581
8789
  assert(params->ith == 0);
8582
8790
 
8583
8791
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -8604,12 +8812,14 @@ static void ggml_compute_forward_argmax_f32(
8604
8812
 
8605
8813
  static void ggml_compute_forward_argmax(
8606
8814
  const struct ggml_compute_params * params,
8607
- const struct ggml_tensor * src0,
8608
8815
  struct ggml_tensor * dst) {
8816
+
8817
+ const struct ggml_tensor * src0 = dst->src[0];
8818
+
8609
8819
  switch (src0->type) {
8610
8820
  case GGML_TYPE_F32:
8611
8821
  {
8612
- ggml_compute_forward_argmax_f32(params, src0, dst);
8822
+ ggml_compute_forward_argmax_f32(params, dst);
8613
8823
  } break;
8614
8824
  default:
8615
8825
  {
@@ -8622,8 +8832,10 @@ static void ggml_compute_forward_argmax(
8622
8832
 
8623
8833
  static void ggml_compute_forward_repeat_f32(
8624
8834
  const struct ggml_compute_params * params,
8625
- const struct ggml_tensor * src0,
8626
8835
  struct ggml_tensor * dst) {
8836
+
8837
+ const struct ggml_tensor * src0 = dst->src[0];
8838
+
8627
8839
  GGML_ASSERT(params->ith == 0);
8628
8840
  GGML_ASSERT(ggml_can_repeat(src0, dst));
8629
8841
 
@@ -8665,8 +8877,10 @@ static void ggml_compute_forward_repeat_f32(
8665
8877
 
8666
8878
  static void ggml_compute_forward_repeat_f16(
8667
8879
  const struct ggml_compute_params * params,
8668
- const struct ggml_tensor * src0,
8669
8880
  struct ggml_tensor * dst) {
8881
+
8882
+ const struct ggml_tensor * src0 = dst->src[0];
8883
+
8670
8884
  GGML_ASSERT(params->ith == 0);
8671
8885
  GGML_ASSERT(ggml_can_repeat(src0, dst));
8672
8886
 
@@ -8711,18 +8925,20 @@ static void ggml_compute_forward_repeat_f16(
8711
8925
 
8712
8926
  static void ggml_compute_forward_repeat(
8713
8927
  const struct ggml_compute_params * params,
8714
- const struct ggml_tensor * src0,
8715
8928
  struct ggml_tensor * dst) {
8929
+
8930
+ const struct ggml_tensor * src0 = dst->src[0];
8931
+
8716
8932
  switch (src0->type) {
8717
8933
  case GGML_TYPE_F16:
8718
8934
  case GGML_TYPE_I16:
8719
8935
  {
8720
- ggml_compute_forward_repeat_f16(params, src0, dst);
8936
+ ggml_compute_forward_repeat_f16(params, dst);
8721
8937
  } break;
8722
8938
  case GGML_TYPE_F32:
8723
8939
  case GGML_TYPE_I32:
8724
8940
  {
8725
- ggml_compute_forward_repeat_f32(params, src0, dst);
8941
+ ggml_compute_forward_repeat_f32(params, dst);
8726
8942
  } break;
8727
8943
  default:
8728
8944
  {
@@ -8735,8 +8951,10 @@ static void ggml_compute_forward_repeat(
8735
8951
 
8736
8952
  static void ggml_compute_forward_repeat_back_f32(
8737
8953
  const struct ggml_compute_params * params,
8738
- const struct ggml_tensor * src0,
8739
8954
  struct ggml_tensor * dst) {
8955
+
8956
+ const struct ggml_tensor * src0 = dst->src[0];
8957
+
8740
8958
  GGML_ASSERT(params->ith == 0);
8741
8959
  GGML_ASSERT(ggml_can_repeat(dst, src0));
8742
8960
 
@@ -8792,12 +9010,14 @@ static void ggml_compute_forward_repeat_back_f32(
8792
9010
 
8793
9011
  static void ggml_compute_forward_repeat_back(
8794
9012
  const struct ggml_compute_params * params,
8795
- const struct ggml_tensor * src0,
8796
9013
  struct ggml_tensor * dst) {
9014
+
9015
+ const struct ggml_tensor * src0 = dst->src[0];
9016
+
8797
9017
  switch (src0->type) {
8798
9018
  case GGML_TYPE_F32:
8799
9019
  {
8800
- ggml_compute_forward_repeat_back_f32(params, src0, dst);
9020
+ ggml_compute_forward_repeat_back_f32(params, dst);
8801
9021
  } break;
8802
9022
  default:
8803
9023
  {
@@ -8810,10 +9030,11 @@ static void ggml_compute_forward_repeat_back(
8810
9030
 
8811
9031
  static void ggml_compute_forward_concat_f32(
8812
9032
  const struct ggml_compute_params * params,
8813
- const struct ggml_tensor * src0,
8814
- const struct ggml_tensor * src1,
8815
9033
  struct ggml_tensor * dst) {
8816
9034
 
9035
+ const struct ggml_tensor * src0 = dst->src[0];
9036
+ const struct ggml_tensor * src1 = dst->src[1];
9037
+
8817
9038
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8818
9039
  return;
8819
9040
  }
@@ -8858,14 +9079,15 @@ static void ggml_compute_forward_concat_f32(
8858
9079
 
8859
9080
  static void ggml_compute_forward_concat(
8860
9081
  const struct ggml_compute_params* params,
8861
- const struct ggml_tensor* src0,
8862
- const struct ggml_tensor* src1,
8863
9082
  struct ggml_tensor* dst) {
9083
+
9084
+ const struct ggml_tensor * src0 = dst->src[0];
9085
+
8864
9086
  switch (src0->type) {
8865
9087
  case GGML_TYPE_F32:
8866
9088
  case GGML_TYPE_I32:
8867
9089
  {
8868
- ggml_compute_forward_concat_f32(params, src0, src1, dst);
9090
+ ggml_compute_forward_concat_f32(params, dst);
8869
9091
  } break;
8870
9092
  default:
8871
9093
  {
@@ -8878,8 +9100,10 @@ static void ggml_compute_forward_concat(
8878
9100
 
8879
9101
  static void ggml_compute_forward_abs_f32(
8880
9102
  const struct ggml_compute_params * params,
8881
- const struct ggml_tensor * src0,
8882
9103
  struct ggml_tensor * dst) {
9104
+
9105
+ const struct ggml_tensor * src0 = dst->src[0];
9106
+
8883
9107
  assert(params->ith == 0);
8884
9108
  assert(ggml_are_same_shape(src0, dst));
8885
9109
 
@@ -8902,12 +9126,14 @@ static void ggml_compute_forward_abs_f32(
8902
9126
 
8903
9127
  static void ggml_compute_forward_abs(
8904
9128
  const struct ggml_compute_params * params,
8905
- const struct ggml_tensor * src0,
8906
9129
  struct ggml_tensor * dst) {
9130
+
9131
+ const struct ggml_tensor * src0 = dst->src[0];
9132
+
8907
9133
  switch (src0->type) {
8908
9134
  case GGML_TYPE_F32:
8909
9135
  {
8910
- ggml_compute_forward_abs_f32(params, src0, dst);
9136
+ ggml_compute_forward_abs_f32(params, dst);
8911
9137
  } break;
8912
9138
  default:
8913
9139
  {
@@ -8920,8 +9146,10 @@ static void ggml_compute_forward_abs(
8920
9146
 
8921
9147
  static void ggml_compute_forward_sgn_f32(
8922
9148
  const struct ggml_compute_params * params,
8923
- const struct ggml_tensor * src0,
8924
9149
  struct ggml_tensor * dst) {
9150
+
9151
+ const struct ggml_tensor * src0 = dst->src[0];
9152
+
8925
9153
  assert(params->ith == 0);
8926
9154
  assert(ggml_are_same_shape(src0, dst));
8927
9155
 
@@ -8944,12 +9172,14 @@ static void ggml_compute_forward_sgn_f32(
8944
9172
 
8945
9173
  static void ggml_compute_forward_sgn(
8946
9174
  const struct ggml_compute_params * params,
8947
- const struct ggml_tensor * src0,
8948
9175
  struct ggml_tensor * dst) {
9176
+
9177
+ const struct ggml_tensor * src0 = dst->src[0];
9178
+
8949
9179
  switch (src0->type) {
8950
9180
  case GGML_TYPE_F32:
8951
9181
  {
8952
- ggml_compute_forward_sgn_f32(params, src0, dst);
9182
+ ggml_compute_forward_sgn_f32(params, dst);
8953
9183
  } break;
8954
9184
  default:
8955
9185
  {
@@ -8962,8 +9192,10 @@ static void ggml_compute_forward_sgn(
8962
9192
 
8963
9193
  static void ggml_compute_forward_neg_f32(
8964
9194
  const struct ggml_compute_params * params,
8965
- const struct ggml_tensor * src0,
8966
9195
  struct ggml_tensor * dst) {
9196
+
9197
+ const struct ggml_tensor * src0 = dst->src[0];
9198
+
8967
9199
  assert(params->ith == 0);
8968
9200
  assert(ggml_are_same_shape(src0, dst));
8969
9201
 
@@ -8986,12 +9218,14 @@ static void ggml_compute_forward_neg_f32(
8986
9218
 
8987
9219
  static void ggml_compute_forward_neg(
8988
9220
  const struct ggml_compute_params * params,
8989
- const struct ggml_tensor * src0,
8990
9221
  struct ggml_tensor * dst) {
9222
+
9223
+ const struct ggml_tensor * src0 = dst->src[0];
9224
+
8991
9225
  switch (src0->type) {
8992
9226
  case GGML_TYPE_F32:
8993
9227
  {
8994
- ggml_compute_forward_neg_f32(params, src0, dst);
9228
+ ggml_compute_forward_neg_f32(params, dst);
8995
9229
  } break;
8996
9230
  default:
8997
9231
  {
@@ -9004,8 +9238,10 @@ static void ggml_compute_forward_neg(
9004
9238
 
9005
9239
  static void ggml_compute_forward_step_f32(
9006
9240
  const struct ggml_compute_params * params,
9007
- const struct ggml_tensor * src0,
9008
9241
  struct ggml_tensor * dst) {
9242
+
9243
+ const struct ggml_tensor * src0 = dst->src[0];
9244
+
9009
9245
  assert(params->ith == 0);
9010
9246
  assert(ggml_are_same_shape(src0, dst));
9011
9247
 
@@ -9028,12 +9264,14 @@ static void ggml_compute_forward_step_f32(
9028
9264
 
9029
9265
  static void ggml_compute_forward_step(
9030
9266
  const struct ggml_compute_params * params,
9031
- const struct ggml_tensor * src0,
9032
9267
  struct ggml_tensor * dst) {
9268
+
9269
+ const struct ggml_tensor * src0 = dst->src[0];
9270
+
9033
9271
  switch (src0->type) {
9034
9272
  case GGML_TYPE_F32:
9035
9273
  {
9036
- ggml_compute_forward_step_f32(params, src0, dst);
9274
+ ggml_compute_forward_step_f32(params, dst);
9037
9275
  } break;
9038
9276
  default:
9039
9277
  {
@@ -9046,8 +9284,10 @@ static void ggml_compute_forward_step(
9046
9284
 
9047
9285
  static void ggml_compute_forward_tanh_f32(
9048
9286
  const struct ggml_compute_params * params,
9049
- const struct ggml_tensor * src0,
9050
9287
  struct ggml_tensor * dst) {
9288
+
9289
+ const struct ggml_tensor * src0 = dst->src[0];
9290
+
9051
9291
  assert(params->ith == 0);
9052
9292
  assert(ggml_are_same_shape(src0, dst));
9053
9293
 
@@ -9070,12 +9310,14 @@ static void ggml_compute_forward_tanh_f32(
9070
9310
 
9071
9311
  static void ggml_compute_forward_tanh(
9072
9312
  const struct ggml_compute_params * params,
9073
- const struct ggml_tensor * src0,
9074
9313
  struct ggml_tensor * dst) {
9314
+
9315
+ const struct ggml_tensor * src0 = dst->src[0];
9316
+
9075
9317
  switch (src0->type) {
9076
9318
  case GGML_TYPE_F32:
9077
9319
  {
9078
- ggml_compute_forward_tanh_f32(params, src0, dst);
9320
+ ggml_compute_forward_tanh_f32(params, dst);
9079
9321
  } break;
9080
9322
  default:
9081
9323
  {
@@ -9088,8 +9330,10 @@ static void ggml_compute_forward_tanh(
9088
9330
 
9089
9331
  static void ggml_compute_forward_elu_f32(
9090
9332
  const struct ggml_compute_params * params,
9091
- const struct ggml_tensor * src0,
9092
9333
  struct ggml_tensor * dst) {
9334
+
9335
+ const struct ggml_tensor * src0 = dst->src[0];
9336
+
9093
9337
  assert(params->ith == 0);
9094
9338
  assert(ggml_are_same_shape(src0, dst));
9095
9339
 
@@ -9112,12 +9356,14 @@ static void ggml_compute_forward_elu_f32(
9112
9356
 
9113
9357
  static void ggml_compute_forward_elu(
9114
9358
  const struct ggml_compute_params * params,
9115
- const struct ggml_tensor * src0,
9116
9359
  struct ggml_tensor * dst) {
9360
+
9361
+ const struct ggml_tensor * src0 = dst->src[0];
9362
+
9117
9363
  switch (src0->type) {
9118
9364
  case GGML_TYPE_F32:
9119
9365
  {
9120
- ggml_compute_forward_elu_f32(params, src0, dst);
9366
+ ggml_compute_forward_elu_f32(params, dst);
9121
9367
  } break;
9122
9368
  default:
9123
9369
  {
@@ -9130,8 +9376,10 @@ static void ggml_compute_forward_elu(
9130
9376
 
9131
9377
  static void ggml_compute_forward_relu_f32(
9132
9378
  const struct ggml_compute_params * params,
9133
- const struct ggml_tensor * src0,
9134
9379
  struct ggml_tensor * dst) {
9380
+
9381
+ const struct ggml_tensor * src0 = dst->src[0];
9382
+
9135
9383
  assert(params->ith == 0);
9136
9384
  assert(ggml_are_same_shape(src0, dst));
9137
9385
 
@@ -9154,12 +9402,14 @@ static void ggml_compute_forward_relu_f32(
9154
9402
 
9155
9403
  static void ggml_compute_forward_relu(
9156
9404
  const struct ggml_compute_params * params,
9157
- const struct ggml_tensor * src0,
9158
9405
  struct ggml_tensor * dst) {
9406
+
9407
+ const struct ggml_tensor * src0 = dst->src[0];
9408
+
9159
9409
  switch (src0->type) {
9160
9410
  case GGML_TYPE_F32:
9161
9411
  {
9162
- ggml_compute_forward_relu_f32(params, src0, dst);
9412
+ ggml_compute_forward_relu_f32(params, dst);
9163
9413
  } break;
9164
9414
  default:
9165
9415
  {
@@ -9172,8 +9422,10 @@ static void ggml_compute_forward_relu(
9172
9422
 
9173
9423
  static void ggml_compute_forward_gelu_f32(
9174
9424
  const struct ggml_compute_params * params,
9175
- const struct ggml_tensor * src0,
9176
9425
  struct ggml_tensor * dst) {
9426
+
9427
+ const struct ggml_tensor * src0 = dst->src[0];
9428
+
9177
9429
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
9178
9430
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
9179
9431
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
@@ -9213,12 +9465,14 @@ static void ggml_compute_forward_gelu_f32(
9213
9465
 
9214
9466
  static void ggml_compute_forward_gelu(
9215
9467
  const struct ggml_compute_params * params,
9216
- const struct ggml_tensor * src0,
9217
9468
  struct ggml_tensor * dst) {
9469
+
9470
+ const struct ggml_tensor * src0 = dst->src[0];
9471
+
9218
9472
  switch (src0->type) {
9219
9473
  case GGML_TYPE_F32:
9220
9474
  {
9221
- ggml_compute_forward_gelu_f32(params, src0, dst);
9475
+ ggml_compute_forward_gelu_f32(params, dst);
9222
9476
  } break;
9223
9477
  default:
9224
9478
  {
@@ -9231,8 +9485,10 @@ static void ggml_compute_forward_gelu(
9231
9485
 
9232
9486
  static void ggml_compute_forward_gelu_quick_f32(
9233
9487
  const struct ggml_compute_params * params,
9234
- const struct ggml_tensor * src0,
9235
9488
  struct ggml_tensor * dst) {
9489
+
9490
+ const struct ggml_tensor * src0 = dst->src[0];
9491
+
9236
9492
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
9237
9493
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
9238
9494
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
@@ -9272,12 +9528,14 @@ static void ggml_compute_forward_gelu_quick_f32(
9272
9528
 
9273
9529
  static void ggml_compute_forward_gelu_quick(
9274
9530
  const struct ggml_compute_params * params,
9275
- const struct ggml_tensor * src0,
9276
9531
  struct ggml_tensor * dst) {
9532
+
9533
+ const struct ggml_tensor * src0 = dst->src[0];
9534
+
9277
9535
  switch (src0->type) {
9278
9536
  case GGML_TYPE_F32:
9279
9537
  {
9280
- ggml_compute_forward_gelu_quick_f32(params, src0, dst);
9538
+ ggml_compute_forward_gelu_quick_f32(params, dst);
9281
9539
  } break;
9282
9540
  default:
9283
9541
  {
@@ -9290,8 +9548,10 @@ static void ggml_compute_forward_gelu_quick(
9290
9548
 
9291
9549
  static void ggml_compute_forward_silu_f32(
9292
9550
  const struct ggml_compute_params * params,
9293
- const struct ggml_tensor * src0,
9294
9551
  struct ggml_tensor * dst) {
9552
+
9553
+ const struct ggml_tensor * src0 = dst->src[0];
9554
+
9295
9555
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
9296
9556
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
9297
9557
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
@@ -9331,12 +9591,14 @@ static void ggml_compute_forward_silu_f32(
9331
9591
 
9332
9592
  static void ggml_compute_forward_silu(
9333
9593
  const struct ggml_compute_params * params,
9334
- const struct ggml_tensor * src0,
9335
9594
  struct ggml_tensor * dst) {
9595
+
9596
+ const struct ggml_tensor * src0 = dst->src[0];
9597
+
9336
9598
  switch (src0->type) {
9337
9599
  case GGML_TYPE_F32:
9338
9600
  {
9339
- ggml_compute_forward_silu_f32(params, src0, dst);
9601
+ ggml_compute_forward_silu_f32(params, dst);
9340
9602
  } break;
9341
9603
  default:
9342
9604
  {
@@ -9348,8 +9610,10 @@ static void ggml_compute_forward_silu(
9348
9610
 
9349
9611
  static void ggml_compute_forward_leaky_relu_f32(
9350
9612
  const struct ggml_compute_params * params,
9351
- const struct ggml_tensor * src0,
9352
9613
  struct ggml_tensor * dst) {
9614
+
9615
+ const struct ggml_tensor * src0 = dst->src[0];
9616
+
9353
9617
  assert(params->ith == 0);
9354
9618
  assert(ggml_are_same_shape(src0, dst));
9355
9619
 
@@ -9375,12 +9639,14 @@ static void ggml_compute_forward_leaky_relu_f32(
9375
9639
 
9376
9640
  static void ggml_compute_forward_leaky_relu(
9377
9641
  const struct ggml_compute_params * params,
9378
- const struct ggml_tensor * src0,
9379
9642
  struct ggml_tensor * dst) {
9643
+
9644
+ const struct ggml_tensor * src0 = dst->src[0];
9645
+
9380
9646
  switch (src0->type) {
9381
9647
  case GGML_TYPE_F32:
9382
9648
  {
9383
- ggml_compute_forward_leaky_relu_f32(params, src0, dst);
9649
+ ggml_compute_forward_leaky_relu_f32(params, dst);
9384
9650
  } break;
9385
9651
  default:
9386
9652
  {
@@ -9393,9 +9659,11 @@ static void ggml_compute_forward_leaky_relu(
9393
9659
 
9394
9660
  static void ggml_compute_forward_silu_back_f32(
9395
9661
  const struct ggml_compute_params * params,
9396
- const struct ggml_tensor * src0,
9397
- const struct ggml_tensor * grad,
9398
9662
  struct ggml_tensor * dst) {
9663
+
9664
+ const struct ggml_tensor * src0 = dst->src[0];
9665
+ const struct ggml_tensor * grad = dst->src[1];
9666
+
9399
9667
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(grad));
9400
9668
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
9401
9669
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
@@ -9438,13 +9706,14 @@ static void ggml_compute_forward_silu_back_f32(
9438
9706
 
9439
9707
  static void ggml_compute_forward_silu_back(
9440
9708
  const struct ggml_compute_params * params,
9441
- const struct ggml_tensor * src0,
9442
- const struct ggml_tensor * grad,
9443
9709
  struct ggml_tensor * dst) {
9710
+
9711
+ const struct ggml_tensor * src0 = dst->src[0];
9712
+
9444
9713
  switch (src0->type) {
9445
9714
  case GGML_TYPE_F32:
9446
9715
  {
9447
- ggml_compute_forward_silu_back_f32(params, src0, grad, dst);
9716
+ ggml_compute_forward_silu_back_f32(params, dst);
9448
9717
  } break;
9449
9718
  default:
9450
9719
  {
@@ -9456,8 +9725,10 @@ static void ggml_compute_forward_silu_back(
9456
9725
 
9457
9726
  static void ggml_compute_forward_hardswish_f32(
9458
9727
  const struct ggml_compute_params * params,
9459
- const struct ggml_tensor * src0,
9460
9728
  struct ggml_tensor * dst) {
9729
+
9730
+ const struct ggml_tensor * src0 = dst->src[0];
9731
+
9461
9732
  assert(params->ith == 0);
9462
9733
  assert(ggml_are_same_shape(src0, dst));
9463
9734
 
@@ -9479,12 +9750,14 @@ static void ggml_compute_forward_hardswish_f32(
9479
9750
  }
9480
9751
  static void ggml_compute_forward_hardswish(
9481
9752
  const struct ggml_compute_params * params,
9482
- const struct ggml_tensor * src0,
9483
9753
  struct ggml_tensor * dst) {
9754
+
9755
+ const struct ggml_tensor * src0 = dst->src[0];
9756
+
9484
9757
  switch (src0->type) {
9485
9758
  case GGML_TYPE_F32:
9486
9759
  {
9487
- ggml_compute_forward_hardswish_f32(params, src0, dst);
9760
+ ggml_compute_forward_hardswish_f32(params, dst);
9488
9761
  } break;
9489
9762
  default:
9490
9763
  {
@@ -9495,8 +9768,10 @@ static void ggml_compute_forward_hardswish(
9495
9768
 
9496
9769
  static void ggml_compute_forward_hardsigmoid_f32(
9497
9770
  const struct ggml_compute_params * params,
9498
- const struct ggml_tensor * src0,
9499
9771
  struct ggml_tensor * dst) {
9772
+
9773
+ const struct ggml_tensor * src0 = dst->src[0];
9774
+
9500
9775
  assert(params->ith == 0);
9501
9776
  assert(ggml_are_same_shape(src0, dst));
9502
9777
 
@@ -9519,12 +9794,14 @@ static void ggml_compute_forward_hardsigmoid_f32(
9519
9794
 
9520
9795
  static void ggml_compute_forward_hardsigmoid(
9521
9796
  const struct ggml_compute_params * params,
9522
- const struct ggml_tensor * src0,
9523
9797
  struct ggml_tensor * dst) {
9798
+
9799
+ const struct ggml_tensor * src0 = dst->src[0];
9800
+
9524
9801
  switch (src0->type) {
9525
9802
  case GGML_TYPE_F32:
9526
9803
  {
9527
- ggml_compute_forward_hardsigmoid_f32(params, src0, dst);
9804
+ ggml_compute_forward_hardsigmoid_f32(params, dst);
9528
9805
  } break;
9529
9806
  default:
9530
9807
  {
@@ -9538,8 +9815,10 @@ static void ggml_compute_forward_hardsigmoid(
9538
9815
 
9539
9816
  static void ggml_compute_forward_norm_f32(
9540
9817
  const struct ggml_compute_params * params,
9541
- const struct ggml_tensor * src0,
9542
9818
  struct ggml_tensor * dst) {
9819
+
9820
+ const struct ggml_tensor * src0 = dst->src[0];
9821
+
9543
9822
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
9544
9823
 
9545
9824
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -9591,12 +9870,14 @@ static void ggml_compute_forward_norm_f32(
9591
9870
 
9592
9871
  static void ggml_compute_forward_norm(
9593
9872
  const struct ggml_compute_params * params,
9594
- const struct ggml_tensor * src0,
9595
9873
  struct ggml_tensor * dst) {
9874
+
9875
+ const struct ggml_tensor * src0 = dst->src[0];
9876
+
9596
9877
  switch (src0->type) {
9597
9878
  case GGML_TYPE_F32:
9598
9879
  {
9599
- ggml_compute_forward_norm_f32(params, src0, dst);
9880
+ ggml_compute_forward_norm_f32(params, dst);
9600
9881
  } break;
9601
9882
  default:
9602
9883
  {
@@ -9609,8 +9890,10 @@ static void ggml_compute_forward_norm(
9609
9890
 
9610
9891
  static void ggml_compute_forward_rms_norm_f32(
9611
9892
  const struct ggml_compute_params * params,
9612
- const struct ggml_tensor * src0,
9613
9893
  struct ggml_tensor * dst) {
9894
+
9895
+ const struct ggml_tensor * src0 = dst->src[0];
9896
+
9614
9897
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
9615
9898
 
9616
9899
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -9659,12 +9942,14 @@ static void ggml_compute_forward_rms_norm_f32(
9659
9942
 
9660
9943
  static void ggml_compute_forward_rms_norm(
9661
9944
  const struct ggml_compute_params * params,
9662
- const struct ggml_tensor * src0,
9663
9945
  struct ggml_tensor * dst) {
9946
+
9947
+ const struct ggml_tensor * src0 = dst->src[0];
9948
+
9664
9949
  switch (src0->type) {
9665
9950
  case GGML_TYPE_F32:
9666
9951
  {
9667
- ggml_compute_forward_rms_norm_f32(params, src0, dst);
9952
+ ggml_compute_forward_rms_norm_f32(params, dst);
9668
9953
  } break;
9669
9954
  default:
9670
9955
  {
@@ -9675,9 +9960,11 @@ static void ggml_compute_forward_rms_norm(
9675
9960
 
9676
9961
  static void ggml_compute_forward_rms_norm_back_f32(
9677
9962
  const struct ggml_compute_params * params,
9678
- const struct ggml_tensor * src0,
9679
- const struct ggml_tensor * src1,
9680
9963
  struct ggml_tensor * dst) {
9964
+
9965
+ const struct ggml_tensor * src0 = dst->src[0];
9966
+ const struct ggml_tensor * src1 = dst->src[1];
9967
+
9681
9968
  GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_are_same_shape(src0, src1));
9682
9969
 
9683
9970
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -9832,13 +10119,14 @@ static void ggml_compute_forward_rms_norm_back_f32(
9832
10119
 
9833
10120
  static void ggml_compute_forward_rms_norm_back(
9834
10121
  const struct ggml_compute_params * params,
9835
- const struct ggml_tensor * src0,
9836
- const struct ggml_tensor * src1,
9837
10122
  struct ggml_tensor * dst) {
10123
+
10124
+ const struct ggml_tensor * src0 = dst->src[0];
10125
+
9838
10126
  switch (src0->type) {
9839
10127
  case GGML_TYPE_F32:
9840
10128
  {
9841
- ggml_compute_forward_rms_norm_back_f32(params, src0, src1, dst);
10129
+ ggml_compute_forward_rms_norm_back_f32(params, dst);
9842
10130
  } break;
9843
10131
  default:
9844
10132
  {
@@ -9851,8 +10139,10 @@ static void ggml_compute_forward_rms_norm_back(
9851
10139
 
9852
10140
  static void ggml_compute_forward_group_norm_f32(
9853
10141
  const struct ggml_compute_params * params,
9854
- const struct ggml_tensor * src0,
9855
10142
  struct ggml_tensor * dst) {
10143
+
10144
+ const struct ggml_tensor * src0 = dst->src[0];
10145
+
9856
10146
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
9857
10147
 
9858
10148
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -9923,12 +10213,14 @@ static void ggml_compute_forward_group_norm_f32(
9923
10213
 
9924
10214
  static void ggml_compute_forward_group_norm(
9925
10215
  const struct ggml_compute_params * params,
9926
- const struct ggml_tensor * src0,
9927
10216
  struct ggml_tensor * dst) {
10217
+
10218
+ const struct ggml_tensor * src0 = dst->src[0];
10219
+
9928
10220
  switch (src0->type) {
9929
10221
  case GGML_TYPE_F32:
9930
10222
  {
9931
- ggml_compute_forward_group_norm_f32(params, src0, dst);
10223
+ ggml_compute_forward_group_norm_f32(params, dst);
9932
10224
  } break;
9933
10225
  default:
9934
10226
  {
@@ -9974,9 +10266,11 @@ static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
9974
10266
 
9975
10267
  static void ggml_compute_forward_mul_mat(
9976
10268
  const struct ggml_compute_params * params,
9977
- const struct ggml_tensor * src0,
9978
- const struct ggml_tensor * src1,
9979
10269
  struct ggml_tensor * dst) {
10270
+
10271
+ const struct ggml_tensor * src0 = dst->src[0];
10272
+ const struct ggml_tensor * src1 = dst->src[1];
10273
+
9980
10274
  int64_t t0 = ggml_perf_time_us();
9981
10275
  UNUSED(t0);
9982
10276
 
@@ -9992,6 +10286,7 @@ static void ggml_compute_forward_mul_mat(
9992
10286
  ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
9993
10287
  enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
9994
10288
  ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
10289
+ int64_t const vec_dot_num_rows = type_traits[type].nrows;
9995
10290
 
9996
10291
  GGML_ASSERT(ne0 == ne01);
9997
10292
  GGML_ASSERT(ne1 == ne11);
@@ -10159,12 +10454,23 @@ static void ggml_compute_forward_mul_mat(
10159
10454
  const int64_t blck_0 = 16;
10160
10455
  const int64_t blck_1 = 16;
10161
10456
 
10457
+ // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
10458
+ int64_t nrc = vec_dot_num_rows;
10459
+ // TODO: currently the mmla kernels support only even numbered rows/cols.
10460
+ // this check can be removed once they are extended to support odd numbered rows/cols too
10461
+ if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) {
10462
+ nrc = 1;
10463
+ }
10464
+
10465
+ const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
10466
+
10162
10467
  // attempt to reduce false-sharing (does not seem to make a difference)
10163
- float tmp[16];
10468
+ // 16 * 2, accounting for mmla kernels
10469
+ float tmp[32];
10164
10470
 
10165
10471
  for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
10166
10472
  for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
10167
- for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
10473
+ for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ir1 += nrc) {
10168
10474
  const int64_t i13 = (ir1/(ne12*ne1));
10169
10475
  const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1;
10170
10476
  const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1);
@@ -10187,17 +10493,19 @@ static void ggml_compute_forward_mul_mat(
10187
10493
  (src1_cont || src1->type != vec_dot_type
10188
10494
  ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
10189
10495
  : (i11*nb11 + i12*nb12 + i13*nb13));
10190
-
10191
10496
  float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
10192
10497
 
10193
10498
  //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
10194
10499
  // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
10195
10500
  //}
10196
10501
 
10197
- for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
10198
- vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
10502
+ for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ir0 += nrc) {
10503
+ vec_dot(ne00, &tmp[ir0 - iir0], (nrc>1 ? 16 : 0), src0_row + ir0*nb01, (nrc>1 ? nb01 : 0), src1_col, (nrc>1 ? src1_col_stride : 0), nrc);
10504
+ }
10505
+
10506
+ for (int cn = 0; cn < nrc; ++cn) {
10507
+ memcpy(&dst_col[iir0 + cn*nb1/nb0], tmp + (cn*16), (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
10199
10508
  }
10200
- memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
10201
10509
  }
10202
10510
  }
10203
10511
  }
@@ -10207,10 +10515,11 @@ static void ggml_compute_forward_mul_mat(
10207
10515
 
10208
10516
  static void ggml_compute_forward_mul_mat_id(
10209
10517
  const struct ggml_compute_params * params,
10210
- const struct ggml_tensor * ids,
10211
- const struct ggml_tensor * src1,
10212
10518
  struct ggml_tensor * dst) {
10213
10519
 
10520
+ const struct ggml_tensor * ids = dst->src[0];
10521
+ const struct ggml_tensor * src1 = dst->src[1];
10522
+
10214
10523
  const struct ggml_tensor * src0 = dst->src[2]; // only for GGML_TENSOR_BINARY_OP_LOCALS
10215
10524
 
10216
10525
  GGML_TENSOR_BINARY_OP_LOCALS
@@ -10386,7 +10695,7 @@ static void ggml_compute_forward_mul_mat_id(
10386
10695
  //}
10387
10696
 
10388
10697
  for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
10389
- vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
10698
+ vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_row + ir0*nb01, 0, src1_col, 0, 1);
10390
10699
  }
10391
10700
  memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
10392
10701
  }
@@ -10401,9 +10710,11 @@ static void ggml_compute_forward_mul_mat_id(
10401
10710
 
10402
10711
  static void ggml_compute_forward_out_prod_f32(
10403
10712
  const struct ggml_compute_params * params,
10404
- const struct ggml_tensor * src0,
10405
- const struct ggml_tensor * src1,
10406
10713
  struct ggml_tensor * dst) {
10714
+
10715
+ const struct ggml_tensor * src0 = dst->src[0];
10716
+ const struct ggml_tensor * src1 = dst->src[1];
10717
+
10407
10718
  // int64_t t0 = ggml_perf_time_us();
10408
10719
  // UNUSED(t0);
10409
10720
 
@@ -10593,9 +10904,11 @@ static void ggml_compute_forward_out_prod_f32(
10593
10904
 
10594
10905
  static void ggml_compute_forward_out_prod_q_f32(
10595
10906
  const struct ggml_compute_params * params,
10596
- const struct ggml_tensor * src0,
10597
- const struct ggml_tensor * src1,
10598
10907
  struct ggml_tensor * dst) {
10908
+
10909
+ const struct ggml_tensor * src0 = dst->src[0];
10910
+ const struct ggml_tensor * src1 = dst->src[1];
10911
+
10599
10912
  // int64_t t0 = ggml_perf_time_us();
10600
10913
  // UNUSED(t0);
10601
10914
 
@@ -10706,9 +11019,10 @@ static void ggml_compute_forward_out_prod_q_f32(
10706
11019
 
10707
11020
  static void ggml_compute_forward_out_prod(
10708
11021
  const struct ggml_compute_params * params,
10709
- const struct ggml_tensor * src0,
10710
- const struct ggml_tensor * src1,
10711
11022
  struct ggml_tensor * dst) {
11023
+
11024
+ const struct ggml_tensor * src0 = dst->src[0];
11025
+
10712
11026
  switch (src0->type) {
10713
11027
  case GGML_TYPE_Q4_0:
10714
11028
  case GGML_TYPE_Q4_1:
@@ -10723,17 +11037,19 @@ static void ggml_compute_forward_out_prod(
10723
11037
  case GGML_TYPE_IQ2_XXS:
10724
11038
  case GGML_TYPE_IQ2_XS:
10725
11039
  case GGML_TYPE_IQ3_XXS:
11040
+ case GGML_TYPE_IQ1_S:
11041
+ case GGML_TYPE_IQ4_NL:
10726
11042
  {
10727
- ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst);
11043
+ ggml_compute_forward_out_prod_q_f32(params, dst);
10728
11044
  } break;
10729
11045
  case GGML_TYPE_F16:
10730
11046
  {
10731
11047
  GGML_ASSERT(false); // todo
10732
- // ggml_compute_forward_out_prod_f16_f32(params, src0, src1, dst);
11048
+ // ggml_compute_forward_out_prod_f16_f32(params, dst);
10733
11049
  } break;
10734
11050
  case GGML_TYPE_F32:
10735
11051
  {
10736
- ggml_compute_forward_out_prod_f32(params, src0, src1, dst);
11052
+ ggml_compute_forward_out_prod_f32(params, dst);
10737
11053
  } break;
10738
11054
  default:
10739
11055
  {
@@ -10746,8 +11062,10 @@ static void ggml_compute_forward_out_prod(
10746
11062
 
10747
11063
  static void ggml_compute_forward_scale_f32(
10748
11064
  const struct ggml_compute_params * params,
10749
- const struct ggml_tensor * src0,
10750
11065
  struct ggml_tensor * dst) {
11066
+
11067
+ const struct ggml_tensor * src0 = dst->src[0];
11068
+
10751
11069
  GGML_ASSERT(ggml_is_contiguous(src0));
10752
11070
  GGML_ASSERT(ggml_is_contiguous(dst));
10753
11071
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
@@ -10788,12 +11106,14 @@ static void ggml_compute_forward_scale_f32(
10788
11106
 
10789
11107
  static void ggml_compute_forward_scale(
10790
11108
  const struct ggml_compute_params * params,
10791
- const struct ggml_tensor * src0,
10792
11109
  struct ggml_tensor * dst) {
11110
+
11111
+ const struct ggml_tensor * src0 = dst->src[0];
11112
+
10793
11113
  switch (src0->type) {
10794
11114
  case GGML_TYPE_F32:
10795
11115
  {
10796
- ggml_compute_forward_scale_f32(params, src0, dst);
11116
+ ggml_compute_forward_scale_f32(params, dst);
10797
11117
  } break;
10798
11118
  default:
10799
11119
  {
@@ -10806,9 +11126,11 @@ static void ggml_compute_forward_scale(
10806
11126
 
10807
11127
  static void ggml_compute_forward_set_f32(
10808
11128
  const struct ggml_compute_params * params,
10809
- const struct ggml_tensor * src0,
10810
- const struct ggml_tensor * src1,
10811
11129
  struct ggml_tensor * dst) {
11130
+
11131
+ const struct ggml_tensor * src0 = dst->src[0];
11132
+ const struct ggml_tensor * src1 = dst->src[1];
11133
+
10812
11134
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
10813
11135
  GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
10814
11136
 
@@ -10879,14 +11201,14 @@ static void ggml_compute_forward_set_f32(
10879
11201
 
10880
11202
  static void ggml_compute_forward_set(
10881
11203
  const struct ggml_compute_params * params,
10882
- const struct ggml_tensor * src0,
10883
- const struct ggml_tensor * src1,
10884
11204
  struct ggml_tensor * dst) {
10885
11205
 
11206
+ const struct ggml_tensor * src0 = dst->src[0];
11207
+
10886
11208
  switch (src0->type) {
10887
11209
  case GGML_TYPE_F32:
10888
11210
  {
10889
- ggml_compute_forward_set_f32(params, src0, src1, dst);
11211
+ ggml_compute_forward_set_f32(params, dst);
10890
11212
  } break;
10891
11213
  case GGML_TYPE_F16:
10892
11214
  case GGML_TYPE_Q4_0:
@@ -10903,6 +11225,8 @@ static void ggml_compute_forward_set(
10903
11225
  case GGML_TYPE_IQ2_XXS:
10904
11226
  case GGML_TYPE_IQ2_XS:
10905
11227
  case GGML_TYPE_IQ3_XXS:
11228
+ case GGML_TYPE_IQ1_S:
11229
+ case GGML_TYPE_IQ4_NL:
10906
11230
  default:
10907
11231
  {
10908
11232
  GGML_ASSERT(false);
@@ -10914,29 +11238,25 @@ static void ggml_compute_forward_set(
10914
11238
 
10915
11239
  static void ggml_compute_forward_cpy(
10916
11240
  const struct ggml_compute_params * params,
10917
- const struct ggml_tensor * src0,
10918
11241
  struct ggml_tensor * dst) {
10919
- ggml_compute_forward_dup(params, src0, dst);
11242
+ ggml_compute_forward_dup(params, dst);
10920
11243
  }
10921
11244
 
10922
11245
  // ggml_compute_forward_cont
10923
11246
 
10924
11247
  static void ggml_compute_forward_cont(
10925
11248
  const struct ggml_compute_params * params,
10926
- const struct ggml_tensor * src0,
10927
11249
  struct ggml_tensor * dst) {
10928
- ggml_compute_forward_dup(params, src0, dst);
11250
+ ggml_compute_forward_dup(params, dst);
10929
11251
  }
10930
11252
 
10931
11253
  // ggml_compute_forward_reshape
10932
11254
 
10933
11255
  static void ggml_compute_forward_reshape(
10934
11256
  const struct ggml_compute_params * params,
10935
- const struct ggml_tensor * src0,
10936
11257
  struct ggml_tensor * dst) {
10937
11258
  // NOP
10938
11259
  UNUSED(params);
10939
- UNUSED(src0);
10940
11260
  UNUSED(dst);
10941
11261
  }
10942
11262
 
@@ -10944,39 +11264,41 @@ static void ggml_compute_forward_reshape(
10944
11264
 
10945
11265
  static void ggml_compute_forward_view(
10946
11266
  const struct ggml_compute_params * params,
10947
- const struct ggml_tensor * src0) {
11267
+ const struct ggml_tensor * dst) {
10948
11268
  // NOP
10949
11269
  UNUSED(params);
10950
- UNUSED(src0);
11270
+ UNUSED(dst);
10951
11271
  }
10952
11272
 
10953
11273
  // ggml_compute_forward_permute
10954
11274
 
10955
11275
  static void ggml_compute_forward_permute(
10956
11276
  const struct ggml_compute_params * params,
10957
- const struct ggml_tensor * src0) {
11277
+ const struct ggml_tensor * dst) {
10958
11278
  // NOP
10959
11279
  UNUSED(params);
10960
- UNUSED(src0);
11280
+ UNUSED(dst);
10961
11281
  }
10962
11282
 
10963
11283
  // ggml_compute_forward_transpose
10964
11284
 
10965
11285
  static void ggml_compute_forward_transpose(
10966
11286
  const struct ggml_compute_params * params,
10967
- const struct ggml_tensor * src0) {
11287
+ const struct ggml_tensor * dst) {
10968
11288
  // NOP
10969
11289
  UNUSED(params);
10970
- UNUSED(src0);
11290
+ UNUSED(dst);
10971
11291
  }
10972
11292
 
10973
11293
  // ggml_compute_forward_get_rows
10974
11294
 
10975
11295
  static void ggml_compute_forward_get_rows_q(
10976
11296
  const struct ggml_compute_params * params,
10977
- const struct ggml_tensor * src0,
10978
- const struct ggml_tensor * src1,
10979
11297
  struct ggml_tensor * dst) {
11298
+
11299
+ const struct ggml_tensor * src0 = dst->src[0];
11300
+ const struct ggml_tensor * src1 = dst->src[1];
11301
+
10980
11302
  assert(params->ith == 0);
10981
11303
 
10982
11304
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -11012,9 +11334,11 @@ static void ggml_compute_forward_get_rows_q(
11012
11334
 
11013
11335
  static void ggml_compute_forward_get_rows_f16(
11014
11336
  const struct ggml_compute_params * params,
11015
- const struct ggml_tensor * src0,
11016
- const struct ggml_tensor * src1,
11017
11337
  struct ggml_tensor * dst) {
11338
+
11339
+ const struct ggml_tensor * src0 = dst->src[0];
11340
+ const struct ggml_tensor * src1 = dst->src[1];
11341
+
11018
11342
  assert(params->ith == 0);
11019
11343
 
11020
11344
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -11047,9 +11371,11 @@ static void ggml_compute_forward_get_rows_f16(
11047
11371
 
11048
11372
  static void ggml_compute_forward_get_rows_f32(
11049
11373
  const struct ggml_compute_params * params,
11050
- const struct ggml_tensor * src0,
11051
- const struct ggml_tensor * src1,
11052
11374
  struct ggml_tensor * dst) {
11375
+
11376
+ const struct ggml_tensor * src0 = dst->src[0];
11377
+ const struct ggml_tensor * src1 = dst->src[1];
11378
+
11053
11379
  assert(params->ith == 0);
11054
11380
 
11055
11381
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -11082,9 +11408,10 @@ static void ggml_compute_forward_get_rows_f32(
11082
11408
 
11083
11409
  static void ggml_compute_forward_get_rows(
11084
11410
  const struct ggml_compute_params * params,
11085
- const struct ggml_tensor * src0,
11086
- const struct ggml_tensor * src1,
11087
11411
  struct ggml_tensor * dst) {
11412
+
11413
+ const struct ggml_tensor * src0 = dst->src[0];
11414
+
11088
11415
  switch (src0->type) {
11089
11416
  case GGML_TYPE_Q4_0:
11090
11417
  case GGML_TYPE_Q4_1:
@@ -11100,17 +11427,19 @@ static void ggml_compute_forward_get_rows(
11100
11427
  case GGML_TYPE_IQ2_XXS:
11101
11428
  case GGML_TYPE_IQ2_XS:
11102
11429
  case GGML_TYPE_IQ3_XXS:
11430
+ case GGML_TYPE_IQ1_S:
11431
+ case GGML_TYPE_IQ4_NL:
11103
11432
  {
11104
- ggml_compute_forward_get_rows_q(params, src0, src1, dst);
11433
+ ggml_compute_forward_get_rows_q(params, dst);
11105
11434
  } break;
11106
11435
  case GGML_TYPE_F16:
11107
11436
  {
11108
- ggml_compute_forward_get_rows_f16(params, src0, src1, dst);
11437
+ ggml_compute_forward_get_rows_f16(params, dst);
11109
11438
  } break;
11110
11439
  case GGML_TYPE_F32:
11111
11440
  case GGML_TYPE_I32:
11112
11441
  {
11113
- ggml_compute_forward_get_rows_f32(params, src0, src1, dst);
11442
+ ggml_compute_forward_get_rows_f32(params, dst);
11114
11443
  } break;
11115
11444
  default:
11116
11445
  {
@@ -11141,9 +11470,11 @@ static void ggml_compute_forward_get_rows(
11141
11470
 
11142
11471
  static void ggml_compute_forward_get_rows_back_f32_f16(
11143
11472
  const struct ggml_compute_params * params,
11144
- const struct ggml_tensor * src0,
11145
- const struct ggml_tensor * src1,
11146
11473
  struct ggml_tensor * dst) {
11474
+
11475
+ const struct ggml_tensor * src0 = dst->src[0];
11476
+ const struct ggml_tensor * src1 = dst->src[1];
11477
+
11147
11478
  GGML_ASSERT(params->ith == 0);
11148
11479
  GGML_ASSERT(ggml_is_contiguous(dst));
11149
11480
 
@@ -11178,9 +11509,11 @@ static void ggml_compute_forward_get_rows_back_f32_f16(
11178
11509
 
11179
11510
  static void ggml_compute_forward_get_rows_back_f32(
11180
11511
  const struct ggml_compute_params * params,
11181
- const struct ggml_tensor * src0,
11182
- const struct ggml_tensor * src1,
11183
11512
  struct ggml_tensor * dst) {
11513
+
11514
+ const struct ggml_tensor * src0 = dst->src[0];
11515
+ const struct ggml_tensor * src1 = dst->src[1];
11516
+
11184
11517
  GGML_ASSERT(params->ith == 0);
11185
11518
  GGML_ASSERT(ggml_is_contiguous(dst));
11186
11519
 
@@ -11215,17 +11548,18 @@ static void ggml_compute_forward_get_rows_back_f32(
11215
11548
 
11216
11549
  static void ggml_compute_forward_get_rows_back(
11217
11550
  const struct ggml_compute_params * params,
11218
- const struct ggml_tensor * src0,
11219
- const struct ggml_tensor * src1,
11220
11551
  struct ggml_tensor * dst) {
11552
+
11553
+ const struct ggml_tensor * src0 = dst->src[0];
11554
+
11221
11555
  switch (src0->type) {
11222
11556
  case GGML_TYPE_F16:
11223
11557
  {
11224
- ggml_compute_forward_get_rows_back_f32_f16(params, src0, src1, dst);
11558
+ ggml_compute_forward_get_rows_back_f32_f16(params, dst);
11225
11559
  } break;
11226
11560
  case GGML_TYPE_F32:
11227
11561
  {
11228
- ggml_compute_forward_get_rows_back_f32(params, src0, src1, dst);
11562
+ ggml_compute_forward_get_rows_back_f32(params, dst);
11229
11563
  } break;
11230
11564
  default:
11231
11565
  {
@@ -11256,8 +11590,10 @@ static void ggml_compute_forward_get_rows_back(
11256
11590
 
11257
11591
  static void ggml_compute_forward_diag_f32(
11258
11592
  const struct ggml_compute_params * params,
11259
- const struct ggml_tensor * src0,
11260
11593
  struct ggml_tensor * dst) {
11594
+
11595
+ const struct ggml_tensor * src0 = dst->src[0];
11596
+
11261
11597
  GGML_ASSERT(params->ith == 0);
11262
11598
 
11263
11599
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -11296,12 +11632,14 @@ static void ggml_compute_forward_diag_f32(
11296
11632
 
11297
11633
  static void ggml_compute_forward_diag(
11298
11634
  const struct ggml_compute_params * params,
11299
- const struct ggml_tensor * src0,
11300
11635
  struct ggml_tensor * dst) {
11636
+
11637
+ const struct ggml_tensor * src0 = dst->src[0];
11638
+
11301
11639
  switch (src0->type) {
11302
11640
  case GGML_TYPE_F32:
11303
11641
  {
11304
- ggml_compute_forward_diag_f32(params, src0, dst);
11642
+ ggml_compute_forward_diag_f32(params, dst);
11305
11643
  } break;
11306
11644
  default:
11307
11645
  {
@@ -11314,10 +11652,11 @@ static void ggml_compute_forward_diag(
11314
11652
 
11315
11653
  static void ggml_compute_forward_diag_mask_f32(
11316
11654
  const struct ggml_compute_params * params,
11317
- const struct ggml_tensor * src0,
11318
11655
  struct ggml_tensor * dst,
11319
11656
  const float value) {
11320
11657
 
11658
+ const struct ggml_tensor * src0 = dst->src[0];
11659
+
11321
11660
  const int ith = params->ith;
11322
11661
  const int nth = params->nth;
11323
11662
 
@@ -11367,12 +11706,14 @@ static void ggml_compute_forward_diag_mask_f32(
11367
11706
 
11368
11707
  static void ggml_compute_forward_diag_mask_inf(
11369
11708
  const struct ggml_compute_params * params,
11370
- const struct ggml_tensor * src0,
11371
11709
  struct ggml_tensor * dst) {
11710
+
11711
+ const struct ggml_tensor * src0 = dst->src[0];
11712
+
11372
11713
  switch (src0->type) {
11373
11714
  case GGML_TYPE_F32:
11374
11715
  {
11375
- ggml_compute_forward_diag_mask_f32(params, src0, dst, -INFINITY);
11716
+ ggml_compute_forward_diag_mask_f32(params, dst, -INFINITY);
11376
11717
  } break;
11377
11718
  default:
11378
11719
  {
@@ -11383,12 +11724,14 @@ static void ggml_compute_forward_diag_mask_inf(
11383
11724
 
11384
11725
  static void ggml_compute_forward_diag_mask_zero(
11385
11726
  const struct ggml_compute_params * params,
11386
- const struct ggml_tensor * src0,
11387
11727
  struct ggml_tensor * dst) {
11728
+
11729
+ const struct ggml_tensor * src0 = dst->src[0];
11730
+
11388
11731
  switch (src0->type) {
11389
11732
  case GGML_TYPE_F32:
11390
11733
  {
11391
- ggml_compute_forward_diag_mask_f32(params, src0, dst, 0);
11734
+ ggml_compute_forward_diag_mask_f32(params, dst, 0);
11392
11735
  } break;
11393
11736
  default:
11394
11737
  {
@@ -11401,9 +11744,12 @@ static void ggml_compute_forward_diag_mask_zero(
11401
11744
 
11402
11745
  static void ggml_compute_forward_soft_max_f32(
11403
11746
  const struct ggml_compute_params * params,
11404
- const struct ggml_tensor * src0,
11405
- const struct ggml_tensor * src1,
11406
11747
  struct ggml_tensor * dst) {
11748
+
11749
+ const struct ggml_tensor * src0 = dst->src[0];
11750
+ const struct ggml_tensor * src1 = dst->src[1];
11751
+ const struct ggml_tensor * src2 = dst->src[2];
11752
+
11407
11753
  assert(ggml_is_contiguous(dst));
11408
11754
  assert(ggml_are_same_shape(src0, dst));
11409
11755
 
@@ -11411,16 +11757,29 @@ static void ggml_compute_forward_soft_max_f32(
11411
11757
  return;
11412
11758
  }
11413
11759
 
11414
- float scale = 1.0f;
11415
- memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
11760
+ float scale = 1.0f;
11761
+ float max_bias = 0.0f;
11762
+
11763
+ memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
11764
+ memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
11416
11765
 
11417
11766
  // TODO: handle transposed/permuted matrices
11418
11767
 
11419
11768
  const int ith = params->ith;
11420
11769
  const int nth = params->nth;
11421
11770
 
11771
+ GGML_TENSOR_UNARY_OP_LOCALS
11772
+
11422
11773
  const int64_t ne11 = src1 ? src1->ne[1] : 1;
11423
11774
 
11775
+ // TODO: is this supposed to be ceil instead of floor?
11776
+ // https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370
11777
+ const uint32_t n_head_kv = ne02;
11778
+ const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head_kv));
11779
+
11780
+ const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
11781
+ const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
11782
+
11424
11783
  const int nc = src0->ne[0];
11425
11784
  const int nr = ggml_nrows(src0);
11426
11785
 
@@ -11433,6 +11792,9 @@ static void ggml_compute_forward_soft_max_f32(
11433
11792
 
11434
11793
  float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
11435
11794
 
11795
+ // when max_bias <= 0.0f, src2 is not used and we default it to src0 to avoid branching
11796
+ float * pos = src2 ? (float *) src2->data : src0->data;
11797
+
11436
11798
  for (int i1 = ir0; i1 < ir1; i1++) {
11437
11799
  float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
11438
11800
  float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
@@ -11446,6 +11808,16 @@ static void ggml_compute_forward_soft_max_f32(
11446
11808
  ggml_vec_acc_f32(nc, wp, mp);
11447
11809
  }
11448
11810
 
11811
+ // ALiBi bias
11812
+ if (max_bias > 0.0f) {
11813
+ const uint32_t h = (i1/ne01)%ne02; // head
11814
+ const float slope = h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1);
11815
+
11816
+ for (int i = 0; i < nc; i++) {
11817
+ wp[i] = wp[i] + slope*pos[i];
11818
+ }
11819
+ }
11820
+
11449
11821
  #ifndef NDEBUG
11450
11822
  for (int i = 0; i < nc; ++i) {
11451
11823
  //printf("p[%d] = %f\n", i, p[i]);
@@ -11488,13 +11860,14 @@ static void ggml_compute_forward_soft_max_f32(
11488
11860
 
11489
11861
  static void ggml_compute_forward_soft_max(
11490
11862
  const struct ggml_compute_params * params,
11491
- const struct ggml_tensor * src0,
11492
- const struct ggml_tensor * src1,
11493
11863
  struct ggml_tensor * dst) {
11864
+
11865
+ const struct ggml_tensor * src0 = dst->src[0];
11866
+
11494
11867
  switch (src0->type) {
11495
11868
  case GGML_TYPE_F32:
11496
11869
  {
11497
- ggml_compute_forward_soft_max_f32(params, src0, src1, dst);
11870
+ ggml_compute_forward_soft_max_f32(params, dst);
11498
11871
  } break;
11499
11872
  default:
11500
11873
  {
@@ -11507,9 +11880,11 @@ static void ggml_compute_forward_soft_max(
11507
11880
 
11508
11881
  static void ggml_compute_forward_soft_max_back_f32(
11509
11882
  const struct ggml_compute_params * params,
11510
- const struct ggml_tensor * src0,
11511
- const struct ggml_tensor * src1,
11512
11883
  struct ggml_tensor * dst) {
11884
+
11885
+ const struct ggml_tensor * src0 = dst->src[0];
11886
+ const struct ggml_tensor * src1 = dst->src[1];
11887
+
11513
11888
  GGML_ASSERT(ggml_is_contiguous(src0));
11514
11889
  GGML_ASSERT(ggml_is_contiguous(src1));
11515
11890
  GGML_ASSERT(ggml_is_contiguous(dst));
@@ -11568,7 +11943,7 @@ static void ggml_compute_forward_soft_max_back_f32(
11568
11943
 
11569
11944
  // linear runtime, no additional memory
11570
11945
  float dot_y_dy = 0;
11571
- ggml_vec_dot_f32 (nc, &dot_y_dy, y, dy);
11946
+ ggml_vec_dot_f32 (nc, &dot_y_dy, 0, y, 0, dy, 0, 1);
11572
11947
  ggml_vec_cpy_f32 (nc, dx, dy);
11573
11948
  ggml_vec_acc1_f32(nc, dx, -dot_y_dy);
11574
11949
  ggml_vec_mul_f32 (nc, dx, dx, y);
@@ -11584,13 +11959,14 @@ static void ggml_compute_forward_soft_max_back_f32(
11584
11959
 
11585
11960
  static void ggml_compute_forward_soft_max_back(
11586
11961
  const struct ggml_compute_params * params,
11587
- const struct ggml_tensor * src0,
11588
- const struct ggml_tensor * src1,
11589
11962
  struct ggml_tensor * dst) {
11963
+
11964
+ const struct ggml_tensor * src0 = dst->src[0];
11965
+
11590
11966
  switch (src0->type) {
11591
11967
  case GGML_TYPE_F32:
11592
11968
  {
11593
- ggml_compute_forward_soft_max_back_f32(params, src0, src1, dst);
11969
+ ggml_compute_forward_soft_max_back_f32(params, dst);
11594
11970
  } break;
11595
11971
  default:
11596
11972
  {
@@ -11603,8 +11979,10 @@ static void ggml_compute_forward_soft_max_back(
11603
11979
 
11604
11980
  static void ggml_compute_forward_alibi_f32(
11605
11981
  const struct ggml_compute_params * params,
11606
- const struct ggml_tensor * src0,
11607
11982
  struct ggml_tensor * dst) {
11983
+
11984
+ const struct ggml_tensor * src0 = dst->src[0];
11985
+
11608
11986
  assert(params->ith == 0);
11609
11987
 
11610
11988
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -11638,22 +12016,20 @@ static void ggml_compute_forward_alibi_f32(
11638
12016
  const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
11639
12017
  const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
11640
12018
 
11641
- for (int64_t i = 0; i < ne0; i++) {
11642
- for (int64_t j = 0; j < ne1; j++) {
11643
- for (int64_t k = 0; k < ne2_ne3; k++) {
11644
- float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
11645
- float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
11646
-
11647
- // TODO: k*nb2 or k*nb3
11648
-
11649
- float m_k;
12019
+ for (int64_t k = 0; k < ne2_ne3; k++) {
12020
+ // TODO: k*nb2 or k*nb3
12021
+ float m_k;
11650
12022
 
11651
- if (k < n_heads_log2_floor) {
11652
- m_k = powf(m0, k + 1);
11653
- } else {
11654
- m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
11655
- }
12023
+ if (k < n_heads_log2_floor) {
12024
+ m_k = powf(m0, k + 1);
12025
+ } else {
12026
+ m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
12027
+ }
11656
12028
 
12029
+ for (int64_t i = 0; i < ne0; i++) {
12030
+ for (int64_t j = 0; j < ne1; j++) {
12031
+ float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
12032
+ float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
11657
12033
  pdst[0] = i * m_k + src[0];
11658
12034
  }
11659
12035
  }
@@ -11662,8 +12038,10 @@ static void ggml_compute_forward_alibi_f32(
11662
12038
 
11663
12039
  static void ggml_compute_forward_alibi_f16(
11664
12040
  const struct ggml_compute_params * params,
11665
- const struct ggml_tensor * src0,
11666
12041
  struct ggml_tensor * dst) {
12042
+
12043
+ const struct ggml_tensor * src0 = dst->src[0];
12044
+
11667
12045
  assert(params->ith == 0);
11668
12046
 
11669
12047
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -11698,21 +12076,20 @@ static void ggml_compute_forward_alibi_f16(
11698
12076
  const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
11699
12077
  const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
11700
12078
 
11701
- for (int i = 0; i < ne0; i++) {
11702
- for (int j = 0; j < ne1; j++) {
11703
- for (int k = 0; k < ne2_ne3; k++) {
11704
- ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
11705
- float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
11706
-
11707
- // TODO: k*nb2 or k*nb3
12079
+ for (int k = 0; k < ne2_ne3; k++) {
12080
+ // TODO: k*nb2 or k*nb3
12081
+ float m_k;
11708
12082
 
11709
- float m_k;
12083
+ if (k < n_heads_log2_floor) {
12084
+ m_k = powf(m0, k + 1);
12085
+ } else {
12086
+ m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
12087
+ }
11710
12088
 
11711
- if (k < n_heads_log2_floor) {
11712
- m_k = powf(m0, k + 1);
11713
- } else {
11714
- m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
11715
- }
12089
+ for (int i = 0; i < ne0; i++) {
12090
+ for (int j = 0; j < ne1; j++) {
12091
+ ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
12092
+ float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
11716
12093
 
11717
12094
  // we return F32
11718
12095
  pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]);
@@ -11723,16 +12100,18 @@ static void ggml_compute_forward_alibi_f16(
11723
12100
 
11724
12101
  static void ggml_compute_forward_alibi(
11725
12102
  const struct ggml_compute_params * params,
11726
- const struct ggml_tensor * src0,
11727
12103
  struct ggml_tensor * dst) {
12104
+
12105
+ const struct ggml_tensor * src0 = dst->src[0];
12106
+
11728
12107
  switch (src0->type) {
11729
12108
  case GGML_TYPE_F16:
11730
12109
  {
11731
- ggml_compute_forward_alibi_f16(params, src0, dst);
12110
+ ggml_compute_forward_alibi_f16(params, dst);
11732
12111
  } break;
11733
12112
  case GGML_TYPE_F32:
11734
12113
  {
11735
- ggml_compute_forward_alibi_f32(params, src0, dst);
12114
+ ggml_compute_forward_alibi_f32(params, dst);
11736
12115
  } break;
11737
12116
  case GGML_TYPE_Q4_0:
11738
12117
  case GGML_TYPE_Q4_1:
@@ -11748,6 +12127,8 @@ static void ggml_compute_forward_alibi(
11748
12127
  case GGML_TYPE_IQ2_XXS:
11749
12128
  case GGML_TYPE_IQ2_XS:
11750
12129
  case GGML_TYPE_IQ3_XXS:
12130
+ case GGML_TYPE_IQ1_S:
12131
+ case GGML_TYPE_IQ4_NL:
11751
12132
  case GGML_TYPE_Q8_K:
11752
12133
  case GGML_TYPE_I8:
11753
12134
  case GGML_TYPE_I16:
@@ -11763,8 +12144,10 @@ static void ggml_compute_forward_alibi(
11763
12144
 
11764
12145
  static void ggml_compute_forward_clamp_f32(
11765
12146
  const struct ggml_compute_params * params,
11766
- const struct ggml_tensor * src0,
11767
12147
  struct ggml_tensor * dst) {
12148
+
12149
+ const struct ggml_tensor * src0 = dst->src[0];
12150
+
11768
12151
  assert(params->ith == 0);
11769
12152
 
11770
12153
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -11803,12 +12186,14 @@ static void ggml_compute_forward_clamp_f32(
11803
12186
 
11804
12187
  static void ggml_compute_forward_clamp(
11805
12188
  const struct ggml_compute_params * params,
11806
- const struct ggml_tensor * src0,
11807
12189
  struct ggml_tensor * dst) {
12190
+
12191
+ const struct ggml_tensor * src0 = dst->src[0];
12192
+
11808
12193
  switch (src0->type) {
11809
12194
  case GGML_TYPE_F32:
11810
12195
  {
11811
- ggml_compute_forward_clamp_f32(params, src0, dst);
12196
+ ggml_compute_forward_clamp_f32(params, dst);
11812
12197
  } break;
11813
12198
  case GGML_TYPE_F16:
11814
12199
  case GGML_TYPE_Q4_0:
@@ -11825,6 +12210,8 @@ static void ggml_compute_forward_clamp(
11825
12210
  case GGML_TYPE_IQ2_XXS:
11826
12211
  case GGML_TYPE_IQ2_XS:
11827
12212
  case GGML_TYPE_IQ3_XXS:
12213
+ case GGML_TYPE_IQ1_S:
12214
+ case GGML_TYPE_IQ4_NL:
11828
12215
  case GGML_TYPE_Q8_K:
11829
12216
  case GGML_TYPE_I8:
11830
12217
  case GGML_TYPE_I16:
@@ -11896,10 +12283,12 @@ GGML_CALL void ggml_rope_yarn_corr_dims(
11896
12283
 
11897
12284
  static void ggml_compute_forward_rope_f32(
11898
12285
  const struct ggml_compute_params * params,
11899
- const struct ggml_tensor * src0,
11900
- const struct ggml_tensor * src1,
11901
12286
  struct ggml_tensor * dst,
11902
12287
  const bool forward) {
12288
+
12289
+ const struct ggml_tensor * src0 = dst->src[0];
12290
+ const struct ggml_tensor * src1 = dst->src[1];
12291
+
11903
12292
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11904
12293
  return;
11905
12294
  }
@@ -12072,10 +12461,12 @@ static void ggml_compute_forward_rope_f32(
12072
12461
 
12073
12462
  static void ggml_compute_forward_rope_f16(
12074
12463
  const struct ggml_compute_params * params,
12075
- const struct ggml_tensor * src0,
12076
- const struct ggml_tensor * src1,
12077
12464
  struct ggml_tensor * dst,
12078
12465
  const bool forward) {
12466
+
12467
+ const struct ggml_tensor * src0 = dst->src[0];
12468
+ const struct ggml_tensor * src1 = dst->src[1];
12469
+
12079
12470
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12080
12471
  return;
12081
12472
  }
@@ -12237,17 +12628,18 @@ static void ggml_compute_forward_rope_f16(
12237
12628
 
12238
12629
  static void ggml_compute_forward_rope(
12239
12630
  const struct ggml_compute_params * params,
12240
- const struct ggml_tensor * src0,
12241
- const struct ggml_tensor * src1,
12242
12631
  struct ggml_tensor * dst) {
12632
+
12633
+ const struct ggml_tensor * src0 = dst->src[0];
12634
+
12243
12635
  switch (src0->type) {
12244
12636
  case GGML_TYPE_F16:
12245
12637
  {
12246
- ggml_compute_forward_rope_f16(params, src0, src1, dst, true);
12638
+ ggml_compute_forward_rope_f16(params, dst, true);
12247
12639
  } break;
12248
12640
  case GGML_TYPE_F32:
12249
12641
  {
12250
- ggml_compute_forward_rope_f32(params, src0, src1, dst, true);
12642
+ ggml_compute_forward_rope_f32(params, dst, true);
12251
12643
  } break;
12252
12644
  default:
12253
12645
  {
@@ -12260,17 +12652,18 @@ static void ggml_compute_forward_rope(
12260
12652
 
12261
12653
  static void ggml_compute_forward_rope_back(
12262
12654
  const struct ggml_compute_params * params,
12263
- const struct ggml_tensor * src0,
12264
- const struct ggml_tensor * src1,
12265
12655
  struct ggml_tensor * dst) {
12656
+
12657
+ const struct ggml_tensor * src0 = dst->src[0];
12658
+
12266
12659
  switch (src0->type) {
12267
12660
  case GGML_TYPE_F16:
12268
12661
  {
12269
- ggml_compute_forward_rope_f16(params, src0, src1, dst, false);
12662
+ ggml_compute_forward_rope_f16(params, dst, false);
12270
12663
  } break;
12271
12664
  case GGML_TYPE_F32:
12272
12665
  {
12273
- ggml_compute_forward_rope_f32(params, src0, src1, dst, false);
12666
+ ggml_compute_forward_rope_f32(params, dst, false);
12274
12667
  } break;
12275
12668
  default:
12276
12669
  {
@@ -12283,9 +12676,11 @@ static void ggml_compute_forward_rope_back(
12283
12676
 
12284
12677
  static void ggml_compute_forward_conv_transpose_1d_f16_f32(
12285
12678
  const struct ggml_compute_params * params,
12286
- const struct ggml_tensor * src0,
12287
- const struct ggml_tensor * src1,
12288
12679
  struct ggml_tensor * dst) {
12680
+
12681
+ const struct ggml_tensor * src0 = dst->src[0];
12682
+ const struct ggml_tensor * src1 = dst->src[1];
12683
+
12289
12684
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
12290
12685
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
12291
12686
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -12369,9 +12764,9 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
12369
12764
  const int i1n = i10*ne11;
12370
12765
  for (int i00 = 0; i00 < ne00; i00++) {
12371
12766
  float v = 0;
12372
- ggml_vec_dot_f16(ne02, &v,
12373
- (ggml_fp16_t *) wdata_src + i1n,
12374
- (ggml_fp16_t *) wdata_kernel + i00*ne02);
12767
+ ggml_vec_dot_f16(ne02, &v, 0,
12768
+ (ggml_fp16_t *) wdata_src + i1n, 0,
12769
+ (ggml_fp16_t *) wdata_kernel + i00*ne02, 0, 1);
12375
12770
  dst_data[i10*s0 + i00] += v;
12376
12771
  }
12377
12772
  }
@@ -12380,9 +12775,11 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
12380
12775
 
12381
12776
  static void ggml_compute_forward_conv_transpose_1d_f32(
12382
12777
  const struct ggml_compute_params * params,
12383
- const struct ggml_tensor * src0,
12384
- const struct ggml_tensor * src1,
12385
12778
  struct ggml_tensor * dst) {
12779
+
12780
+ const struct ggml_tensor * src0 = dst->src[0];
12781
+ const struct ggml_tensor * src1 = dst->src[1];
12782
+
12386
12783
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
12387
12784
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
12388
12785
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -12466,9 +12863,9 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
12466
12863
  const int i1n = i10*ne11;
12467
12864
  for (int i00 = 0; i00 < ne00; i00++) {
12468
12865
  float v = 0;
12469
- ggml_vec_dot_f32(ne02, &v,
12470
- wdata_src + i1n,
12471
- wdata_kernel + i00*ne02);
12866
+ ggml_vec_dot_f32(ne02, &v, 0,
12867
+ wdata_src + i1n, 0,
12868
+ wdata_kernel + i00*ne02, 0, 1);
12472
12869
  dst_data[i10*s0 + i00] += v;
12473
12870
  }
12474
12871
  }
@@ -12477,17 +12874,18 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
12477
12874
 
12478
12875
  static void ggml_compute_forward_conv_transpose_1d(
12479
12876
  const struct ggml_compute_params * params,
12480
- const struct ggml_tensor * src0,
12481
- const struct ggml_tensor * src1,
12482
12877
  struct ggml_tensor * dst) {
12878
+
12879
+ const struct ggml_tensor * src0 = dst->src[0];
12880
+
12483
12881
  switch (src0->type) {
12484
12882
  case GGML_TYPE_F16:
12485
12883
  {
12486
- ggml_compute_forward_conv_transpose_1d_f16_f32(params, src0, src1, dst);
12884
+ ggml_compute_forward_conv_transpose_1d_f16_f32(params, dst);
12487
12885
  } break;
12488
12886
  case GGML_TYPE_F32:
12489
12887
  {
12490
- ggml_compute_forward_conv_transpose_1d_f32(params, src0, src1, dst);
12888
+ ggml_compute_forward_conv_transpose_1d_f32(params, dst);
12491
12889
  } break;
12492
12890
  default:
12493
12891
  {
@@ -12501,9 +12899,11 @@ static void ggml_compute_forward_conv_transpose_1d(
12501
12899
  // dst: result [N, OH, OW, IC*KH*KW]
12502
12900
  static void ggml_compute_forward_im2col_f32(
12503
12901
  const struct ggml_compute_params * params,
12504
- const struct ggml_tensor * src0,
12505
- const struct ggml_tensor * src1,
12506
12902
  struct ggml_tensor * dst) {
12903
+
12904
+ const struct ggml_tensor * src0 = dst->src[0];
12905
+ const struct ggml_tensor * src1 = dst->src[1];
12906
+
12507
12907
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
12508
12908
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
12509
12909
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -12587,9 +12987,11 @@ static void ggml_compute_forward_im2col_f32(
12587
12987
  // dst: result [N, OH, OW, IC*KH*KW]
12588
12988
  static void ggml_compute_forward_im2col_f16(
12589
12989
  const struct ggml_compute_params * params,
12590
- const struct ggml_tensor * src0,
12591
- const struct ggml_tensor * src1,
12592
12990
  struct ggml_tensor * dst) {
12991
+
12992
+ const struct ggml_tensor * src0 = dst->src[0];
12993
+ const struct ggml_tensor * src1 = dst->src[1];
12994
+
12593
12995
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
12594
12996
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
12595
12997
  GGML_ASSERT( dst->type == GGML_TYPE_F16);
@@ -12669,17 +13071,15 @@ static void ggml_compute_forward_im2col_f16(
12669
13071
 
12670
13072
  static void ggml_compute_forward_im2col(
12671
13073
  const struct ggml_compute_params * params,
12672
- const struct ggml_tensor * src0,
12673
- const struct ggml_tensor * src1,
12674
13074
  struct ggml_tensor * dst) {
12675
13075
  switch (dst->type) {
12676
13076
  case GGML_TYPE_F16:
12677
13077
  {
12678
- ggml_compute_forward_im2col_f16(params, src0, src1, dst);
13078
+ ggml_compute_forward_im2col_f16(params, dst);
12679
13079
  } break;
12680
13080
  case GGML_TYPE_F32:
12681
13081
  {
12682
- ggml_compute_forward_im2col_f32(params, src0, src1, dst);
13082
+ ggml_compute_forward_im2col_f32(params, dst);
12683
13083
  } break;
12684
13084
  default:
12685
13085
  {
@@ -12693,9 +13093,11 @@ static void ggml_compute_forward_im2col(
12693
13093
 
12694
13094
  static void ggml_compute_forward_conv_transpose_2d(
12695
13095
  const struct ggml_compute_params * params,
12696
- const struct ggml_tensor * src0,
12697
- const struct ggml_tensor * src1,
12698
13096
  struct ggml_tensor * dst) {
13097
+
13098
+ const struct ggml_tensor * src0 = dst->src[0];
13099
+ const struct ggml_tensor * src1 = dst->src[1];
13100
+
12699
13101
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
12700
13102
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
12701
13103
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -12783,9 +13185,9 @@ static void ggml_compute_forward_conv_transpose_2d(
12783
13185
  for (int i01 = 0; i01 < ne01; i01++) {
12784
13186
  for (int i00 = 0; i00 < ne00; i00++) {
12785
13187
  float v = 0;
12786
- ggml_vec_dot_f16(ne03, &v,
12787
- wdata_src + i1n,
12788
- wdata_kernel + i01*ne00*ne03 + i00*ne03);
13188
+ ggml_vec_dot_f16(ne03, &v, 0,
13189
+ wdata_src + i1n, 0,
13190
+ wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1);
12789
13191
  dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
12790
13192
  }
12791
13193
  }
@@ -12799,9 +13201,11 @@ static void ggml_compute_forward_conv_transpose_2d(
12799
13201
  static void ggml_compute_forward_pool_1d_sk_p0(
12800
13202
  const struct ggml_compute_params * params,
12801
13203
  const enum ggml_op_pool op,
12802
- const struct ggml_tensor * src,
12803
13204
  const int k,
12804
13205
  struct ggml_tensor * dst) {
13206
+
13207
+ const struct ggml_tensor * src = dst->src[0];
13208
+
12805
13209
  assert(src->type == GGML_TYPE_F32);
12806
13210
  assert(params->ith == 0);
12807
13211
 
@@ -12850,7 +13254,6 @@ static void ggml_compute_forward_pool_1d_sk_p0(
12850
13254
 
12851
13255
  static void ggml_compute_forward_pool_1d(
12852
13256
  const struct ggml_compute_params * params,
12853
- const struct ggml_tensor * src0,
12854
13257
  struct ggml_tensor * dst) {
12855
13258
 
12856
13259
  const int32_t * opts = (const int32_t *)dst->op_params;
@@ -12861,15 +13264,17 @@ static void ggml_compute_forward_pool_1d(
12861
13264
  GGML_ASSERT(p0 == 0); // padding not supported
12862
13265
  GGML_ASSERT(k0 == s0); // only s = k supported
12863
13266
 
12864
- ggml_compute_forward_pool_1d_sk_p0(params, op, src0, k0, dst);
13267
+ ggml_compute_forward_pool_1d_sk_p0(params, op, k0, dst);
12865
13268
  }
12866
13269
 
12867
13270
  // ggml_compute_forward_pool_2d
12868
13271
 
12869
13272
  static void ggml_compute_forward_pool_2d(
12870
13273
  const struct ggml_compute_params * params,
12871
- const struct ggml_tensor * src,
12872
13274
  struct ggml_tensor * dst) {
13275
+
13276
+ const struct ggml_tensor * src = dst->src[0];
13277
+
12873
13278
  GGML_ASSERT(src->type == GGML_TYPE_F32);
12874
13279
  GGML_ASSERT(params->ith == 0);
12875
13280
 
@@ -12942,9 +13347,10 @@ static void ggml_compute_forward_pool_2d(
12942
13347
 
12943
13348
  static void ggml_compute_forward_upscale_f32(
12944
13349
  const struct ggml_compute_params * params,
12945
- const struct ggml_tensor * src0,
12946
13350
  struct ggml_tensor * dst) {
12947
13351
 
13352
+ const struct ggml_tensor * src0 = dst->src[0];
13353
+
12948
13354
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12949
13355
  return;
12950
13356
  }
@@ -12981,12 +13387,14 @@ static void ggml_compute_forward_upscale_f32(
12981
13387
 
12982
13388
  static void ggml_compute_forward_upscale(
12983
13389
  const struct ggml_compute_params * params,
12984
- const struct ggml_tensor * src0,
12985
13390
  struct ggml_tensor * dst) {
13391
+
13392
+ const struct ggml_tensor * src0 = dst->src[0];
13393
+
12986
13394
  switch (src0->type) {
12987
13395
  case GGML_TYPE_F32:
12988
13396
  {
12989
- ggml_compute_forward_upscale_f32(params, src0, dst);
13397
+ ggml_compute_forward_upscale_f32(params, dst);
12990
13398
  } break;
12991
13399
  default:
12992
13400
  {
@@ -12999,9 +13407,10 @@ static void ggml_compute_forward_upscale(
12999
13407
 
13000
13408
  static void ggml_compute_forward_pad_f32(
13001
13409
  const struct ggml_compute_params * params,
13002
- const struct ggml_tensor * src0,
13003
13410
  struct ggml_tensor * dst) {
13004
13411
 
13412
+ const struct ggml_tensor * src0 = dst->src[0];
13413
+
13005
13414
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
13006
13415
  return;
13007
13416
  }
@@ -13039,12 +13448,14 @@ static void ggml_compute_forward_pad_f32(
13039
13448
 
13040
13449
  static void ggml_compute_forward_pad(
13041
13450
  const struct ggml_compute_params * params,
13042
- const struct ggml_tensor * src0,
13043
13451
  struct ggml_tensor * dst) {
13452
+
13453
+ const struct ggml_tensor * src0 = dst->src[0];
13454
+
13044
13455
  switch (src0->type) {
13045
13456
  case GGML_TYPE_F32:
13046
13457
  {
13047
- ggml_compute_forward_pad_f32(params, src0, dst);
13458
+ ggml_compute_forward_pad_f32(params, dst);
13048
13459
  } break;
13049
13460
  default:
13050
13461
  {
@@ -13057,9 +13468,10 @@ static void ggml_compute_forward_pad(
13057
13468
 
13058
13469
  static void ggml_compute_forward_argsort_f32(
13059
13470
  const struct ggml_compute_params * params,
13060
- const struct ggml_tensor * src0,
13061
13471
  struct ggml_tensor * dst) {
13062
13472
 
13473
+ const struct ggml_tensor * src0 = dst->src[0];
13474
+
13063
13475
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
13064
13476
  return;
13065
13477
  }
@@ -13099,13 +13511,14 @@ static void ggml_compute_forward_argsort_f32(
13099
13511
 
13100
13512
  static void ggml_compute_forward_argsort(
13101
13513
  const struct ggml_compute_params * params,
13102
- const struct ggml_tensor * src0,
13103
13514
  struct ggml_tensor * dst) {
13104
13515
 
13516
+ const struct ggml_tensor * src0 = dst->src[0];
13517
+
13105
13518
  switch (src0->type) {
13106
13519
  case GGML_TYPE_F32:
13107
13520
  {
13108
- ggml_compute_forward_argsort_f32(params, src0, dst);
13521
+ ggml_compute_forward_argsort_f32(params, dst);
13109
13522
  } break;
13110
13523
  default:
13111
13524
  {
@@ -13118,11 +13531,13 @@ static void ggml_compute_forward_argsort(
13118
13531
 
13119
13532
  static void ggml_compute_forward_flash_attn_f32(
13120
13533
  const struct ggml_compute_params * params,
13121
- const struct ggml_tensor * q,
13122
- const struct ggml_tensor * k,
13123
- const struct ggml_tensor * v,
13124
13534
  const bool masked,
13125
13535
  struct ggml_tensor * dst) {
13536
+
13537
+ const struct ggml_tensor * q = dst->src[0];
13538
+ const struct ggml_tensor * k = dst->src[1];
13539
+ const struct ggml_tensor * v = dst->src[2];
13540
+
13126
13541
  int64_t t0 = ggml_perf_time_us();
13127
13542
  UNUSED(t0);
13128
13543
 
@@ -13214,9 +13629,9 @@ static void ggml_compute_forward_flash_attn_f32(
13214
13629
  const int i1 = ik1;
13215
13630
 
13216
13631
  ggml_vec_dot_f32(neq0,
13217
- S + i1,
13218
- (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
13219
- (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
13632
+ S + i1, 0,
13633
+ (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
13634
+ (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
13220
13635
  }
13221
13636
 
13222
13637
  // scale
@@ -13299,20 +13714,22 @@ static void ggml_compute_forward_flash_attn_f32(
13299
13714
  const int iv3 = iq3;
13300
13715
 
13301
13716
  ggml_vec_dot_f32(masked_begin,
13302
- (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
13303
- (float *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)),
13304
- S);
13717
+ (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0,
13718
+ (float *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0,
13719
+ S, 0, 1);
13305
13720
  }
13306
13721
  }
13307
13722
  }
13308
13723
 
13309
13724
  static void ggml_compute_forward_flash_attn_f16(
13310
13725
  const struct ggml_compute_params * params,
13311
- const struct ggml_tensor * q,
13312
- const struct ggml_tensor * k,
13313
- const struct ggml_tensor * v,
13314
13726
  const bool masked,
13315
13727
  struct ggml_tensor * dst) {
13728
+
13729
+ const struct ggml_tensor * q = dst->src[0];
13730
+ const struct ggml_tensor * k = dst->src[1];
13731
+ const struct ggml_tensor * v = dst->src[2];
13732
+
13316
13733
  int64_t t0 = ggml_perf_time_us();
13317
13734
  UNUSED(t0);
13318
13735
 
@@ -13404,9 +13821,9 @@ static void ggml_compute_forward_flash_attn_f16(
13404
13821
  const int i1 = ik1;
13405
13822
 
13406
13823
  ggml_vec_dot_f16(neq0,
13407
- S + i1,
13408
- (ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
13409
- (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
13824
+ S + i1, 0,
13825
+ (ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
13826
+ (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
13410
13827
  }
13411
13828
  } else {
13412
13829
  for (int64_t ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) {
@@ -13508,9 +13925,9 @@ static void ggml_compute_forward_flash_attn_f16(
13508
13925
  const int iv3 = iq3;
13509
13926
 
13510
13927
  ggml_vec_dot_f16(nev0,
13511
- (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
13512
- (ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)),
13513
- S16);
13928
+ (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0,
13929
+ (ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0,
13930
+ S16, 0, 1);
13514
13931
  }
13515
13932
  } else {
13516
13933
  for (int64_t ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) {
@@ -13534,19 +13951,19 @@ static void ggml_compute_forward_flash_attn_f16(
13534
13951
 
13535
13952
  static void ggml_compute_forward_flash_attn(
13536
13953
  const struct ggml_compute_params * params,
13537
- const struct ggml_tensor * q,
13538
- const struct ggml_tensor * k,
13539
- const struct ggml_tensor * v,
13540
13954
  const bool masked,
13541
13955
  struct ggml_tensor * dst) {
13956
+
13957
+ const struct ggml_tensor * q = dst->src[0];
13958
+
13542
13959
  switch (q->type) {
13543
13960
  case GGML_TYPE_F16:
13544
13961
  {
13545
- ggml_compute_forward_flash_attn_f16(params, q, k, v, masked, dst);
13962
+ ggml_compute_forward_flash_attn_f16(params, masked, dst);
13546
13963
  } break;
13547
13964
  case GGML_TYPE_F32:
13548
13965
  {
13549
- ggml_compute_forward_flash_attn_f32(params, q, k, v, masked, dst);
13966
+ ggml_compute_forward_flash_attn_f32(params, masked, dst);
13550
13967
  } break;
13551
13968
  default:
13552
13969
  {
@@ -13559,12 +13976,14 @@ static void ggml_compute_forward_flash_attn(
13559
13976
 
13560
13977
  static void ggml_compute_forward_flash_ff_f16(
13561
13978
  const struct ggml_compute_params * params,
13562
- const struct ggml_tensor * a, // F16
13563
- const struct ggml_tensor * b0, // F16 fc_w
13564
- const struct ggml_tensor * b1, // F32 fc_b
13565
- const struct ggml_tensor * c0, // F16 proj_w
13566
- const struct ggml_tensor * c1, // F32 proj_b
13567
13979
  struct ggml_tensor * dst) {
13980
+
13981
+ const struct ggml_tensor * a = dst->src[0]; // F16
13982
+ const struct ggml_tensor * b0 = dst->src[1]; // F16 fc_w
13983
+ const struct ggml_tensor * b1 = dst->src[2]; // F32 fc_b
13984
+ const struct ggml_tensor * c0 = dst->src[3]; // F16 proj_w
13985
+ const struct ggml_tensor * c1 = dst->src[4]; // F32 proj_b
13986
+
13568
13987
  int64_t t0 = ggml_perf_time_us();
13569
13988
  UNUSED(t0);
13570
13989
 
@@ -13652,9 +14071,9 @@ static void ggml_compute_forward_flash_ff_f16(
13652
14071
  const int i1 = ib01;
13653
14072
 
13654
14073
  ggml_vec_dot_f16(nea0,
13655
- S + i1,
13656
- (ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)),
13657
- (ggml_fp16_t *) ((char *) a->data + ( ia1*nba1 + ia2*nba2 + ia3*nba3)));
14074
+ S + i1, 0,
14075
+ (ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)), 0,
14076
+ (ggml_fp16_t *) ((char *) a->data + ( ia1*nba1 + ia2*nba2 + ia3*nba3)), 0, 1);
13658
14077
  }
13659
14078
 
13660
14079
  ggml_vec_add_f32(neb01, S, S, (float *) b1->data);
@@ -13677,9 +14096,9 @@ static void ggml_compute_forward_flash_ff_f16(
13677
14096
  for (int64_t ic = 0; ic < nec01; ++ic) {
13678
14097
 
13679
14098
  ggml_vec_dot_f16(neb01,
13680
- (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
13681
- (ggml_fp16_t *) ((char *) c0->data + ( ic*nbc01 + i2*nbc02 + i3*nbc03)),
13682
- S16);
14099
+ (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0,
14100
+ (ggml_fp16_t *) ((char *) c0->data + ( ic*nbc01 + i2*nbc02 + i3*nbc03)), 0,
14101
+ S16, 0, 1);
13683
14102
  }
13684
14103
 
13685
14104
  ggml_vec_add_f32(nec01,
@@ -13692,16 +14111,14 @@ static void ggml_compute_forward_flash_ff_f16(
13692
14111
 
13693
14112
  static void ggml_compute_forward_flash_ff(
13694
14113
  const struct ggml_compute_params * params,
13695
- const struct ggml_tensor * a,
13696
- const struct ggml_tensor * b0,
13697
- const struct ggml_tensor * b1,
13698
- const struct ggml_tensor * c0,
13699
- const struct ggml_tensor * c1,
13700
14114
  struct ggml_tensor * dst) {
14115
+
14116
+ const struct ggml_tensor * b0 = dst->src[1];
14117
+
13701
14118
  switch (b0->type) {
13702
14119
  case GGML_TYPE_F16:
13703
14120
  {
13704
- ggml_compute_forward_flash_ff_f16(params, a, b0, b1, c0, c1, dst);
14121
+ ggml_compute_forward_flash_ff_f16(params, dst);
13705
14122
  } break;
13706
14123
  case GGML_TYPE_F32:
13707
14124
  {
@@ -13718,12 +14135,14 @@ static void ggml_compute_forward_flash_ff(
13718
14135
 
13719
14136
  static void ggml_compute_forward_flash_attn_back_f32(
13720
14137
  const struct ggml_compute_params * params,
13721
- const struct ggml_tensor * q,
13722
- const struct ggml_tensor * k,
13723
- const struct ggml_tensor * v,
13724
- const struct ggml_tensor * d,
13725
14138
  const bool masked,
13726
14139
  struct ggml_tensor * dst) {
14140
+
14141
+ const struct ggml_tensor * q = dst->src[0];
14142
+ const struct ggml_tensor * k = dst->src[1];
14143
+ const struct ggml_tensor * v = dst->src[2];
14144
+ const struct ggml_tensor * d = dst->src[3];
14145
+
13727
14146
  int64_t t0 = ggml_perf_time_us();
13728
14147
  UNUSED(t0);
13729
14148
 
@@ -13866,9 +14285,9 @@ static void ggml_compute_forward_flash_attn_back_f32(
13866
14285
  const int i1 = ik1;
13867
14286
 
13868
14287
  ggml_vec_dot_f32(neq0,
13869
- S + i1,
13870
- (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
13871
- (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
14288
+ S + i1, 0,
14289
+ (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
14290
+ (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
13872
14291
  }
13873
14292
 
13874
14293
  // scale
@@ -14013,7 +14432,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
14013
14432
 
14014
14433
  // S = SM * (S - dot(SM, S))
14015
14434
  float dot_SM_gradSM = 0;
14016
- ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, SM, S);
14435
+ ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, 0, SM, 0, S, 0, 1);
14017
14436
  ggml_vec_acc1_f32(M, S, -dot_SM_gradSM);
14018
14437
  ggml_vec_mul_f32 (masked_begin, S, S, SM);
14019
14438
 
@@ -14071,16 +14490,15 @@ static void ggml_compute_forward_flash_attn_back_f32(
14071
14490
 
14072
14491
  static void ggml_compute_forward_flash_attn_back(
14073
14492
  const struct ggml_compute_params * params,
14074
- const struct ggml_tensor * q,
14075
- const struct ggml_tensor * k,
14076
- const struct ggml_tensor * v,
14077
- const struct ggml_tensor * d,
14078
14493
  const bool masked,
14079
14494
  struct ggml_tensor * dst) {
14495
+
14496
+ const struct ggml_tensor * q = dst->src[0];
14497
+
14080
14498
  switch (q->type) {
14081
14499
  case GGML_TYPE_F32:
14082
14500
  {
14083
- ggml_compute_forward_flash_attn_back_f32(params, q, k, v, d, masked, dst);
14501
+ ggml_compute_forward_flash_attn_back_f32(params, masked, dst);
14084
14502
  } break;
14085
14503
  default:
14086
14504
  {
@@ -14093,8 +14511,10 @@ static void ggml_compute_forward_flash_attn_back(
14093
14511
 
14094
14512
  static void ggml_compute_forward_win_part_f32(
14095
14513
  const struct ggml_compute_params * params,
14096
- const struct ggml_tensor * src0,
14097
14514
  struct ggml_tensor * dst) {
14515
+
14516
+ const struct ggml_tensor * src0 = dst->src[0];
14517
+
14098
14518
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14099
14519
  return;
14100
14520
  }
@@ -14137,12 +14557,14 @@ static void ggml_compute_forward_win_part_f32(
14137
14557
 
14138
14558
  static void ggml_compute_forward_win_part(
14139
14559
  const struct ggml_compute_params * params,
14140
- const struct ggml_tensor * src0,
14141
14560
  struct ggml_tensor * dst) {
14561
+
14562
+ const struct ggml_tensor * src0 = dst->src[0];
14563
+
14142
14564
  switch (src0->type) {
14143
14565
  case GGML_TYPE_F32:
14144
14566
  {
14145
- ggml_compute_forward_win_part_f32(params, src0, dst);
14567
+ ggml_compute_forward_win_part_f32(params, dst);
14146
14568
  } break;
14147
14569
  default:
14148
14570
  {
@@ -14155,8 +14577,10 @@ static void ggml_compute_forward_win_part(
14155
14577
 
14156
14578
  static void ggml_compute_forward_win_unpart_f32(
14157
14579
  const struct ggml_compute_params * params,
14158
- const struct ggml_tensor * src0,
14159
14580
  struct ggml_tensor * dst) {
14581
+
14582
+ const struct ggml_tensor * src0 = dst->src[0];
14583
+
14160
14584
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14161
14585
  return;
14162
14586
  }
@@ -14197,12 +14621,14 @@ static void ggml_compute_forward_win_unpart_f32(
14197
14621
 
14198
14622
  static void ggml_compute_forward_win_unpart(
14199
14623
  const struct ggml_compute_params * params,
14200
- const struct ggml_tensor * src0,
14201
14624
  struct ggml_tensor * dst) {
14625
+
14626
+ const struct ggml_tensor * src0 = dst->src[0];
14627
+
14202
14628
  switch (src0->type) {
14203
14629
  case GGML_TYPE_F32:
14204
14630
  {
14205
- ggml_compute_forward_win_unpart_f32(params, src0, dst);
14631
+ ggml_compute_forward_win_unpart_f32(params, dst);
14206
14632
  } break;
14207
14633
  default:
14208
14634
  {
@@ -14215,58 +14641,58 @@ static void ggml_compute_forward_win_unpart(
14215
14641
 
14216
14642
  static void ggml_compute_forward_unary(
14217
14643
  const struct ggml_compute_params * params,
14218
- const struct ggml_tensor * src0,
14219
14644
  struct ggml_tensor * dst) {
14645
+
14220
14646
  const enum ggml_unary_op op = ggml_get_unary_op(dst);
14221
14647
 
14222
14648
  switch (op) {
14223
14649
  case GGML_UNARY_OP_ABS:
14224
14650
  {
14225
- ggml_compute_forward_abs(params, src0, dst);
14651
+ ggml_compute_forward_abs(params, dst);
14226
14652
  } break;
14227
14653
  case GGML_UNARY_OP_SGN:
14228
14654
  {
14229
- ggml_compute_forward_sgn(params, src0, dst);
14655
+ ggml_compute_forward_sgn(params, dst);
14230
14656
  } break;
14231
14657
  case GGML_UNARY_OP_NEG:
14232
14658
  {
14233
- ggml_compute_forward_neg(params, src0, dst);
14659
+ ggml_compute_forward_neg(params, dst);
14234
14660
  } break;
14235
14661
  case GGML_UNARY_OP_STEP:
14236
14662
  {
14237
- ggml_compute_forward_step(params, src0, dst);
14663
+ ggml_compute_forward_step(params, dst);
14238
14664
  } break;
14239
14665
  case GGML_UNARY_OP_TANH:
14240
14666
  {
14241
- ggml_compute_forward_tanh(params, src0, dst);
14667
+ ggml_compute_forward_tanh(params, dst);
14242
14668
  } break;
14243
14669
  case GGML_UNARY_OP_ELU:
14244
14670
  {
14245
- ggml_compute_forward_elu(params, src0, dst);
14671
+ ggml_compute_forward_elu(params, dst);
14246
14672
  } break;
14247
14673
  case GGML_UNARY_OP_RELU:
14248
14674
  {
14249
- ggml_compute_forward_relu(params, src0, dst);
14675
+ ggml_compute_forward_relu(params, dst);
14250
14676
  } break;
14251
14677
  case GGML_UNARY_OP_GELU:
14252
14678
  {
14253
- ggml_compute_forward_gelu(params, src0, dst);
14679
+ ggml_compute_forward_gelu(params, dst);
14254
14680
  } break;
14255
14681
  case GGML_UNARY_OP_GELU_QUICK:
14256
14682
  {
14257
- ggml_compute_forward_gelu_quick(params, src0, dst);
14683
+ ggml_compute_forward_gelu_quick(params, dst);
14258
14684
  } break;
14259
14685
  case GGML_UNARY_OP_SILU:
14260
14686
  {
14261
- ggml_compute_forward_silu(params, src0, dst);
14687
+ ggml_compute_forward_silu(params, dst);
14262
14688
  } break;
14263
14689
  case GGML_UNARY_OP_HARDSWISH:
14264
14690
  {
14265
- ggml_compute_forward_hardswish(params, src0, dst);
14691
+ ggml_compute_forward_hardswish(params, dst);
14266
14692
  } break;
14267
14693
  case GGML_UNARY_OP_HARDSIGMOID:
14268
14694
  {
14269
- ggml_compute_forward_hardsigmoid(params, src0, dst);
14695
+ ggml_compute_forward_hardsigmoid(params, dst);
14270
14696
  } break;
14271
14697
  default:
14272
14698
  {
@@ -14279,8 +14705,10 @@ static void ggml_compute_forward_unary(
14279
14705
 
14280
14706
  static void ggml_compute_forward_get_rel_pos_f16(
14281
14707
  const struct ggml_compute_params * params,
14282
- const struct ggml_tensor * src0,
14283
14708
  struct ggml_tensor * dst) {
14709
+
14710
+ const struct ggml_tensor * src0 = dst->src[0];
14711
+
14284
14712
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14285
14713
  return;
14286
14714
  }
@@ -14306,12 +14734,14 @@ static void ggml_compute_forward_get_rel_pos_f16(
14306
14734
 
14307
14735
  static void ggml_compute_forward_get_rel_pos(
14308
14736
  const struct ggml_compute_params * params,
14309
- const struct ggml_tensor * src0,
14310
14737
  struct ggml_tensor * dst) {
14738
+
14739
+ const struct ggml_tensor * src0 = dst->src[0];
14740
+
14311
14741
  switch (src0->type) {
14312
14742
  case GGML_TYPE_F16:
14313
14743
  {
14314
- ggml_compute_forward_get_rel_pos_f16(params, src0, dst);
14744
+ ggml_compute_forward_get_rel_pos_f16(params, dst);
14315
14745
  } break;
14316
14746
  default:
14317
14747
  {
@@ -14324,11 +14754,12 @@ static void ggml_compute_forward_get_rel_pos(
14324
14754
 
14325
14755
  static void ggml_compute_forward_add_rel_pos_f32(
14326
14756
  const struct ggml_compute_params * params,
14327
- const struct ggml_tensor * src0,
14328
- const struct ggml_tensor * src1,
14329
- const struct ggml_tensor * src2,
14330
14757
  struct ggml_tensor * dst) {
14331
14758
 
14759
+ const struct ggml_tensor * src0 = dst->src[0];
14760
+ const struct ggml_tensor * src1 = dst->src[1];
14761
+ const struct ggml_tensor * src2 = dst->src[2];
14762
+
14332
14763
  const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
14333
14764
  if (!inplace && params->type == GGML_TASK_INIT) {
14334
14765
  if (params->ith != 0) {
@@ -14392,14 +14823,14 @@ static void ggml_compute_forward_add_rel_pos_f32(
14392
14823
 
14393
14824
  static void ggml_compute_forward_add_rel_pos(
14394
14825
  const struct ggml_compute_params * params,
14395
- const struct ggml_tensor * src0,
14396
- const struct ggml_tensor * src1,
14397
- const struct ggml_tensor * src2,
14398
14826
  struct ggml_tensor * dst) {
14827
+
14828
+ const struct ggml_tensor * src0 = dst->src[0];
14829
+
14399
14830
  switch (src0->type) {
14400
14831
  case GGML_TYPE_F32:
14401
14832
  {
14402
- ggml_compute_forward_add_rel_pos_f32(params, src0, src1, src2, dst);
14833
+ ggml_compute_forward_add_rel_pos_f32(params, dst);
14403
14834
  } break;
14404
14835
  default:
14405
14836
  {
@@ -14412,9 +14843,11 @@ static void ggml_compute_forward_add_rel_pos(
14412
14843
 
14413
14844
  static void ggml_compute_forward_map_unary_f32(
14414
14845
  const struct ggml_compute_params * params,
14415
- const struct ggml_tensor * src0,
14416
14846
  struct ggml_tensor * dst,
14417
14847
  const ggml_unary_op_f32_t fun) {
14848
+
14849
+ const struct ggml_tensor * src0 = dst->src[0];
14850
+
14418
14851
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
14419
14852
 
14420
14853
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -14436,13 +14869,15 @@ static void ggml_compute_forward_map_unary_f32(
14436
14869
 
14437
14870
  static void ggml_compute_forward_map_unary(
14438
14871
  const struct ggml_compute_params * params,
14439
- const struct ggml_tensor * src0,
14440
14872
  struct ggml_tensor * dst,
14441
14873
  const ggml_unary_op_f32_t fun) {
14874
+
14875
+ const struct ggml_tensor * src0 = dst->src[0];
14876
+
14442
14877
  switch (src0->type) {
14443
14878
  case GGML_TYPE_F32:
14444
14879
  {
14445
- ggml_compute_forward_map_unary_f32(params, src0, dst, fun);
14880
+ ggml_compute_forward_map_unary_f32(params, dst, fun);
14446
14881
  } break;
14447
14882
  default:
14448
14883
  {
@@ -14455,10 +14890,12 @@ static void ggml_compute_forward_map_unary(
14455
14890
 
14456
14891
  static void ggml_compute_forward_map_binary_f32(
14457
14892
  const struct ggml_compute_params * params,
14458
- const struct ggml_tensor * src0,
14459
- const struct ggml_tensor * src1,
14460
14893
  struct ggml_tensor * dst,
14461
14894
  const ggml_binary_op_f32_t fun) {
14895
+
14896
+ const struct ggml_tensor * src0 = dst->src[0];
14897
+ const struct ggml_tensor * src1 = dst->src[1];
14898
+
14462
14899
  assert(params->ith == 0);
14463
14900
  assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
14464
14901
 
@@ -14483,14 +14920,15 @@ static void ggml_compute_forward_map_binary_f32(
14483
14920
 
14484
14921
  static void ggml_compute_forward_map_binary(
14485
14922
  const struct ggml_compute_params * params,
14486
- const struct ggml_tensor * src0,
14487
- const struct ggml_tensor * src1,
14488
14923
  struct ggml_tensor * dst,
14489
14924
  const ggml_binary_op_f32_t fun) {
14925
+
14926
+ const struct ggml_tensor * src0 = dst->src[0];
14927
+
14490
14928
  switch (src0->type) {
14491
14929
  case GGML_TYPE_F32:
14492
14930
  {
14493
- ggml_compute_forward_map_binary_f32(params, src0, src1, dst, fun);
14931
+ ggml_compute_forward_map_binary_f32(params, dst, fun);
14494
14932
  } break;
14495
14933
  default:
14496
14934
  {
@@ -14503,9 +14941,11 @@ static void ggml_compute_forward_map_binary(
14503
14941
 
14504
14942
  static void ggml_compute_forward_map_custom1_f32(
14505
14943
  const struct ggml_compute_params * params,
14506
- const struct ggml_tensor * a,
14507
14944
  struct ggml_tensor * dst,
14508
14945
  const ggml_custom1_op_f32_t fun) {
14946
+
14947
+ const struct ggml_tensor * a = dst->src[0];
14948
+
14509
14949
  assert(params->ith == 0);
14510
14950
 
14511
14951
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -14519,10 +14959,12 @@ static void ggml_compute_forward_map_custom1_f32(
14519
14959
 
14520
14960
  static void ggml_compute_forward_map_custom2_f32(
14521
14961
  const struct ggml_compute_params * params,
14522
- const struct ggml_tensor * a,
14523
- const struct ggml_tensor * b,
14524
14962
  struct ggml_tensor * dst,
14525
14963
  const ggml_custom2_op_f32_t fun) {
14964
+
14965
+ const struct ggml_tensor * a = dst->src[0];
14966
+ const struct ggml_tensor * b = dst->src[1];
14967
+
14526
14968
  assert(params->ith == 0);
14527
14969
 
14528
14970
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -14536,11 +14978,13 @@ static void ggml_compute_forward_map_custom2_f32(
14536
14978
 
14537
14979
  static void ggml_compute_forward_map_custom3_f32(
14538
14980
  const struct ggml_compute_params * params,
14539
- const struct ggml_tensor * a,
14540
- const struct ggml_tensor * b,
14541
- const struct ggml_tensor * c,
14542
14981
  struct ggml_tensor * dst,
14543
14982
  const ggml_custom3_op_f32_t fun) {
14983
+
14984
+ const struct ggml_tensor * a = dst->src[0];
14985
+ const struct ggml_tensor * b = dst->src[1];
14986
+ const struct ggml_tensor * c = dst->src[1];
14987
+
14544
14988
  assert(params->ith == 0);
14545
14989
 
14546
14990
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -14554,8 +14998,10 @@ static void ggml_compute_forward_map_custom3_f32(
14554
14998
 
14555
14999
  static void ggml_compute_forward_map_custom1(
14556
15000
  const struct ggml_compute_params * params,
14557
- const struct ggml_tensor * a,
14558
15001
  struct ggml_tensor * dst) {
15002
+
15003
+ const struct ggml_tensor * a = dst->src[0];
15004
+
14559
15005
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14560
15006
  return;
14561
15007
  }
@@ -14569,9 +15015,11 @@ static void ggml_compute_forward_map_custom1(
14569
15015
 
14570
15016
  static void ggml_compute_forward_map_custom2(
14571
15017
  const struct ggml_compute_params * params,
14572
- const struct ggml_tensor * a,
14573
- const struct ggml_tensor * b,
14574
15018
  struct ggml_tensor * dst) {
15019
+
15020
+ const struct ggml_tensor * a = dst->src[0];
15021
+ const struct ggml_tensor * b = dst->src[1];
15022
+
14575
15023
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14576
15024
  return;
14577
15025
  }
@@ -14585,10 +15033,12 @@ static void ggml_compute_forward_map_custom2(
14585
15033
 
14586
15034
  static void ggml_compute_forward_map_custom3(
14587
15035
  const struct ggml_compute_params * params,
14588
- const struct ggml_tensor * a,
14589
- const struct ggml_tensor * b,
14590
- const struct ggml_tensor * c,
14591
15036
  struct ggml_tensor * dst) {
15037
+
15038
+ const struct ggml_tensor * a = dst->src[0];
15039
+ const struct ggml_tensor * b = dst->src[1];
15040
+ const struct ggml_tensor * c = dst->src[2];
15041
+
14592
15042
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14593
15043
  return;
14594
15044
  }
@@ -14602,9 +15052,11 @@ static void ggml_compute_forward_map_custom3(
14602
15052
 
14603
15053
  static void ggml_compute_forward_cross_entropy_loss_f32(
14604
15054
  const struct ggml_compute_params * params,
14605
- const struct ggml_tensor * src0,
14606
- const struct ggml_tensor * src1,
14607
15055
  struct ggml_tensor * dst) {
15056
+
15057
+ const struct ggml_tensor * src0 = dst->src[0];
15058
+ const struct ggml_tensor * src1 = dst->src[1];
15059
+
14608
15060
  GGML_ASSERT(ggml_is_contiguous(src0));
14609
15061
  GGML_ASSERT(ggml_is_contiguous(src1));
14610
15062
  GGML_ASSERT(ggml_is_scalar(dst));
@@ -14708,13 +15160,14 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
14708
15160
 
14709
15161
  static void ggml_compute_forward_cross_entropy_loss(
14710
15162
  const struct ggml_compute_params * params,
14711
- const struct ggml_tensor * src0,
14712
- const struct ggml_tensor * src1,
14713
15163
  struct ggml_tensor * dst) {
15164
+
15165
+ const struct ggml_tensor * src0 = dst->src[0];
15166
+
14714
15167
  switch (src0->type) {
14715
15168
  case GGML_TYPE_F32:
14716
15169
  {
14717
- ggml_compute_forward_cross_entropy_loss_f32(params, src0, src1, dst);
15170
+ ggml_compute_forward_cross_entropy_loss_f32(params, dst);
14718
15171
  } break;
14719
15172
  default:
14720
15173
  {
@@ -14727,10 +15180,12 @@ static void ggml_compute_forward_cross_entropy_loss(
14727
15180
 
14728
15181
  static void ggml_compute_forward_cross_entropy_loss_back_f32(
14729
15182
  const struct ggml_compute_params * params,
14730
- const struct ggml_tensor * src0,
14731
- const struct ggml_tensor * src1,
14732
- const struct ggml_tensor * opt0,
14733
15183
  struct ggml_tensor * dst) {
15184
+
15185
+ const struct ggml_tensor * src0 = dst->src[0];
15186
+ const struct ggml_tensor * src1 = dst->src[1];
15187
+ const struct ggml_tensor * opt0 = dst->src[2];
15188
+
14734
15189
  GGML_ASSERT(ggml_is_contiguous(dst));
14735
15190
  GGML_ASSERT(ggml_is_contiguous(src0));
14736
15191
  GGML_ASSERT(ggml_is_contiguous(src1));
@@ -14817,14 +15272,14 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
14817
15272
 
14818
15273
  static void ggml_compute_forward_cross_entropy_loss_back(
14819
15274
  const struct ggml_compute_params * params,
14820
- const struct ggml_tensor * src0,
14821
- const struct ggml_tensor * src1,
14822
- const struct ggml_tensor * opt0,
14823
15275
  struct ggml_tensor * dst) {
15276
+
15277
+ const struct ggml_tensor * src0 = dst->src[0];
15278
+
14824
15279
  switch (src0->type) {
14825
15280
  case GGML_TYPE_F32:
14826
15281
  {
14827
- ggml_compute_forward_cross_entropy_loss_back_f32(params, src0, src1, opt0, dst);
15282
+ ggml_compute_forward_cross_entropy_loss_back_f32(params, dst);
14828
15283
  } break;
14829
15284
  default:
14830
15285
  {
@@ -14872,312 +15327,312 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14872
15327
  switch (tensor->op) {
14873
15328
  case GGML_OP_DUP:
14874
15329
  {
14875
- ggml_compute_forward_dup(params, tensor->src[0], tensor);
15330
+ ggml_compute_forward_dup(params, tensor);
14876
15331
  } break;
14877
15332
  case GGML_OP_ADD:
14878
15333
  {
14879
- ggml_compute_forward_add(params, tensor->src[0], tensor->src[1], tensor);
15334
+ ggml_compute_forward_add(params, tensor);
14880
15335
  } break;
14881
15336
  case GGML_OP_ADD1:
14882
15337
  {
14883
- ggml_compute_forward_add1(params, tensor->src[0], tensor->src[1], tensor);
15338
+ ggml_compute_forward_add1(params, tensor);
14884
15339
  } break;
14885
15340
  case GGML_OP_ACC:
14886
15341
  {
14887
- ggml_compute_forward_acc(params, tensor->src[0], tensor->src[1], tensor);
15342
+ ggml_compute_forward_acc(params, tensor);
14888
15343
  } break;
14889
15344
  case GGML_OP_SUB:
14890
15345
  {
14891
- ggml_compute_forward_sub(params, tensor->src[0], tensor->src[1], tensor);
15346
+ ggml_compute_forward_sub(params, tensor);
14892
15347
  } break;
14893
15348
  case GGML_OP_MUL:
14894
15349
  {
14895
- ggml_compute_forward_mul(params, tensor->src[0], tensor->src[1], tensor);
15350
+ ggml_compute_forward_mul(params, tensor);
14896
15351
  } break;
14897
15352
  case GGML_OP_DIV:
14898
15353
  {
14899
- ggml_compute_forward_div(params, tensor->src[0], tensor->src[1], tensor);
15354
+ ggml_compute_forward_div(params, tensor);
14900
15355
  } break;
14901
15356
  case GGML_OP_SQR:
14902
15357
  {
14903
- ggml_compute_forward_sqr(params, tensor->src[0], tensor);
15358
+ ggml_compute_forward_sqr(params, tensor);
14904
15359
  } break;
14905
15360
  case GGML_OP_SQRT:
14906
15361
  {
14907
- ggml_compute_forward_sqrt(params, tensor->src[0], tensor);
15362
+ ggml_compute_forward_sqrt(params, tensor);
14908
15363
  } break;
14909
15364
  case GGML_OP_LOG:
14910
15365
  {
14911
- ggml_compute_forward_log(params, tensor->src[0], tensor);
15366
+ ggml_compute_forward_log(params, tensor);
14912
15367
  } break;
14913
15368
  case GGML_OP_SUM:
14914
15369
  {
14915
- ggml_compute_forward_sum(params, tensor->src[0], tensor);
15370
+ ggml_compute_forward_sum(params, tensor);
14916
15371
  } break;
14917
15372
  case GGML_OP_SUM_ROWS:
14918
15373
  {
14919
- ggml_compute_forward_sum_rows(params, tensor->src[0], tensor);
15374
+ ggml_compute_forward_sum_rows(params, tensor);
14920
15375
  } break;
14921
15376
  case GGML_OP_MEAN:
14922
15377
  {
14923
- ggml_compute_forward_mean(params, tensor->src[0], tensor);
15378
+ ggml_compute_forward_mean(params, tensor);
14924
15379
  } break;
14925
15380
  case GGML_OP_ARGMAX:
14926
15381
  {
14927
- ggml_compute_forward_argmax(params, tensor->src[0], tensor);
15382
+ ggml_compute_forward_argmax(params, tensor);
14928
15383
  } break;
14929
15384
  case GGML_OP_REPEAT:
14930
15385
  {
14931
- ggml_compute_forward_repeat(params, tensor->src[0], tensor);
15386
+ ggml_compute_forward_repeat(params, tensor);
14932
15387
  } break;
14933
15388
  case GGML_OP_REPEAT_BACK:
14934
15389
  {
14935
- ggml_compute_forward_repeat_back(params, tensor->src[0], tensor);
15390
+ ggml_compute_forward_repeat_back(params, tensor);
14936
15391
  } break;
14937
15392
  case GGML_OP_CONCAT:
14938
15393
  {
14939
- ggml_compute_forward_concat(params, tensor->src[0], tensor->src[1], tensor);
15394
+ ggml_compute_forward_concat(params, tensor);
14940
15395
  } break;
14941
15396
  case GGML_OP_SILU_BACK:
14942
15397
  {
14943
- ggml_compute_forward_silu_back(params, tensor->src[0], tensor->src[1], tensor);
15398
+ ggml_compute_forward_silu_back(params, tensor);
14944
15399
  } break;
14945
15400
  case GGML_OP_NORM:
14946
15401
  {
14947
- ggml_compute_forward_norm(params, tensor->src[0], tensor);
15402
+ ggml_compute_forward_norm(params, tensor);
14948
15403
  } break;
14949
15404
  case GGML_OP_RMS_NORM:
14950
15405
  {
14951
- ggml_compute_forward_rms_norm(params, tensor->src[0], tensor);
15406
+ ggml_compute_forward_rms_norm(params, tensor);
14952
15407
  } break;
14953
15408
  case GGML_OP_RMS_NORM_BACK:
14954
15409
  {
14955
- ggml_compute_forward_rms_norm_back(params, tensor->src[0], tensor->src[1], tensor);
15410
+ ggml_compute_forward_rms_norm_back(params, tensor);
14956
15411
  } break;
14957
15412
  case GGML_OP_GROUP_NORM:
14958
15413
  {
14959
- ggml_compute_forward_group_norm(params, tensor->src[0], tensor);
15414
+ ggml_compute_forward_group_norm(params, tensor);
14960
15415
  } break;
14961
15416
  case GGML_OP_MUL_MAT:
14962
15417
  {
14963
- ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
15418
+ ggml_compute_forward_mul_mat(params, tensor);
14964
15419
  } break;
14965
15420
  case GGML_OP_MUL_MAT_ID:
14966
15421
  {
14967
- ggml_compute_forward_mul_mat_id(params, tensor->src[0], tensor->src[1], tensor);
15422
+ ggml_compute_forward_mul_mat_id(params, tensor);
14968
15423
  } break;
14969
15424
  case GGML_OP_OUT_PROD:
14970
15425
  {
14971
- ggml_compute_forward_out_prod(params, tensor->src[0], tensor->src[1], tensor);
15426
+ ggml_compute_forward_out_prod(params, tensor);
14972
15427
  } break;
14973
15428
  case GGML_OP_SCALE:
14974
15429
  {
14975
- ggml_compute_forward_scale(params, tensor->src[0], tensor);
15430
+ ggml_compute_forward_scale(params, tensor);
14976
15431
  } break;
14977
15432
  case GGML_OP_SET:
14978
15433
  {
14979
- ggml_compute_forward_set(params, tensor->src[0], tensor->src[1], tensor);
15434
+ ggml_compute_forward_set(params, tensor);
14980
15435
  } break;
14981
15436
  case GGML_OP_CPY:
14982
15437
  {
14983
- ggml_compute_forward_cpy(params, tensor->src[0], tensor);
15438
+ ggml_compute_forward_cpy(params, tensor);
14984
15439
  } break;
14985
15440
  case GGML_OP_CONT:
14986
15441
  {
14987
- ggml_compute_forward_cont(params, tensor->src[0], tensor);
15442
+ ggml_compute_forward_cont(params, tensor);
14988
15443
  } break;
14989
15444
  case GGML_OP_RESHAPE:
14990
15445
  {
14991
- ggml_compute_forward_reshape(params, tensor->src[0], tensor);
15446
+ ggml_compute_forward_reshape(params, tensor);
14992
15447
  } break;
14993
15448
  case GGML_OP_VIEW:
14994
15449
  {
14995
- ggml_compute_forward_view(params, tensor->src[0]);
15450
+ ggml_compute_forward_view(params, tensor);
14996
15451
  } break;
14997
15452
  case GGML_OP_PERMUTE:
14998
15453
  {
14999
- ggml_compute_forward_permute(params, tensor->src[0]);
15454
+ ggml_compute_forward_permute(params, tensor);
15000
15455
  } break;
15001
15456
  case GGML_OP_TRANSPOSE:
15002
15457
  {
15003
- ggml_compute_forward_transpose(params, tensor->src[0]);
15458
+ ggml_compute_forward_transpose(params, tensor);
15004
15459
  } break;
15005
15460
  case GGML_OP_GET_ROWS:
15006
15461
  {
15007
- ggml_compute_forward_get_rows(params, tensor->src[0], tensor->src[1], tensor);
15462
+ ggml_compute_forward_get_rows(params, tensor);
15008
15463
  } break;
15009
15464
  case GGML_OP_GET_ROWS_BACK:
15010
15465
  {
15011
- ggml_compute_forward_get_rows_back(params, tensor->src[0], tensor->src[1], tensor);
15466
+ ggml_compute_forward_get_rows_back(params, tensor);
15012
15467
  } break;
15013
15468
  case GGML_OP_DIAG:
15014
15469
  {
15015
- ggml_compute_forward_diag(params, tensor->src[0], tensor);
15470
+ ggml_compute_forward_diag(params, tensor);
15016
15471
  } break;
15017
15472
  case GGML_OP_DIAG_MASK_INF:
15018
15473
  {
15019
- ggml_compute_forward_diag_mask_inf(params, tensor->src[0], tensor);
15474
+ ggml_compute_forward_diag_mask_inf(params, tensor);
15020
15475
  } break;
15021
15476
  case GGML_OP_DIAG_MASK_ZERO:
15022
15477
  {
15023
- ggml_compute_forward_diag_mask_zero(params, tensor->src[0], tensor);
15478
+ ggml_compute_forward_diag_mask_zero(params, tensor);
15024
15479
  } break;
15025
15480
  case GGML_OP_SOFT_MAX:
15026
15481
  {
15027
- ggml_compute_forward_soft_max(params, tensor->src[0], tensor->src[1], tensor);
15482
+ ggml_compute_forward_soft_max(params, tensor);
15028
15483
  } break;
15029
15484
  case GGML_OP_SOFT_MAX_BACK:
15030
15485
  {
15031
- ggml_compute_forward_soft_max_back(params, tensor->src[0], tensor->src[1], tensor);
15486
+ ggml_compute_forward_soft_max_back(params, tensor);
15032
15487
  } break;
15033
15488
  case GGML_OP_ROPE:
15034
15489
  {
15035
- ggml_compute_forward_rope(params, tensor->src[0], tensor->src[1], tensor);
15490
+ ggml_compute_forward_rope(params, tensor);
15036
15491
  } break;
15037
15492
  case GGML_OP_ROPE_BACK:
15038
15493
  {
15039
- ggml_compute_forward_rope_back(params, tensor->src[0], tensor->src[1], tensor);
15494
+ ggml_compute_forward_rope_back(params, tensor);
15040
15495
  } break;
15041
15496
  case GGML_OP_ALIBI:
15042
15497
  {
15043
- ggml_compute_forward_alibi(params, tensor->src[0], tensor);
15498
+ ggml_compute_forward_alibi(params, tensor);
15044
15499
  } break;
15045
15500
  case GGML_OP_CLAMP:
15046
15501
  {
15047
- ggml_compute_forward_clamp(params, tensor->src[0], tensor);
15502
+ ggml_compute_forward_clamp(params, tensor);
15048
15503
  } break;
15049
15504
  case GGML_OP_CONV_TRANSPOSE_1D:
15050
15505
  {
15051
- ggml_compute_forward_conv_transpose_1d(params, tensor->src[0], tensor->src[1], tensor);
15506
+ ggml_compute_forward_conv_transpose_1d(params, tensor);
15052
15507
  } break;
15053
15508
  case GGML_OP_IM2COL:
15054
15509
  {
15055
- ggml_compute_forward_im2col(params, tensor->src[0], tensor->src[1], tensor);
15510
+ ggml_compute_forward_im2col(params, tensor);
15056
15511
  } break;
15057
15512
  case GGML_OP_CONV_TRANSPOSE_2D:
15058
15513
  {
15059
- ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor);
15514
+ ggml_compute_forward_conv_transpose_2d(params, tensor);
15060
15515
  } break;
15061
15516
  case GGML_OP_POOL_1D:
15062
15517
  {
15063
- ggml_compute_forward_pool_1d(params, tensor->src[0], tensor);
15518
+ ggml_compute_forward_pool_1d(params, tensor);
15064
15519
  } break;
15065
15520
  case GGML_OP_POOL_2D:
15066
15521
  {
15067
- ggml_compute_forward_pool_2d(params, tensor->src[0], tensor);
15522
+ ggml_compute_forward_pool_2d(params, tensor);
15068
15523
  } break;
15069
15524
  case GGML_OP_UPSCALE:
15070
15525
  {
15071
- ggml_compute_forward_upscale(params, tensor->src[0], tensor);
15526
+ ggml_compute_forward_upscale(params, tensor);
15072
15527
  } break;
15073
15528
  case GGML_OP_PAD:
15074
15529
  {
15075
- ggml_compute_forward_pad(params, tensor->src[0], tensor);
15530
+ ggml_compute_forward_pad(params, tensor);
15076
15531
  } break;
15077
15532
  case GGML_OP_ARGSORT:
15078
15533
  {
15079
- ggml_compute_forward_argsort(params, tensor->src[0], tensor);
15534
+ ggml_compute_forward_argsort(params, tensor);
15080
15535
  } break;
15081
15536
  case GGML_OP_LEAKY_RELU:
15082
15537
  {
15083
- ggml_compute_forward_leaky_relu(params, tensor->src[0], tensor);
15538
+ ggml_compute_forward_leaky_relu(params, tensor);
15084
15539
  } break;
15085
15540
  case GGML_OP_FLASH_ATTN:
15086
15541
  {
15087
15542
  const int32_t t = ggml_get_op_params_i32(tensor, 0);
15088
15543
  GGML_ASSERT(t == 0 || t == 1);
15089
15544
  const bool masked = t != 0;
15090
- ggml_compute_forward_flash_attn(params, tensor->src[0], tensor->src[1], tensor->src[2], masked, tensor);
15545
+ ggml_compute_forward_flash_attn(params, masked, tensor);
15091
15546
  } break;
15092
15547
  case GGML_OP_FLASH_FF:
15093
15548
  {
15094
- ggml_compute_forward_flash_ff(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor->src[4], tensor);
15549
+ ggml_compute_forward_flash_ff(params, tensor);
15095
15550
  } break;
15096
15551
  case GGML_OP_FLASH_ATTN_BACK:
15097
15552
  {
15098
15553
  int32_t t = ggml_get_op_params_i32(tensor, 0);
15099
15554
  GGML_ASSERT(t == 0 || t == 1);
15100
15555
  bool masked = t != 0;
15101
- ggml_compute_forward_flash_attn_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], masked, tensor);
15556
+ ggml_compute_forward_flash_attn_back(params, masked, tensor);
15102
15557
  } break;
15103
15558
  case GGML_OP_WIN_PART:
15104
15559
  {
15105
- ggml_compute_forward_win_part(params, tensor->src[0], tensor);
15560
+ ggml_compute_forward_win_part(params, tensor);
15106
15561
  } break;
15107
15562
  case GGML_OP_WIN_UNPART:
15108
15563
  {
15109
- ggml_compute_forward_win_unpart(params, tensor->src[0], tensor);
15564
+ ggml_compute_forward_win_unpart(params, tensor);
15110
15565
  } break;
15111
15566
  case GGML_OP_UNARY:
15112
15567
  {
15113
- ggml_compute_forward_unary(params, tensor->src[0], tensor);
15568
+ ggml_compute_forward_unary(params, tensor);
15114
15569
  } break;
15115
15570
  case GGML_OP_GET_REL_POS:
15116
15571
  {
15117
- ggml_compute_forward_get_rel_pos(params, tensor->src[0], tensor);
15572
+ ggml_compute_forward_get_rel_pos(params, tensor);
15118
15573
  } break;
15119
15574
  case GGML_OP_ADD_REL_POS:
15120
15575
  {
15121
- ggml_compute_forward_add_rel_pos(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
15576
+ ggml_compute_forward_add_rel_pos(params, tensor);
15122
15577
  } break;
15123
15578
  case GGML_OP_MAP_UNARY:
15124
15579
  {
15125
15580
  ggml_unary_op_f32_t fun;
15126
15581
  memcpy(&fun, tensor->op_params, sizeof(fun));
15127
- ggml_compute_forward_map_unary(params, tensor->src[0], tensor, fun);
15582
+ ggml_compute_forward_map_unary(params, tensor, fun);
15128
15583
  }
15129
15584
  break;
15130
15585
  case GGML_OP_MAP_BINARY:
15131
15586
  {
15132
15587
  ggml_binary_op_f32_t fun;
15133
15588
  memcpy(&fun, tensor->op_params, sizeof(fun));
15134
- ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun);
15589
+ ggml_compute_forward_map_binary(params, tensor, fun);
15135
15590
  }
15136
15591
  break;
15137
15592
  case GGML_OP_MAP_CUSTOM1_F32:
15138
15593
  {
15139
15594
  ggml_custom1_op_f32_t fun;
15140
15595
  memcpy(&fun, tensor->op_params, sizeof(fun));
15141
- ggml_compute_forward_map_custom1_f32(params, tensor->src[0], tensor, fun);
15596
+ ggml_compute_forward_map_custom1_f32(params, tensor, fun);
15142
15597
  }
15143
15598
  break;
15144
15599
  case GGML_OP_MAP_CUSTOM2_F32:
15145
15600
  {
15146
15601
  ggml_custom2_op_f32_t fun;
15147
15602
  memcpy(&fun, tensor->op_params, sizeof(fun));
15148
- ggml_compute_forward_map_custom2_f32(params, tensor->src[0], tensor->src[1], tensor, fun);
15603
+ ggml_compute_forward_map_custom2_f32(params, tensor, fun);
15149
15604
  }
15150
15605
  break;
15151
15606
  case GGML_OP_MAP_CUSTOM3_F32:
15152
15607
  {
15153
15608
  ggml_custom3_op_f32_t fun;
15154
15609
  memcpy(&fun, tensor->op_params, sizeof(fun));
15155
- ggml_compute_forward_map_custom3_f32(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun);
15610
+ ggml_compute_forward_map_custom3_f32(params, tensor, fun);
15156
15611
  }
15157
15612
  break;
15158
15613
  case GGML_OP_MAP_CUSTOM1:
15159
15614
  {
15160
- ggml_compute_forward_map_custom1(params, tensor->src[0], tensor);
15615
+ ggml_compute_forward_map_custom1(params, tensor);
15161
15616
  }
15162
15617
  break;
15163
15618
  case GGML_OP_MAP_CUSTOM2:
15164
15619
  {
15165
- ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor);
15620
+ ggml_compute_forward_map_custom2(params, tensor);
15166
15621
  }
15167
15622
  break;
15168
15623
  case GGML_OP_MAP_CUSTOM3:
15169
15624
  {
15170
- ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
15625
+ ggml_compute_forward_map_custom3(params, tensor);
15171
15626
  }
15172
15627
  break;
15173
15628
  case GGML_OP_CROSS_ENTROPY_LOSS:
15174
15629
  {
15175
- ggml_compute_forward_cross_entropy_loss(params, tensor->src[0], tensor->src[1], tensor);
15630
+ ggml_compute_forward_cross_entropy_loss(params, tensor);
15176
15631
  }
15177
15632
  break;
15178
15633
  case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
15179
15634
  {
15180
- ggml_compute_forward_cross_entropy_loss_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
15635
+ ggml_compute_forward_cross_entropy_loss_back(params, tensor);
15181
15636
  }
15182
15637
  break;
15183
15638
  case GGML_OP_NONE:
@@ -15311,7 +15766,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
15311
15766
  return NULL;
15312
15767
  }
15313
15768
 
15314
- if (node->is_param) {
15769
+ if (node->flags & GGML_TENSOR_FLAG_PARAM) {
15315
15770
  return node;
15316
15771
  }
15317
15772
 
@@ -15345,7 +15800,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
15345
15800
 
15346
15801
  clone->op = node->op;
15347
15802
  clone->grad = node->grad;
15348
- clone->is_param = node->is_param;
15803
+ clone->flags = node->flags;
15349
15804
  clone->extra = node->extra;
15350
15805
  for (int k = 0; k < GGML_MAX_DIMS; ++k) {
15351
15806
  clone->nb[k] = node->nb[k];
@@ -16377,7 +16832,7 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
16377
16832
  for (int i = 0; i < gf->n_nodes; i++) {
16378
16833
  struct ggml_tensor * node = gf->nodes[i];
16379
16834
 
16380
- if (node->is_param) {
16835
+ if (node->flags & GGML_TENSOR_FLAG_PARAM) {
16381
16836
  GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
16382
16837
  ggml_build_forward_expand(gb, node->grad);
16383
16838
  }
@@ -16581,27 +17036,47 @@ typedef pthread_t ggml_thread_t;
16581
17036
  #endif
16582
17037
 
16583
17038
  // Android's libc implementation "bionic" does not support setting affinity
16584
- #if defined(__linux__) && !defined(__BIONIC__)
16585
- static void set_numa_thread_affinity(int thread_n, int n_threads) {
17039
+ #if defined(__gnu_linux__)
17040
+ static void set_numa_thread_affinity(int thread_n) {
16586
17041
  if (!ggml_is_numa()) {
16587
17042
  return;
16588
17043
  }
16589
17044
 
16590
- // run thread on node_num thread_n / (threads per node)
16591
- const int node_num = thread_n / ((n_threads + g_state.numa.n_nodes - 1) / g_state.numa.n_nodes);
16592
- struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
17045
+ int node_num;
17046
+ int rv;
16593
17047
  size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
16594
17048
 
17049
+ switch(g_state.numa.numa_strategy) {
17050
+ case GGML_NUMA_STRATEGY_DISTRIBUTE:
17051
+ // run thread on node_num thread_n / (threads per node)
17052
+ node_num = thread_n % g_state.numa.n_nodes;
17053
+ break;
17054
+ case GGML_NUMA_STRATEGY_ISOLATE:
17055
+ // run thread on current_node
17056
+ node_num = g_state.numa.current_node;
17057
+ break;
17058
+ case GGML_NUMA_STRATEGY_NUMACTL:
17059
+ // use the cpuset that numactl gave us
17060
+ rv = pthread_setaffinity_np(pthread_self(), setsize, &g_state.numa.cpuset);
17061
+ if (rv) {
17062
+ fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",strerror(rv));
17063
+ }
17064
+ return;
17065
+ default:
17066
+ return;
17067
+ }
17068
+
17069
+ struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
17070
+
16595
17071
  cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
16596
17072
  CPU_ZERO_S(setsize, cpus);
16597
17073
  for (size_t i = 0; i < node->n_cpus; ++i) {
16598
17074
  CPU_SET_S(node->cpus[i], setsize, cpus);
16599
17075
  }
16600
17076
 
16601
- int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
17077
+ rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
16602
17078
  if (rv) {
16603
- fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
16604
- strerror(rv));
17079
+ fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
16605
17080
  }
16606
17081
 
16607
17082
  CPU_FREE(cpus);
@@ -16622,8 +17097,7 @@ static void clear_numa_thread_affinity(void) {
16622
17097
 
16623
17098
  int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
16624
17099
  if (rv) {
16625
- fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
16626
- strerror(rv));
17100
+ fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
16627
17101
  }
16628
17102
 
16629
17103
  CPU_FREE(cpus);
@@ -16631,7 +17105,7 @@ static void clear_numa_thread_affinity(void) {
16631
17105
  #else
16632
17106
  // TODO: Windows etc.
16633
17107
  // (the linux implementation may also work on BSD, someone should test)
16634
- static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
17108
+ static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n); }
16635
17109
  static void clear_numa_thread_affinity(void) {}
16636
17110
  #endif
16637
17111
 
@@ -16649,7 +17123,7 @@ struct ggml_compute_state_shared {
16649
17123
  atomic_int node_n; // active graph node
16650
17124
  atomic_int node_task; // active graph node task phase
16651
17125
 
16652
- bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
17126
+ ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
16653
17127
  void * abort_callback_data;
16654
17128
  };
16655
17129
 
@@ -16931,7 +17405,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16931
17405
 
16932
17406
  const int n_threads = state->shared->n_threads;
16933
17407
 
16934
- set_numa_thread_affinity(state->ith, n_threads);
17408
+ set_numa_thread_affinity(state->ith);
16935
17409
 
16936
17410
  int node_n = -1;
16937
17411
  int task_phase = GGML_TASK_FINALIZE;
@@ -17737,7 +18211,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
17737
18211
 
17738
18212
  ptr += ggml_nbytes(tensor);
17739
18213
 
17740
- fprintf(stderr, "%s: loaded leaf %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
18214
+ fprintf(stderr, "%s: loaded leaf %u: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
17741
18215
  }
17742
18216
  }
17743
18217
 
@@ -17840,7 +18314,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
17840
18314
 
17841
18315
  result->nodes[i] = tensor;
17842
18316
 
17843
- fprintf(stderr, "%s: loaded node %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
18317
+ fprintf(stderr, "%s: loaded node %u: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
17844
18318
  }
17845
18319
  }
17846
18320
  }
@@ -17862,7 +18336,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
17862
18336
  GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
17863
18337
  i,
17864
18338
  node->ne[0], node->ne[1], node->ne[2],
17865
- ggml_op_name(node->op), node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
18339
+ ggml_op_name(node->op), (node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" : node->grad ? "g" : " ", node->perf_runs,
17866
18340
  (double) node->perf_cycles / (double) ggml_cycles_per_ms(),
17867
18341
  (double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
17868
18342
  (double) node->perf_time_us / 1000.0,
@@ -17955,7 +18429,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
17955
18429
  continue;
17956
18430
  }
17957
18431
 
17958
- if (node->is_param) {
18432
+ if (node->flags & GGML_TENSOR_FLAG_PARAM) {
17959
18433
  snprintf(color, sizeof(color), "yellow");
17960
18434
  } else if (node->grad) {
17961
18435
  if (ggml_graph_find(gf, node)) {
@@ -18129,7 +18603,7 @@ static enum ggml_opt_result ggml_opt_adam(
18129
18603
  int np = 0;
18130
18604
  int64_t nx = 0;
18131
18605
  for (int i = 0; i < gf->n_nodes; ++i) {
18132
- if (gf->nodes[i]->is_param) {
18606
+ if (gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) {
18133
18607
  GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
18134
18608
 
18135
18609
  GGML_ASSERT(np < GGML_MAX_PARAMS);
@@ -18382,7 +18856,7 @@ static enum ggml_opt_result linesearch_backtracking(
18382
18856
  }
18383
18857
 
18384
18858
  // compute the initial gradient in the search direction
18385
- ggml_vec_dot_f32(nx, &dginit, g, d);
18859
+ ggml_vec_dot_f32(nx, &dginit, 0, g, 0, d, 0, 1);
18386
18860
 
18387
18861
  // make sure that d points to a descent direction
18388
18862
  if (0 < dginit) {
@@ -18432,7 +18906,7 @@ static enum ggml_opt_result linesearch_backtracking(
18432
18906
  return count;
18433
18907
  }
18434
18908
 
18435
- ggml_vec_dot_f32(nx, &dg, g, d);
18909
+ ggml_vec_dot_f32(nx, &dg, 0, g, 0, d, 0, 1);
18436
18910
 
18437
18911
  // check the Wolfe condition
18438
18912
  if (dg < params->lbfgs.wolfe * dginit) {
@@ -18465,7 +18939,9 @@ static enum ggml_opt_result linesearch_backtracking(
18465
18939
  (*step) *= width;
18466
18940
  }
18467
18941
 
18468
- GGML_UNREACHABLE();
18942
+ GGML_ASSERT(false && "line search failed");
18943
+
18944
+ return GGML_LINESEARCH_FAIL;
18469
18945
  }
18470
18946
 
18471
18947
  static enum ggml_opt_result ggml_opt_lbfgs(
@@ -18492,7 +18968,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18492
18968
  int np = 0;
18493
18969
  int nx = 0;
18494
18970
  for (int i = 0; i < gf->n_nodes; ++i) {
18495
- if (gf->nodes[i]->is_param) {
18971
+ if (gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) {
18496
18972
  GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
18497
18973
 
18498
18974
  GGML_ASSERT(np < GGML_MAX_PARAMS);
@@ -18693,8 +19169,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18693
19169
  // ys = y^t \cdot s -> 1 / \rho.
18694
19170
  // yy = y^t \cdot y.
18695
19171
  //
18696
- ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]*nx]);
18697
- ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]);
19172
+ ggml_vec_dot_f32(nx, &ys, 0, &lm_y[end[0]*nx], 0, &lm_s[end[0]*nx], 0, 1);
19173
+ ggml_vec_dot_f32(nx, &yy, 0, &lm_y[end[0]*nx], 0, &lm_y[end[0]*nx], 0, 1);
18698
19174
 
18699
19175
  lm_ys[end[0]] = ys;
18700
19176
 
@@ -18713,7 +19189,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18713
19189
  for (int i = 0; i < bound; ++i) {
18714
19190
  j[0] = (j[0] + m - 1) % m;
18715
19191
  // \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}
18716
- ggml_vec_dot_f32(nx, &lm_alpha[j[0]], &lm_s[j[0]*nx], d);
19192
+ ggml_vec_dot_f32(nx, &lm_alpha[j[0]], 0, &lm_s[j[0]*nx], 0, d, 0, 1);
18717
19193
  lm_alpha[j[0]] /= lm_ys[j[0]];
18718
19194
  // q_{i} = q_{i+1} - \alpha_{i} y_{i}
18719
19195
  ggml_vec_mad_f32(nx, d, &lm_y[j[0]*nx], -lm_alpha[j[0]]);
@@ -18723,7 +19199,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18723
19199
 
18724
19200
  for (int i = 0; i < bound; ++i) {
18725
19201
  // \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}
18726
- ggml_vec_dot_f32(nx, &beta, &lm_y[j[0]*nx], d);
19202
+ ggml_vec_dot_f32(nx, &beta, 0, &lm_y[j[0]*nx], 0, d, 0, 1);
18727
19203
  beta /= lm_ys[j[0]];
18728
19204
  // \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}
18729
19205
  ggml_vec_mad_f32(nx, d, &lm_s[j[0]*nx], lm_alpha[j[0]] - beta);
@@ -18733,7 +19209,9 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18733
19209
  step[0] = 1.0;
18734
19210
  }
18735
19211
 
18736
- GGML_UNREACHABLE();
19212
+ GGML_ASSERT(false && "lbfgs failed");
19213
+
19214
+ return GGML_OPT_DID_NOT_CONVERGE;
18737
19215
  }
18738
19216
 
18739
19217
  struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
@@ -18967,12 +19445,23 @@ enum ggml_opt_result ggml_opt_resume_g(
18967
19445
 
18968
19446
  ////////////////////////////////////////////////////////////////////////////////
18969
19447
 
19448
+ void ggml_set_input(struct ggml_tensor * tensor) {
19449
+ tensor->flags |= GGML_TENSOR_FLAG_INPUT;
19450
+ }
19451
+
19452
+ void ggml_set_output(struct ggml_tensor * tensor) {
19453
+ tensor->flags |= GGML_TENSOR_FLAG_OUTPUT;
19454
+ }
19455
+
19456
+ ////////////////////////////////////////////////////////////////////////////////
19457
+
18970
19458
  void ggml_quantize_init(enum ggml_type type) {
18971
19459
  ggml_critical_section_start();
18972
19460
 
18973
19461
  switch (type) {
18974
- case GGML_TYPE_IQ2_XXS: iq2xs_init_impl(256); break;
18975
- case GGML_TYPE_IQ2_XS: iq2xs_init_impl(512); break;
19462
+ case GGML_TYPE_IQ2_XXS:
19463
+ case GGML_TYPE_IQ2_XS:
19464
+ case GGML_TYPE_IQ1_S: iq2xs_init_impl(type); break;
18976
19465
  case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
18977
19466
  default: // nothing
18978
19467
  break;
@@ -18984,8 +19473,10 @@ void ggml_quantize_init(enum ggml_type type) {
18984
19473
  void ggml_quantize_free(void) {
18985
19474
  ggml_critical_section_start();
18986
19475
 
18987
- iq2xs_free_impl(256);
18988
- iq2xs_free_impl(512);
19476
+ iq2xs_free_impl(GGML_TYPE_IQ2_XXS);
19477
+ iq2xs_free_impl(GGML_TYPE_IQ2_XS);
19478
+ iq2xs_free_impl(GGML_TYPE_IQ1_S);
19479
+ iq3xs_free_impl(256);
18989
19480
 
18990
19481
  ggml_critical_section_end();
18991
19482
  }
@@ -19120,7 +19611,8 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
19120
19611
  bool ggml_quantize_requires_imatrix(enum ggml_type type) {
19121
19612
  return
19122
19613
  type == GGML_TYPE_IQ2_XXS ||
19123
- type == GGML_TYPE_IQ2_XS;
19614
+ type == GGML_TYPE_IQ2_XS ||
19615
+ type == GGML_TYPE_IQ1_S;
19124
19616
  }
19125
19617
 
19126
19618
  size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
@@ -19245,6 +19737,24 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
19245
19737
  result = quantize_iq3_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19246
19738
  GGML_ASSERT(result == row_size * nrows);
19247
19739
  } break;
19740
+ case GGML_TYPE_IQ1_S:
19741
+ {
19742
+ GGML_ASSERT(start % QK_K == 0);
19743
+ GGML_ASSERT(start % n_per_row == 0);
19744
+ size_t start_row = start / n_per_row;
19745
+ size_t row_size = ggml_row_size(type, n_per_row);
19746
+ result = quantize_iq1_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19747
+ GGML_ASSERT(result == row_size * nrows);
19748
+ } break;
19749
+ case GGML_TYPE_IQ4_NL:
19750
+ {
19751
+ GGML_ASSERT(start % QK4_NL == 0);
19752
+ GGML_ASSERT(start % n_per_row == 0);
19753
+ size_t start_row = start / n_per_row;
19754
+ size_t row_size = ggml_row_size(type, n_per_row);
19755
+ result = quantize_iq4_nl(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19756
+ GGML_ASSERT(result == row_size * nrows);
19757
+ } break;
19248
19758
  case GGML_TYPE_F16:
19249
19759
  {
19250
19760
  size_t elemsize = sizeof(ggml_fp16_t);
@@ -20611,4 +21121,12 @@ int ggml_cpu_has_vsx(void) {
20611
21121
  #endif
20612
21122
  }
20613
21123
 
21124
+ int ggml_cpu_has_matmul_int8(void) {
21125
+ #if defined(__ARM_FEATURE_MATMUL_INT8)
21126
+ return 1;
21127
+ #else
21128
+ return 0;
21129
+ #endif
21130
+ }
21131
+
20614
21132
  ////////////////////////////////////////////////////////////////////////////////