llama_cpp 0.15.4 → 0.16.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +16 -0
  3. data/ext/llama_cpp/extconf.rb +3 -2
  4. data/ext/llama_cpp/llama_cpp.cpp +17 -3
  5. data/lib/llama_cpp/version.rb +2 -2
  6. data/sig/llama_cpp.rbs +15 -1
  7. data/vendor/tmp/llama.cpp/Makefile +166 -82
  8. data/vendor/tmp/llama.cpp/ggml-alloc.c +82 -26
  9. data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
  10. data/vendor/tmp/llama.cpp/ggml-backend.c +183 -69
  11. data/vendor/tmp/llama.cpp/ggml-backend.h +4 -4
  12. data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
  13. data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
  14. data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
  15. data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
  16. data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
  17. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +104 -0
  18. data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
  19. data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
  20. data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
  21. data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
  22. data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
  23. data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
  24. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +674 -0
  25. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
  26. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
  27. data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
  28. data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
  29. data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
  30. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +88 -0
  31. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +419 -0
  32. data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
  33. data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
  34. data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
  35. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +112 -0
  36. data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
  37. data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
  38. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +206 -0
  39. data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
  40. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
  41. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
  42. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
  43. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
  44. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
  45. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
  123. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
  124. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
  125. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
  126. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
  127. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
  128. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
  129. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
  130. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
  131. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  132. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  133. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  134. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  135. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  136. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  137. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  138. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  139. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  140. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  141. data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
  142. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +286 -0
  143. data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
  144. data/vendor/tmp/llama.cpp/ggml-cuda.cu +103 -135
  145. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +29 -13
  146. data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
  147. data/vendor/tmp/llama.cpp/ggml-metal.m +45 -33
  148. data/vendor/tmp/llama.cpp/ggml-metal.metal +83 -59
  149. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +15 -14
  150. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +26 -90
  151. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +74522 -14913
  152. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +631 -471
  153. data/vendor/tmp/llama.cpp/ggml.c +278 -603
  154. data/vendor/tmp/llama.cpp/ggml.h +9 -28
  155. data/vendor/tmp/llama.cpp/llama.cpp +345 -473
  156. data/vendor/tmp/llama.cpp/llama.h +21 -43
  157. metadata +134 -7
  158. data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
  159. data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
  160. data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
  161. data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
@@ -5,6 +5,7 @@
5
5
  #include "ggml-quants.h"
6
6
  #include "ggml.h"
7
7
 
8
+
8
9
  #if defined(_MSC_VER) || defined(__MINGW32__)
9
10
  #include <malloc.h> // using malloc.h with MSC/MINGW
10
11
  #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
@@ -28,6 +29,10 @@
28
29
  #include <syscall.h>
29
30
  #endif
30
31
 
32
+ #ifdef GGML_USE_OPENMP
33
+ #include <omp.h>
34
+ #endif
35
+
31
36
  #ifdef GGML_USE_METAL
32
37
  #include <unistd.h>
33
38
  #endif
@@ -292,17 +297,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
292
297
 
293
298
  #if defined(GGML_USE_ACCELERATE)
294
299
  #include <Accelerate/Accelerate.h>
295
- #if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
296
- #include "ggml-opencl.h"
297
- #endif
298
- #elif defined(GGML_USE_OPENBLAS)
299
- #if defined(GGML_BLAS_USE_MKL)
300
- #include <mkl.h>
301
- #else
302
- #include <cblas.h>
303
- #endif
304
- #elif defined(GGML_USE_CLBLAST)
305
- #include "ggml-opencl.h"
306
300
  #endif
307
301
 
308
302
  // floating point type used to accumulate sums
@@ -1756,7 +1750,7 @@ struct ggml_compute_state_shared {
1756
1750
  int64_t perf_node_start_cycles;
1757
1751
  int64_t perf_node_start_time_us;
1758
1752
 
1759
- const int n_threads;
1753
+ int n_threads;
1760
1754
 
1761
1755
  // synchronization primitives
1762
1756
  atomic_int n_active; // num active threads
@@ -2267,6 +2261,11 @@ inline static float ggml_silu_f32(float x) {
2267
2261
  return x/(1.0f + expf(-x));
2268
2262
  }
2269
2263
 
2264
+ #if __FINITE_MATH_ONLY__
2265
+ #error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix"
2266
+ #error "ref: https://github.com/ggerganov/llama.cpp/pull/7154#issuecomment-2143844461"
2267
+ #endif
2268
+
2270
2269
  #if defined(__ARM_NEON) && defined(__aarch64__)
2271
2270
 
2272
2271
  // adapted from arm limited optimized routine
@@ -3207,35 +3206,42 @@ GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor) {
3207
3206
  return tensor->nb[0] > tensor->nb[1];
3208
3207
  }
3209
3208
 
3210
- GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
3211
- static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3209
+ static bool ggml_is_contiguous_n(const struct ggml_tensor * tensor, int n) {
3210
+ size_t next_nb = ggml_type_size(tensor->type);
3211
+ if (tensor->ne[0] != ggml_blck_size(tensor->type) && tensor->nb[0] != next_nb) {
3212
+ return false;
3213
+ }
3214
+ next_nb *= tensor->ne[0]/ggml_blck_size(tensor->type);
3215
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
3216
+ if (tensor->ne[i] != 1) {
3217
+ if (i > n) {
3218
+ if (tensor->nb[i] != next_nb) {
3219
+ return false;
3220
+ }
3221
+ next_nb *= tensor->ne[i];
3222
+ } else {
3223
+ // this dimension does not need to be contiguous
3224
+ next_nb = tensor->ne[i]*tensor->nb[i];
3225
+ }
3226
+ }
3227
+ }
3228
+ return true;
3229
+ }
3212
3230
 
3213
- return
3214
- tensor->nb[0] == ggml_type_size(tensor->type) &&
3215
- tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
3216
- tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
3217
- tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
3231
+ GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
3232
+ return ggml_is_contiguous_0(tensor);
3218
3233
  }
3219
3234
 
3220
3235
  GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) {
3221
- return ggml_is_contiguous(tensor);
3236
+ return ggml_is_contiguous_n(tensor, 0);
3222
3237
  }
3223
3238
 
3224
3239
  GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor) {
3225
- static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3226
-
3227
- return
3228
- tensor->nb[0] == ggml_type_size(tensor->type) &&
3229
- tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
3230
- tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
3240
+ return ggml_is_contiguous_n(tensor, 1);
3231
3241
  }
3232
3242
 
3233
3243
  GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) {
3234
- static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3235
-
3236
- return
3237
- tensor->nb[0] == ggml_type_size(tensor->type) &&
3238
- tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
3244
+ return ggml_is_contiguous_n(tensor, 2);
3239
3245
  }
3240
3246
 
3241
3247
  GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
@@ -3267,20 +3273,20 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor
3267
3273
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3268
3274
 
3269
3275
  return
3270
- (t0->ne[0] == t1->ne[0] ) &&
3271
- (t0->ne[1] == t1->ne[1] ) &&
3272
- (t0->ne[2] == t1->ne[2] ) &&
3273
- (t0->ne[3] == t1->ne[3] );
3276
+ (t0->ne[0] == t1->ne[0]) &&
3277
+ (t0->ne[1] == t1->ne[1]) &&
3278
+ (t0->ne[2] == t1->ne[2]) &&
3279
+ (t0->ne[3] == t1->ne[3]);
3274
3280
  }
3275
3281
 
3276
3282
  bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
3277
3283
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3278
3284
 
3279
3285
  return
3280
- (t0->nb[0] == t1->nb[0] ) &&
3281
- (t0->nb[1] == t1->nb[1] ) &&
3282
- (t0->nb[2] == t1->nb[2] ) &&
3283
- (t0->nb[3] == t1->nb[3] );
3286
+ (t0->nb[0] == t1->nb[0]) &&
3287
+ (t0->nb[1] == t1->nb[1]) &&
3288
+ (t0->nb[2] == t1->nb[2]) &&
3289
+ (t0->nb[3] == t1->nb[3]);
3284
3290
  }
3285
3291
 
3286
3292
  // check if t1 can be represented as a repeatition of t0
@@ -3370,10 +3376,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
3370
3376
  GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
3371
3377
  }
3372
3378
 
3373
- #if defined(GGML_USE_CLBLAST)
3374
- ggml_cl_init();
3375
- #endif
3376
-
3377
3379
  ggml_setup_op_has_task_pass();
3378
3380
 
3379
3381
  is_first_call = false;
@@ -4077,32 +4079,26 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
4077
4079
  switch (tensor->type) {
4078
4080
  case GGML_TYPE_I8:
4079
4081
  {
4080
- GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
4081
4082
  return ((int8_t *)(tensor->data))[i];
4082
4083
  }
4083
4084
  case GGML_TYPE_I16:
4084
4085
  {
4085
- GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
4086
4086
  return ((int16_t *)(tensor->data))[i];
4087
4087
  }
4088
4088
  case GGML_TYPE_I32:
4089
4089
  {
4090
- GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
4091
4090
  return ((int32_t *)(tensor->data))[i];
4092
4091
  }
4093
4092
  case GGML_TYPE_F16:
4094
4093
  {
4095
- GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
4096
4094
  return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
4097
4095
  }
4098
4096
  case GGML_TYPE_BF16:
4099
4097
  {
4100
- GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
4101
4098
  return GGML_BF16_TO_FP32(((ggml_bf16_t *)(tensor->data))[i]);
4102
4099
  }
4103
4100
  case GGML_TYPE_F32:
4104
4101
  {
4105
- GGML_ASSERT(tensor->nb[0] == sizeof(float));
4106
4102
  return ((float *)(tensor->data))[i];
4107
4103
  }
4108
4104
  default:
@@ -4124,32 +4120,26 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
4124
4120
  switch (tensor->type) {
4125
4121
  case GGML_TYPE_I8:
4126
4122
  {
4127
- GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
4128
4123
  ((int8_t *)(tensor->data))[i] = value;
4129
4124
  } break;
4130
4125
  case GGML_TYPE_I16:
4131
4126
  {
4132
- GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
4133
4127
  ((int16_t *)(tensor->data))[i] = value;
4134
4128
  } break;
4135
4129
  case GGML_TYPE_I32:
4136
4130
  {
4137
- GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
4138
4131
  ((int32_t *)(tensor->data))[i] = value;
4139
4132
  } break;
4140
4133
  case GGML_TYPE_F16:
4141
4134
  {
4142
- GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
4143
4135
  ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
4144
4136
  } break;
4145
4137
  case GGML_TYPE_BF16:
4146
4138
  {
4147
- GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
4148
4139
  ((ggml_bf16_t *)(tensor->data))[i] = GGML_FP32_TO_BF16(value);
4149
4140
  } break;
4150
4141
  case GGML_TYPE_F32:
4151
4142
  {
4152
- GGML_ASSERT(tensor->nb[0] == sizeof(float));
4153
4143
  ((float *)(tensor->data))[i] = value;
4154
4144
  } break;
4155
4145
  default:
@@ -6249,16 +6239,13 @@ static struct ggml_tensor * ggml_rope_impl(
6249
6239
  struct ggml_tensor * c,
6250
6240
  int n_dims,
6251
6241
  int mode,
6252
- int n_ctx,
6253
- int n_orig_ctx,
6242
+ int n_ctx_orig,
6254
6243
  float freq_base,
6255
6244
  float freq_scale,
6256
6245
  float ext_factor,
6257
6246
  float attn_factor,
6258
6247
  float beta_fast,
6259
6248
  float beta_slow,
6260
- float xpos_base,
6261
- bool xpos_down,
6262
6249
  bool inplace) {
6263
6250
  GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");
6264
6251
 
@@ -6279,15 +6266,13 @@ static struct ggml_tensor * ggml_rope_impl(
6279
6266
 
6280
6267
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6281
6268
 
6282
- int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx };
6269
+ int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
6283
6270
  memcpy(params + 5, &freq_base, sizeof(float));
6284
6271
  memcpy(params + 6, &freq_scale, sizeof(float));
6285
6272
  memcpy(params + 7, &ext_factor, sizeof(float));
6286
6273
  memcpy(params + 8, &attn_factor, sizeof(float));
6287
6274
  memcpy(params + 9, &beta_fast, sizeof(float));
6288
6275
  memcpy(params + 10, &beta_slow, sizeof(float));
6289
- memcpy(params + 11, &xpos_base, sizeof(float));
6290
- memcpy(params + 12, &xpos_down, sizeof(bool));
6291
6276
  ggml_set_op_params(result, params, sizeof(params));
6292
6277
 
6293
6278
  result->op = GGML_OP_ROPE;
@@ -6304,10 +6289,9 @@ struct ggml_tensor * ggml_rope(
6304
6289
  struct ggml_tensor * a,
6305
6290
  struct ggml_tensor * b,
6306
6291
  int n_dims,
6307
- int mode,
6308
- int n_ctx) {
6292
+ int mode) {
6309
6293
  return ggml_rope_impl(
6310
- ctx, a, b, NULL, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, false
6294
+ ctx, a, b, NULL, n_dims, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, false
6311
6295
  );
6312
6296
  }
6313
6297
 
@@ -6316,10 +6300,9 @@ struct ggml_tensor * ggml_rope_inplace(
6316
6300
  struct ggml_tensor * a,
6317
6301
  struct ggml_tensor * b,
6318
6302
  int n_dims,
6319
- int mode,
6320
- int n_ctx) {
6303
+ int mode) {
6321
6304
  return ggml_rope_impl(
6322
- ctx, a, b, NULL, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, true
6305
+ ctx, a, b, NULL, n_dims, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, true
6323
6306
  );
6324
6307
  }
6325
6308
 
@@ -6330,8 +6313,7 @@ struct ggml_tensor * ggml_rope_ext(
6330
6313
  struct ggml_tensor * c,
6331
6314
  int n_dims,
6332
6315
  int mode,
6333
- int n_ctx,
6334
- int n_orig_ctx,
6316
+ int n_ctx_orig,
6335
6317
  float freq_base,
6336
6318
  float freq_scale,
6337
6319
  float ext_factor,
@@ -6339,8 +6321,8 @@ struct ggml_tensor * ggml_rope_ext(
6339
6321
  float beta_fast,
6340
6322
  float beta_slow) {
6341
6323
  return ggml_rope_impl(
6342
- ctx, a, b, c, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
6343
- ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, false
6324
+ ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
6325
+ ext_factor, attn_factor, beta_fast, beta_slow, false
6344
6326
  );
6345
6327
  }
6346
6328
 
@@ -6351,8 +6333,7 @@ struct ggml_tensor * ggml_rope_ext_inplace(
6351
6333
  struct ggml_tensor * c,
6352
6334
  int n_dims,
6353
6335
  int mode,
6354
- int n_ctx,
6355
- int n_orig_ctx,
6336
+ int n_ctx_orig,
6356
6337
  float freq_base,
6357
6338
  float freq_scale,
6358
6339
  float ext_factor,
@@ -6360,8 +6341,8 @@ struct ggml_tensor * ggml_rope_ext_inplace(
6360
6341
  float beta_fast,
6361
6342
  float beta_slow) {
6362
6343
  return ggml_rope_impl(
6363
- ctx, a, b, c, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
6364
- ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, true
6344
+ ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
6345
+ ext_factor, attn_factor, beta_fast, beta_slow, true
6365
6346
  );
6366
6347
  }
6367
6348
 
@@ -6371,8 +6352,7 @@ struct ggml_tensor * ggml_rope_custom(
6371
6352
  struct ggml_tensor * b,
6372
6353
  int n_dims,
6373
6354
  int mode,
6374
- int n_ctx,
6375
- int n_orig_ctx,
6355
+ int n_ctx_orig,
6376
6356
  float freq_base,
6377
6357
  float freq_scale,
6378
6358
  float ext_factor,
@@ -6380,8 +6360,8 @@ struct ggml_tensor * ggml_rope_custom(
6380
6360
  float beta_fast,
6381
6361
  float beta_slow) {
6382
6362
  return ggml_rope_impl(
6383
- ctx, a, b, NULL, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
6384
- ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, false
6363
+ ctx, a, b, NULL, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
6364
+ ext_factor, attn_factor, beta_fast, beta_slow, false
6385
6365
  );
6386
6366
  }
6387
6367
 
@@ -6391,8 +6371,7 @@ struct ggml_tensor * ggml_rope_custom_inplace(
6391
6371
  struct ggml_tensor * b,
6392
6372
  int n_dims,
6393
6373
  int mode,
6394
- int n_ctx,
6395
- int n_orig_ctx,
6374
+ int n_ctx_orig,
6396
6375
  float freq_base,
6397
6376
  float freq_scale,
6398
6377
  float ext_factor,
@@ -6400,21 +6379,11 @@ struct ggml_tensor * ggml_rope_custom_inplace(
6400
6379
  float beta_fast,
6401
6380
  float beta_slow) {
6402
6381
  return ggml_rope_impl(
6403
- ctx, a, b, NULL, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
6404
- ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, true
6382
+ ctx, a, b, NULL, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
6383
+ ext_factor, attn_factor, beta_fast, beta_slow, true
6405
6384
  );
6406
6385
  }
6407
6386
 
6408
- struct ggml_tensor * ggml_rope_xpos_inplace(
6409
- struct ggml_context * ctx,
6410
- struct ggml_tensor * a,
6411
- struct ggml_tensor * b,
6412
- int n_dims,
6413
- float base,
6414
- bool down) {
6415
- return ggml_rope_impl(ctx, a, b, NULL, n_dims, 0, 0, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, base, down, true);
6416
- }
6417
-
6418
6387
  // ggml_rope_back
6419
6388
 
6420
6389
  struct ggml_tensor * ggml_rope_back(
@@ -6424,16 +6393,13 @@ struct ggml_tensor * ggml_rope_back(
6424
6393
  struct ggml_tensor * c,
6425
6394
  int n_dims,
6426
6395
  int mode,
6427
- int n_ctx,
6428
- int n_orig_ctx,
6396
+ int n_ctx_orig,
6429
6397
  float freq_base,
6430
6398
  float freq_scale,
6431
6399
  float ext_factor,
6432
6400
  float attn_factor,
6433
6401
  float beta_fast,
6434
- float beta_slow,
6435
- float xpos_base,
6436
- bool xpos_down) {
6402
+ float beta_slow) {
6437
6403
  GGML_ASSERT(ggml_is_vector(b));
6438
6404
  GGML_ASSERT(b->type == GGML_TYPE_I32);
6439
6405
  GGML_ASSERT(a->ne[2] == b->ne[0]);
@@ -6449,15 +6415,13 @@ struct ggml_tensor * ggml_rope_back(
6449
6415
 
6450
6416
  struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
6451
6417
 
6452
- int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx };
6418
+ int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
6453
6419
  memcpy(params + 5, &freq_base, sizeof(float));
6454
6420
  memcpy(params + 6, &freq_scale, sizeof(float));
6455
6421
  memcpy(params + 7, &ext_factor, sizeof(float));
6456
6422
  memcpy(params + 8, &attn_factor, sizeof(float));
6457
6423
  memcpy(params + 9, &beta_fast, sizeof(float));
6458
6424
  memcpy(params + 10, &beta_slow, sizeof(float));
6459
- memcpy(params + 11, &xpos_base, sizeof(float));
6460
- memcpy(params + 12, &xpos_down, sizeof(bool));
6461
6425
  ggml_set_op_params(result, params, sizeof(params));
6462
6426
 
6463
6427
  result->op = GGML_OP_ROPE_BACK;
@@ -7368,13 +7332,15 @@ struct ggml_tensor * ggml_add_rel_pos_inplace(
7368
7332
  return ggml_add_rel_pos_impl(ctx, a, pw, ph, true);
7369
7333
  }
7370
7334
 
7371
- // gmml_unary
7335
+ // ggml_unary
7372
7336
 
7373
7337
  static struct ggml_tensor * ggml_unary_impl(
7374
7338
  struct ggml_context * ctx,
7375
7339
  struct ggml_tensor * a,
7376
7340
  enum ggml_unary_op op,
7377
7341
  bool inplace) {
7342
+ GGML_ASSERT(ggml_is_contiguous_1(a));
7343
+
7378
7344
  bool is_node = false;
7379
7345
 
7380
7346
  if (!inplace && (a->grad)) {
@@ -9043,17 +9009,6 @@ static void ggml_compute_forward_add_f32(
9043
9009
  const int ith = params->ith;
9044
9010
  const int nth = params->nth;
9045
9011
 
9046
- #ifdef GGML_USE_CLBLAST
9047
- if (src1->backend == GGML_BACKEND_TYPE_GPU) {
9048
- // TODO: OpenCL kernel support full broadcast
9049
- GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
9050
- if (ith == 0) {
9051
- ggml_cl_add(src0, src1, dst);
9052
- }
9053
- return;
9054
- }
9055
- #endif
9056
-
9057
9012
  const int nr = ggml_nrows(src0);
9058
9013
 
9059
9014
  GGML_TENSOR_BINARY_OP_LOCALS
@@ -10161,17 +10116,6 @@ static void ggml_compute_forward_mul_f32(
10161
10116
  const int ith = params->ith;
10162
10117
  const int nth = params->nth;
10163
10118
 
10164
- #if defined(GGML_USE_CLBLAST)
10165
- if (src1->backend == GGML_BACKEND_TYPE_GPU) {
10166
- // TODO: OpenCL kernel support full broadcast
10167
- GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
10168
- if (ith == 0) {
10169
- ggml_cl_mul(src0, src1, dst);
10170
- }
10171
- return;
10172
- }
10173
- #endif
10174
-
10175
10119
  const int64_t nr = ggml_nrows(src0);
10176
10120
 
10177
10121
  GGML_TENSOR_BINARY_OP_LOCALS
@@ -11061,6 +11005,8 @@ static void ggml_compute_forward_abs_f32(
11061
11005
  const struct ggml_tensor * src0 = dst->src[0];
11062
11006
 
11063
11007
  assert(params->ith == 0);
11008
+ assert(ggml_is_contiguous_1(src0));
11009
+ assert(ggml_is_contiguous_1(dst));
11064
11010
  assert(ggml_are_same_shape(src0, dst));
11065
11011
 
11066
11012
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11070,9 +11016,6 @@ static void ggml_compute_forward_abs_f32(
11070
11016
  const int n = ggml_nrows(src0);
11071
11017
  const int nc = src0->ne[0];
11072
11018
 
11073
- assert(dst->nb[0] == sizeof(float));
11074
- assert(src0->nb[0] == sizeof(float));
11075
-
11076
11019
  for (int i = 0; i < n; i++) {
11077
11020
  ggml_vec_abs_f32(nc,
11078
11021
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11107,6 +11050,8 @@ static void ggml_compute_forward_sgn_f32(
11107
11050
  const struct ggml_tensor * src0 = dst->src[0];
11108
11051
 
11109
11052
  assert(params->ith == 0);
11053
+ assert(ggml_is_contiguous_1(src0));
11054
+ assert(ggml_is_contiguous_1(dst));
11110
11055
  assert(ggml_are_same_shape(src0, dst));
11111
11056
 
11112
11057
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11116,9 +11061,6 @@ static void ggml_compute_forward_sgn_f32(
11116
11061
  const int n = ggml_nrows(src0);
11117
11062
  const int nc = src0->ne[0];
11118
11063
 
11119
- assert(dst->nb[0] == sizeof(float));
11120
- assert(src0->nb[0] == sizeof(float));
11121
-
11122
11064
  for (int i = 0; i < n; i++) {
11123
11065
  ggml_vec_sgn_f32(nc,
11124
11066
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11153,6 +11095,8 @@ static void ggml_compute_forward_neg_f32(
11153
11095
  const struct ggml_tensor * src0 = dst->src[0];
11154
11096
 
11155
11097
  assert(params->ith == 0);
11098
+ assert(ggml_is_contiguous_1(src0));
11099
+ assert(ggml_is_contiguous_1(dst));
11156
11100
  assert(ggml_are_same_shape(src0, dst));
11157
11101
 
11158
11102
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11162,9 +11106,6 @@ static void ggml_compute_forward_neg_f32(
11162
11106
  const int n = ggml_nrows(src0);
11163
11107
  const int nc = src0->ne[0];
11164
11108
 
11165
- assert(dst->nb[0] == sizeof(float));
11166
- assert(src0->nb[0] == sizeof(float));
11167
-
11168
11109
  for (int i = 0; i < n; i++) {
11169
11110
  ggml_vec_neg_f32(nc,
11170
11111
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11199,6 +11140,8 @@ static void ggml_compute_forward_step_f32(
11199
11140
  const struct ggml_tensor * src0 = dst->src[0];
11200
11141
 
11201
11142
  assert(params->ith == 0);
11143
+ assert(ggml_is_contiguous_1(src0));
11144
+ assert(ggml_is_contiguous_1(dst));
11202
11145
  assert(ggml_are_same_shape(src0, dst));
11203
11146
 
11204
11147
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11208,9 +11151,6 @@ static void ggml_compute_forward_step_f32(
11208
11151
  const int n = ggml_nrows(src0);
11209
11152
  const int nc = src0->ne[0];
11210
11153
 
11211
- assert(dst->nb[0] == sizeof(float));
11212
- assert(src0->nb[0] == sizeof(float));
11213
-
11214
11154
  for (int i = 0; i < n; i++) {
11215
11155
  ggml_vec_step_f32(nc,
11216
11156
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11245,6 +11185,8 @@ static void ggml_compute_forward_tanh_f32(
11245
11185
  const struct ggml_tensor * src0 = dst->src[0];
11246
11186
 
11247
11187
  assert(params->ith == 0);
11188
+ assert(ggml_is_contiguous_1(src0));
11189
+ assert(ggml_is_contiguous_1(dst));
11248
11190
  assert(ggml_are_same_shape(src0, dst));
11249
11191
 
11250
11192
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11254,9 +11196,6 @@ static void ggml_compute_forward_tanh_f32(
11254
11196
  const int n = ggml_nrows(src0);
11255
11197
  const int nc = src0->ne[0];
11256
11198
 
11257
- assert(dst->nb[0] == sizeof(float));
11258
- assert(src0->nb[0] == sizeof(float));
11259
-
11260
11199
  for (int i = 0; i < n; i++) {
11261
11200
  ggml_vec_tanh_f32(nc,
11262
11201
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11291,6 +11230,8 @@ static void ggml_compute_forward_elu_f32(
11291
11230
  const struct ggml_tensor * src0 = dst->src[0];
11292
11231
 
11293
11232
  assert(params->ith == 0);
11233
+ assert(ggml_is_contiguous_1(src0));
11234
+ assert(ggml_is_contiguous_1(dst));
11294
11235
  assert(ggml_are_same_shape(src0, dst));
11295
11236
 
11296
11237
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11300,9 +11241,6 @@ static void ggml_compute_forward_elu_f32(
11300
11241
  const int n = ggml_nrows(src0);
11301
11242
  const int nc = src0->ne[0];
11302
11243
 
11303
- assert(dst->nb[0] == sizeof(float));
11304
- assert(src0->nb[0] == sizeof(float));
11305
-
11306
11244
  for (int i = 0; i < n; i++) {
11307
11245
  ggml_vec_elu_f32(nc,
11308
11246
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11337,6 +11275,8 @@ static void ggml_compute_forward_relu_f32(
11337
11275
  const struct ggml_tensor * src0 = dst->src[0];
11338
11276
 
11339
11277
  assert(params->ith == 0);
11278
+ assert(ggml_is_contiguous_1(src0));
11279
+ assert(ggml_is_contiguous_1(dst));
11340
11280
  assert(ggml_are_same_shape(src0, dst));
11341
11281
 
11342
11282
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11346,9 +11286,6 @@ static void ggml_compute_forward_relu_f32(
11346
11286
  const int n = ggml_nrows(src0);
11347
11287
  const int nc = src0->ne[0];
11348
11288
 
11349
- assert(dst->nb[0] == sizeof(float));
11350
- assert(src0->nb[0] == sizeof(float));
11351
-
11352
11289
  for (int i = 0; i < n; i++) {
11353
11290
  ggml_vec_relu_f32(nc,
11354
11291
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11383,6 +11320,8 @@ static void ggml_compute_forward_sigmoid_f32(
11383
11320
  const struct ggml_tensor * src0 = dst->src[0];
11384
11321
 
11385
11322
  assert(params->ith == 0);
11323
+ assert(ggml_is_contiguous_1(src0));
11324
+ assert(ggml_is_contiguous_1(dst));
11386
11325
  assert(ggml_are_same_shape(src0, dst));
11387
11326
 
11388
11327
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11392,9 +11331,6 @@ static void ggml_compute_forward_sigmoid_f32(
11392
11331
  const int n = ggml_nrows(src0);
11393
11332
  const int nc = src0->ne[0];
11394
11333
 
11395
- assert(dst->nb[0] == sizeof(float));
11396
- assert(src0->nb[0] == sizeof(float));
11397
-
11398
11334
  for (int i = 0; i < n; i++) {
11399
11335
  ggml_vec_sigmoid_f32(nc,
11400
11336
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11428,9 +11364,9 @@ static void ggml_compute_forward_gelu_f32(
11428
11364
 
11429
11365
  const struct ggml_tensor * src0 = dst->src[0];
11430
11366
 
11431
- GGML_ASSERT(ggml_is_contiguous_1(src0));
11432
- GGML_ASSERT(ggml_is_contiguous_1(dst));
11433
- GGML_ASSERT(ggml_are_same_shape(src0, dst));
11367
+ assert(ggml_is_contiguous_1(src0));
11368
+ assert(ggml_is_contiguous_1(dst));
11369
+ assert(ggml_are_same_shape(src0, dst));
11434
11370
 
11435
11371
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11436
11372
  return;
@@ -11491,9 +11427,9 @@ static void ggml_compute_forward_gelu_quick_f32(
11491
11427
 
11492
11428
  const struct ggml_tensor * src0 = dst->src[0];
11493
11429
 
11494
- GGML_ASSERT(ggml_is_contiguous_1(src0));
11495
- GGML_ASSERT(ggml_is_contiguous_1(dst));
11496
- GGML_ASSERT(ggml_are_same_shape(src0, dst));
11430
+ assert(ggml_is_contiguous_1(src0));
11431
+ assert(ggml_is_contiguous_1(dst));
11432
+ assert(ggml_are_same_shape(src0, dst));
11497
11433
 
11498
11434
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11499
11435
  return;
@@ -11554,9 +11490,9 @@ static void ggml_compute_forward_silu_f32(
11554
11490
 
11555
11491
  const struct ggml_tensor * src0 = dst->src[0];
11556
11492
 
11557
- GGML_ASSERT(ggml_is_contiguous_1(src0));
11558
- GGML_ASSERT(ggml_is_contiguous_1(dst));
11559
- GGML_ASSERT(ggml_are_same_shape(src0, dst));
11493
+ assert(ggml_is_contiguous_1(src0));
11494
+ assert(ggml_is_contiguous_1(dst));
11495
+ assert(ggml_are_same_shape(src0, dst));
11560
11496
 
11561
11497
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11562
11498
  return;
@@ -11617,6 +11553,8 @@ static void ggml_compute_forward_leaky_relu_f32(
11617
11553
  const struct ggml_tensor * src0 = dst->src[0];
11618
11554
 
11619
11555
  assert(params->ith == 0);
11556
+ assert(ggml_is_contiguous_1(src0));
11557
+ assert(ggml_is_contiguous_1(dst));
11620
11558
  assert(ggml_are_same_shape(src0, dst));
11621
11559
 
11622
11560
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11666,11 +11604,11 @@ static void ggml_compute_forward_silu_back_f32(
11666
11604
  const struct ggml_tensor * src0 = dst->src[0];
11667
11605
  const struct ggml_tensor * grad = dst->src[1];
11668
11606
 
11669
- GGML_ASSERT(ggml_is_contiguous_1(grad));
11670
- GGML_ASSERT(ggml_is_contiguous_1(src0));
11671
- GGML_ASSERT(ggml_is_contiguous_1(dst));
11672
- GGML_ASSERT(ggml_are_same_shape(src0, dst));
11673
- GGML_ASSERT(ggml_are_same_shape(src0, grad));
11607
+ assert(ggml_is_contiguous_1(grad));
11608
+ assert(ggml_is_contiguous_1(src0));
11609
+ assert(ggml_is_contiguous_1(dst));
11610
+ assert(ggml_are_same_shape(src0, dst));
11611
+ assert(ggml_are_same_shape(src0, grad));
11674
11612
 
11675
11613
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11676
11614
  return;
@@ -11732,6 +11670,8 @@ static void ggml_compute_forward_hardswish_f32(
11732
11670
  const struct ggml_tensor * src0 = dst->src[0];
11733
11671
 
11734
11672
  assert(params->ith == 0);
11673
+ assert(ggml_is_contiguous_1(src0));
11674
+ assert(ggml_is_contiguous_1(dst));
11735
11675
  assert(ggml_are_same_shape(src0, dst));
11736
11676
 
11737
11677
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11741,9 +11681,6 @@ static void ggml_compute_forward_hardswish_f32(
11741
11681
  const int n = ggml_nrows(src0);
11742
11682
  const int nc = src0->ne[0];
11743
11683
 
11744
- assert(dst->nb[0] == sizeof(float));
11745
- assert(src0->nb[0] == sizeof(float));
11746
-
11747
11684
  for (int i = 0; i < n; i++) {
11748
11685
  ggml_vec_hardswish_f32(nc,
11749
11686
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11775,6 +11712,8 @@ static void ggml_compute_forward_hardsigmoid_f32(
11775
11712
  const struct ggml_tensor * src0 = dst->src[0];
11776
11713
 
11777
11714
  assert(params->ith == 0);
11715
+ assert(ggml_is_contiguous_1(src0));
11716
+ assert(ggml_is_contiguous_1(dst));
11778
11717
  assert(ggml_are_same_shape(src0, dst));
11779
11718
 
11780
11719
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11784,9 +11723,6 @@ static void ggml_compute_forward_hardsigmoid_f32(
11784
11723
  const int n = ggml_nrows(src0);
11785
11724
  const int nc = src0->ne[0];
11786
11725
 
11787
- assert(dst->nb[0] == sizeof(float));
11788
- assert(src0->nb[0] == sizeof(float));
11789
-
11790
11726
  for (int i = 0; i < n; i++) {
11791
11727
  ggml_vec_hardsigmoid_f32(nc,
11792
11728
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -12237,39 +12173,6 @@ static void ggml_compute_forward_group_norm(
12237
12173
 
12238
12174
  // ggml_compute_forward_mul_mat
12239
12175
 
12240
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
12241
- // helper function to determine if it is better to use BLAS or not
12242
- // for large matrices, BLAS is faster
12243
- static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
12244
- const struct ggml_tensor * src0 = dst->src[0];
12245
- const struct ggml_tensor * src1 = dst->src[1];
12246
-
12247
- //const int64_t ne00 = src0->ne[0];
12248
- //const int64_t ne01 = src0->ne[1];
12249
-
12250
- const int64_t ne10 = src1->ne[0];
12251
-
12252
- const int64_t ne0 = dst->ne[0];
12253
- const int64_t ne1 = dst->ne[1];
12254
-
12255
- // NOTE: with GGML_OP_MUL_MAT_ID we don't want to go through the BLAS branch because it will dequantize (to_float)
12256
- // all the experts for each batch element and the processing would become incredibly slow
12257
- // TODO: find the optimal values for these
12258
- if (dst->op != GGML_OP_MUL_MAT_ID &&
12259
- ggml_is_contiguous(src0) &&
12260
- ggml_is_contiguous(src1) &&
12261
- //src0->type == GGML_TYPE_F32 &&
12262
- src1->type == GGML_TYPE_F32 &&
12263
- (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
12264
-
12265
- /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
12266
- return true;
12267
- }
12268
-
12269
- return false;
12270
- }
12271
- #endif
12272
-
12273
12176
  static void ggml_compute_forward_mul_mat_one_chunk(
12274
12177
  const struct ggml_compute_params * params,
12275
12178
  struct ggml_tensor * dst,
@@ -12407,82 +12310,6 @@ static void ggml_compute_forward_mul_mat(
12407
12310
  // nb01 >= nb00 - src0 is not transposed
12408
12311
  // compute by src0 rows
12409
12312
 
12410
- #if defined(GGML_USE_CLBLAST)
12411
- if (ggml_cl_can_mul_mat(src0, src1, dst)) {
12412
- if (params->ith == 0 && params->type == GGML_TASK_TYPE_COMPUTE) {
12413
- ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
12414
- }
12415
- return;
12416
- }
12417
- #endif
12418
-
12419
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
12420
- if (ggml_compute_forward_mul_mat_use_blas(dst)) {
12421
- const int64_t ne_plane = ne01*ne00;
12422
- const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
12423
- UNUSED(desired_wsize);
12424
-
12425
- if (params->type == GGML_TASK_TYPE_INIT) {
12426
- if (type != GGML_TYPE_F32) {
12427
- assert(params->wsize >= desired_wsize);
12428
- // parallelize by src0 rows
12429
- for (int64_t i13 = 0; i13 < ne13; i13++) {
12430
- for (int64_t i12 = 0; i12 < ne12; i12++) {
12431
- // broadcast src0 into src1 across 2nd,3rd dimension
12432
- const int64_t i03 = i13/r3;
12433
- const int64_t i02 = i12/r2;
12434
-
12435
- const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
12436
- float * const wdata = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
12437
- ggml_to_float_t const to_float = type_traits[type].to_float;
12438
-
12439
- for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
12440
- to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
12441
- }
12442
- }
12443
- }
12444
- }
12445
- return;
12446
- }
12447
-
12448
- if (params->type == GGML_TASK_TYPE_FINALIZE) {
12449
- return;
12450
- }
12451
-
12452
- // perform sgemm, parallelization controlled by blas lib
12453
- if (ith != 0) {
12454
- return;
12455
- }
12456
-
12457
- //const int64_t tgemm0 = ggml_perf_time_us();
12458
- for (int64_t i13 = 0; i13 < ne13; i13++) {
12459
- for (int64_t i12 = 0; i12 < ne12; i12++) {
12460
- const int64_t i03 = i13/r3;
12461
- const int64_t i02 = i12/r2;
12462
-
12463
- const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
12464
- const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
12465
- float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
12466
-
12467
- if (type != GGML_TYPE_F32) {
12468
- x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
12469
- }
12470
-
12471
- cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
12472
- ne1, ne01, ne10,
12473
- 1.0f, y, ne10,
12474
- x, ne00,
12475
- 0.0f, d, ne01);
12476
- }
12477
- }
12478
- //printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);
12479
-
12480
- //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
12481
-
12482
- return;
12483
- }
12484
- #endif
12485
-
12486
12313
  #if GGML_USE_LLAMAFILE
12487
12314
  const bool src1_cont = ggml_is_contiguous(src1);
12488
12315
 
@@ -12863,21 +12690,7 @@ static void ggml_compute_forward_out_prod_f32(
12863
12690
  // nb01 >= nb00 - src0 is not transposed
12864
12691
  // compute by src0 rows
12865
12692
 
12866
- // TODO: #if defined(GGML_USE_CLBLAST)
12867
-
12868
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
12869
- bool use_blas = ggml_is_matrix(src0) &&
12870
- ggml_is_matrix(src1) &&
12871
- ggml_is_contiguous(src0) &&
12872
- (ggml_is_contiguous(src1) || ggml_is_transposed(src1));
12873
- #endif
12874
-
12875
12693
  if (params->type == GGML_TASK_TYPE_INIT) {
12876
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
12877
- if (use_blas) {
12878
- return;
12879
- }
12880
- #endif
12881
12694
  if (ith != 0) {
12882
12695
  return;
12883
12696
  }
@@ -12889,50 +12702,6 @@ static void ggml_compute_forward_out_prod_f32(
12889
12702
  return;
12890
12703
  }
12891
12704
 
12892
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
12893
- if (use_blas) {
12894
- if (params->ith != 0) { // All threads other than the first do no work.
12895
- return;
12896
- }
12897
- // Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
12898
- // src0: (k,n)
12899
- // src1: (k,m)
12900
- // dst: (m,n)
12901
- //
12902
- // Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
12903
- // Also expressed as (major,minor)
12904
- // a: (m,k): so src1 transposed
12905
- // b: (k,n): so src0
12906
- // c: (m,n)
12907
- //
12908
- // However, if ggml_is_transposed(src1) is true, then
12909
- // src1->data already contains a transposed version, so sgemm mustn't
12910
- // transpose it further.
12911
-
12912
- int n = src0->ne[0];
12913
- int k = src0->ne[1];
12914
- int m = src1->ne[0];
12915
-
12916
- int transposeA, lda;
12917
-
12918
- if (!ggml_is_transposed(src1)) {
12919
- transposeA = CblasTrans;
12920
- lda = m;
12921
- } else {
12922
- transposeA = CblasNoTrans;
12923
- lda = k;
12924
- }
12925
-
12926
- float * a = (float *) ((char *) src1->data);
12927
- float * b = (float *) ((char *) src0->data);
12928
- float * c = (float *) ((char *) dst->data);
12929
-
12930
- cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
12931
-
12932
- return;
12933
- }
12934
- #endif
12935
-
12936
12705
  // dst[:,:,:,:] = 0
12937
12706
  // for i2,i3:
12938
12707
  // for i1:
@@ -13062,8 +12831,6 @@ static void ggml_compute_forward_out_prod_q_f32(
13062
12831
  // nb01 >= nb00 - src0 is not transposed
13063
12832
  // compute by src0 rows
13064
12833
 
13065
- // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
13066
-
13067
12834
  if (params->type == GGML_TASK_TYPE_INIT) {
13068
12835
  if (ith != 0) {
13069
12836
  return;
@@ -13460,6 +13227,8 @@ static void ggml_compute_forward_get_rows_q(
13460
13227
  const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
13461
13228
  const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
13462
13229
 
13230
+ assert(i01 >= 0 && i01 < ne01);
13231
+
13463
13232
  dequantize_row_q(
13464
13233
  (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
13465
13234
  (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
@@ -13503,6 +13272,8 @@ static void ggml_compute_forward_get_rows_f16(
13503
13272
  const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
13504
13273
  const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
13505
13274
 
13275
+ assert(i01 >= 0 && i01 < ne01);
13276
+
13506
13277
  ggml_fp16_to_fp32_row(
13507
13278
  (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
13508
13279
  (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
@@ -13546,7 +13317,9 @@ static void ggml_compute_forward_get_rows_bf16(
13546
13317
  const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
13547
13318
  const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
13548
13319
 
13549
- ggml_bf16_to_fp32_row(
13320
+ assert(i01 >= 0 && i01 < ne01);
13321
+
13322
+ ggml_bf16_to_fp32_row(
13550
13323
  (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
13551
13324
  (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
13552
13325
  }
@@ -13589,6 +13362,8 @@ static void ggml_compute_forward_get_rows_f32(
13589
13362
  const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
13590
13363
  const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
13591
13364
 
13365
+ assert(i01 >= 0 && i01 < ne01);
13366
+
13592
13367
  ggml_vec_cpy_f32(nc,
13593
13368
  (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
13594
13369
  (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
@@ -14259,8 +14034,7 @@ static float rope_yarn_ramp(const float low, const float high, const int i0) {
14259
14034
  // MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
14260
14035
  static void rope_yarn(
14261
14036
  float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale,
14262
- float * cos_theta, float * sin_theta
14263
- ) {
14037
+ float * cos_theta, float * sin_theta) {
14264
14038
  // Get n-d rotational scaling corrected for extrapolation
14265
14039
  float theta_interp = freq_scale * theta_extrap;
14266
14040
  float theta = theta_interp;
@@ -14277,18 +14051,19 @@ static void rope_yarn(
14277
14051
 
14278
14052
  // Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
14279
14053
  // `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
14280
- static float ggml_rope_yarn_corr_dim(int n_dims, int n_orig_ctx, float n_rot, float base) {
14281
- return n_dims * logf(n_orig_ctx / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
14054
+ static float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) {
14055
+ return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
14282
14056
  }
14283
14057
 
14284
14058
  static void ggml_rope_cache_init(
14285
- float theta_base, float freq_scale, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
14286
- float * cache, float sin_sign, float theta_scale
14287
- ) {
14059
+ float theta_base, float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
14060
+ float * cache, float sin_sign, float theta_scale) {
14061
+ // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
14288
14062
  float theta = theta_base;
14289
14063
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
14064
+ const float ff = freq_factors ? freq_factors[i0/2] : 1.0f;
14290
14065
  rope_yarn(
14291
- theta, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
14066
+ theta/ff, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
14292
14067
  );
14293
14068
  cache[i0 + 1] *= sin_sign;
14294
14069
 
@@ -14297,11 +14072,11 @@ static void ggml_rope_cache_init(
14297
14072
  }
14298
14073
 
14299
14074
  GGML_CALL void ggml_rope_yarn_corr_dims(
14300
- int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
14075
+ int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]
14301
14076
  ) {
14302
14077
  // start and end correction dims
14303
- float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_fast, freq_base));
14304
- float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_slow, freq_base));
14078
+ float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base));
14079
+ float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base));
14305
14080
  dims[0] = MAX(0, start);
14306
14081
  dims[1] = MIN(n_dims - 1, end);
14307
14082
  }
@@ -14321,15 +14096,11 @@ static void ggml_compute_forward_rope_f32(
14321
14096
 
14322
14097
  float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
14323
14098
 
14324
- // these two only relevant for xPos RoPE:
14325
- float xpos_base;
14326
- bool xpos_down;
14327
-
14328
14099
  //const int n_past = ((int32_t *) dst->op_params)[0];
14329
14100
  const int n_dims = ((int32_t *) dst->op_params)[1];
14330
14101
  const int mode = ((int32_t *) dst->op_params)[2];
14331
- const int n_ctx = ((int32_t *) dst->op_params)[3];
14332
- const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
14102
+ //const int n_ctx = ((int32_t *) dst->op_params)[3];
14103
+ const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
14333
14104
 
14334
14105
  memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
14335
14106
  memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
@@ -14337,8 +14108,6 @@ static void ggml_compute_forward_rope_f32(
14337
14108
  memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
14338
14109
  memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
14339
14110
  memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
14340
- memcpy(&xpos_base, (int32_t *) dst->op_params + 11, sizeof(float));
14341
- memcpy(&xpos_down, (int32_t *) dst->op_params + 12, sizeof(bool));
14342
14111
 
14343
14112
  GGML_TENSOR_UNARY_OP_LOCALS
14344
14113
 
@@ -14368,20 +14137,15 @@ static void ggml_compute_forward_rope_f32(
14368
14137
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
14369
14138
 
14370
14139
  float corr_dims[2];
14371
- ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
14140
+ ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
14372
14141
 
14373
14142
  const bool is_neox = mode & 2;
14374
- const bool is_glm = mode & 4;
14375
14143
 
14376
14144
  const float * freq_factors = NULL;
14377
- if (is_neox) {
14378
- if (src2 != NULL) {
14379
- GGML_ASSERT(src2->type == GGML_TYPE_F32);
14380
- GGML_ASSERT(src2->ne[0] >= n_dims / 2);
14381
- freq_factors = (const float *) src2->data;
14382
- }
14383
- } else {
14384
- GGML_ASSERT(src2 == NULL && "TODO: freq_factors not implemented for !is_neox");
14145
+ if (src2 != NULL) {
14146
+ GGML_ASSERT(src2->type == GGML_TYPE_F32);
14147
+ GGML_ASSERT(src2->ne[0] >= n_dims / 2);
14148
+ freq_factors = (const float *) src2->data;
14385
14149
  }
14386
14150
 
14387
14151
  // backward process uses inverse rotation by cos and sin.
@@ -14396,94 +14160,50 @@ static void ggml_compute_forward_rope_f32(
14396
14160
  const int64_t p = pos[i2];
14397
14161
 
14398
14162
  float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
14399
- if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox
14400
- ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
14401
- }
14163
+ ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
14402
14164
 
14403
14165
  for (int64_t i1 = 0; i1 < ne1; i1++) {
14404
14166
  if (ir++ < ir0) continue;
14405
14167
  if (ir > ir1) break;
14406
14168
 
14407
- float theta_base = (float)p;
14408
-
14409
- if (is_glm) {
14410
- theta_base = MIN(p, n_ctx - 2);
14411
- float block_theta = MAX(p - (n_ctx - 2), 0);
14412
- for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
14413
- const float cos_theta = cosf(theta_base);
14414
- const float sin_theta = sinf(theta_base) * sin_sign;
14415
- const float cos_block_theta = cosf(block_theta);
14416
- const float sin_block_theta = sinf(block_theta) * sin_sign;
14417
-
14418
- theta_base *= theta_scale;
14419
- block_theta *= theta_scale;
14420
-
14421
- const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
14422
- float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
14423
-
14424
- const float x0 = src[0];
14425
- const float x1 = src[n_dims/2];
14426
- const float x2 = src[n_dims];
14427
- const float x3 = src[n_dims/2*3];
14428
-
14429
- dst_data[0] = x0*cos_theta - x1*sin_theta;
14430
- dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
14431
- dst_data[n_dims] = x2*cos_block_theta - x3*sin_block_theta;
14432
- dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta;
14433
- }
14434
- } else if (!is_neox) {
14435
- for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
14169
+ if (!is_neox) {
14170
+ for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
14436
14171
  const float cos_theta = cache[i0 + 0];
14437
14172
  const float sin_theta = cache[i0 + 1];
14438
14173
 
14439
- // zeta scaling for xPos only:
14440
- float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
14441
- if (xpos_down) zeta = 1.0f / zeta;
14442
-
14443
14174
  const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
14444
14175
  float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
14445
14176
 
14446
14177
  const float x0 = src[0];
14447
14178
  const float x1 = src[1];
14448
14179
 
14449
- dst_data[0] = x0*cos_theta*zeta - x1*sin_theta*zeta;
14450
- dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta;
14180
+ dst_data[0] = x0*cos_theta - x1*sin_theta;
14181
+ dst_data[1] = x0*sin_theta + x1*cos_theta;
14451
14182
  }
14452
14183
  } else {
14453
- // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
14454
- for (int64_t ic = 0; ic < ne0; ic += 2) {
14455
- if (ic < n_dims) {
14456
- const int64_t i0 = ic/2;
14457
-
14458
- const float freq_factor = freq_factors ? freq_factors[i0] : 1.0f;
14459
-
14460
- float cos_theta, sin_theta;
14461
- rope_yarn(
14462
- theta_base/freq_factor, freq_scale, corr_dims, ic, ext_factor, attn_factor,
14463
- &cos_theta, &sin_theta
14464
- );
14184
+ for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
14185
+ const int64_t ic = i0/2;
14465
14186
 
14466
- sin_theta *= sin_sign;
14467
- theta_base *= theta_scale;
14187
+ const float cos_theta = cache[i0 + 0];
14188
+ const float sin_theta = cache[i0 + 1];
14468
14189
 
14469
- const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
14470
- float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
14190
+ const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
14191
+ float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
14471
14192
 
14472
- const float x0 = src[0];
14473
- const float x1 = src[n_dims/2];
14193
+ const float x0 = src[0];
14194
+ const float x1 = src[n_dims/2];
14474
14195
 
14475
- dst_data[0] = x0*cos_theta - x1*sin_theta;
14476
- dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
14477
- } else {
14478
- const int64_t i0 = ic;
14196
+ dst_data[0] = x0*cos_theta - x1*sin_theta;
14197
+ dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
14198
+ }
14199
+ }
14479
14200
 
14480
- const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
14481
- float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
14201
+ for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
14202
+ const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
14203
+ float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
14482
14204
 
14483
- dst_data[0] = src[0];
14484
- dst_data[1] = src[1];
14485
- }
14486
- }
14205
+ dst_data[0] = src[0];
14206
+ dst_data[1] = src[1];
14487
14207
  }
14488
14208
  }
14489
14209
  }
@@ -14509,8 +14229,8 @@ static void ggml_compute_forward_rope_f16(
14509
14229
  //const int n_past = ((int32_t *) dst->op_params)[0];
14510
14230
  const int n_dims = ((int32_t *) dst->op_params)[1];
14511
14231
  const int mode = ((int32_t *) dst->op_params)[2];
14512
- const int n_ctx = ((int32_t *) dst->op_params)[3];
14513
- const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
14232
+ //const int n_ctx = ((int32_t *) dst->op_params)[3];
14233
+ const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
14514
14234
  memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
14515
14235
  memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
14516
14236
  memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
@@ -14546,20 +14266,15 @@ static void ggml_compute_forward_rope_f16(
14546
14266
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
14547
14267
 
14548
14268
  float corr_dims[2];
14549
- ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
14269
+ ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
14550
14270
 
14551
14271
  const bool is_neox = mode & 2;
14552
- const bool is_glm = mode & 4;
14553
14272
 
14554
14273
  const float * freq_factors = NULL;
14555
- if (is_neox) {
14556
- if (src2 != NULL) {
14557
- GGML_ASSERT(src2->type == GGML_TYPE_F32);
14558
- GGML_ASSERT(src2->ne[0] >= n_dims / 2);
14559
- freq_factors = (const float *) src2->data;
14560
- }
14561
- } else {
14562
- GGML_ASSERT(src2 == NULL && "TODO: freq_factors not implemented for !is_neox");
14274
+ if (src2 != NULL) {
14275
+ GGML_ASSERT(src2->type == GGML_TYPE_F32);
14276
+ GGML_ASSERT(src2->ne[0] >= n_dims / 2);
14277
+ freq_factors = (const float *) src2->data;
14563
14278
  }
14564
14279
 
14565
14280
  // backward process uses inverse rotation by cos and sin.
@@ -14574,43 +14289,14 @@ static void ggml_compute_forward_rope_f16(
14574
14289
  const int64_t p = pos[i2];
14575
14290
 
14576
14291
  float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
14577
- if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox
14578
- ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
14579
- }
14292
+ ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
14580
14293
 
14581
14294
  for (int64_t i1 = 0; i1 < ne1; i1++) {
14582
14295
  if (ir++ < ir0) continue;
14583
14296
  if (ir > ir1) break;
14584
14297
 
14585
- float theta_base = (float)p;
14586
-
14587
- if (is_glm) {
14588
- theta_base = MIN(p, n_ctx - 2);
14589
- float block_theta = MAX(p - (n_ctx - 2), 0);
14590
- for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
14591
- const float cos_theta = cosf(theta_base);
14592
- const float sin_theta = sinf(theta_base) * sin_sign;
14593
- const float cos_block_theta = cosf(block_theta);
14594
- const float sin_block_theta = sinf(block_theta) * sin_sign;
14595
-
14596
- theta_base *= theta_scale;
14597
- block_theta *= theta_scale;
14598
-
14599
- const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
14600
- ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
14601
-
14602
- const float x0 = GGML_FP16_TO_FP32(src[0]);
14603
- const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
14604
- const float x2 = GGML_FP16_TO_FP32(src[n_dims]);
14605
- const float x3 = GGML_FP16_TO_FP32(src[n_dims/2*3]);
14606
-
14607
- dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
14608
- dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
14609
- dst_data[n_dims] = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
14610
- dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
14611
- }
14612
- } else if (!is_neox) {
14613
- for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
14298
+ if (!is_neox) {
14299
+ for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
14614
14300
  const float cos_theta = cache[i0 + 0];
14615
14301
  const float sin_theta = cache[i0 + 1];
14616
14302
 
@@ -14624,40 +14310,29 @@ static void ggml_compute_forward_rope_f16(
14624
14310
  dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
14625
14311
  }
14626
14312
  } else {
14627
- // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
14628
- for (int64_t ic = 0; ic < ne0; ic += 2) {
14629
- if (ic < n_dims) {
14630
- const int64_t i0 = ic/2;
14313
+ for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
14314
+ const int64_t ic = i0/2;
14631
14315
 
14632
- const float freq_factor = freq_factors ? freq_factors[i0] : 1.0f;
14633
-
14634
- float cos_theta, sin_theta;
14635
- rope_yarn(
14636
- theta_base/freq_factor, freq_scale, corr_dims, ic, ext_factor, attn_factor,
14637
- &cos_theta, &sin_theta
14638
- );
14639
-
14640
- sin_theta *= sin_sign;
14641
- theta_base *= theta_scale;
14316
+ const float cos_theta = cache[i0 + 0];
14317
+ const float sin_theta = cache[i0 + 1];
14642
14318
 
14643
- const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
14644
- ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
14319
+ const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
14320
+ ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
14645
14321
 
14646
- const float x0 = GGML_FP16_TO_FP32(src[0]);
14647
- const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
14322
+ const float x0 = GGML_FP16_TO_FP32(src[0]);
14323
+ const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
14648
14324
 
14649
- dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
14650
- dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
14651
- } else {
14652
- const int64_t i0 = ic;
14325
+ dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
14326
+ dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
14327
+ }
14328
+ }
14653
14329
 
14654
- const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
14655
- ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
14330
+ for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
14331
+ const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
14332
+ ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
14656
14333
 
14657
- dst_data[0] = src[0];
14658
- dst_data[1] = src[1];
14659
- }
14660
- }
14334
+ dst_data[0] = src[0];
14335
+ dst_data[1] = src[1];
14661
14336
  }
14662
14337
  }
14663
14338
  }
@@ -16844,7 +16519,10 @@ static void ggml_compute_forward_map_unary_f32(
16844
16519
 
16845
16520
  const struct ggml_tensor * src0 = dst->src[0];
16846
16521
 
16847
- GGML_ASSERT(ggml_are_same_shape(src0, dst));
16522
+ assert(params->ith == 0);
16523
+ assert(ggml_is_contiguous_1(src0));
16524
+ assert(ggml_is_contiguous_1(dst));
16525
+ assert(ggml_are_same_shape(src0, dst));
16848
16526
 
16849
16527
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
16850
16528
  return;
@@ -16853,9 +16531,6 @@ static void ggml_compute_forward_map_unary_f32(
16853
16531
  const int n = ggml_nrows(src0);
16854
16532
  const int nc = src0->ne[0];
16855
16533
 
16856
- assert( dst->nb[0] == sizeof(float));
16857
- assert(src0->nb[0] == sizeof(float));
16858
-
16859
16534
  for (int i = 0; i < n; i++) {
16860
16535
  fun(nc,
16861
16536
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -16893,6 +16568,9 @@ static void ggml_compute_forward_map_binary_f32(
16893
16568
  const struct ggml_tensor * src1 = dst->src[1];
16894
16569
 
16895
16570
  assert(params->ith == 0);
16571
+ assert(ggml_is_contiguous_1(src0));
16572
+ assert(ggml_is_contiguous_1(src1));
16573
+ assert(ggml_is_contiguous_1(dst));
16896
16574
  assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
16897
16575
 
16898
16576
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -16902,10 +16580,6 @@ static void ggml_compute_forward_map_binary_f32(
16902
16580
  const int n = ggml_nrows(src0);
16903
16581
  const int nc = src0->ne[0];
16904
16582
 
16905
- assert( dst->nb[0] == sizeof(float));
16906
- assert(src0->nb[0] == sizeof(float));
16907
- assert(src1->nb[0] == sizeof(float));
16908
-
16909
16583
  for (int i = 0; i < n; i++) {
16910
16584
  fun(nc,
16911
16585
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -18359,9 +18033,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
18359
18033
  //const int n_past = ((int32_t *) tensor->op_params)[0];
18360
18034
  const int n_dims = ((int32_t *) tensor->op_params)[1];
18361
18035
  const int mode = ((int32_t *) tensor->op_params)[2];
18362
- const int n_ctx = ((int32_t *) tensor->op_params)[3];
18363
- const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
18364
- float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
18036
+ //const int n_ctx = ((int32_t *) tensor->op_params)[3];
18037
+ const int n_ctx_orig = ((int32_t *) tensor->op_params)[4];
18038
+ float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
18365
18039
 
18366
18040
  memcpy(&freq_base, (int32_t *) tensor->op_params + 5, sizeof(float));
18367
18041
  memcpy(&freq_scale, (int32_t *) tensor->op_params + 6, sizeof(float));
@@ -18369,8 +18043,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
18369
18043
  memcpy(&attn_factor, (int32_t *) tensor->op_params + 8, sizeof(float));
18370
18044
  memcpy(&beta_fast, (int32_t *) tensor->op_params + 9, sizeof(float));
18371
18045
  memcpy(&beta_slow, (int32_t *) tensor->op_params + 10, sizeof(float));
18372
- memcpy(&xpos_base, (int32_t *) tensor->op_params + 11, sizeof(float));
18373
- memcpy(&xpos_down, (int32_t *) tensor->op_params + 12, sizeof(bool));
18374
18046
 
18375
18047
  src0->grad = ggml_add_or_set(ctx,
18376
18048
  src0->grad,
@@ -18380,16 +18052,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
18380
18052
  src2,
18381
18053
  n_dims,
18382
18054
  mode,
18383
- n_ctx,
18384
- n_orig_ctx,
18055
+ n_ctx_orig,
18385
18056
  freq_base,
18386
18057
  freq_scale,
18387
18058
  ext_factor,
18388
18059
  attn_factor,
18389
18060
  beta_fast,
18390
- beta_slow,
18391
- xpos_base,
18392
- xpos_down),
18061
+ beta_slow),
18393
18062
  zero_table);
18394
18063
  }
18395
18064
  } break;
@@ -18399,9 +18068,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
18399
18068
  //const int n_past = ((int32_t *) tensor->op_params)[0];
18400
18069
  const int n_dims = ((int32_t *) tensor->op_params)[1];
18401
18070
  const int mode = ((int32_t *) tensor->op_params)[2];
18402
- const int n_ctx = ((int32_t *) tensor->op_params)[3];
18403
- const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
18404
- float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
18071
+ //const int n_ctx = ((int32_t *) tensor->op_params)[3];
18072
+ const int n_ctx_orig = ((int32_t *) tensor->op_params)[4];
18073
+ float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
18405
18074
 
18406
18075
  memcpy(&freq_base, (int32_t *) tensor->op_params + 5, sizeof(float));
18407
18076
  memcpy(&freq_scale, (int32_t *) tensor->op_params + 6, sizeof(float));
@@ -18409,8 +18078,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
18409
18078
  memcpy(&attn_factor, (int32_t *) tensor->op_params + 8, sizeof(float));
18410
18079
  memcpy(&beta_fast, (int32_t *) tensor->op_params + 9, sizeof(float));
18411
18080
  memcpy(&beta_slow, (int32_t *) tensor->op_params + 10, sizeof(float));
18412
- memcpy(&xpos_base, (int32_t *) tensor->op_params + 11, sizeof(float));
18413
- memcpy(&xpos_down, (int32_t *) tensor->op_params + 12, sizeof(bool));
18414
18081
 
18415
18082
  src0->grad = ggml_add_or_set(ctx,
18416
18083
  src0->grad,
@@ -18420,16 +18087,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
18420
18087
  src2,
18421
18088
  n_dims,
18422
18089
  mode,
18423
- n_ctx,
18424
- n_orig_ctx,
18090
+ n_ctx_orig,
18425
18091
  freq_base,
18426
18092
  freq_scale,
18427
18093
  ext_factor,
18428
18094
  attn_factor,
18429
18095
  beta_fast,
18430
18096
  beta_slow,
18431
- xpos_base,
18432
- xpos_down,
18433
18097
  false),
18434
18098
  zero_table);
18435
18099
  }
@@ -19073,6 +18737,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
19073
18737
  switch (node->op) {
19074
18738
  case GGML_OP_CPY:
19075
18739
  case GGML_OP_DUP:
18740
+ case GGML_OP_CONT:
19076
18741
  case GGML_OP_ADD:
19077
18742
  case GGML_OP_ADD1:
19078
18743
  case GGML_OP_ACC:
@@ -19157,7 +18822,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
19157
18822
  } break;
19158
18823
  case GGML_OP_SCALE:
19159
18824
  case GGML_OP_SET:
19160
- case GGML_OP_CONT:
19161
18825
  case GGML_OP_RESHAPE:
19162
18826
  case GGML_OP_VIEW:
19163
18827
  case GGML_OP_PERMUTE:
@@ -19317,8 +18981,11 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput
19317
18981
  sched_yield();
19318
18982
  }
19319
18983
 
19320
- * node_n = atomic_load(&state->shared->node_n);
19321
- if (* node_n != last_node_n) break;
18984
+ *node_n = atomic_load(&state->shared->node_n);
18985
+ if (*node_n != last_node_n) {
18986
+ break;
18987
+ }
18988
+
19322
18989
  #if defined(__SSE3__)
19323
18990
  // Tell the processor we're spinning. It's a processor hint for spinlocks.
19324
18991
  _mm_pause();
@@ -19328,15 +18995,18 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput
19328
18995
 
19329
18996
  static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
19330
18997
  // wait for other threads to finish
19331
- const int last_task_phase = * task_phase;
18998
+ const int last_task_phase = *task_phase;
19332
18999
 
19333
19000
  while (true) {
19334
19001
  if (do_yield) {
19335
19002
  sched_yield();
19336
19003
  }
19337
19004
 
19338
- * task_phase = atomic_load(&state->shared->node_task);
19339
- if (* task_phase != last_task_phase) break;
19005
+ *task_phase = atomic_load(&state->shared->node_task);
19006
+ if (*task_phase != last_task_phase) {
19007
+ break;
19008
+ }
19009
+
19340
19010
  #if defined(__SSE3__)
19341
19011
  // Tell the processor we're spinning. It's a processor hint for spinlocks.
19342
19012
  _mm_pause();
@@ -19536,22 +19206,6 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
19536
19206
  {
19537
19207
  const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
19538
19208
 
19539
- #if defined(GGML_USE_CLBLAST)
19540
- if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
19541
- cur = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node);
19542
- } else
19543
- #endif
19544
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
19545
- if (ggml_compute_forward_mul_mat_use_blas(node)) {
19546
- if (node->src[0]->type != GGML_TYPE_F32) {
19547
- // here we need memory for fully dequantized matrix from src0
19548
- // take into account that src0 can be broadcasted into src1[2,3]
19549
- cur = ggml_type_size(GGML_TYPE_F32)
19550
- * node->src[0]->ne[0]*node->src[0]->ne[1]
19551
- * node->src[1]->ne[2]*node->src[1]->ne[3];
19552
- }
19553
- } else
19554
- #endif
19555
19209
  if (node->src[1]->type != vec_dot_type) {
19556
19210
  cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
19557
19211
  }
@@ -19670,6 +19324,59 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
19670
19324
  return cplan;
19671
19325
  }
19672
19326
 
19327
+ static enum ggml_status ggml_graph_compute_parallel(struct ggml_compute_state * workers, int n_threads) {
19328
+ enum ggml_status compute_status = GGML_STATUS_SUCCESS;
19329
+
19330
+ #ifdef GGML_USE_OPENMP
19331
+ if (n_threads > 1) {
19332
+ #pragma omp parallel num_threads(n_threads)
19333
+ {
19334
+ #pragma omp single
19335
+ {
19336
+ // update the number of threads from the actual number of threads that we got from OpenMP
19337
+ n_threads = omp_get_num_threads();
19338
+ workers[0].shared->n_threads = n_threads;
19339
+ workers[0].shared->n_active = n_threads;
19340
+ }
19341
+ ggml_graph_compute_thread(&workers[omp_get_thread_num()]);
19342
+ }
19343
+ } else {
19344
+ ggml_graph_compute_thread(&workers[0]);
19345
+ }
19346
+ #else
19347
+ // create thread pool
19348
+ if (n_threads > 1) {
19349
+ for (int j = 1; j < n_threads; ++j) {
19350
+ const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
19351
+ GGML_ASSERT(rc == 0);
19352
+ UNUSED(rc);
19353
+ }
19354
+ }
19355
+
19356
+ // this is a work thread too
19357
+ ggml_graph_compute_thread(&workers[0]);
19358
+
19359
+ // join or kill thread pool
19360
+ if (n_threads > 1) {
19361
+ for (int j = 1; j < n_threads; j++) {
19362
+ const int rc = ggml_thread_join(workers[j].thrd, NULL);
19363
+ GGML_ASSERT(rc == 0);
19364
+ UNUSED(rc);
19365
+ }
19366
+ }
19367
+ #endif
19368
+ // don't leave affinity set on the main thread
19369
+ clear_numa_thread_affinity();
19370
+
19371
+ for (int j = 0; j < n_threads; j++) {
19372
+ if (workers[j].ec != GGML_STATUS_SUCCESS) {
19373
+ compute_status = workers[j].ec;
19374
+ break;
19375
+ }
19376
+ }
19377
+ return compute_status;
19378
+ }
19379
+
19673
19380
  enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
19674
19381
  {
19675
19382
  GGML_ASSERT(cplan);
@@ -19680,7 +19387,11 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
19680
19387
  }
19681
19388
  }
19682
19389
 
19683
- const int n_threads = cplan->n_threads;
19390
+ int n_threads = cplan->n_threads;
19391
+
19392
+ #if defined(GGML_USE_OPENMP)
19393
+ n_threads = MIN(n_threads, omp_get_max_threads());
19394
+ #endif
19684
19395
 
19685
19396
  struct ggml_compute_state_shared state_shared = {
19686
19397
  /*.cgraph =*/ cgraph,
@@ -19696,47 +19407,20 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
19696
19407
  /*.current_chunk; =*/ 0,
19697
19408
  };
19698
19409
  struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
19699
-
19700
- // create thread pool
19701
- if (n_threads > 1) {
19702
- for (int j = 1; j < n_threads; ++j) {
19703
- workers[j] = (struct ggml_compute_state) {
19704
- .thrd = 0,
19705
- .ith = j,
19706
- .shared = &state_shared,
19707
- .ec = GGML_STATUS_SUCCESS,
19708
- };
19709
-
19710
- const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
19711
- GGML_ASSERT(rc == 0);
19712
- UNUSED(rc);
19713
- }
19714
- }
19715
-
19716
- workers[0].ith = 0;
19717
- workers[0].shared = &state_shared;
19718
- workers[0].ec = GGML_STATUS_SUCCESS;
19719
-
19720
19410
  const int64_t perf_start_cycles = ggml_perf_cycles();
19721
19411
  const int64_t perf_start_time_us = ggml_perf_time_us();
19722
19412
 
19723
- // this is a work thread too
19724
- ggml_graph_compute_thread(&workers[0]);
19725
- enum ggml_status compute_status = workers[0].ec;
19726
-
19727
- // don't leave affinity set on the main thread
19728
- clear_numa_thread_affinity();
19729
-
19730
- // join or kill thread pool
19731
- if (n_threads > 1) {
19732
- for (int j = 1; j < n_threads; j++) {
19733
- const int rc = ggml_thread_join(workers[j].thrd, NULL);
19734
- GGML_ASSERT(rc == 0);
19735
- if (workers[j].ec != GGML_STATUS_SUCCESS)
19736
- compute_status = workers[j].ec;
19737
- }
19413
+ for (int j = 0; j < n_threads; ++j) {
19414
+ workers[j] = (struct ggml_compute_state) {
19415
+ .thrd = 0,
19416
+ .ith = j,
19417
+ .shared = &state_shared,
19418
+ .ec = GGML_STATUS_SUCCESS,
19419
+ };
19738
19420
  }
19739
19421
 
19422
+ enum ggml_status compute_status = ggml_graph_compute_parallel(workers, n_threads);
19423
+
19740
19424
  // performance stats (graph)
19741
19425
  {
19742
19426
  int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
@@ -22819,7 +22503,7 @@ int ggml_cpu_has_wasm_simd(void) {
22819
22503
  }
22820
22504
 
22821
22505
  int ggml_cpu_has_blas(void) {
22822
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
22506
+ #if defined(GGML_USE_BLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
22823
22507
  return 1;
22824
22508
  #else
22825
22509
  return 0;
@@ -22834,14 +22518,6 @@ int ggml_cpu_has_cuda(void) {
22834
22518
  #endif
22835
22519
  }
22836
22520
 
22837
- int ggml_cpu_has_clblast(void) {
22838
- #if defined(GGML_USE_CLBLAST)
22839
- return 1;
22840
- #else
22841
- return 0;
22842
- #endif
22843
- }
22844
-
22845
22521
  int ggml_cpu_has_vulkan(void) {
22846
22522
  #if defined(GGML_USE_VULKAN)
22847
22523
  return 1;
@@ -22875,8 +22551,7 @@ int ggml_cpu_has_rpc(void) {
22875
22551
  }
22876
22552
 
22877
22553
  int ggml_cpu_has_gpublas(void) {
22878
- return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
22879
- ggml_cpu_has_sycl();
22554
+ return ggml_cpu_has_cuda() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() || ggml_cpu_has_sycl();
22880
22555
  }
22881
22556
 
22882
22557
  int ggml_cpu_has_sse3(void) {