llama_cpp 0.15.4 → 0.16.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (161) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +16 -0
  3. data/ext/llama_cpp/extconf.rb +3 -2
  4. data/ext/llama_cpp/llama_cpp.cpp +17 -3
  5. data/lib/llama_cpp/version.rb +2 -2
  6. data/sig/llama_cpp.rbs +15 -1
  7. data/vendor/tmp/llama.cpp/Makefile +166 -82
  8. data/vendor/tmp/llama.cpp/ggml-alloc.c +82 -26
  9. data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
  10. data/vendor/tmp/llama.cpp/ggml-backend.c +183 -69
  11. data/vendor/tmp/llama.cpp/ggml-backend.h +4 -4
  12. data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
  13. data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
  14. data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
  15. data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
  16. data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
  17. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +104 -0
  18. data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
  19. data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
  20. data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
  21. data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
  22. data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
  23. data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
  24. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +674 -0
  25. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
  26. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
  27. data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
  28. data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
  29. data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
  30. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +88 -0
  31. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +419 -0
  32. data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
  33. data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
  34. data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
  35. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +112 -0
  36. data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
  37. data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
  38. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +206 -0
  39. data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
  40. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
  41. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
  42. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
  43. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
  44. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
  45. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
  123. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
  124. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
  125. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
  126. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
  127. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
  128. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
  129. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
  130. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
  131. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  132. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  133. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  134. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  135. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  136. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  137. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  138. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  139. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  140. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  141. data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
  142. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +286 -0
  143. data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
  144. data/vendor/tmp/llama.cpp/ggml-cuda.cu +103 -135
  145. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +29 -13
  146. data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
  147. data/vendor/tmp/llama.cpp/ggml-metal.m +45 -33
  148. data/vendor/tmp/llama.cpp/ggml-metal.metal +83 -59
  149. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +15 -14
  150. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +26 -90
  151. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +74522 -14913
  152. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +631 -471
  153. data/vendor/tmp/llama.cpp/ggml.c +278 -603
  154. data/vendor/tmp/llama.cpp/ggml.h +9 -28
  155. data/vendor/tmp/llama.cpp/llama.cpp +345 -473
  156. data/vendor/tmp/llama.cpp/llama.h +21 -43
  157. metadata +134 -7
  158. data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
  159. data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
  160. data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
  161. data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
@@ -5,6 +5,7 @@
5
5
  #include "ggml-quants.h"
6
6
  #include "ggml.h"
7
7
 
8
+
8
9
  #if defined(_MSC_VER) || defined(__MINGW32__)
9
10
  #include <malloc.h> // using malloc.h with MSC/MINGW
10
11
  #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
@@ -28,6 +29,10 @@
28
29
  #include <syscall.h>
29
30
  #endif
30
31
 
32
+ #ifdef GGML_USE_OPENMP
33
+ #include <omp.h>
34
+ #endif
35
+
31
36
  #ifdef GGML_USE_METAL
32
37
  #include <unistd.h>
33
38
  #endif
@@ -292,17 +297,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
292
297
 
293
298
  #if defined(GGML_USE_ACCELERATE)
294
299
  #include <Accelerate/Accelerate.h>
295
- #if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
296
- #include "ggml-opencl.h"
297
- #endif
298
- #elif defined(GGML_USE_OPENBLAS)
299
- #if defined(GGML_BLAS_USE_MKL)
300
- #include <mkl.h>
301
- #else
302
- #include <cblas.h>
303
- #endif
304
- #elif defined(GGML_USE_CLBLAST)
305
- #include "ggml-opencl.h"
306
300
  #endif
307
301
 
308
302
  // floating point type used to accumulate sums
@@ -1756,7 +1750,7 @@ struct ggml_compute_state_shared {
1756
1750
  int64_t perf_node_start_cycles;
1757
1751
  int64_t perf_node_start_time_us;
1758
1752
 
1759
- const int n_threads;
1753
+ int n_threads;
1760
1754
 
1761
1755
  // synchronization primitives
1762
1756
  atomic_int n_active; // num active threads
@@ -2267,6 +2261,11 @@ inline static float ggml_silu_f32(float x) {
2267
2261
  return x/(1.0f + expf(-x));
2268
2262
  }
2269
2263
 
2264
+ #if __FINITE_MATH_ONLY__
2265
+ #error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix"
2266
+ #error "ref: https://github.com/ggerganov/llama.cpp/pull/7154#issuecomment-2143844461"
2267
+ #endif
2268
+
2270
2269
  #if defined(__ARM_NEON) && defined(__aarch64__)
2271
2270
 
2272
2271
  // adapted from arm limited optimized routine
@@ -3207,35 +3206,42 @@ GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor) {
3207
3206
  return tensor->nb[0] > tensor->nb[1];
3208
3207
  }
3209
3208
 
3210
- GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
3211
- static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3209
+ static bool ggml_is_contiguous_n(const struct ggml_tensor * tensor, int n) {
3210
+ size_t next_nb = ggml_type_size(tensor->type);
3211
+ if (tensor->ne[0] != ggml_blck_size(tensor->type) && tensor->nb[0] != next_nb) {
3212
+ return false;
3213
+ }
3214
+ next_nb *= tensor->ne[0]/ggml_blck_size(tensor->type);
3215
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
3216
+ if (tensor->ne[i] != 1) {
3217
+ if (i > n) {
3218
+ if (tensor->nb[i] != next_nb) {
3219
+ return false;
3220
+ }
3221
+ next_nb *= tensor->ne[i];
3222
+ } else {
3223
+ // this dimension does not need to be contiguous
3224
+ next_nb = tensor->ne[i]*tensor->nb[i];
3225
+ }
3226
+ }
3227
+ }
3228
+ return true;
3229
+ }
3212
3230
 
3213
- return
3214
- tensor->nb[0] == ggml_type_size(tensor->type) &&
3215
- tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
3216
- tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
3217
- tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
3231
+ GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
3232
+ return ggml_is_contiguous_0(tensor);
3218
3233
  }
3219
3234
 
3220
3235
  GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) {
3221
- return ggml_is_contiguous(tensor);
3236
+ return ggml_is_contiguous_n(tensor, 0);
3222
3237
  }
3223
3238
 
3224
3239
  GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor) {
3225
- static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3226
-
3227
- return
3228
- tensor->nb[0] == ggml_type_size(tensor->type) &&
3229
- tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
3230
- tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
3240
+ return ggml_is_contiguous_n(tensor, 1);
3231
3241
  }
3232
3242
 
3233
3243
  GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) {
3234
- static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3235
-
3236
- return
3237
- tensor->nb[0] == ggml_type_size(tensor->type) &&
3238
- tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
3244
+ return ggml_is_contiguous_n(tensor, 2);
3239
3245
  }
3240
3246
 
3241
3247
  GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
@@ -3267,20 +3273,20 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor
3267
3273
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3268
3274
 
3269
3275
  return
3270
- (t0->ne[0] == t1->ne[0] ) &&
3271
- (t0->ne[1] == t1->ne[1] ) &&
3272
- (t0->ne[2] == t1->ne[2] ) &&
3273
- (t0->ne[3] == t1->ne[3] );
3276
+ (t0->ne[0] == t1->ne[0]) &&
3277
+ (t0->ne[1] == t1->ne[1]) &&
3278
+ (t0->ne[2] == t1->ne[2]) &&
3279
+ (t0->ne[3] == t1->ne[3]);
3274
3280
  }
3275
3281
 
3276
3282
  bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
3277
3283
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3278
3284
 
3279
3285
  return
3280
- (t0->nb[0] == t1->nb[0] ) &&
3281
- (t0->nb[1] == t1->nb[1] ) &&
3282
- (t0->nb[2] == t1->nb[2] ) &&
3283
- (t0->nb[3] == t1->nb[3] );
3286
+ (t0->nb[0] == t1->nb[0]) &&
3287
+ (t0->nb[1] == t1->nb[1]) &&
3288
+ (t0->nb[2] == t1->nb[2]) &&
3289
+ (t0->nb[3] == t1->nb[3]);
3284
3290
  }
3285
3291
 
3286
3292
  // check if t1 can be represented as a repeatition of t0
@@ -3370,10 +3376,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
3370
3376
  GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
3371
3377
  }
3372
3378
 
3373
- #if defined(GGML_USE_CLBLAST)
3374
- ggml_cl_init();
3375
- #endif
3376
-
3377
3379
  ggml_setup_op_has_task_pass();
3378
3380
 
3379
3381
  is_first_call = false;
@@ -4077,32 +4079,26 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
4077
4079
  switch (tensor->type) {
4078
4080
  case GGML_TYPE_I8:
4079
4081
  {
4080
- GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
4081
4082
  return ((int8_t *)(tensor->data))[i];
4082
4083
  }
4083
4084
  case GGML_TYPE_I16:
4084
4085
  {
4085
- GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
4086
4086
  return ((int16_t *)(tensor->data))[i];
4087
4087
  }
4088
4088
  case GGML_TYPE_I32:
4089
4089
  {
4090
- GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
4091
4090
  return ((int32_t *)(tensor->data))[i];
4092
4091
  }
4093
4092
  case GGML_TYPE_F16:
4094
4093
  {
4095
- GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
4096
4094
  return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
4097
4095
  }
4098
4096
  case GGML_TYPE_BF16:
4099
4097
  {
4100
- GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
4101
4098
  return GGML_BF16_TO_FP32(((ggml_bf16_t *)(tensor->data))[i]);
4102
4099
  }
4103
4100
  case GGML_TYPE_F32:
4104
4101
  {
4105
- GGML_ASSERT(tensor->nb[0] == sizeof(float));
4106
4102
  return ((float *)(tensor->data))[i];
4107
4103
  }
4108
4104
  default:
@@ -4124,32 +4120,26 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
4124
4120
  switch (tensor->type) {
4125
4121
  case GGML_TYPE_I8:
4126
4122
  {
4127
- GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
4128
4123
  ((int8_t *)(tensor->data))[i] = value;
4129
4124
  } break;
4130
4125
  case GGML_TYPE_I16:
4131
4126
  {
4132
- GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
4133
4127
  ((int16_t *)(tensor->data))[i] = value;
4134
4128
  } break;
4135
4129
  case GGML_TYPE_I32:
4136
4130
  {
4137
- GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
4138
4131
  ((int32_t *)(tensor->data))[i] = value;
4139
4132
  } break;
4140
4133
  case GGML_TYPE_F16:
4141
4134
  {
4142
- GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
4143
4135
  ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
4144
4136
  } break;
4145
4137
  case GGML_TYPE_BF16:
4146
4138
  {
4147
- GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
4148
4139
  ((ggml_bf16_t *)(tensor->data))[i] = GGML_FP32_TO_BF16(value);
4149
4140
  } break;
4150
4141
  case GGML_TYPE_F32:
4151
4142
  {
4152
- GGML_ASSERT(tensor->nb[0] == sizeof(float));
4153
4143
  ((float *)(tensor->data))[i] = value;
4154
4144
  } break;
4155
4145
  default:
@@ -6249,16 +6239,13 @@ static struct ggml_tensor * ggml_rope_impl(
6249
6239
  struct ggml_tensor * c,
6250
6240
  int n_dims,
6251
6241
  int mode,
6252
- int n_ctx,
6253
- int n_orig_ctx,
6242
+ int n_ctx_orig,
6254
6243
  float freq_base,
6255
6244
  float freq_scale,
6256
6245
  float ext_factor,
6257
6246
  float attn_factor,
6258
6247
  float beta_fast,
6259
6248
  float beta_slow,
6260
- float xpos_base,
6261
- bool xpos_down,
6262
6249
  bool inplace) {
6263
6250
  GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");
6264
6251
 
@@ -6279,15 +6266,13 @@ static struct ggml_tensor * ggml_rope_impl(
6279
6266
 
6280
6267
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6281
6268
 
6282
- int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx };
6269
+ int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
6283
6270
  memcpy(params + 5, &freq_base, sizeof(float));
6284
6271
  memcpy(params + 6, &freq_scale, sizeof(float));
6285
6272
  memcpy(params + 7, &ext_factor, sizeof(float));
6286
6273
  memcpy(params + 8, &attn_factor, sizeof(float));
6287
6274
  memcpy(params + 9, &beta_fast, sizeof(float));
6288
6275
  memcpy(params + 10, &beta_slow, sizeof(float));
6289
- memcpy(params + 11, &xpos_base, sizeof(float));
6290
- memcpy(params + 12, &xpos_down, sizeof(bool));
6291
6276
  ggml_set_op_params(result, params, sizeof(params));
6292
6277
 
6293
6278
  result->op = GGML_OP_ROPE;
@@ -6304,10 +6289,9 @@ struct ggml_tensor * ggml_rope(
6304
6289
  struct ggml_tensor * a,
6305
6290
  struct ggml_tensor * b,
6306
6291
  int n_dims,
6307
- int mode,
6308
- int n_ctx) {
6292
+ int mode) {
6309
6293
  return ggml_rope_impl(
6310
- ctx, a, b, NULL, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, false
6294
+ ctx, a, b, NULL, n_dims, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, false
6311
6295
  );
6312
6296
  }
6313
6297
 
@@ -6316,10 +6300,9 @@ struct ggml_tensor * ggml_rope_inplace(
6316
6300
  struct ggml_tensor * a,
6317
6301
  struct ggml_tensor * b,
6318
6302
  int n_dims,
6319
- int mode,
6320
- int n_ctx) {
6303
+ int mode) {
6321
6304
  return ggml_rope_impl(
6322
- ctx, a, b, NULL, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, true
6305
+ ctx, a, b, NULL, n_dims, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, true
6323
6306
  );
6324
6307
  }
6325
6308
 
@@ -6330,8 +6313,7 @@ struct ggml_tensor * ggml_rope_ext(
6330
6313
  struct ggml_tensor * c,
6331
6314
  int n_dims,
6332
6315
  int mode,
6333
- int n_ctx,
6334
- int n_orig_ctx,
6316
+ int n_ctx_orig,
6335
6317
  float freq_base,
6336
6318
  float freq_scale,
6337
6319
  float ext_factor,
@@ -6339,8 +6321,8 @@ struct ggml_tensor * ggml_rope_ext(
6339
6321
  float beta_fast,
6340
6322
  float beta_slow) {
6341
6323
  return ggml_rope_impl(
6342
- ctx, a, b, c, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
6343
- ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, false
6324
+ ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
6325
+ ext_factor, attn_factor, beta_fast, beta_slow, false
6344
6326
  );
6345
6327
  }
6346
6328
 
@@ -6351,8 +6333,7 @@ struct ggml_tensor * ggml_rope_ext_inplace(
6351
6333
  struct ggml_tensor * c,
6352
6334
  int n_dims,
6353
6335
  int mode,
6354
- int n_ctx,
6355
- int n_orig_ctx,
6336
+ int n_ctx_orig,
6356
6337
  float freq_base,
6357
6338
  float freq_scale,
6358
6339
  float ext_factor,
@@ -6360,8 +6341,8 @@ struct ggml_tensor * ggml_rope_ext_inplace(
6360
6341
  float beta_fast,
6361
6342
  float beta_slow) {
6362
6343
  return ggml_rope_impl(
6363
- ctx, a, b, c, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
6364
- ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, true
6344
+ ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
6345
+ ext_factor, attn_factor, beta_fast, beta_slow, true
6365
6346
  );
6366
6347
  }
6367
6348
 
@@ -6371,8 +6352,7 @@ struct ggml_tensor * ggml_rope_custom(
6371
6352
  struct ggml_tensor * b,
6372
6353
  int n_dims,
6373
6354
  int mode,
6374
- int n_ctx,
6375
- int n_orig_ctx,
6355
+ int n_ctx_orig,
6376
6356
  float freq_base,
6377
6357
  float freq_scale,
6378
6358
  float ext_factor,
@@ -6380,8 +6360,8 @@ struct ggml_tensor * ggml_rope_custom(
6380
6360
  float beta_fast,
6381
6361
  float beta_slow) {
6382
6362
  return ggml_rope_impl(
6383
- ctx, a, b, NULL, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
6384
- ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, false
6363
+ ctx, a, b, NULL, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
6364
+ ext_factor, attn_factor, beta_fast, beta_slow, false
6385
6365
  );
6386
6366
  }
6387
6367
 
@@ -6391,8 +6371,7 @@ struct ggml_tensor * ggml_rope_custom_inplace(
6391
6371
  struct ggml_tensor * b,
6392
6372
  int n_dims,
6393
6373
  int mode,
6394
- int n_ctx,
6395
- int n_orig_ctx,
6374
+ int n_ctx_orig,
6396
6375
  float freq_base,
6397
6376
  float freq_scale,
6398
6377
  float ext_factor,
@@ -6400,21 +6379,11 @@ struct ggml_tensor * ggml_rope_custom_inplace(
6400
6379
  float beta_fast,
6401
6380
  float beta_slow) {
6402
6381
  return ggml_rope_impl(
6403
- ctx, a, b, NULL, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
6404
- ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, true
6382
+ ctx, a, b, NULL, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
6383
+ ext_factor, attn_factor, beta_fast, beta_slow, true
6405
6384
  );
6406
6385
  }
6407
6386
 
6408
- struct ggml_tensor * ggml_rope_xpos_inplace(
6409
- struct ggml_context * ctx,
6410
- struct ggml_tensor * a,
6411
- struct ggml_tensor * b,
6412
- int n_dims,
6413
- float base,
6414
- bool down) {
6415
- return ggml_rope_impl(ctx, a, b, NULL, n_dims, 0, 0, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, base, down, true);
6416
- }
6417
-
6418
6387
  // ggml_rope_back
6419
6388
 
6420
6389
  struct ggml_tensor * ggml_rope_back(
@@ -6424,16 +6393,13 @@ struct ggml_tensor * ggml_rope_back(
6424
6393
  struct ggml_tensor * c,
6425
6394
  int n_dims,
6426
6395
  int mode,
6427
- int n_ctx,
6428
- int n_orig_ctx,
6396
+ int n_ctx_orig,
6429
6397
  float freq_base,
6430
6398
  float freq_scale,
6431
6399
  float ext_factor,
6432
6400
  float attn_factor,
6433
6401
  float beta_fast,
6434
- float beta_slow,
6435
- float xpos_base,
6436
- bool xpos_down) {
6402
+ float beta_slow) {
6437
6403
  GGML_ASSERT(ggml_is_vector(b));
6438
6404
  GGML_ASSERT(b->type == GGML_TYPE_I32);
6439
6405
  GGML_ASSERT(a->ne[2] == b->ne[0]);
@@ -6449,15 +6415,13 @@ struct ggml_tensor * ggml_rope_back(
6449
6415
 
6450
6416
  struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
6451
6417
 
6452
- int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx };
6418
+ int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
6453
6419
  memcpy(params + 5, &freq_base, sizeof(float));
6454
6420
  memcpy(params + 6, &freq_scale, sizeof(float));
6455
6421
  memcpy(params + 7, &ext_factor, sizeof(float));
6456
6422
  memcpy(params + 8, &attn_factor, sizeof(float));
6457
6423
  memcpy(params + 9, &beta_fast, sizeof(float));
6458
6424
  memcpy(params + 10, &beta_slow, sizeof(float));
6459
- memcpy(params + 11, &xpos_base, sizeof(float));
6460
- memcpy(params + 12, &xpos_down, sizeof(bool));
6461
6425
  ggml_set_op_params(result, params, sizeof(params));
6462
6426
 
6463
6427
  result->op = GGML_OP_ROPE_BACK;
@@ -7368,13 +7332,15 @@ struct ggml_tensor * ggml_add_rel_pos_inplace(
7368
7332
  return ggml_add_rel_pos_impl(ctx, a, pw, ph, true);
7369
7333
  }
7370
7334
 
7371
- // gmml_unary
7335
+ // ggml_unary
7372
7336
 
7373
7337
  static struct ggml_tensor * ggml_unary_impl(
7374
7338
  struct ggml_context * ctx,
7375
7339
  struct ggml_tensor * a,
7376
7340
  enum ggml_unary_op op,
7377
7341
  bool inplace) {
7342
+ GGML_ASSERT(ggml_is_contiguous_1(a));
7343
+
7378
7344
  bool is_node = false;
7379
7345
 
7380
7346
  if (!inplace && (a->grad)) {
@@ -9043,17 +9009,6 @@ static void ggml_compute_forward_add_f32(
9043
9009
  const int ith = params->ith;
9044
9010
  const int nth = params->nth;
9045
9011
 
9046
- #ifdef GGML_USE_CLBLAST
9047
- if (src1->backend == GGML_BACKEND_TYPE_GPU) {
9048
- // TODO: OpenCL kernel support full broadcast
9049
- GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
9050
- if (ith == 0) {
9051
- ggml_cl_add(src0, src1, dst);
9052
- }
9053
- return;
9054
- }
9055
- #endif
9056
-
9057
9012
  const int nr = ggml_nrows(src0);
9058
9013
 
9059
9014
  GGML_TENSOR_BINARY_OP_LOCALS
@@ -10161,17 +10116,6 @@ static void ggml_compute_forward_mul_f32(
10161
10116
  const int ith = params->ith;
10162
10117
  const int nth = params->nth;
10163
10118
 
10164
- #if defined(GGML_USE_CLBLAST)
10165
- if (src1->backend == GGML_BACKEND_TYPE_GPU) {
10166
- // TODO: OpenCL kernel support full broadcast
10167
- GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
10168
- if (ith == 0) {
10169
- ggml_cl_mul(src0, src1, dst);
10170
- }
10171
- return;
10172
- }
10173
- #endif
10174
-
10175
10119
  const int64_t nr = ggml_nrows(src0);
10176
10120
 
10177
10121
  GGML_TENSOR_BINARY_OP_LOCALS
@@ -11061,6 +11005,8 @@ static void ggml_compute_forward_abs_f32(
11061
11005
  const struct ggml_tensor * src0 = dst->src[0];
11062
11006
 
11063
11007
  assert(params->ith == 0);
11008
+ assert(ggml_is_contiguous_1(src0));
11009
+ assert(ggml_is_contiguous_1(dst));
11064
11010
  assert(ggml_are_same_shape(src0, dst));
11065
11011
 
11066
11012
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11070,9 +11016,6 @@ static void ggml_compute_forward_abs_f32(
11070
11016
  const int n = ggml_nrows(src0);
11071
11017
  const int nc = src0->ne[0];
11072
11018
 
11073
- assert(dst->nb[0] == sizeof(float));
11074
- assert(src0->nb[0] == sizeof(float));
11075
-
11076
11019
  for (int i = 0; i < n; i++) {
11077
11020
  ggml_vec_abs_f32(nc,
11078
11021
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11107,6 +11050,8 @@ static void ggml_compute_forward_sgn_f32(
11107
11050
  const struct ggml_tensor * src0 = dst->src[0];
11108
11051
 
11109
11052
  assert(params->ith == 0);
11053
+ assert(ggml_is_contiguous_1(src0));
11054
+ assert(ggml_is_contiguous_1(dst));
11110
11055
  assert(ggml_are_same_shape(src0, dst));
11111
11056
 
11112
11057
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11116,9 +11061,6 @@ static void ggml_compute_forward_sgn_f32(
11116
11061
  const int n = ggml_nrows(src0);
11117
11062
  const int nc = src0->ne[0];
11118
11063
 
11119
- assert(dst->nb[0] == sizeof(float));
11120
- assert(src0->nb[0] == sizeof(float));
11121
-
11122
11064
  for (int i = 0; i < n; i++) {
11123
11065
  ggml_vec_sgn_f32(nc,
11124
11066
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11153,6 +11095,8 @@ static void ggml_compute_forward_neg_f32(
11153
11095
  const struct ggml_tensor * src0 = dst->src[0];
11154
11096
 
11155
11097
  assert(params->ith == 0);
11098
+ assert(ggml_is_contiguous_1(src0));
11099
+ assert(ggml_is_contiguous_1(dst));
11156
11100
  assert(ggml_are_same_shape(src0, dst));
11157
11101
 
11158
11102
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11162,9 +11106,6 @@ static void ggml_compute_forward_neg_f32(
11162
11106
  const int n = ggml_nrows(src0);
11163
11107
  const int nc = src0->ne[0];
11164
11108
 
11165
- assert(dst->nb[0] == sizeof(float));
11166
- assert(src0->nb[0] == sizeof(float));
11167
-
11168
11109
  for (int i = 0; i < n; i++) {
11169
11110
  ggml_vec_neg_f32(nc,
11170
11111
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11199,6 +11140,8 @@ static void ggml_compute_forward_step_f32(
11199
11140
  const struct ggml_tensor * src0 = dst->src[0];
11200
11141
 
11201
11142
  assert(params->ith == 0);
11143
+ assert(ggml_is_contiguous_1(src0));
11144
+ assert(ggml_is_contiguous_1(dst));
11202
11145
  assert(ggml_are_same_shape(src0, dst));
11203
11146
 
11204
11147
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11208,9 +11151,6 @@ static void ggml_compute_forward_step_f32(
11208
11151
  const int n = ggml_nrows(src0);
11209
11152
  const int nc = src0->ne[0];
11210
11153
 
11211
- assert(dst->nb[0] == sizeof(float));
11212
- assert(src0->nb[0] == sizeof(float));
11213
-
11214
11154
  for (int i = 0; i < n; i++) {
11215
11155
  ggml_vec_step_f32(nc,
11216
11156
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11245,6 +11185,8 @@ static void ggml_compute_forward_tanh_f32(
11245
11185
  const struct ggml_tensor * src0 = dst->src[0];
11246
11186
 
11247
11187
  assert(params->ith == 0);
11188
+ assert(ggml_is_contiguous_1(src0));
11189
+ assert(ggml_is_contiguous_1(dst));
11248
11190
  assert(ggml_are_same_shape(src0, dst));
11249
11191
 
11250
11192
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11254,9 +11196,6 @@ static void ggml_compute_forward_tanh_f32(
11254
11196
  const int n = ggml_nrows(src0);
11255
11197
  const int nc = src0->ne[0];
11256
11198
 
11257
- assert(dst->nb[0] == sizeof(float));
11258
- assert(src0->nb[0] == sizeof(float));
11259
-
11260
11199
  for (int i = 0; i < n; i++) {
11261
11200
  ggml_vec_tanh_f32(nc,
11262
11201
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11291,6 +11230,8 @@ static void ggml_compute_forward_elu_f32(
11291
11230
  const struct ggml_tensor * src0 = dst->src[0];
11292
11231
 
11293
11232
  assert(params->ith == 0);
11233
+ assert(ggml_is_contiguous_1(src0));
11234
+ assert(ggml_is_contiguous_1(dst));
11294
11235
  assert(ggml_are_same_shape(src0, dst));
11295
11236
 
11296
11237
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11300,9 +11241,6 @@ static void ggml_compute_forward_elu_f32(
11300
11241
  const int n = ggml_nrows(src0);
11301
11242
  const int nc = src0->ne[0];
11302
11243
 
11303
- assert(dst->nb[0] == sizeof(float));
11304
- assert(src0->nb[0] == sizeof(float));
11305
-
11306
11244
  for (int i = 0; i < n; i++) {
11307
11245
  ggml_vec_elu_f32(nc,
11308
11246
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11337,6 +11275,8 @@ static void ggml_compute_forward_relu_f32(
11337
11275
  const struct ggml_tensor * src0 = dst->src[0];
11338
11276
 
11339
11277
  assert(params->ith == 0);
11278
+ assert(ggml_is_contiguous_1(src0));
11279
+ assert(ggml_is_contiguous_1(dst));
11340
11280
  assert(ggml_are_same_shape(src0, dst));
11341
11281
 
11342
11282
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11346,9 +11286,6 @@ static void ggml_compute_forward_relu_f32(
11346
11286
  const int n = ggml_nrows(src0);
11347
11287
  const int nc = src0->ne[0];
11348
11288
 
11349
- assert(dst->nb[0] == sizeof(float));
11350
- assert(src0->nb[0] == sizeof(float));
11351
-
11352
11289
  for (int i = 0; i < n; i++) {
11353
11290
  ggml_vec_relu_f32(nc,
11354
11291
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11383,6 +11320,8 @@ static void ggml_compute_forward_sigmoid_f32(
11383
11320
  const struct ggml_tensor * src0 = dst->src[0];
11384
11321
 
11385
11322
  assert(params->ith == 0);
11323
+ assert(ggml_is_contiguous_1(src0));
11324
+ assert(ggml_is_contiguous_1(dst));
11386
11325
  assert(ggml_are_same_shape(src0, dst));
11387
11326
 
11388
11327
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11392,9 +11331,6 @@ static void ggml_compute_forward_sigmoid_f32(
11392
11331
  const int n = ggml_nrows(src0);
11393
11332
  const int nc = src0->ne[0];
11394
11333
 
11395
- assert(dst->nb[0] == sizeof(float));
11396
- assert(src0->nb[0] == sizeof(float));
11397
-
11398
11334
  for (int i = 0; i < n; i++) {
11399
11335
  ggml_vec_sigmoid_f32(nc,
11400
11336
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11428,9 +11364,9 @@ static void ggml_compute_forward_gelu_f32(
11428
11364
 
11429
11365
  const struct ggml_tensor * src0 = dst->src[0];
11430
11366
 
11431
- GGML_ASSERT(ggml_is_contiguous_1(src0));
11432
- GGML_ASSERT(ggml_is_contiguous_1(dst));
11433
- GGML_ASSERT(ggml_are_same_shape(src0, dst));
11367
+ assert(ggml_is_contiguous_1(src0));
11368
+ assert(ggml_is_contiguous_1(dst));
11369
+ assert(ggml_are_same_shape(src0, dst));
11434
11370
 
11435
11371
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11436
11372
  return;
@@ -11491,9 +11427,9 @@ static void ggml_compute_forward_gelu_quick_f32(
11491
11427
 
11492
11428
  const struct ggml_tensor * src0 = dst->src[0];
11493
11429
 
11494
- GGML_ASSERT(ggml_is_contiguous_1(src0));
11495
- GGML_ASSERT(ggml_is_contiguous_1(dst));
11496
- GGML_ASSERT(ggml_are_same_shape(src0, dst));
11430
+ assert(ggml_is_contiguous_1(src0));
11431
+ assert(ggml_is_contiguous_1(dst));
11432
+ assert(ggml_are_same_shape(src0, dst));
11497
11433
 
11498
11434
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11499
11435
  return;
@@ -11554,9 +11490,9 @@ static void ggml_compute_forward_silu_f32(
11554
11490
 
11555
11491
  const struct ggml_tensor * src0 = dst->src[0];
11556
11492
 
11557
- GGML_ASSERT(ggml_is_contiguous_1(src0));
11558
- GGML_ASSERT(ggml_is_contiguous_1(dst));
11559
- GGML_ASSERT(ggml_are_same_shape(src0, dst));
11493
+ assert(ggml_is_contiguous_1(src0));
11494
+ assert(ggml_is_contiguous_1(dst));
11495
+ assert(ggml_are_same_shape(src0, dst));
11560
11496
 
11561
11497
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11562
11498
  return;
@@ -11617,6 +11553,8 @@ static void ggml_compute_forward_leaky_relu_f32(
11617
11553
  const struct ggml_tensor * src0 = dst->src[0];
11618
11554
 
11619
11555
  assert(params->ith == 0);
11556
+ assert(ggml_is_contiguous_1(src0));
11557
+ assert(ggml_is_contiguous_1(dst));
11620
11558
  assert(ggml_are_same_shape(src0, dst));
11621
11559
 
11622
11560
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11666,11 +11604,11 @@ static void ggml_compute_forward_silu_back_f32(
11666
11604
  const struct ggml_tensor * src0 = dst->src[0];
11667
11605
  const struct ggml_tensor * grad = dst->src[1];
11668
11606
 
11669
- GGML_ASSERT(ggml_is_contiguous_1(grad));
11670
- GGML_ASSERT(ggml_is_contiguous_1(src0));
11671
- GGML_ASSERT(ggml_is_contiguous_1(dst));
11672
- GGML_ASSERT(ggml_are_same_shape(src0, dst));
11673
- GGML_ASSERT(ggml_are_same_shape(src0, grad));
11607
+ assert(ggml_is_contiguous_1(grad));
11608
+ assert(ggml_is_contiguous_1(src0));
11609
+ assert(ggml_is_contiguous_1(dst));
11610
+ assert(ggml_are_same_shape(src0, dst));
11611
+ assert(ggml_are_same_shape(src0, grad));
11674
11612
 
11675
11613
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11676
11614
  return;
@@ -11732,6 +11670,8 @@ static void ggml_compute_forward_hardswish_f32(
11732
11670
  const struct ggml_tensor * src0 = dst->src[0];
11733
11671
 
11734
11672
  assert(params->ith == 0);
11673
+ assert(ggml_is_contiguous_1(src0));
11674
+ assert(ggml_is_contiguous_1(dst));
11735
11675
  assert(ggml_are_same_shape(src0, dst));
11736
11676
 
11737
11677
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11741,9 +11681,6 @@ static void ggml_compute_forward_hardswish_f32(
11741
11681
  const int n = ggml_nrows(src0);
11742
11682
  const int nc = src0->ne[0];
11743
11683
 
11744
- assert(dst->nb[0] == sizeof(float));
11745
- assert(src0->nb[0] == sizeof(float));
11746
-
11747
11684
  for (int i = 0; i < n; i++) {
11748
11685
  ggml_vec_hardswish_f32(nc,
11749
11686
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11775,6 +11712,8 @@ static void ggml_compute_forward_hardsigmoid_f32(
11775
11712
  const struct ggml_tensor * src0 = dst->src[0];
11776
11713
 
11777
11714
  assert(params->ith == 0);
11715
+ assert(ggml_is_contiguous_1(src0));
11716
+ assert(ggml_is_contiguous_1(dst));
11778
11717
  assert(ggml_are_same_shape(src0, dst));
11779
11718
 
11780
11719
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11784,9 +11723,6 @@ static void ggml_compute_forward_hardsigmoid_f32(
11784
11723
  const int n = ggml_nrows(src0);
11785
11724
  const int nc = src0->ne[0];
11786
11725
 
11787
- assert(dst->nb[0] == sizeof(float));
11788
- assert(src0->nb[0] == sizeof(float));
11789
-
11790
11726
  for (int i = 0; i < n; i++) {
11791
11727
  ggml_vec_hardsigmoid_f32(nc,
11792
11728
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -12237,39 +12173,6 @@ static void ggml_compute_forward_group_norm(
12237
12173
 
12238
12174
  // ggml_compute_forward_mul_mat
12239
12175
 
12240
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
12241
- // helper function to determine if it is better to use BLAS or not
12242
- // for large matrices, BLAS is faster
12243
- static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
12244
- const struct ggml_tensor * src0 = dst->src[0];
12245
- const struct ggml_tensor * src1 = dst->src[1];
12246
-
12247
- //const int64_t ne00 = src0->ne[0];
12248
- //const int64_t ne01 = src0->ne[1];
12249
-
12250
- const int64_t ne10 = src1->ne[0];
12251
-
12252
- const int64_t ne0 = dst->ne[0];
12253
- const int64_t ne1 = dst->ne[1];
12254
-
12255
- // NOTE: with GGML_OP_MUL_MAT_ID we don't want to go through the BLAS branch because it will dequantize (to_float)
12256
- // all the experts for each batch element and the processing would become incredibly slow
12257
- // TODO: find the optimal values for these
12258
- if (dst->op != GGML_OP_MUL_MAT_ID &&
12259
- ggml_is_contiguous(src0) &&
12260
- ggml_is_contiguous(src1) &&
12261
- //src0->type == GGML_TYPE_F32 &&
12262
- src1->type == GGML_TYPE_F32 &&
12263
- (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
12264
-
12265
- /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
12266
- return true;
12267
- }
12268
-
12269
- return false;
12270
- }
12271
- #endif
12272
-
12273
12176
  static void ggml_compute_forward_mul_mat_one_chunk(
12274
12177
  const struct ggml_compute_params * params,
12275
12178
  struct ggml_tensor * dst,
@@ -12407,82 +12310,6 @@ static void ggml_compute_forward_mul_mat(
12407
12310
  // nb01 >= nb00 - src0 is not transposed
12408
12311
  // compute by src0 rows
12409
12312
 
12410
- #if defined(GGML_USE_CLBLAST)
12411
- if (ggml_cl_can_mul_mat(src0, src1, dst)) {
12412
- if (params->ith == 0 && params->type == GGML_TASK_TYPE_COMPUTE) {
12413
- ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
12414
- }
12415
- return;
12416
- }
12417
- #endif
12418
-
12419
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
12420
- if (ggml_compute_forward_mul_mat_use_blas(dst)) {
12421
- const int64_t ne_plane = ne01*ne00;
12422
- const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
12423
- UNUSED(desired_wsize);
12424
-
12425
- if (params->type == GGML_TASK_TYPE_INIT) {
12426
- if (type != GGML_TYPE_F32) {
12427
- assert(params->wsize >= desired_wsize);
12428
- // parallelize by src0 rows
12429
- for (int64_t i13 = 0; i13 < ne13; i13++) {
12430
- for (int64_t i12 = 0; i12 < ne12; i12++) {
12431
- // broadcast src0 into src1 across 2nd,3rd dimension
12432
- const int64_t i03 = i13/r3;
12433
- const int64_t i02 = i12/r2;
12434
-
12435
- const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
12436
- float * const wdata = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
12437
- ggml_to_float_t const to_float = type_traits[type].to_float;
12438
-
12439
- for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
12440
- to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
12441
- }
12442
- }
12443
- }
12444
- }
12445
- return;
12446
- }
12447
-
12448
- if (params->type == GGML_TASK_TYPE_FINALIZE) {
12449
- return;
12450
- }
12451
-
12452
- // perform sgemm, parallelization controlled by blas lib
12453
- if (ith != 0) {
12454
- return;
12455
- }
12456
-
12457
- //const int64_t tgemm0 = ggml_perf_time_us();
12458
- for (int64_t i13 = 0; i13 < ne13; i13++) {
12459
- for (int64_t i12 = 0; i12 < ne12; i12++) {
12460
- const int64_t i03 = i13/r3;
12461
- const int64_t i02 = i12/r2;
12462
-
12463
- const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
12464
- const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
12465
- float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
12466
-
12467
- if (type != GGML_TYPE_F32) {
12468
- x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
12469
- }
12470
-
12471
- cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
12472
- ne1, ne01, ne10,
12473
- 1.0f, y, ne10,
12474
- x, ne00,
12475
- 0.0f, d, ne01);
12476
- }
12477
- }
12478
- //printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);
12479
-
12480
- //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
12481
-
12482
- return;
12483
- }
12484
- #endif
12485
-
12486
12313
  #if GGML_USE_LLAMAFILE
12487
12314
  const bool src1_cont = ggml_is_contiguous(src1);
12488
12315
 
@@ -12863,21 +12690,7 @@ static void ggml_compute_forward_out_prod_f32(
12863
12690
  // nb01 >= nb00 - src0 is not transposed
12864
12691
  // compute by src0 rows
12865
12692
 
12866
- // TODO: #if defined(GGML_USE_CLBLAST)
12867
-
12868
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
12869
- bool use_blas = ggml_is_matrix(src0) &&
12870
- ggml_is_matrix(src1) &&
12871
- ggml_is_contiguous(src0) &&
12872
- (ggml_is_contiguous(src1) || ggml_is_transposed(src1));
12873
- #endif
12874
-
12875
12693
  if (params->type == GGML_TASK_TYPE_INIT) {
12876
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
12877
- if (use_blas) {
12878
- return;
12879
- }
12880
- #endif
12881
12694
  if (ith != 0) {
12882
12695
  return;
12883
12696
  }
@@ -12889,50 +12702,6 @@ static void ggml_compute_forward_out_prod_f32(
12889
12702
  return;
12890
12703
  }
12891
12704
 
12892
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
12893
- if (use_blas) {
12894
- if (params->ith != 0) { // All threads other than the first do no work.
12895
- return;
12896
- }
12897
- // Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
12898
- // src0: (k,n)
12899
- // src1: (k,m)
12900
- // dst: (m,n)
12901
- //
12902
- // Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
12903
- // Also expressed as (major,minor)
12904
- // a: (m,k): so src1 transposed
12905
- // b: (k,n): so src0
12906
- // c: (m,n)
12907
- //
12908
- // However, if ggml_is_transposed(src1) is true, then
12909
- // src1->data already contains a transposed version, so sgemm mustn't
12910
- // transpose it further.
12911
-
12912
- int n = src0->ne[0];
12913
- int k = src0->ne[1];
12914
- int m = src1->ne[0];
12915
-
12916
- int transposeA, lda;
12917
-
12918
- if (!ggml_is_transposed(src1)) {
12919
- transposeA = CblasTrans;
12920
- lda = m;
12921
- } else {
12922
- transposeA = CblasNoTrans;
12923
- lda = k;
12924
- }
12925
-
12926
- float * a = (float *) ((char *) src1->data);
12927
- float * b = (float *) ((char *) src0->data);
12928
- float * c = (float *) ((char *) dst->data);
12929
-
12930
- cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
12931
-
12932
- return;
12933
- }
12934
- #endif
12935
-
12936
12705
  // dst[:,:,:,:] = 0
12937
12706
  // for i2,i3:
12938
12707
  // for i1:
@@ -13062,8 +12831,6 @@ static void ggml_compute_forward_out_prod_q_f32(
13062
12831
  // nb01 >= nb00 - src0 is not transposed
13063
12832
  // compute by src0 rows
13064
12833
 
13065
- // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
13066
-
13067
12834
  if (params->type == GGML_TASK_TYPE_INIT) {
13068
12835
  if (ith != 0) {
13069
12836
  return;
@@ -13460,6 +13227,8 @@ static void ggml_compute_forward_get_rows_q(
13460
13227
  const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
13461
13228
  const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
13462
13229
 
13230
+ assert(i01 >= 0 && i01 < ne01);
13231
+
13463
13232
  dequantize_row_q(
13464
13233
  (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
13465
13234
  (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
@@ -13503,6 +13272,8 @@ static void ggml_compute_forward_get_rows_f16(
13503
13272
  const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
13504
13273
  const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
13505
13274
 
13275
+ assert(i01 >= 0 && i01 < ne01);
13276
+
13506
13277
  ggml_fp16_to_fp32_row(
13507
13278
  (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
13508
13279
  (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
@@ -13546,7 +13317,9 @@ static void ggml_compute_forward_get_rows_bf16(
13546
13317
  const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
13547
13318
  const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
13548
13319
 
13549
- ggml_bf16_to_fp32_row(
13320
+ assert(i01 >= 0 && i01 < ne01);
13321
+
13322
+ ggml_bf16_to_fp32_row(
13550
13323
  (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
13551
13324
  (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
13552
13325
  }
@@ -13589,6 +13362,8 @@ static void ggml_compute_forward_get_rows_f32(
13589
13362
  const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
13590
13363
  const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
13591
13364
 
13365
+ assert(i01 >= 0 && i01 < ne01);
13366
+
13592
13367
  ggml_vec_cpy_f32(nc,
13593
13368
  (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
13594
13369
  (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
@@ -14259,8 +14034,7 @@ static float rope_yarn_ramp(const float low, const float high, const int i0) {
14259
14034
  // MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
14260
14035
  static void rope_yarn(
14261
14036
  float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale,
14262
- float * cos_theta, float * sin_theta
14263
- ) {
14037
+ float * cos_theta, float * sin_theta) {
14264
14038
  // Get n-d rotational scaling corrected for extrapolation
14265
14039
  float theta_interp = freq_scale * theta_extrap;
14266
14040
  float theta = theta_interp;
@@ -14277,18 +14051,19 @@ static void rope_yarn(
14277
14051
 
14278
14052
  // Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
14279
14053
  // `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
14280
- static float ggml_rope_yarn_corr_dim(int n_dims, int n_orig_ctx, float n_rot, float base) {
14281
- return n_dims * logf(n_orig_ctx / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
14054
+ static float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) {
14055
+ return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
14282
14056
  }
14283
14057
 
14284
14058
  static void ggml_rope_cache_init(
14285
- float theta_base, float freq_scale, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
14286
- float * cache, float sin_sign, float theta_scale
14287
- ) {
14059
+ float theta_base, float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
14060
+ float * cache, float sin_sign, float theta_scale) {
14061
+ // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
14288
14062
  float theta = theta_base;
14289
14063
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
14064
+ const float ff = freq_factors ? freq_factors[i0/2] : 1.0f;
14290
14065
  rope_yarn(
14291
- theta, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
14066
+ theta/ff, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
14292
14067
  );
14293
14068
  cache[i0 + 1] *= sin_sign;
14294
14069
 
@@ -14297,11 +14072,11 @@ static void ggml_rope_cache_init(
14297
14072
  }
14298
14073
 
14299
14074
  GGML_CALL void ggml_rope_yarn_corr_dims(
14300
- int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
14075
+ int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]
14301
14076
  ) {
14302
14077
  // start and end correction dims
14303
- float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_fast, freq_base));
14304
- float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_slow, freq_base));
14078
+ float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base));
14079
+ float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base));
14305
14080
  dims[0] = MAX(0, start);
14306
14081
  dims[1] = MIN(n_dims - 1, end);
14307
14082
  }
@@ -14321,15 +14096,11 @@ static void ggml_compute_forward_rope_f32(
14321
14096
 
14322
14097
  float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
14323
14098
 
14324
- // these two only relevant for xPos RoPE:
14325
- float xpos_base;
14326
- bool xpos_down;
14327
-
14328
14099
  //const int n_past = ((int32_t *) dst->op_params)[0];
14329
14100
  const int n_dims = ((int32_t *) dst->op_params)[1];
14330
14101
  const int mode = ((int32_t *) dst->op_params)[2];
14331
- const int n_ctx = ((int32_t *) dst->op_params)[3];
14332
- const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
14102
+ //const int n_ctx = ((int32_t *) dst->op_params)[3];
14103
+ const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
14333
14104
 
14334
14105
  memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
14335
14106
  memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
@@ -14337,8 +14108,6 @@ static void ggml_compute_forward_rope_f32(
14337
14108
  memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
14338
14109
  memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
14339
14110
  memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
14340
- memcpy(&xpos_base, (int32_t *) dst->op_params + 11, sizeof(float));
14341
- memcpy(&xpos_down, (int32_t *) dst->op_params + 12, sizeof(bool));
14342
14111
 
14343
14112
  GGML_TENSOR_UNARY_OP_LOCALS
14344
14113
 
@@ -14368,20 +14137,15 @@ static void ggml_compute_forward_rope_f32(
14368
14137
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
14369
14138
 
14370
14139
  float corr_dims[2];
14371
- ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
14140
+ ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
14372
14141
 
14373
14142
  const bool is_neox = mode & 2;
14374
- const bool is_glm = mode & 4;
14375
14143
 
14376
14144
  const float * freq_factors = NULL;
14377
- if (is_neox) {
14378
- if (src2 != NULL) {
14379
- GGML_ASSERT(src2->type == GGML_TYPE_F32);
14380
- GGML_ASSERT(src2->ne[0] >= n_dims / 2);
14381
- freq_factors = (const float *) src2->data;
14382
- }
14383
- } else {
14384
- GGML_ASSERT(src2 == NULL && "TODO: freq_factors not implemented for !is_neox");
14145
+ if (src2 != NULL) {
14146
+ GGML_ASSERT(src2->type == GGML_TYPE_F32);
14147
+ GGML_ASSERT(src2->ne[0] >= n_dims / 2);
14148
+ freq_factors = (const float *) src2->data;
14385
14149
  }
14386
14150
 
14387
14151
  // backward process uses inverse rotation by cos and sin.
@@ -14396,94 +14160,50 @@ static void ggml_compute_forward_rope_f32(
14396
14160
  const int64_t p = pos[i2];
14397
14161
 
14398
14162
  float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
14399
- if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox
14400
- ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
14401
- }
14163
+ ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
14402
14164
 
14403
14165
  for (int64_t i1 = 0; i1 < ne1; i1++) {
14404
14166
  if (ir++ < ir0) continue;
14405
14167
  if (ir > ir1) break;
14406
14168
 
14407
- float theta_base = (float)p;
14408
-
14409
- if (is_glm) {
14410
- theta_base = MIN(p, n_ctx - 2);
14411
- float block_theta = MAX(p - (n_ctx - 2), 0);
14412
- for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
14413
- const float cos_theta = cosf(theta_base);
14414
- const float sin_theta = sinf(theta_base) * sin_sign;
14415
- const float cos_block_theta = cosf(block_theta);
14416
- const float sin_block_theta = sinf(block_theta) * sin_sign;
14417
-
14418
- theta_base *= theta_scale;
14419
- block_theta *= theta_scale;
14420
-
14421
- const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
14422
- float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
14423
-
14424
- const float x0 = src[0];
14425
- const float x1 = src[n_dims/2];
14426
- const float x2 = src[n_dims];
14427
- const float x3 = src[n_dims/2*3];
14428
-
14429
- dst_data[0] = x0*cos_theta - x1*sin_theta;
14430
- dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
14431
- dst_data[n_dims] = x2*cos_block_theta - x3*sin_block_theta;
14432
- dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta;
14433
- }
14434
- } else if (!is_neox) {
14435
- for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
14169
+ if (!is_neox) {
14170
+ for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
14436
14171
  const float cos_theta = cache[i0 + 0];
14437
14172
  const float sin_theta = cache[i0 + 1];
14438
14173
 
14439
- // zeta scaling for xPos only:
14440
- float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
14441
- if (xpos_down) zeta = 1.0f / zeta;
14442
-
14443
14174
  const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
14444
14175
  float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
14445
14176
 
14446
14177
  const float x0 = src[0];
14447
14178
  const float x1 = src[1];
14448
14179
 
14449
- dst_data[0] = x0*cos_theta*zeta - x1*sin_theta*zeta;
14450
- dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta;
14180
+ dst_data[0] = x0*cos_theta - x1*sin_theta;
14181
+ dst_data[1] = x0*sin_theta + x1*cos_theta;
14451
14182
  }
14452
14183
  } else {
14453
- // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
14454
- for (int64_t ic = 0; ic < ne0; ic += 2) {
14455
- if (ic < n_dims) {
14456
- const int64_t i0 = ic/2;
14457
-
14458
- const float freq_factor = freq_factors ? freq_factors[i0] : 1.0f;
14459
-
14460
- float cos_theta, sin_theta;
14461
- rope_yarn(
14462
- theta_base/freq_factor, freq_scale, corr_dims, ic, ext_factor, attn_factor,
14463
- &cos_theta, &sin_theta
14464
- );
14184
+ for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
14185
+ const int64_t ic = i0/2;
14465
14186
 
14466
- sin_theta *= sin_sign;
14467
- theta_base *= theta_scale;
14187
+ const float cos_theta = cache[i0 + 0];
14188
+ const float sin_theta = cache[i0 + 1];
14468
14189
 
14469
- const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
14470
- float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
14190
+ const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
14191
+ float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
14471
14192
 
14472
- const float x0 = src[0];
14473
- const float x1 = src[n_dims/2];
14193
+ const float x0 = src[0];
14194
+ const float x1 = src[n_dims/2];
14474
14195
 
14475
- dst_data[0] = x0*cos_theta - x1*sin_theta;
14476
- dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
14477
- } else {
14478
- const int64_t i0 = ic;
14196
+ dst_data[0] = x0*cos_theta - x1*sin_theta;
14197
+ dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
14198
+ }
14199
+ }
14479
14200
 
14480
- const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
14481
- float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
14201
+ for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
14202
+ const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
14203
+ float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
14482
14204
 
14483
- dst_data[0] = src[0];
14484
- dst_data[1] = src[1];
14485
- }
14486
- }
14205
+ dst_data[0] = src[0];
14206
+ dst_data[1] = src[1];
14487
14207
  }
14488
14208
  }
14489
14209
  }
@@ -14509,8 +14229,8 @@ static void ggml_compute_forward_rope_f16(
14509
14229
  //const int n_past = ((int32_t *) dst->op_params)[0];
14510
14230
  const int n_dims = ((int32_t *) dst->op_params)[1];
14511
14231
  const int mode = ((int32_t *) dst->op_params)[2];
14512
- const int n_ctx = ((int32_t *) dst->op_params)[3];
14513
- const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
14232
+ //const int n_ctx = ((int32_t *) dst->op_params)[3];
14233
+ const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
14514
14234
  memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
14515
14235
  memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
14516
14236
  memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
@@ -14546,20 +14266,15 @@ static void ggml_compute_forward_rope_f16(
14546
14266
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
14547
14267
 
14548
14268
  float corr_dims[2];
14549
- ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
14269
+ ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
14550
14270
 
14551
14271
  const bool is_neox = mode & 2;
14552
- const bool is_glm = mode & 4;
14553
14272
 
14554
14273
  const float * freq_factors = NULL;
14555
- if (is_neox) {
14556
- if (src2 != NULL) {
14557
- GGML_ASSERT(src2->type == GGML_TYPE_F32);
14558
- GGML_ASSERT(src2->ne[0] >= n_dims / 2);
14559
- freq_factors = (const float *) src2->data;
14560
- }
14561
- } else {
14562
- GGML_ASSERT(src2 == NULL && "TODO: freq_factors not implemented for !is_neox");
14274
+ if (src2 != NULL) {
14275
+ GGML_ASSERT(src2->type == GGML_TYPE_F32);
14276
+ GGML_ASSERT(src2->ne[0] >= n_dims / 2);
14277
+ freq_factors = (const float *) src2->data;
14563
14278
  }
14564
14279
 
14565
14280
  // backward process uses inverse rotation by cos and sin.
@@ -14574,43 +14289,14 @@ static void ggml_compute_forward_rope_f16(
14574
14289
  const int64_t p = pos[i2];
14575
14290
 
14576
14291
  float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
14577
- if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox
14578
- ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
14579
- }
14292
+ ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
14580
14293
 
14581
14294
  for (int64_t i1 = 0; i1 < ne1; i1++) {
14582
14295
  if (ir++ < ir0) continue;
14583
14296
  if (ir > ir1) break;
14584
14297
 
14585
- float theta_base = (float)p;
14586
-
14587
- if (is_glm) {
14588
- theta_base = MIN(p, n_ctx - 2);
14589
- float block_theta = MAX(p - (n_ctx - 2), 0);
14590
- for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
14591
- const float cos_theta = cosf(theta_base);
14592
- const float sin_theta = sinf(theta_base) * sin_sign;
14593
- const float cos_block_theta = cosf(block_theta);
14594
- const float sin_block_theta = sinf(block_theta) * sin_sign;
14595
-
14596
- theta_base *= theta_scale;
14597
- block_theta *= theta_scale;
14598
-
14599
- const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
14600
- ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
14601
-
14602
- const float x0 = GGML_FP16_TO_FP32(src[0]);
14603
- const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
14604
- const float x2 = GGML_FP16_TO_FP32(src[n_dims]);
14605
- const float x3 = GGML_FP16_TO_FP32(src[n_dims/2*3]);
14606
-
14607
- dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
14608
- dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
14609
- dst_data[n_dims] = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
14610
- dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
14611
- }
14612
- } else if (!is_neox) {
14613
- for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
14298
+ if (!is_neox) {
14299
+ for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
14614
14300
  const float cos_theta = cache[i0 + 0];
14615
14301
  const float sin_theta = cache[i0 + 1];
14616
14302
 
@@ -14624,40 +14310,29 @@ static void ggml_compute_forward_rope_f16(
14624
14310
  dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
14625
14311
  }
14626
14312
  } else {
14627
- // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
14628
- for (int64_t ic = 0; ic < ne0; ic += 2) {
14629
- if (ic < n_dims) {
14630
- const int64_t i0 = ic/2;
14313
+ for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
14314
+ const int64_t ic = i0/2;
14631
14315
 
14632
- const float freq_factor = freq_factors ? freq_factors[i0] : 1.0f;
14633
-
14634
- float cos_theta, sin_theta;
14635
- rope_yarn(
14636
- theta_base/freq_factor, freq_scale, corr_dims, ic, ext_factor, attn_factor,
14637
- &cos_theta, &sin_theta
14638
- );
14639
-
14640
- sin_theta *= sin_sign;
14641
- theta_base *= theta_scale;
14316
+ const float cos_theta = cache[i0 + 0];
14317
+ const float sin_theta = cache[i0 + 1];
14642
14318
 
14643
- const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
14644
- ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
14319
+ const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
14320
+ ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
14645
14321
 
14646
- const float x0 = GGML_FP16_TO_FP32(src[0]);
14647
- const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
14322
+ const float x0 = GGML_FP16_TO_FP32(src[0]);
14323
+ const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
14648
14324
 
14649
- dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
14650
- dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
14651
- } else {
14652
- const int64_t i0 = ic;
14325
+ dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
14326
+ dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
14327
+ }
14328
+ }
14653
14329
 
14654
- const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
14655
- ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
14330
+ for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
14331
+ const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
14332
+ ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
14656
14333
 
14657
- dst_data[0] = src[0];
14658
- dst_data[1] = src[1];
14659
- }
14660
- }
14334
+ dst_data[0] = src[0];
14335
+ dst_data[1] = src[1];
14661
14336
  }
14662
14337
  }
14663
14338
  }
@@ -16844,7 +16519,10 @@ static void ggml_compute_forward_map_unary_f32(
16844
16519
 
16845
16520
  const struct ggml_tensor * src0 = dst->src[0];
16846
16521
 
16847
- GGML_ASSERT(ggml_are_same_shape(src0, dst));
16522
+ assert(params->ith == 0);
16523
+ assert(ggml_is_contiguous_1(src0));
16524
+ assert(ggml_is_contiguous_1(dst));
16525
+ assert(ggml_are_same_shape(src0, dst));
16848
16526
 
16849
16527
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
16850
16528
  return;
@@ -16853,9 +16531,6 @@ static void ggml_compute_forward_map_unary_f32(
16853
16531
  const int n = ggml_nrows(src0);
16854
16532
  const int nc = src0->ne[0];
16855
16533
 
16856
- assert( dst->nb[0] == sizeof(float));
16857
- assert(src0->nb[0] == sizeof(float));
16858
-
16859
16534
  for (int i = 0; i < n; i++) {
16860
16535
  fun(nc,
16861
16536
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -16893,6 +16568,9 @@ static void ggml_compute_forward_map_binary_f32(
16893
16568
  const struct ggml_tensor * src1 = dst->src[1];
16894
16569
 
16895
16570
  assert(params->ith == 0);
16571
+ assert(ggml_is_contiguous_1(src0));
16572
+ assert(ggml_is_contiguous_1(src1));
16573
+ assert(ggml_is_contiguous_1(dst));
16896
16574
  assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
16897
16575
 
16898
16576
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -16902,10 +16580,6 @@ static void ggml_compute_forward_map_binary_f32(
16902
16580
  const int n = ggml_nrows(src0);
16903
16581
  const int nc = src0->ne[0];
16904
16582
 
16905
- assert( dst->nb[0] == sizeof(float));
16906
- assert(src0->nb[0] == sizeof(float));
16907
- assert(src1->nb[0] == sizeof(float));
16908
-
16909
16583
  for (int i = 0; i < n; i++) {
16910
16584
  fun(nc,
16911
16585
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -18359,9 +18033,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
18359
18033
  //const int n_past = ((int32_t *) tensor->op_params)[0];
18360
18034
  const int n_dims = ((int32_t *) tensor->op_params)[1];
18361
18035
  const int mode = ((int32_t *) tensor->op_params)[2];
18362
- const int n_ctx = ((int32_t *) tensor->op_params)[3];
18363
- const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
18364
- float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
18036
+ //const int n_ctx = ((int32_t *) tensor->op_params)[3];
18037
+ const int n_ctx_orig = ((int32_t *) tensor->op_params)[4];
18038
+ float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
18365
18039
 
18366
18040
  memcpy(&freq_base, (int32_t *) tensor->op_params + 5, sizeof(float));
18367
18041
  memcpy(&freq_scale, (int32_t *) tensor->op_params + 6, sizeof(float));
@@ -18369,8 +18043,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
18369
18043
  memcpy(&attn_factor, (int32_t *) tensor->op_params + 8, sizeof(float));
18370
18044
  memcpy(&beta_fast, (int32_t *) tensor->op_params + 9, sizeof(float));
18371
18045
  memcpy(&beta_slow, (int32_t *) tensor->op_params + 10, sizeof(float));
18372
- memcpy(&xpos_base, (int32_t *) tensor->op_params + 11, sizeof(float));
18373
- memcpy(&xpos_down, (int32_t *) tensor->op_params + 12, sizeof(bool));
18374
18046
 
18375
18047
  src0->grad = ggml_add_or_set(ctx,
18376
18048
  src0->grad,
@@ -18380,16 +18052,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
18380
18052
  src2,
18381
18053
  n_dims,
18382
18054
  mode,
18383
- n_ctx,
18384
- n_orig_ctx,
18055
+ n_ctx_orig,
18385
18056
  freq_base,
18386
18057
  freq_scale,
18387
18058
  ext_factor,
18388
18059
  attn_factor,
18389
18060
  beta_fast,
18390
- beta_slow,
18391
- xpos_base,
18392
- xpos_down),
18061
+ beta_slow),
18393
18062
  zero_table);
18394
18063
  }
18395
18064
  } break;
@@ -18399,9 +18068,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
18399
18068
  //const int n_past = ((int32_t *) tensor->op_params)[0];
18400
18069
  const int n_dims = ((int32_t *) tensor->op_params)[1];
18401
18070
  const int mode = ((int32_t *) tensor->op_params)[2];
18402
- const int n_ctx = ((int32_t *) tensor->op_params)[3];
18403
- const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
18404
- float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
18071
+ //const int n_ctx = ((int32_t *) tensor->op_params)[3];
18072
+ const int n_ctx_orig = ((int32_t *) tensor->op_params)[4];
18073
+ float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
18405
18074
 
18406
18075
  memcpy(&freq_base, (int32_t *) tensor->op_params + 5, sizeof(float));
18407
18076
  memcpy(&freq_scale, (int32_t *) tensor->op_params + 6, sizeof(float));
@@ -18409,8 +18078,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
18409
18078
  memcpy(&attn_factor, (int32_t *) tensor->op_params + 8, sizeof(float));
18410
18079
  memcpy(&beta_fast, (int32_t *) tensor->op_params + 9, sizeof(float));
18411
18080
  memcpy(&beta_slow, (int32_t *) tensor->op_params + 10, sizeof(float));
18412
- memcpy(&xpos_base, (int32_t *) tensor->op_params + 11, sizeof(float));
18413
- memcpy(&xpos_down, (int32_t *) tensor->op_params + 12, sizeof(bool));
18414
18081
 
18415
18082
  src0->grad = ggml_add_or_set(ctx,
18416
18083
  src0->grad,
@@ -18420,16 +18087,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
18420
18087
  src2,
18421
18088
  n_dims,
18422
18089
  mode,
18423
- n_ctx,
18424
- n_orig_ctx,
18090
+ n_ctx_orig,
18425
18091
  freq_base,
18426
18092
  freq_scale,
18427
18093
  ext_factor,
18428
18094
  attn_factor,
18429
18095
  beta_fast,
18430
18096
  beta_slow,
18431
- xpos_base,
18432
- xpos_down,
18433
18097
  false),
18434
18098
  zero_table);
18435
18099
  }
@@ -19073,6 +18737,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
19073
18737
  switch (node->op) {
19074
18738
  case GGML_OP_CPY:
19075
18739
  case GGML_OP_DUP:
18740
+ case GGML_OP_CONT:
19076
18741
  case GGML_OP_ADD:
19077
18742
  case GGML_OP_ADD1:
19078
18743
  case GGML_OP_ACC:
@@ -19157,7 +18822,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
19157
18822
  } break;
19158
18823
  case GGML_OP_SCALE:
19159
18824
  case GGML_OP_SET:
19160
- case GGML_OP_CONT:
19161
18825
  case GGML_OP_RESHAPE:
19162
18826
  case GGML_OP_VIEW:
19163
18827
  case GGML_OP_PERMUTE:
@@ -19317,8 +18981,11 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput
19317
18981
  sched_yield();
19318
18982
  }
19319
18983
 
19320
- * node_n = atomic_load(&state->shared->node_n);
19321
- if (* node_n != last_node_n) break;
18984
+ *node_n = atomic_load(&state->shared->node_n);
18985
+ if (*node_n != last_node_n) {
18986
+ break;
18987
+ }
18988
+
19322
18989
  #if defined(__SSE3__)
19323
18990
  // Tell the processor we're spinning. It's a processor hint for spinlocks.
19324
18991
  _mm_pause();
@@ -19328,15 +18995,18 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput
19328
18995
 
19329
18996
  static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
19330
18997
  // wait for other threads to finish
19331
- const int last_task_phase = * task_phase;
18998
+ const int last_task_phase = *task_phase;
19332
18999
 
19333
19000
  while (true) {
19334
19001
  if (do_yield) {
19335
19002
  sched_yield();
19336
19003
  }
19337
19004
 
19338
- * task_phase = atomic_load(&state->shared->node_task);
19339
- if (* task_phase != last_task_phase) break;
19005
+ *task_phase = atomic_load(&state->shared->node_task);
19006
+ if (*task_phase != last_task_phase) {
19007
+ break;
19008
+ }
19009
+
19340
19010
  #if defined(__SSE3__)
19341
19011
  // Tell the processor we're spinning. It's a processor hint for spinlocks.
19342
19012
  _mm_pause();
@@ -19536,22 +19206,6 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
19536
19206
  {
19537
19207
  const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
19538
19208
 
19539
- #if defined(GGML_USE_CLBLAST)
19540
- if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
19541
- cur = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node);
19542
- } else
19543
- #endif
19544
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
19545
- if (ggml_compute_forward_mul_mat_use_blas(node)) {
19546
- if (node->src[0]->type != GGML_TYPE_F32) {
19547
- // here we need memory for fully dequantized matrix from src0
19548
- // take into account that src0 can be broadcasted into src1[2,3]
19549
- cur = ggml_type_size(GGML_TYPE_F32)
19550
- * node->src[0]->ne[0]*node->src[0]->ne[1]
19551
- * node->src[1]->ne[2]*node->src[1]->ne[3];
19552
- }
19553
- } else
19554
- #endif
19555
19209
  if (node->src[1]->type != vec_dot_type) {
19556
19210
  cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
19557
19211
  }
@@ -19670,6 +19324,59 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
19670
19324
  return cplan;
19671
19325
  }
19672
19326
 
19327
+ static enum ggml_status ggml_graph_compute_parallel(struct ggml_compute_state * workers, int n_threads) {
19328
+ enum ggml_status compute_status = GGML_STATUS_SUCCESS;
19329
+
19330
+ #ifdef GGML_USE_OPENMP
19331
+ if (n_threads > 1) {
19332
+ #pragma omp parallel num_threads(n_threads)
19333
+ {
19334
+ #pragma omp single
19335
+ {
19336
+ // update the number of threads from the actual number of threads that we got from OpenMP
19337
+ n_threads = omp_get_num_threads();
19338
+ workers[0].shared->n_threads = n_threads;
19339
+ workers[0].shared->n_active = n_threads;
19340
+ }
19341
+ ggml_graph_compute_thread(&workers[omp_get_thread_num()]);
19342
+ }
19343
+ } else {
19344
+ ggml_graph_compute_thread(&workers[0]);
19345
+ }
19346
+ #else
19347
+ // create thread pool
19348
+ if (n_threads > 1) {
19349
+ for (int j = 1; j < n_threads; ++j) {
19350
+ const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
19351
+ GGML_ASSERT(rc == 0);
19352
+ UNUSED(rc);
19353
+ }
19354
+ }
19355
+
19356
+ // this is a work thread too
19357
+ ggml_graph_compute_thread(&workers[0]);
19358
+
19359
+ // join or kill thread pool
19360
+ if (n_threads > 1) {
19361
+ for (int j = 1; j < n_threads; j++) {
19362
+ const int rc = ggml_thread_join(workers[j].thrd, NULL);
19363
+ GGML_ASSERT(rc == 0);
19364
+ UNUSED(rc);
19365
+ }
19366
+ }
19367
+ #endif
19368
+ // don't leave affinity set on the main thread
19369
+ clear_numa_thread_affinity();
19370
+
19371
+ for (int j = 0; j < n_threads; j++) {
19372
+ if (workers[j].ec != GGML_STATUS_SUCCESS) {
19373
+ compute_status = workers[j].ec;
19374
+ break;
19375
+ }
19376
+ }
19377
+ return compute_status;
19378
+ }
19379
+
19673
19380
  enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
19674
19381
  {
19675
19382
  GGML_ASSERT(cplan);
@@ -19680,7 +19387,11 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
19680
19387
  }
19681
19388
  }
19682
19389
 
19683
- const int n_threads = cplan->n_threads;
19390
+ int n_threads = cplan->n_threads;
19391
+
19392
+ #if defined(GGML_USE_OPENMP)
19393
+ n_threads = MIN(n_threads, omp_get_max_threads());
19394
+ #endif
19684
19395
 
19685
19396
  struct ggml_compute_state_shared state_shared = {
19686
19397
  /*.cgraph =*/ cgraph,
@@ -19696,47 +19407,20 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
19696
19407
  /*.current_chunk; =*/ 0,
19697
19408
  };
19698
19409
  struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
19699
-
19700
- // create thread pool
19701
- if (n_threads > 1) {
19702
- for (int j = 1; j < n_threads; ++j) {
19703
- workers[j] = (struct ggml_compute_state) {
19704
- .thrd = 0,
19705
- .ith = j,
19706
- .shared = &state_shared,
19707
- .ec = GGML_STATUS_SUCCESS,
19708
- };
19709
-
19710
- const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
19711
- GGML_ASSERT(rc == 0);
19712
- UNUSED(rc);
19713
- }
19714
- }
19715
-
19716
- workers[0].ith = 0;
19717
- workers[0].shared = &state_shared;
19718
- workers[0].ec = GGML_STATUS_SUCCESS;
19719
-
19720
19410
  const int64_t perf_start_cycles = ggml_perf_cycles();
19721
19411
  const int64_t perf_start_time_us = ggml_perf_time_us();
19722
19412
 
19723
- // this is a work thread too
19724
- ggml_graph_compute_thread(&workers[0]);
19725
- enum ggml_status compute_status = workers[0].ec;
19726
-
19727
- // don't leave affinity set on the main thread
19728
- clear_numa_thread_affinity();
19729
-
19730
- // join or kill thread pool
19731
- if (n_threads > 1) {
19732
- for (int j = 1; j < n_threads; j++) {
19733
- const int rc = ggml_thread_join(workers[j].thrd, NULL);
19734
- GGML_ASSERT(rc == 0);
19735
- if (workers[j].ec != GGML_STATUS_SUCCESS)
19736
- compute_status = workers[j].ec;
19737
- }
19413
+ for (int j = 0; j < n_threads; ++j) {
19414
+ workers[j] = (struct ggml_compute_state) {
19415
+ .thrd = 0,
19416
+ .ith = j,
19417
+ .shared = &state_shared,
19418
+ .ec = GGML_STATUS_SUCCESS,
19419
+ };
19738
19420
  }
19739
19421
 
19422
+ enum ggml_status compute_status = ggml_graph_compute_parallel(workers, n_threads);
19423
+
19740
19424
  // performance stats (graph)
19741
19425
  {
19742
19426
  int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
@@ -22819,7 +22503,7 @@ int ggml_cpu_has_wasm_simd(void) {
22819
22503
  }
22820
22504
 
22821
22505
  int ggml_cpu_has_blas(void) {
22822
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
22506
+ #if defined(GGML_USE_BLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
22823
22507
  return 1;
22824
22508
  #else
22825
22509
  return 0;
@@ -22834,14 +22518,6 @@ int ggml_cpu_has_cuda(void) {
22834
22518
  #endif
22835
22519
  }
22836
22520
 
22837
- int ggml_cpu_has_clblast(void) {
22838
- #if defined(GGML_USE_CLBLAST)
22839
- return 1;
22840
- #else
22841
- return 0;
22842
- #endif
22843
- }
22844
-
22845
22521
  int ggml_cpu_has_vulkan(void) {
22846
22522
  #if defined(GGML_USE_VULKAN)
22847
22523
  return 1;
@@ -22875,8 +22551,7 @@ int ggml_cpu_has_rpc(void) {
22875
22551
  }
22876
22552
 
22877
22553
  int ggml_cpu_has_gpublas(void) {
22878
- return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
22879
- ggml_cpu_has_sycl();
22554
+ return ggml_cpu_has_cuda() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() || ggml_cpu_has_sycl();
22880
22555
  }
22881
22556
 
22882
22557
  int ggml_cpu_has_sse3(void) {