llama_cpp 0.15.4 → 0.16.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (147) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +10 -0
  3. data/ext/llama_cpp/extconf.rb +1 -2
  4. data/ext/llama_cpp/llama_cpp.cpp +15 -3
  5. data/lib/llama_cpp/version.rb +2 -2
  6. data/sig/llama_cpp.rbs +13 -1
  7. data/vendor/tmp/llama.cpp/Makefile +62 -35
  8. data/vendor/tmp/llama.cpp/ggml-alloc.c +4 -4
  9. data/vendor/tmp/llama.cpp/ggml-backend.c +5 -5
  10. data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
  11. data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
  12. data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
  13. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +103 -0
  14. data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
  15. data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
  16. data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
  17. data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
  18. data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
  19. data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
  20. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +662 -0
  21. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
  22. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
  23. data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
  24. data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
  25. data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
  26. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +1564 -0
  27. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +404 -0
  28. data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
  29. data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
  30. data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
  31. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +45 -0
  32. data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
  33. data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
  34. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +205 -0
  35. data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
  36. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
  37. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
  38. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
  39. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
  40. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
  41. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
  42. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
  43. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
  44. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
  45. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
  123. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
  124. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
  125. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
  126. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
  127. data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
  128. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +266 -0
  129. data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
  130. data/vendor/tmp/llama.cpp/ggml-cuda.cu +8 -6
  131. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +21 -6
  132. data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
  133. data/vendor/tmp/llama.cpp/ggml-metal.m +34 -24
  134. data/vendor/tmp/llama.cpp/ggml-metal.metal +83 -59
  135. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +2 -2
  136. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +7 -67
  137. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +99301 -39793
  138. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +456 -329
  139. data/vendor/tmp/llama.cpp/ggml.c +178 -330
  140. data/vendor/tmp/llama.cpp/ggml.h +9 -28
  141. data/vendor/tmp/llama.cpp/llama.cpp +242 -426
  142. data/vendor/tmp/llama.cpp/llama.h +17 -43
  143. metadata +121 -6
  144. data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
  145. data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
  146. data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
  147. data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
@@ -5,6 +5,7 @@
5
5
  #include "ggml-quants.h"
6
6
  #include "ggml.h"
7
7
 
8
+
8
9
  #if defined(_MSC_VER) || defined(__MINGW32__)
9
10
  #include <malloc.h> // using malloc.h with MSC/MINGW
10
11
  #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
@@ -28,6 +29,10 @@
28
29
  #include <syscall.h>
29
30
  #endif
30
31
 
32
+ #ifdef GGML_USE_OPENMP
33
+ #include <omp.h>
34
+ #endif
35
+
31
36
  #ifdef GGML_USE_METAL
32
37
  #include <unistd.h>
33
38
  #endif
@@ -292,17 +297,12 @@ inline static void * ggml_calloc(size_t num, size_t size) {
292
297
 
293
298
  #if defined(GGML_USE_ACCELERATE)
294
299
  #include <Accelerate/Accelerate.h>
295
- #if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
296
- #include "ggml-opencl.h"
297
- #endif
298
300
  #elif defined(GGML_USE_OPENBLAS)
299
301
  #if defined(GGML_BLAS_USE_MKL)
300
302
  #include <mkl.h>
301
303
  #else
302
304
  #include <cblas.h>
303
305
  #endif
304
- #elif defined(GGML_USE_CLBLAST)
305
- #include "ggml-opencl.h"
306
306
  #endif
307
307
 
308
308
  // floating point type used to accumulate sums
@@ -1756,7 +1756,7 @@ struct ggml_compute_state_shared {
1756
1756
  int64_t perf_node_start_cycles;
1757
1757
  int64_t perf_node_start_time_us;
1758
1758
 
1759
- const int n_threads;
1759
+ int n_threads;
1760
1760
 
1761
1761
  // synchronization primitives
1762
1762
  atomic_int n_active; // num active threads
@@ -2267,6 +2267,11 @@ inline static float ggml_silu_f32(float x) {
2267
2267
  return x/(1.0f + expf(-x));
2268
2268
  }
2269
2269
 
2270
+ #if __FINITE_MATH_ONLY__
2271
+ #error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix"
2272
+ #error "ref: https://github.com/ggerganov/llama.cpp/pull/7154#issuecomment-2143844461"
2273
+ #endif
2274
+
2270
2275
  #if defined(__ARM_NEON) && defined(__aarch64__)
2271
2276
 
2272
2277
  // adapted from arm limited optimized routine
@@ -3370,10 +3375,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
3370
3375
  GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
3371
3376
  }
3372
3377
 
3373
- #if defined(GGML_USE_CLBLAST)
3374
- ggml_cl_init();
3375
- #endif
3376
-
3377
3378
  ggml_setup_op_has_task_pass();
3378
3379
 
3379
3380
  is_first_call = false;
@@ -6249,16 +6250,13 @@ static struct ggml_tensor * ggml_rope_impl(
6249
6250
  struct ggml_tensor * c,
6250
6251
  int n_dims,
6251
6252
  int mode,
6252
- int n_ctx,
6253
- int n_orig_ctx,
6253
+ int n_ctx_orig,
6254
6254
  float freq_base,
6255
6255
  float freq_scale,
6256
6256
  float ext_factor,
6257
6257
  float attn_factor,
6258
6258
  float beta_fast,
6259
6259
  float beta_slow,
6260
- float xpos_base,
6261
- bool xpos_down,
6262
6260
  bool inplace) {
6263
6261
  GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");
6264
6262
 
@@ -6279,15 +6277,13 @@ static struct ggml_tensor * ggml_rope_impl(
6279
6277
 
6280
6278
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6281
6279
 
6282
- int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx };
6280
+ int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
6283
6281
  memcpy(params + 5, &freq_base, sizeof(float));
6284
6282
  memcpy(params + 6, &freq_scale, sizeof(float));
6285
6283
  memcpy(params + 7, &ext_factor, sizeof(float));
6286
6284
  memcpy(params + 8, &attn_factor, sizeof(float));
6287
6285
  memcpy(params + 9, &beta_fast, sizeof(float));
6288
6286
  memcpy(params + 10, &beta_slow, sizeof(float));
6289
- memcpy(params + 11, &xpos_base, sizeof(float));
6290
- memcpy(params + 12, &xpos_down, sizeof(bool));
6291
6287
  ggml_set_op_params(result, params, sizeof(params));
6292
6288
 
6293
6289
  result->op = GGML_OP_ROPE;
@@ -6304,10 +6300,9 @@ struct ggml_tensor * ggml_rope(
6304
6300
  struct ggml_tensor * a,
6305
6301
  struct ggml_tensor * b,
6306
6302
  int n_dims,
6307
- int mode,
6308
- int n_ctx) {
6303
+ int mode) {
6309
6304
  return ggml_rope_impl(
6310
- ctx, a, b, NULL, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, false
6305
+ ctx, a, b, NULL, n_dims, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, false
6311
6306
  );
6312
6307
  }
6313
6308
 
@@ -6316,10 +6311,9 @@ struct ggml_tensor * ggml_rope_inplace(
6316
6311
  struct ggml_tensor * a,
6317
6312
  struct ggml_tensor * b,
6318
6313
  int n_dims,
6319
- int mode,
6320
- int n_ctx) {
6314
+ int mode) {
6321
6315
  return ggml_rope_impl(
6322
- ctx, a, b, NULL, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, true
6316
+ ctx, a, b, NULL, n_dims, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, true
6323
6317
  );
6324
6318
  }
6325
6319
 
@@ -6330,8 +6324,7 @@ struct ggml_tensor * ggml_rope_ext(
6330
6324
  struct ggml_tensor * c,
6331
6325
  int n_dims,
6332
6326
  int mode,
6333
- int n_ctx,
6334
- int n_orig_ctx,
6327
+ int n_ctx_orig,
6335
6328
  float freq_base,
6336
6329
  float freq_scale,
6337
6330
  float ext_factor,
@@ -6339,8 +6332,8 @@ struct ggml_tensor * ggml_rope_ext(
6339
6332
  float beta_fast,
6340
6333
  float beta_slow) {
6341
6334
  return ggml_rope_impl(
6342
- ctx, a, b, c, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
6343
- ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, false
6335
+ ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
6336
+ ext_factor, attn_factor, beta_fast, beta_slow, false
6344
6337
  );
6345
6338
  }
6346
6339
 
@@ -6351,8 +6344,7 @@ struct ggml_tensor * ggml_rope_ext_inplace(
6351
6344
  struct ggml_tensor * c,
6352
6345
  int n_dims,
6353
6346
  int mode,
6354
- int n_ctx,
6355
- int n_orig_ctx,
6347
+ int n_ctx_orig,
6356
6348
  float freq_base,
6357
6349
  float freq_scale,
6358
6350
  float ext_factor,
@@ -6360,8 +6352,8 @@ struct ggml_tensor * ggml_rope_ext_inplace(
6360
6352
  float beta_fast,
6361
6353
  float beta_slow) {
6362
6354
  return ggml_rope_impl(
6363
- ctx, a, b, c, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
6364
- ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, true
6355
+ ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
6356
+ ext_factor, attn_factor, beta_fast, beta_slow, true
6365
6357
  );
6366
6358
  }
6367
6359
 
@@ -6371,8 +6363,7 @@ struct ggml_tensor * ggml_rope_custom(
6371
6363
  struct ggml_tensor * b,
6372
6364
  int n_dims,
6373
6365
  int mode,
6374
- int n_ctx,
6375
- int n_orig_ctx,
6366
+ int n_ctx_orig,
6376
6367
  float freq_base,
6377
6368
  float freq_scale,
6378
6369
  float ext_factor,
@@ -6380,8 +6371,8 @@ struct ggml_tensor * ggml_rope_custom(
6380
6371
  float beta_fast,
6381
6372
  float beta_slow) {
6382
6373
  return ggml_rope_impl(
6383
- ctx, a, b, NULL, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
6384
- ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, false
6374
+ ctx, a, b, NULL, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
6375
+ ext_factor, attn_factor, beta_fast, beta_slow, false
6385
6376
  );
6386
6377
  }
6387
6378
 
@@ -6391,8 +6382,7 @@ struct ggml_tensor * ggml_rope_custom_inplace(
6391
6382
  struct ggml_tensor * b,
6392
6383
  int n_dims,
6393
6384
  int mode,
6394
- int n_ctx,
6395
- int n_orig_ctx,
6385
+ int n_ctx_orig,
6396
6386
  float freq_base,
6397
6387
  float freq_scale,
6398
6388
  float ext_factor,
@@ -6400,21 +6390,11 @@ struct ggml_tensor * ggml_rope_custom_inplace(
6400
6390
  float beta_fast,
6401
6391
  float beta_slow) {
6402
6392
  return ggml_rope_impl(
6403
- ctx, a, b, NULL, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
6404
- ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, true
6393
+ ctx, a, b, NULL, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
6394
+ ext_factor, attn_factor, beta_fast, beta_slow, true
6405
6395
  );
6406
6396
  }
6407
6397
 
6408
- struct ggml_tensor * ggml_rope_xpos_inplace(
6409
- struct ggml_context * ctx,
6410
- struct ggml_tensor * a,
6411
- struct ggml_tensor * b,
6412
- int n_dims,
6413
- float base,
6414
- bool down) {
6415
- return ggml_rope_impl(ctx, a, b, NULL, n_dims, 0, 0, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, base, down, true);
6416
- }
6417
-
6418
6398
  // ggml_rope_back
6419
6399
 
6420
6400
  struct ggml_tensor * ggml_rope_back(
@@ -6424,16 +6404,13 @@ struct ggml_tensor * ggml_rope_back(
6424
6404
  struct ggml_tensor * c,
6425
6405
  int n_dims,
6426
6406
  int mode,
6427
- int n_ctx,
6428
- int n_orig_ctx,
6407
+ int n_ctx_orig,
6429
6408
  float freq_base,
6430
6409
  float freq_scale,
6431
6410
  float ext_factor,
6432
6411
  float attn_factor,
6433
6412
  float beta_fast,
6434
- float beta_slow,
6435
- float xpos_base,
6436
- bool xpos_down) {
6413
+ float beta_slow) {
6437
6414
  GGML_ASSERT(ggml_is_vector(b));
6438
6415
  GGML_ASSERT(b->type == GGML_TYPE_I32);
6439
6416
  GGML_ASSERT(a->ne[2] == b->ne[0]);
@@ -6449,15 +6426,13 @@ struct ggml_tensor * ggml_rope_back(
6449
6426
 
6450
6427
  struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
6451
6428
 
6452
- int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx };
6429
+ int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
6453
6430
  memcpy(params + 5, &freq_base, sizeof(float));
6454
6431
  memcpy(params + 6, &freq_scale, sizeof(float));
6455
6432
  memcpy(params + 7, &ext_factor, sizeof(float));
6456
6433
  memcpy(params + 8, &attn_factor, sizeof(float));
6457
6434
  memcpy(params + 9, &beta_fast, sizeof(float));
6458
6435
  memcpy(params + 10, &beta_slow, sizeof(float));
6459
- memcpy(params + 11, &xpos_base, sizeof(float));
6460
- memcpy(params + 12, &xpos_down, sizeof(bool));
6461
6436
  ggml_set_op_params(result, params, sizeof(params));
6462
6437
 
6463
6438
  result->op = GGML_OP_ROPE_BACK;
@@ -9043,17 +9018,6 @@ static void ggml_compute_forward_add_f32(
9043
9018
  const int ith = params->ith;
9044
9019
  const int nth = params->nth;
9045
9020
 
9046
- #ifdef GGML_USE_CLBLAST
9047
- if (src1->backend == GGML_BACKEND_TYPE_GPU) {
9048
- // TODO: OpenCL kernel support full broadcast
9049
- GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
9050
- if (ith == 0) {
9051
- ggml_cl_add(src0, src1, dst);
9052
- }
9053
- return;
9054
- }
9055
- #endif
9056
-
9057
9021
  const int nr = ggml_nrows(src0);
9058
9022
 
9059
9023
  GGML_TENSOR_BINARY_OP_LOCALS
@@ -10161,17 +10125,6 @@ static void ggml_compute_forward_mul_f32(
10161
10125
  const int ith = params->ith;
10162
10126
  const int nth = params->nth;
10163
10127
 
10164
- #if defined(GGML_USE_CLBLAST)
10165
- if (src1->backend == GGML_BACKEND_TYPE_GPU) {
10166
- // TODO: OpenCL kernel support full broadcast
10167
- GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
10168
- if (ith == 0) {
10169
- ggml_cl_mul(src0, src1, dst);
10170
- }
10171
- return;
10172
- }
10173
- #endif
10174
-
10175
10128
  const int64_t nr = ggml_nrows(src0);
10176
10129
 
10177
10130
  GGML_TENSOR_BINARY_OP_LOCALS
@@ -12407,15 +12360,6 @@ static void ggml_compute_forward_mul_mat(
12407
12360
  // nb01 >= nb00 - src0 is not transposed
12408
12361
  // compute by src0 rows
12409
12362
 
12410
- #if defined(GGML_USE_CLBLAST)
12411
- if (ggml_cl_can_mul_mat(src0, src1, dst)) {
12412
- if (params->ith == 0 && params->type == GGML_TASK_TYPE_COMPUTE) {
12413
- ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
12414
- }
12415
- return;
12416
- }
12417
- #endif
12418
-
12419
12363
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
12420
12364
  if (ggml_compute_forward_mul_mat_use_blas(dst)) {
12421
12365
  const int64_t ne_plane = ne01*ne00;
@@ -12863,8 +12807,6 @@ static void ggml_compute_forward_out_prod_f32(
12863
12807
  // nb01 >= nb00 - src0 is not transposed
12864
12808
  // compute by src0 rows
12865
12809
 
12866
- // TODO: #if defined(GGML_USE_CLBLAST)
12867
-
12868
12810
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
12869
12811
  bool use_blas = ggml_is_matrix(src0) &&
12870
12812
  ggml_is_matrix(src1) &&
@@ -13062,7 +13004,7 @@ static void ggml_compute_forward_out_prod_q_f32(
13062
13004
  // nb01 >= nb00 - src0 is not transposed
13063
13005
  // compute by src0 rows
13064
13006
 
13065
- // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
13007
+ // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
13066
13008
 
13067
13009
  if (params->type == GGML_TASK_TYPE_INIT) {
13068
13010
  if (ith != 0) {
@@ -14259,8 +14201,7 @@ static float rope_yarn_ramp(const float low, const float high, const int i0) {
14259
14201
  // MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
14260
14202
  static void rope_yarn(
14261
14203
  float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale,
14262
- float * cos_theta, float * sin_theta
14263
- ) {
14204
+ float * cos_theta, float * sin_theta) {
14264
14205
  // Get n-d rotational scaling corrected for extrapolation
14265
14206
  float theta_interp = freq_scale * theta_extrap;
14266
14207
  float theta = theta_interp;
@@ -14277,18 +14218,19 @@ static void rope_yarn(
14277
14218
 
14278
14219
  // Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
14279
14220
  // `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
14280
- static float ggml_rope_yarn_corr_dim(int n_dims, int n_orig_ctx, float n_rot, float base) {
14281
- return n_dims * logf(n_orig_ctx / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
14221
+ static float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) {
14222
+ return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
14282
14223
  }
14283
14224
 
14284
14225
  static void ggml_rope_cache_init(
14285
- float theta_base, float freq_scale, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
14286
- float * cache, float sin_sign, float theta_scale
14287
- ) {
14226
+ float theta_base, float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
14227
+ float * cache, float sin_sign, float theta_scale) {
14228
+ // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
14288
14229
  float theta = theta_base;
14289
14230
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
14231
+ const float ff = freq_factors ? freq_factors[i0/2] : 1.0f;
14290
14232
  rope_yarn(
14291
- theta, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
14233
+ theta/ff, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
14292
14234
  );
14293
14235
  cache[i0 + 1] *= sin_sign;
14294
14236
 
@@ -14297,11 +14239,11 @@ static void ggml_rope_cache_init(
14297
14239
  }
14298
14240
 
14299
14241
  GGML_CALL void ggml_rope_yarn_corr_dims(
14300
- int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
14242
+ int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]
14301
14243
  ) {
14302
14244
  // start and end correction dims
14303
- float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_fast, freq_base));
14304
- float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_slow, freq_base));
14245
+ float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base));
14246
+ float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base));
14305
14247
  dims[0] = MAX(0, start);
14306
14248
  dims[1] = MIN(n_dims - 1, end);
14307
14249
  }
@@ -14321,15 +14263,11 @@ static void ggml_compute_forward_rope_f32(
14321
14263
 
14322
14264
  float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
14323
14265
 
14324
- // these two only relevant for xPos RoPE:
14325
- float xpos_base;
14326
- bool xpos_down;
14327
-
14328
14266
  //const int n_past = ((int32_t *) dst->op_params)[0];
14329
14267
  const int n_dims = ((int32_t *) dst->op_params)[1];
14330
14268
  const int mode = ((int32_t *) dst->op_params)[2];
14331
- const int n_ctx = ((int32_t *) dst->op_params)[3];
14332
- const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
14269
+ //const int n_ctx = ((int32_t *) dst->op_params)[3];
14270
+ const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
14333
14271
 
14334
14272
  memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
14335
14273
  memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
@@ -14337,8 +14275,6 @@ static void ggml_compute_forward_rope_f32(
14337
14275
  memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
14338
14276
  memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
14339
14277
  memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
14340
- memcpy(&xpos_base, (int32_t *) dst->op_params + 11, sizeof(float));
14341
- memcpy(&xpos_down, (int32_t *) dst->op_params + 12, sizeof(bool));
14342
14278
 
14343
14279
  GGML_TENSOR_UNARY_OP_LOCALS
14344
14280
 
@@ -14368,20 +14304,15 @@ static void ggml_compute_forward_rope_f32(
14368
14304
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
14369
14305
 
14370
14306
  float corr_dims[2];
14371
- ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
14307
+ ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
14372
14308
 
14373
14309
  const bool is_neox = mode & 2;
14374
- const bool is_glm = mode & 4;
14375
14310
 
14376
14311
  const float * freq_factors = NULL;
14377
- if (is_neox) {
14378
- if (src2 != NULL) {
14379
- GGML_ASSERT(src2->type == GGML_TYPE_F32);
14380
- GGML_ASSERT(src2->ne[0] >= n_dims / 2);
14381
- freq_factors = (const float *) src2->data;
14382
- }
14383
- } else {
14384
- GGML_ASSERT(src2 == NULL && "TODO: freq_factors not implemented for !is_neox");
14312
+ if (src2 != NULL) {
14313
+ GGML_ASSERT(src2->type == GGML_TYPE_F32);
14314
+ GGML_ASSERT(src2->ne[0] >= n_dims / 2);
14315
+ freq_factors = (const float *) src2->data;
14385
14316
  }
14386
14317
 
14387
14318
  // backward process uses inverse rotation by cos and sin.
@@ -14396,94 +14327,50 @@ static void ggml_compute_forward_rope_f32(
14396
14327
  const int64_t p = pos[i2];
14397
14328
 
14398
14329
  float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
14399
- if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox
14400
- ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
14401
- }
14330
+ ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
14402
14331
 
14403
14332
  for (int64_t i1 = 0; i1 < ne1; i1++) {
14404
14333
  if (ir++ < ir0) continue;
14405
14334
  if (ir > ir1) break;
14406
14335
 
14407
- float theta_base = (float)p;
14408
-
14409
- if (is_glm) {
14410
- theta_base = MIN(p, n_ctx - 2);
14411
- float block_theta = MAX(p - (n_ctx - 2), 0);
14412
- for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
14413
- const float cos_theta = cosf(theta_base);
14414
- const float sin_theta = sinf(theta_base) * sin_sign;
14415
- const float cos_block_theta = cosf(block_theta);
14416
- const float sin_block_theta = sinf(block_theta) * sin_sign;
14417
-
14418
- theta_base *= theta_scale;
14419
- block_theta *= theta_scale;
14420
-
14421
- const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
14422
- float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
14423
-
14424
- const float x0 = src[0];
14425
- const float x1 = src[n_dims/2];
14426
- const float x2 = src[n_dims];
14427
- const float x3 = src[n_dims/2*3];
14428
-
14429
- dst_data[0] = x0*cos_theta - x1*sin_theta;
14430
- dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
14431
- dst_data[n_dims] = x2*cos_block_theta - x3*sin_block_theta;
14432
- dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta;
14433
- }
14434
- } else if (!is_neox) {
14435
- for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
14336
+ if (!is_neox) {
14337
+ for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
14436
14338
  const float cos_theta = cache[i0 + 0];
14437
14339
  const float sin_theta = cache[i0 + 1];
14438
14340
 
14439
- // zeta scaling for xPos only:
14440
- float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
14441
- if (xpos_down) zeta = 1.0f / zeta;
14442
-
14443
14341
  const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
14444
14342
  float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
14445
14343
 
14446
14344
  const float x0 = src[0];
14447
14345
  const float x1 = src[1];
14448
14346
 
14449
- dst_data[0] = x0*cos_theta*zeta - x1*sin_theta*zeta;
14450
- dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta;
14347
+ dst_data[0] = x0*cos_theta - x1*sin_theta;
14348
+ dst_data[1] = x0*sin_theta + x1*cos_theta;
14451
14349
  }
14452
14350
  } else {
14453
- // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
14454
- for (int64_t ic = 0; ic < ne0; ic += 2) {
14455
- if (ic < n_dims) {
14456
- const int64_t i0 = ic/2;
14351
+ for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
14352
+ const int64_t ic = i0/2;
14457
14353
 
14458
- const float freq_factor = freq_factors ? freq_factors[i0] : 1.0f;
14459
-
14460
- float cos_theta, sin_theta;
14461
- rope_yarn(
14462
- theta_base/freq_factor, freq_scale, corr_dims, ic, ext_factor, attn_factor,
14463
- &cos_theta, &sin_theta
14464
- );
14465
-
14466
- sin_theta *= sin_sign;
14467
- theta_base *= theta_scale;
14354
+ const float cos_theta = cache[i0 + 0];
14355
+ const float sin_theta = cache[i0 + 1];
14468
14356
 
14469
- const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
14470
- float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
14357
+ const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
14358
+ float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
14471
14359
 
14472
- const float x0 = src[0];
14473
- const float x1 = src[n_dims/2];
14360
+ const float x0 = src[0];
14361
+ const float x1 = src[n_dims/2];
14474
14362
 
14475
- dst_data[0] = x0*cos_theta - x1*sin_theta;
14476
- dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
14477
- } else {
14478
- const int64_t i0 = ic;
14363
+ dst_data[0] = x0*cos_theta - x1*sin_theta;
14364
+ dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
14365
+ }
14366
+ }
14479
14367
 
14480
- const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
14481
- float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
14368
+ for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
14369
+ const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
14370
+ float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
14482
14371
 
14483
- dst_data[0] = src[0];
14484
- dst_data[1] = src[1];
14485
- }
14486
- }
14372
+ dst_data[0] = src[0];
14373
+ dst_data[1] = src[1];
14487
14374
  }
14488
14375
  }
14489
14376
  }
@@ -14509,8 +14396,8 @@ static void ggml_compute_forward_rope_f16(
14509
14396
  //const int n_past = ((int32_t *) dst->op_params)[0];
14510
14397
  const int n_dims = ((int32_t *) dst->op_params)[1];
14511
14398
  const int mode = ((int32_t *) dst->op_params)[2];
14512
- const int n_ctx = ((int32_t *) dst->op_params)[3];
14513
- const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
14399
+ //const int n_ctx = ((int32_t *) dst->op_params)[3];
14400
+ const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
14514
14401
  memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
14515
14402
  memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
14516
14403
  memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
@@ -14546,20 +14433,15 @@ static void ggml_compute_forward_rope_f16(
14546
14433
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
14547
14434
 
14548
14435
  float corr_dims[2];
14549
- ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
14436
+ ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
14550
14437
 
14551
14438
  const bool is_neox = mode & 2;
14552
- const bool is_glm = mode & 4;
14553
14439
 
14554
14440
  const float * freq_factors = NULL;
14555
- if (is_neox) {
14556
- if (src2 != NULL) {
14557
- GGML_ASSERT(src2->type == GGML_TYPE_F32);
14558
- GGML_ASSERT(src2->ne[0] >= n_dims / 2);
14559
- freq_factors = (const float *) src2->data;
14560
- }
14561
- } else {
14562
- GGML_ASSERT(src2 == NULL && "TODO: freq_factors not implemented for !is_neox");
14441
+ if (src2 != NULL) {
14442
+ GGML_ASSERT(src2->type == GGML_TYPE_F32);
14443
+ GGML_ASSERT(src2->ne[0] >= n_dims / 2);
14444
+ freq_factors = (const float *) src2->data;
14563
14445
  }
14564
14446
 
14565
14447
  // backward process uses inverse rotation by cos and sin.
@@ -14574,43 +14456,14 @@ static void ggml_compute_forward_rope_f16(
14574
14456
  const int64_t p = pos[i2];
14575
14457
 
14576
14458
  float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
14577
- if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox
14578
- ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
14579
- }
14459
+ ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
14580
14460
 
14581
14461
  for (int64_t i1 = 0; i1 < ne1; i1++) {
14582
14462
  if (ir++ < ir0) continue;
14583
14463
  if (ir > ir1) break;
14584
14464
 
14585
- float theta_base = (float)p;
14586
-
14587
- if (is_glm) {
14588
- theta_base = MIN(p, n_ctx - 2);
14589
- float block_theta = MAX(p - (n_ctx - 2), 0);
14590
- for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
14591
- const float cos_theta = cosf(theta_base);
14592
- const float sin_theta = sinf(theta_base) * sin_sign;
14593
- const float cos_block_theta = cosf(block_theta);
14594
- const float sin_block_theta = sinf(block_theta) * sin_sign;
14595
-
14596
- theta_base *= theta_scale;
14597
- block_theta *= theta_scale;
14598
-
14599
- const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
14600
- ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
14601
-
14602
- const float x0 = GGML_FP16_TO_FP32(src[0]);
14603
- const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
14604
- const float x2 = GGML_FP16_TO_FP32(src[n_dims]);
14605
- const float x3 = GGML_FP16_TO_FP32(src[n_dims/2*3]);
14606
-
14607
- dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
14608
- dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
14609
- dst_data[n_dims] = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
14610
- dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
14611
- }
14612
- } else if (!is_neox) {
14613
- for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
14465
+ if (!is_neox) {
14466
+ for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
14614
14467
  const float cos_theta = cache[i0 + 0];
14615
14468
  const float sin_theta = cache[i0 + 1];
14616
14469
 
@@ -14624,40 +14477,29 @@ static void ggml_compute_forward_rope_f16(
14624
14477
  dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
14625
14478
  }
14626
14479
  } else {
14627
- // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
14628
- for (int64_t ic = 0; ic < ne0; ic += 2) {
14629
- if (ic < n_dims) {
14630
- const int64_t i0 = ic/2;
14631
-
14632
- const float freq_factor = freq_factors ? freq_factors[i0] : 1.0f;
14633
-
14634
- float cos_theta, sin_theta;
14635
- rope_yarn(
14636
- theta_base/freq_factor, freq_scale, corr_dims, ic, ext_factor, attn_factor,
14637
- &cos_theta, &sin_theta
14638
- );
14480
+ for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
14481
+ const int64_t ic = i0/2;
14639
14482
 
14640
- sin_theta *= sin_sign;
14641
- theta_base *= theta_scale;
14483
+ const float cos_theta = cache[i0 + 0];
14484
+ const float sin_theta = cache[i0 + 1];
14642
14485
 
14643
- const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
14644
- ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
14486
+ const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
14487
+ ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
14645
14488
 
14646
- const float x0 = GGML_FP16_TO_FP32(src[0]);
14647
- const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
14489
+ const float x0 = GGML_FP16_TO_FP32(src[0]);
14490
+ const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
14648
14491
 
14649
- dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
14650
- dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
14651
- } else {
14652
- const int64_t i0 = ic;
14492
+ dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
14493
+ dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
14494
+ }
14495
+ }
14653
14496
 
14654
- const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
14655
- ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
14497
+ for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
14498
+ const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
14499
+ ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
14656
14500
 
14657
- dst_data[0] = src[0];
14658
- dst_data[1] = src[1];
14659
- }
14660
- }
14501
+ dst_data[0] = src[0];
14502
+ dst_data[1] = src[1];
14661
14503
  }
14662
14504
  }
14663
14505
  }
@@ -18359,9 +18201,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
18359
18201
  //const int n_past = ((int32_t *) tensor->op_params)[0];
18360
18202
  const int n_dims = ((int32_t *) tensor->op_params)[1];
18361
18203
  const int mode = ((int32_t *) tensor->op_params)[2];
18362
- const int n_ctx = ((int32_t *) tensor->op_params)[3];
18363
- const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
18364
- float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
18204
+ //const int n_ctx = ((int32_t *) tensor->op_params)[3];
18205
+ const int n_ctx_orig = ((int32_t *) tensor->op_params)[4];
18206
+ float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
18365
18207
 
18366
18208
  memcpy(&freq_base, (int32_t *) tensor->op_params + 5, sizeof(float));
18367
18209
  memcpy(&freq_scale, (int32_t *) tensor->op_params + 6, sizeof(float));
@@ -18369,8 +18211,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
18369
18211
  memcpy(&attn_factor, (int32_t *) tensor->op_params + 8, sizeof(float));
18370
18212
  memcpy(&beta_fast, (int32_t *) tensor->op_params + 9, sizeof(float));
18371
18213
  memcpy(&beta_slow, (int32_t *) tensor->op_params + 10, sizeof(float));
18372
- memcpy(&xpos_base, (int32_t *) tensor->op_params + 11, sizeof(float));
18373
- memcpy(&xpos_down, (int32_t *) tensor->op_params + 12, sizeof(bool));
18374
18214
 
18375
18215
  src0->grad = ggml_add_or_set(ctx,
18376
18216
  src0->grad,
@@ -18380,16 +18220,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
18380
18220
  src2,
18381
18221
  n_dims,
18382
18222
  mode,
18383
- n_ctx,
18384
- n_orig_ctx,
18223
+ n_ctx_orig,
18385
18224
  freq_base,
18386
18225
  freq_scale,
18387
18226
  ext_factor,
18388
18227
  attn_factor,
18389
18228
  beta_fast,
18390
- beta_slow,
18391
- xpos_base,
18392
- xpos_down),
18229
+ beta_slow),
18393
18230
  zero_table);
18394
18231
  }
18395
18232
  } break;
@@ -18399,9 +18236,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
18399
18236
  //const int n_past = ((int32_t *) tensor->op_params)[0];
18400
18237
  const int n_dims = ((int32_t *) tensor->op_params)[1];
18401
18238
  const int mode = ((int32_t *) tensor->op_params)[2];
18402
- const int n_ctx = ((int32_t *) tensor->op_params)[3];
18403
- const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
18404
- float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
18239
+ //const int n_ctx = ((int32_t *) tensor->op_params)[3];
18240
+ const int n_ctx_orig = ((int32_t *) tensor->op_params)[4];
18241
+ float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
18405
18242
 
18406
18243
  memcpy(&freq_base, (int32_t *) tensor->op_params + 5, sizeof(float));
18407
18244
  memcpy(&freq_scale, (int32_t *) tensor->op_params + 6, sizeof(float));
@@ -18409,8 +18246,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
18409
18246
  memcpy(&attn_factor, (int32_t *) tensor->op_params + 8, sizeof(float));
18410
18247
  memcpy(&beta_fast, (int32_t *) tensor->op_params + 9, sizeof(float));
18411
18248
  memcpy(&beta_slow, (int32_t *) tensor->op_params + 10, sizeof(float));
18412
- memcpy(&xpos_base, (int32_t *) tensor->op_params + 11, sizeof(float));
18413
- memcpy(&xpos_down, (int32_t *) tensor->op_params + 12, sizeof(bool));
18414
18249
 
18415
18250
  src0->grad = ggml_add_or_set(ctx,
18416
18251
  src0->grad,
@@ -18420,16 +18255,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
18420
18255
  src2,
18421
18256
  n_dims,
18422
18257
  mode,
18423
- n_ctx,
18424
- n_orig_ctx,
18258
+ n_ctx_orig,
18425
18259
  freq_base,
18426
18260
  freq_scale,
18427
18261
  ext_factor,
18428
18262
  attn_factor,
18429
18263
  beta_fast,
18430
18264
  beta_slow,
18431
- xpos_base,
18432
- xpos_down,
18433
18265
  false),
18434
18266
  zero_table);
18435
18267
  }
@@ -19536,11 +19368,6 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
19536
19368
  {
19537
19369
  const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
19538
19370
 
19539
- #if defined(GGML_USE_CLBLAST)
19540
- if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
19541
- cur = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node);
19542
- } else
19543
- #endif
19544
19371
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
19545
19372
  if (ggml_compute_forward_mul_mat_use_blas(node)) {
19546
19373
  if (node->src[0]->type != GGML_TYPE_F32) {
@@ -19670,6 +19497,59 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
19670
19497
  return cplan;
19671
19498
  }
19672
19499
 
19500
+ static enum ggml_status ggml_graph_compute_parallel(struct ggml_compute_state * workers, int n_threads) {
19501
+ enum ggml_status compute_status = GGML_STATUS_SUCCESS;
19502
+
19503
+ #ifdef GGML_USE_OPENMP
19504
+ if (n_threads > 1) {
19505
+ #pragma omp parallel num_threads(n_threads)
19506
+ {
19507
+ #pragma omp single
19508
+ {
19509
+ // update the number of threads from the actual number of threads that we got from OpenMP
19510
+ n_threads = omp_get_num_threads();
19511
+ workers[0].shared->n_threads = n_threads;
19512
+ workers[0].shared->n_active = n_threads;
19513
+ }
19514
+ ggml_graph_compute_thread(&workers[omp_get_thread_num()]);
19515
+ }
19516
+ } else {
19517
+ ggml_graph_compute_thread(&workers[0]);
19518
+ }
19519
+ #else
19520
+ // create thread pool
19521
+ if (n_threads > 1) {
19522
+ for (int j = 1; j < n_threads; ++j) {
19523
+ const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
19524
+ GGML_ASSERT(rc == 0);
19525
+ UNUSED(rc);
19526
+ }
19527
+ }
19528
+
19529
+ // this is a work thread too
19530
+ ggml_graph_compute_thread(&workers[0]);
19531
+
19532
+ // join or kill thread pool
19533
+ if (n_threads > 1) {
19534
+ for (int j = 1; j < n_threads; j++) {
19535
+ const int rc = ggml_thread_join(workers[j].thrd, NULL);
19536
+ GGML_ASSERT(rc == 0);
19537
+ UNUSED(rc);
19538
+ }
19539
+ }
19540
+ #endif
19541
+ // don't leave affinity set on the main thread
19542
+ clear_numa_thread_affinity();
19543
+
19544
+ for (int j = 0; j < n_threads; j++) {
19545
+ if (workers[j].ec != GGML_STATUS_SUCCESS) {
19546
+ compute_status = workers[j].ec;
19547
+ break;
19548
+ }
19549
+ }
19550
+ return compute_status;
19551
+ }
19552
+
19673
19553
  enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
19674
19554
  {
19675
19555
  GGML_ASSERT(cplan);
@@ -19680,7 +19560,11 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
19680
19560
  }
19681
19561
  }
19682
19562
 
19683
- const int n_threads = cplan->n_threads;
19563
+ int n_threads = cplan->n_threads;
19564
+
19565
+ #if defined(GGML_USE_OPENMP)
19566
+ n_threads = MIN(n_threads, omp_get_max_threads());
19567
+ #endif
19684
19568
 
19685
19569
  struct ggml_compute_state_shared state_shared = {
19686
19570
  /*.cgraph =*/ cgraph,
@@ -19696,47 +19580,20 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
19696
19580
  /*.current_chunk; =*/ 0,
19697
19581
  };
19698
19582
  struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
19699
-
19700
- // create thread pool
19701
- if (n_threads > 1) {
19702
- for (int j = 1; j < n_threads; ++j) {
19703
- workers[j] = (struct ggml_compute_state) {
19704
- .thrd = 0,
19705
- .ith = j,
19706
- .shared = &state_shared,
19707
- .ec = GGML_STATUS_SUCCESS,
19708
- };
19709
-
19710
- const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
19711
- GGML_ASSERT(rc == 0);
19712
- UNUSED(rc);
19713
- }
19714
- }
19715
-
19716
- workers[0].ith = 0;
19717
- workers[0].shared = &state_shared;
19718
- workers[0].ec = GGML_STATUS_SUCCESS;
19719
-
19720
19583
  const int64_t perf_start_cycles = ggml_perf_cycles();
19721
19584
  const int64_t perf_start_time_us = ggml_perf_time_us();
19722
19585
 
19723
- // this is a work thread too
19724
- ggml_graph_compute_thread(&workers[0]);
19725
- enum ggml_status compute_status = workers[0].ec;
19726
-
19727
- // don't leave affinity set on the main thread
19728
- clear_numa_thread_affinity();
19729
-
19730
- // join or kill thread pool
19731
- if (n_threads > 1) {
19732
- for (int j = 1; j < n_threads; j++) {
19733
- const int rc = ggml_thread_join(workers[j].thrd, NULL);
19734
- GGML_ASSERT(rc == 0);
19735
- if (workers[j].ec != GGML_STATUS_SUCCESS)
19736
- compute_status = workers[j].ec;
19737
- }
19586
+ for (int j = 0; j < n_threads; ++j) {
19587
+ workers[j] = (struct ggml_compute_state) {
19588
+ .thrd = 0,
19589
+ .ith = j,
19590
+ .shared = &state_shared,
19591
+ .ec = GGML_STATUS_SUCCESS,
19592
+ };
19738
19593
  }
19739
19594
 
19595
+ enum ggml_status compute_status = ggml_graph_compute_parallel(workers, n_threads);
19596
+
19740
19597
  // performance stats (graph)
19741
19598
  {
19742
19599
  int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
@@ -22819,7 +22676,7 @@ int ggml_cpu_has_wasm_simd(void) {
22819
22676
  }
22820
22677
 
22821
22678
  int ggml_cpu_has_blas(void) {
22822
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
22679
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
22823
22680
  return 1;
22824
22681
  #else
22825
22682
  return 0;
@@ -22834,14 +22691,6 @@ int ggml_cpu_has_cuda(void) {
22834
22691
  #endif
22835
22692
  }
22836
22693
 
22837
- int ggml_cpu_has_clblast(void) {
22838
- #if defined(GGML_USE_CLBLAST)
22839
- return 1;
22840
- #else
22841
- return 0;
22842
- #endif
22843
- }
22844
-
22845
22694
  int ggml_cpu_has_vulkan(void) {
22846
22695
  #if defined(GGML_USE_VULKAN)
22847
22696
  return 1;
@@ -22875,8 +22724,7 @@ int ggml_cpu_has_rpc(void) {
22875
22724
  }
22876
22725
 
22877
22726
  int ggml_cpu_has_gpublas(void) {
22878
- return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
22879
- ggml_cpu_has_sycl();
22727
+ return ggml_cpu_has_cuda() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() || ggml_cpu_has_sycl();
22880
22728
  }
22881
22729
 
22882
22730
  int ggml_cpu_has_sse3(void) {