llama_cpp 0.15.4 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +10 -0
  3. data/ext/llama_cpp/extconf.rb +1 -2
  4. data/ext/llama_cpp/llama_cpp.cpp +15 -3
  5. data/lib/llama_cpp/version.rb +2 -2
  6. data/sig/llama_cpp.rbs +13 -1
  7. data/vendor/tmp/llama.cpp/Makefile +62 -35
  8. data/vendor/tmp/llama.cpp/ggml-alloc.c +4 -4
  9. data/vendor/tmp/llama.cpp/ggml-backend.c +5 -5
  10. data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
  11. data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
  12. data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
  13. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +103 -0
  14. data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
  15. data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
  16. data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
  17. data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
  18. data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
  19. data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
  20. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +662 -0
  21. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
  22. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
  23. data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
  24. data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
  25. data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
  26. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +1564 -0
  27. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +404 -0
  28. data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
  29. data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
  30. data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
  31. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +45 -0
  32. data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
  33. data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
  34. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +205 -0
  35. data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
  36. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
  37. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
  38. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
  39. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
  40. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
  41. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
  42. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
  43. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
  44. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
  45. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
  123. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
  124. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
  125. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
  126. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
  127. data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
  128. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +266 -0
  129. data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
  130. data/vendor/tmp/llama.cpp/ggml-cuda.cu +8 -6
  131. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +21 -6
  132. data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
  133. data/vendor/tmp/llama.cpp/ggml-metal.m +34 -24
  134. data/vendor/tmp/llama.cpp/ggml-metal.metal +83 -59
  135. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +2 -2
  136. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +7 -67
  137. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +99301 -39793
  138. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +456 -329
  139. data/vendor/tmp/llama.cpp/ggml.c +178 -330
  140. data/vendor/tmp/llama.cpp/ggml.h +9 -28
  141. data/vendor/tmp/llama.cpp/llama.cpp +242 -426
  142. data/vendor/tmp/llama.cpp/llama.h +17 -43
  143. metadata +121 -6
  144. data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
  145. data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
  146. data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
  147. data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
@@ -5,6 +5,7 @@
5
5
  #include "ggml-quants.h"
6
6
  #include "ggml.h"
7
7
 
8
+
8
9
  #if defined(_MSC_VER) || defined(__MINGW32__)
9
10
  #include <malloc.h> // using malloc.h with MSC/MINGW
10
11
  #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
@@ -28,6 +29,10 @@
28
29
  #include <syscall.h>
29
30
  #endif
30
31
 
32
+ #ifdef GGML_USE_OPENMP
33
+ #include <omp.h>
34
+ #endif
35
+
31
36
  #ifdef GGML_USE_METAL
32
37
  #include <unistd.h>
33
38
  #endif
@@ -292,17 +297,12 @@ inline static void * ggml_calloc(size_t num, size_t size) {
292
297
 
293
298
  #if defined(GGML_USE_ACCELERATE)
294
299
  #include <Accelerate/Accelerate.h>
295
- #if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
296
- #include "ggml-opencl.h"
297
- #endif
298
300
  #elif defined(GGML_USE_OPENBLAS)
299
301
  #if defined(GGML_BLAS_USE_MKL)
300
302
  #include <mkl.h>
301
303
  #else
302
304
  #include <cblas.h>
303
305
  #endif
304
- #elif defined(GGML_USE_CLBLAST)
305
- #include "ggml-opencl.h"
306
306
  #endif
307
307
 
308
308
  // floating point type used to accumulate sums
@@ -1756,7 +1756,7 @@ struct ggml_compute_state_shared {
1756
1756
  int64_t perf_node_start_cycles;
1757
1757
  int64_t perf_node_start_time_us;
1758
1758
 
1759
- const int n_threads;
1759
+ int n_threads;
1760
1760
 
1761
1761
  // synchronization primitives
1762
1762
  atomic_int n_active; // num active threads
@@ -2267,6 +2267,11 @@ inline static float ggml_silu_f32(float x) {
2267
2267
  return x/(1.0f + expf(-x));
2268
2268
  }
2269
2269
 
2270
+ #if __FINITE_MATH_ONLY__
2271
+ #error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix"
2272
+ #error "ref: https://github.com/ggerganov/llama.cpp/pull/7154#issuecomment-2143844461"
2273
+ #endif
2274
+
2270
2275
  #if defined(__ARM_NEON) && defined(__aarch64__)
2271
2276
 
2272
2277
  // adapted from arm limited optimized routine
@@ -3370,10 +3375,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
3370
3375
  GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
3371
3376
  }
3372
3377
 
3373
- #if defined(GGML_USE_CLBLAST)
3374
- ggml_cl_init();
3375
- #endif
3376
-
3377
3378
  ggml_setup_op_has_task_pass();
3378
3379
 
3379
3380
  is_first_call = false;
@@ -6249,16 +6250,13 @@ static struct ggml_tensor * ggml_rope_impl(
6249
6250
  struct ggml_tensor * c,
6250
6251
  int n_dims,
6251
6252
  int mode,
6252
- int n_ctx,
6253
- int n_orig_ctx,
6253
+ int n_ctx_orig,
6254
6254
  float freq_base,
6255
6255
  float freq_scale,
6256
6256
  float ext_factor,
6257
6257
  float attn_factor,
6258
6258
  float beta_fast,
6259
6259
  float beta_slow,
6260
- float xpos_base,
6261
- bool xpos_down,
6262
6260
  bool inplace) {
6263
6261
  GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");
6264
6262
 
@@ -6279,15 +6277,13 @@ static struct ggml_tensor * ggml_rope_impl(
6279
6277
 
6280
6278
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6281
6279
 
6282
- int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx };
6280
+ int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
6283
6281
  memcpy(params + 5, &freq_base, sizeof(float));
6284
6282
  memcpy(params + 6, &freq_scale, sizeof(float));
6285
6283
  memcpy(params + 7, &ext_factor, sizeof(float));
6286
6284
  memcpy(params + 8, &attn_factor, sizeof(float));
6287
6285
  memcpy(params + 9, &beta_fast, sizeof(float));
6288
6286
  memcpy(params + 10, &beta_slow, sizeof(float));
6289
- memcpy(params + 11, &xpos_base, sizeof(float));
6290
- memcpy(params + 12, &xpos_down, sizeof(bool));
6291
6287
  ggml_set_op_params(result, params, sizeof(params));
6292
6288
 
6293
6289
  result->op = GGML_OP_ROPE;
@@ -6304,10 +6300,9 @@ struct ggml_tensor * ggml_rope(
6304
6300
  struct ggml_tensor * a,
6305
6301
  struct ggml_tensor * b,
6306
6302
  int n_dims,
6307
- int mode,
6308
- int n_ctx) {
6303
+ int mode) {
6309
6304
  return ggml_rope_impl(
6310
- ctx, a, b, NULL, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, false
6305
+ ctx, a, b, NULL, n_dims, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, false
6311
6306
  );
6312
6307
  }
6313
6308
 
@@ -6316,10 +6311,9 @@ struct ggml_tensor * ggml_rope_inplace(
6316
6311
  struct ggml_tensor * a,
6317
6312
  struct ggml_tensor * b,
6318
6313
  int n_dims,
6319
- int mode,
6320
- int n_ctx) {
6314
+ int mode) {
6321
6315
  return ggml_rope_impl(
6322
- ctx, a, b, NULL, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, true
6316
+ ctx, a, b, NULL, n_dims, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, true
6323
6317
  );
6324
6318
  }
6325
6319
 
@@ -6330,8 +6324,7 @@ struct ggml_tensor * ggml_rope_ext(
6330
6324
  struct ggml_tensor * c,
6331
6325
  int n_dims,
6332
6326
  int mode,
6333
- int n_ctx,
6334
- int n_orig_ctx,
6327
+ int n_ctx_orig,
6335
6328
  float freq_base,
6336
6329
  float freq_scale,
6337
6330
  float ext_factor,
@@ -6339,8 +6332,8 @@ struct ggml_tensor * ggml_rope_ext(
6339
6332
  float beta_fast,
6340
6333
  float beta_slow) {
6341
6334
  return ggml_rope_impl(
6342
- ctx, a, b, c, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
6343
- ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, false
6335
+ ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
6336
+ ext_factor, attn_factor, beta_fast, beta_slow, false
6344
6337
  );
6345
6338
  }
6346
6339
 
@@ -6351,8 +6344,7 @@ struct ggml_tensor * ggml_rope_ext_inplace(
6351
6344
  struct ggml_tensor * c,
6352
6345
  int n_dims,
6353
6346
  int mode,
6354
- int n_ctx,
6355
- int n_orig_ctx,
6347
+ int n_ctx_orig,
6356
6348
  float freq_base,
6357
6349
  float freq_scale,
6358
6350
  float ext_factor,
@@ -6360,8 +6352,8 @@ struct ggml_tensor * ggml_rope_ext_inplace(
6360
6352
  float beta_fast,
6361
6353
  float beta_slow) {
6362
6354
  return ggml_rope_impl(
6363
- ctx, a, b, c, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
6364
- ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, true
6355
+ ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
6356
+ ext_factor, attn_factor, beta_fast, beta_slow, true
6365
6357
  );
6366
6358
  }
6367
6359
 
@@ -6371,8 +6363,7 @@ struct ggml_tensor * ggml_rope_custom(
6371
6363
  struct ggml_tensor * b,
6372
6364
  int n_dims,
6373
6365
  int mode,
6374
- int n_ctx,
6375
- int n_orig_ctx,
6366
+ int n_ctx_orig,
6376
6367
  float freq_base,
6377
6368
  float freq_scale,
6378
6369
  float ext_factor,
@@ -6380,8 +6371,8 @@ struct ggml_tensor * ggml_rope_custom(
6380
6371
  float beta_fast,
6381
6372
  float beta_slow) {
6382
6373
  return ggml_rope_impl(
6383
- ctx, a, b, NULL, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
6384
- ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, false
6374
+ ctx, a, b, NULL, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
6375
+ ext_factor, attn_factor, beta_fast, beta_slow, false
6385
6376
  );
6386
6377
  }
6387
6378
 
@@ -6391,8 +6382,7 @@ struct ggml_tensor * ggml_rope_custom_inplace(
6391
6382
  struct ggml_tensor * b,
6392
6383
  int n_dims,
6393
6384
  int mode,
6394
- int n_ctx,
6395
- int n_orig_ctx,
6385
+ int n_ctx_orig,
6396
6386
  float freq_base,
6397
6387
  float freq_scale,
6398
6388
  float ext_factor,
@@ -6400,21 +6390,11 @@ struct ggml_tensor * ggml_rope_custom_inplace(
6400
6390
  float beta_fast,
6401
6391
  float beta_slow) {
6402
6392
  return ggml_rope_impl(
6403
- ctx, a, b, NULL, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
6404
- ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, true
6393
+ ctx, a, b, NULL, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
6394
+ ext_factor, attn_factor, beta_fast, beta_slow, true
6405
6395
  );
6406
6396
  }
6407
6397
 
6408
- struct ggml_tensor * ggml_rope_xpos_inplace(
6409
- struct ggml_context * ctx,
6410
- struct ggml_tensor * a,
6411
- struct ggml_tensor * b,
6412
- int n_dims,
6413
- float base,
6414
- bool down) {
6415
- return ggml_rope_impl(ctx, a, b, NULL, n_dims, 0, 0, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, base, down, true);
6416
- }
6417
-
6418
6398
  // ggml_rope_back
6419
6399
 
6420
6400
  struct ggml_tensor * ggml_rope_back(
@@ -6424,16 +6404,13 @@ struct ggml_tensor * ggml_rope_back(
6424
6404
  struct ggml_tensor * c,
6425
6405
  int n_dims,
6426
6406
  int mode,
6427
- int n_ctx,
6428
- int n_orig_ctx,
6407
+ int n_ctx_orig,
6429
6408
  float freq_base,
6430
6409
  float freq_scale,
6431
6410
  float ext_factor,
6432
6411
  float attn_factor,
6433
6412
  float beta_fast,
6434
- float beta_slow,
6435
- float xpos_base,
6436
- bool xpos_down) {
6413
+ float beta_slow) {
6437
6414
  GGML_ASSERT(ggml_is_vector(b));
6438
6415
  GGML_ASSERT(b->type == GGML_TYPE_I32);
6439
6416
  GGML_ASSERT(a->ne[2] == b->ne[0]);
@@ -6449,15 +6426,13 @@ struct ggml_tensor * ggml_rope_back(
6449
6426
 
6450
6427
  struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
6451
6428
 
6452
- int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx };
6429
+ int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
6453
6430
  memcpy(params + 5, &freq_base, sizeof(float));
6454
6431
  memcpy(params + 6, &freq_scale, sizeof(float));
6455
6432
  memcpy(params + 7, &ext_factor, sizeof(float));
6456
6433
  memcpy(params + 8, &attn_factor, sizeof(float));
6457
6434
  memcpy(params + 9, &beta_fast, sizeof(float));
6458
6435
  memcpy(params + 10, &beta_slow, sizeof(float));
6459
- memcpy(params + 11, &xpos_base, sizeof(float));
6460
- memcpy(params + 12, &xpos_down, sizeof(bool));
6461
6436
  ggml_set_op_params(result, params, sizeof(params));
6462
6437
 
6463
6438
  result->op = GGML_OP_ROPE_BACK;
@@ -9043,17 +9018,6 @@ static void ggml_compute_forward_add_f32(
9043
9018
  const int ith = params->ith;
9044
9019
  const int nth = params->nth;
9045
9020
 
9046
- #ifdef GGML_USE_CLBLAST
9047
- if (src1->backend == GGML_BACKEND_TYPE_GPU) {
9048
- // TODO: OpenCL kernel support full broadcast
9049
- GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
9050
- if (ith == 0) {
9051
- ggml_cl_add(src0, src1, dst);
9052
- }
9053
- return;
9054
- }
9055
- #endif
9056
-
9057
9021
  const int nr = ggml_nrows(src0);
9058
9022
 
9059
9023
  GGML_TENSOR_BINARY_OP_LOCALS
@@ -10161,17 +10125,6 @@ static void ggml_compute_forward_mul_f32(
10161
10125
  const int ith = params->ith;
10162
10126
  const int nth = params->nth;
10163
10127
 
10164
- #if defined(GGML_USE_CLBLAST)
10165
- if (src1->backend == GGML_BACKEND_TYPE_GPU) {
10166
- // TODO: OpenCL kernel support full broadcast
10167
- GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
10168
- if (ith == 0) {
10169
- ggml_cl_mul(src0, src1, dst);
10170
- }
10171
- return;
10172
- }
10173
- #endif
10174
-
10175
10128
  const int64_t nr = ggml_nrows(src0);
10176
10129
 
10177
10130
  GGML_TENSOR_BINARY_OP_LOCALS
@@ -12407,15 +12360,6 @@ static void ggml_compute_forward_mul_mat(
12407
12360
  // nb01 >= nb00 - src0 is not transposed
12408
12361
  // compute by src0 rows
12409
12362
 
12410
- #if defined(GGML_USE_CLBLAST)
12411
- if (ggml_cl_can_mul_mat(src0, src1, dst)) {
12412
- if (params->ith == 0 && params->type == GGML_TASK_TYPE_COMPUTE) {
12413
- ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
12414
- }
12415
- return;
12416
- }
12417
- #endif
12418
-
12419
12363
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
12420
12364
  if (ggml_compute_forward_mul_mat_use_blas(dst)) {
12421
12365
  const int64_t ne_plane = ne01*ne00;
@@ -12863,8 +12807,6 @@ static void ggml_compute_forward_out_prod_f32(
12863
12807
  // nb01 >= nb00 - src0 is not transposed
12864
12808
  // compute by src0 rows
12865
12809
 
12866
- // TODO: #if defined(GGML_USE_CLBLAST)
12867
-
12868
12810
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
12869
12811
  bool use_blas = ggml_is_matrix(src0) &&
12870
12812
  ggml_is_matrix(src1) &&
@@ -13062,7 +13004,7 @@ static void ggml_compute_forward_out_prod_q_f32(
13062
13004
  // nb01 >= nb00 - src0 is not transposed
13063
13005
  // compute by src0 rows
13064
13006
 
13065
- // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
13007
+ // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
13066
13008
 
13067
13009
  if (params->type == GGML_TASK_TYPE_INIT) {
13068
13010
  if (ith != 0) {
@@ -14259,8 +14201,7 @@ static float rope_yarn_ramp(const float low, const float high, const int i0) {
14259
14201
  // MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
14260
14202
  static void rope_yarn(
14261
14203
  float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale,
14262
- float * cos_theta, float * sin_theta
14263
- ) {
14204
+ float * cos_theta, float * sin_theta) {
14264
14205
  // Get n-d rotational scaling corrected for extrapolation
14265
14206
  float theta_interp = freq_scale * theta_extrap;
14266
14207
  float theta = theta_interp;
@@ -14277,18 +14218,19 @@ static void rope_yarn(
14277
14218
 
14278
14219
  // Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
14279
14220
  // `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
14280
- static float ggml_rope_yarn_corr_dim(int n_dims, int n_orig_ctx, float n_rot, float base) {
14281
- return n_dims * logf(n_orig_ctx / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
14221
+ static float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) {
14222
+ return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
14282
14223
  }
14283
14224
 
14284
14225
  static void ggml_rope_cache_init(
14285
- float theta_base, float freq_scale, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
14286
- float * cache, float sin_sign, float theta_scale
14287
- ) {
14226
+ float theta_base, float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
14227
+ float * cache, float sin_sign, float theta_scale) {
14228
+ // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
14288
14229
  float theta = theta_base;
14289
14230
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
14231
+ const float ff = freq_factors ? freq_factors[i0/2] : 1.0f;
14290
14232
  rope_yarn(
14291
- theta, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
14233
+ theta/ff, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
14292
14234
  );
14293
14235
  cache[i0 + 1] *= sin_sign;
14294
14236
 
@@ -14297,11 +14239,11 @@ static void ggml_rope_cache_init(
14297
14239
  }
14298
14240
 
14299
14241
  GGML_CALL void ggml_rope_yarn_corr_dims(
14300
- int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
14242
+ int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]
14301
14243
  ) {
14302
14244
  // start and end correction dims
14303
- float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_fast, freq_base));
14304
- float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_slow, freq_base));
14245
+ float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base));
14246
+ float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base));
14305
14247
  dims[0] = MAX(0, start);
14306
14248
  dims[1] = MIN(n_dims - 1, end);
14307
14249
  }
@@ -14321,15 +14263,11 @@ static void ggml_compute_forward_rope_f32(
14321
14263
 
14322
14264
  float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
14323
14265
 
14324
- // these two only relevant for xPos RoPE:
14325
- float xpos_base;
14326
- bool xpos_down;
14327
-
14328
14266
  //const int n_past = ((int32_t *) dst->op_params)[0];
14329
14267
  const int n_dims = ((int32_t *) dst->op_params)[1];
14330
14268
  const int mode = ((int32_t *) dst->op_params)[2];
14331
- const int n_ctx = ((int32_t *) dst->op_params)[3];
14332
- const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
14269
+ //const int n_ctx = ((int32_t *) dst->op_params)[3];
14270
+ const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
14333
14271
 
14334
14272
  memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
14335
14273
  memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
@@ -14337,8 +14275,6 @@ static void ggml_compute_forward_rope_f32(
14337
14275
  memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
14338
14276
  memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
14339
14277
  memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
14340
- memcpy(&xpos_base, (int32_t *) dst->op_params + 11, sizeof(float));
14341
- memcpy(&xpos_down, (int32_t *) dst->op_params + 12, sizeof(bool));
14342
14278
 
14343
14279
  GGML_TENSOR_UNARY_OP_LOCALS
14344
14280
 
@@ -14368,20 +14304,15 @@ static void ggml_compute_forward_rope_f32(
14368
14304
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
14369
14305
 
14370
14306
  float corr_dims[2];
14371
- ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
14307
+ ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
14372
14308
 
14373
14309
  const bool is_neox = mode & 2;
14374
- const bool is_glm = mode & 4;
14375
14310
 
14376
14311
  const float * freq_factors = NULL;
14377
- if (is_neox) {
14378
- if (src2 != NULL) {
14379
- GGML_ASSERT(src2->type == GGML_TYPE_F32);
14380
- GGML_ASSERT(src2->ne[0] >= n_dims / 2);
14381
- freq_factors = (const float *) src2->data;
14382
- }
14383
- } else {
14384
- GGML_ASSERT(src2 == NULL && "TODO: freq_factors not implemented for !is_neox");
14312
+ if (src2 != NULL) {
14313
+ GGML_ASSERT(src2->type == GGML_TYPE_F32);
14314
+ GGML_ASSERT(src2->ne[0] >= n_dims / 2);
14315
+ freq_factors = (const float *) src2->data;
14385
14316
  }
14386
14317
 
14387
14318
  // backward process uses inverse rotation by cos and sin.
@@ -14396,94 +14327,50 @@ static void ggml_compute_forward_rope_f32(
14396
14327
  const int64_t p = pos[i2];
14397
14328
 
14398
14329
  float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
14399
- if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox
14400
- ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
14401
- }
14330
+ ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
14402
14331
 
14403
14332
  for (int64_t i1 = 0; i1 < ne1; i1++) {
14404
14333
  if (ir++ < ir0) continue;
14405
14334
  if (ir > ir1) break;
14406
14335
 
14407
- float theta_base = (float)p;
14408
-
14409
- if (is_glm) {
14410
- theta_base = MIN(p, n_ctx - 2);
14411
- float block_theta = MAX(p - (n_ctx - 2), 0);
14412
- for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
14413
- const float cos_theta = cosf(theta_base);
14414
- const float sin_theta = sinf(theta_base) * sin_sign;
14415
- const float cos_block_theta = cosf(block_theta);
14416
- const float sin_block_theta = sinf(block_theta) * sin_sign;
14417
-
14418
- theta_base *= theta_scale;
14419
- block_theta *= theta_scale;
14420
-
14421
- const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
14422
- float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
14423
-
14424
- const float x0 = src[0];
14425
- const float x1 = src[n_dims/2];
14426
- const float x2 = src[n_dims];
14427
- const float x3 = src[n_dims/2*3];
14428
-
14429
- dst_data[0] = x0*cos_theta - x1*sin_theta;
14430
- dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
14431
- dst_data[n_dims] = x2*cos_block_theta - x3*sin_block_theta;
14432
- dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta;
14433
- }
14434
- } else if (!is_neox) {
14435
- for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
14336
+ if (!is_neox) {
14337
+ for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
14436
14338
  const float cos_theta = cache[i0 + 0];
14437
14339
  const float sin_theta = cache[i0 + 1];
14438
14340
 
14439
- // zeta scaling for xPos only:
14440
- float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
14441
- if (xpos_down) zeta = 1.0f / zeta;
14442
-
14443
14341
  const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
14444
14342
  float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
14445
14343
 
14446
14344
  const float x0 = src[0];
14447
14345
  const float x1 = src[1];
14448
14346
 
14449
- dst_data[0] = x0*cos_theta*zeta - x1*sin_theta*zeta;
14450
- dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta;
14347
+ dst_data[0] = x0*cos_theta - x1*sin_theta;
14348
+ dst_data[1] = x0*sin_theta + x1*cos_theta;
14451
14349
  }
14452
14350
  } else {
14453
- // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
14454
- for (int64_t ic = 0; ic < ne0; ic += 2) {
14455
- if (ic < n_dims) {
14456
- const int64_t i0 = ic/2;
14351
+ for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
14352
+ const int64_t ic = i0/2;
14457
14353
 
14458
- const float freq_factor = freq_factors ? freq_factors[i0] : 1.0f;
14459
-
14460
- float cos_theta, sin_theta;
14461
- rope_yarn(
14462
- theta_base/freq_factor, freq_scale, corr_dims, ic, ext_factor, attn_factor,
14463
- &cos_theta, &sin_theta
14464
- );
14465
-
14466
- sin_theta *= sin_sign;
14467
- theta_base *= theta_scale;
14354
+ const float cos_theta = cache[i0 + 0];
14355
+ const float sin_theta = cache[i0 + 1];
14468
14356
 
14469
- const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
14470
- float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
14357
+ const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
14358
+ float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
14471
14359
 
14472
- const float x0 = src[0];
14473
- const float x1 = src[n_dims/2];
14360
+ const float x0 = src[0];
14361
+ const float x1 = src[n_dims/2];
14474
14362
 
14475
- dst_data[0] = x0*cos_theta - x1*sin_theta;
14476
- dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
14477
- } else {
14478
- const int64_t i0 = ic;
14363
+ dst_data[0] = x0*cos_theta - x1*sin_theta;
14364
+ dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
14365
+ }
14366
+ }
14479
14367
 
14480
- const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
14481
- float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
14368
+ for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
14369
+ const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
14370
+ float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
14482
14371
 
14483
- dst_data[0] = src[0];
14484
- dst_data[1] = src[1];
14485
- }
14486
- }
14372
+ dst_data[0] = src[0];
14373
+ dst_data[1] = src[1];
14487
14374
  }
14488
14375
  }
14489
14376
  }
@@ -14509,8 +14396,8 @@ static void ggml_compute_forward_rope_f16(
14509
14396
  //const int n_past = ((int32_t *) dst->op_params)[0];
14510
14397
  const int n_dims = ((int32_t *) dst->op_params)[1];
14511
14398
  const int mode = ((int32_t *) dst->op_params)[2];
14512
- const int n_ctx = ((int32_t *) dst->op_params)[3];
14513
- const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
14399
+ //const int n_ctx = ((int32_t *) dst->op_params)[3];
14400
+ const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
14514
14401
  memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
14515
14402
  memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
14516
14403
  memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
@@ -14546,20 +14433,15 @@ static void ggml_compute_forward_rope_f16(
14546
14433
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
14547
14434
 
14548
14435
  float corr_dims[2];
14549
- ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
14436
+ ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
14550
14437
 
14551
14438
  const bool is_neox = mode & 2;
14552
- const bool is_glm = mode & 4;
14553
14439
 
14554
14440
  const float * freq_factors = NULL;
14555
- if (is_neox) {
14556
- if (src2 != NULL) {
14557
- GGML_ASSERT(src2->type == GGML_TYPE_F32);
14558
- GGML_ASSERT(src2->ne[0] >= n_dims / 2);
14559
- freq_factors = (const float *) src2->data;
14560
- }
14561
- } else {
14562
- GGML_ASSERT(src2 == NULL && "TODO: freq_factors not implemented for !is_neox");
14441
+ if (src2 != NULL) {
14442
+ GGML_ASSERT(src2->type == GGML_TYPE_F32);
14443
+ GGML_ASSERT(src2->ne[0] >= n_dims / 2);
14444
+ freq_factors = (const float *) src2->data;
14563
14445
  }
14564
14446
 
14565
14447
  // backward process uses inverse rotation by cos and sin.
@@ -14574,43 +14456,14 @@ static void ggml_compute_forward_rope_f16(
14574
14456
  const int64_t p = pos[i2];
14575
14457
 
14576
14458
  float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
14577
- if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox
14578
- ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
14579
- }
14459
+ ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
14580
14460
 
14581
14461
  for (int64_t i1 = 0; i1 < ne1; i1++) {
14582
14462
  if (ir++ < ir0) continue;
14583
14463
  if (ir > ir1) break;
14584
14464
 
14585
- float theta_base = (float)p;
14586
-
14587
- if (is_glm) {
14588
- theta_base = MIN(p, n_ctx - 2);
14589
- float block_theta = MAX(p - (n_ctx - 2), 0);
14590
- for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
14591
- const float cos_theta = cosf(theta_base);
14592
- const float sin_theta = sinf(theta_base) * sin_sign;
14593
- const float cos_block_theta = cosf(block_theta);
14594
- const float sin_block_theta = sinf(block_theta) * sin_sign;
14595
-
14596
- theta_base *= theta_scale;
14597
- block_theta *= theta_scale;
14598
-
14599
- const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
14600
- ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
14601
-
14602
- const float x0 = GGML_FP16_TO_FP32(src[0]);
14603
- const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
14604
- const float x2 = GGML_FP16_TO_FP32(src[n_dims]);
14605
- const float x3 = GGML_FP16_TO_FP32(src[n_dims/2*3]);
14606
-
14607
- dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
14608
- dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
14609
- dst_data[n_dims] = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
14610
- dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
14611
- }
14612
- } else if (!is_neox) {
14613
- for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
14465
+ if (!is_neox) {
14466
+ for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
14614
14467
  const float cos_theta = cache[i0 + 0];
14615
14468
  const float sin_theta = cache[i0 + 1];
14616
14469
 
@@ -14624,40 +14477,29 @@ static void ggml_compute_forward_rope_f16(
14624
14477
  dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
14625
14478
  }
14626
14479
  } else {
14627
- // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
14628
- for (int64_t ic = 0; ic < ne0; ic += 2) {
14629
- if (ic < n_dims) {
14630
- const int64_t i0 = ic/2;
14631
-
14632
- const float freq_factor = freq_factors ? freq_factors[i0] : 1.0f;
14633
-
14634
- float cos_theta, sin_theta;
14635
- rope_yarn(
14636
- theta_base/freq_factor, freq_scale, corr_dims, ic, ext_factor, attn_factor,
14637
- &cos_theta, &sin_theta
14638
- );
14480
+ for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
14481
+ const int64_t ic = i0/2;
14639
14482
 
14640
- sin_theta *= sin_sign;
14641
- theta_base *= theta_scale;
14483
+ const float cos_theta = cache[i0 + 0];
14484
+ const float sin_theta = cache[i0 + 1];
14642
14485
 
14643
- const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
14644
- ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
14486
+ const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
14487
+ ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
14645
14488
 
14646
- const float x0 = GGML_FP16_TO_FP32(src[0]);
14647
- const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
14489
+ const float x0 = GGML_FP16_TO_FP32(src[0]);
14490
+ const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
14648
14491
 
14649
- dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
14650
- dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
14651
- } else {
14652
- const int64_t i0 = ic;
14492
+ dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
14493
+ dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
14494
+ }
14495
+ }
14653
14496
 
14654
- const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
14655
- ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
14497
+ for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
14498
+ const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
14499
+ ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
14656
14500
 
14657
- dst_data[0] = src[0];
14658
- dst_data[1] = src[1];
14659
- }
14660
- }
14501
+ dst_data[0] = src[0];
14502
+ dst_data[1] = src[1];
14661
14503
  }
14662
14504
  }
14663
14505
  }
@@ -18359,9 +18201,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
18359
18201
  //const int n_past = ((int32_t *) tensor->op_params)[0];
18360
18202
  const int n_dims = ((int32_t *) tensor->op_params)[1];
18361
18203
  const int mode = ((int32_t *) tensor->op_params)[2];
18362
- const int n_ctx = ((int32_t *) tensor->op_params)[3];
18363
- const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
18364
- float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
18204
+ //const int n_ctx = ((int32_t *) tensor->op_params)[3];
18205
+ const int n_ctx_orig = ((int32_t *) tensor->op_params)[4];
18206
+ float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
18365
18207
 
18366
18208
  memcpy(&freq_base, (int32_t *) tensor->op_params + 5, sizeof(float));
18367
18209
  memcpy(&freq_scale, (int32_t *) tensor->op_params + 6, sizeof(float));
@@ -18369,8 +18211,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
18369
18211
  memcpy(&attn_factor, (int32_t *) tensor->op_params + 8, sizeof(float));
18370
18212
  memcpy(&beta_fast, (int32_t *) tensor->op_params + 9, sizeof(float));
18371
18213
  memcpy(&beta_slow, (int32_t *) tensor->op_params + 10, sizeof(float));
18372
- memcpy(&xpos_base, (int32_t *) tensor->op_params + 11, sizeof(float));
18373
- memcpy(&xpos_down, (int32_t *) tensor->op_params + 12, sizeof(bool));
18374
18214
 
18375
18215
  src0->grad = ggml_add_or_set(ctx,
18376
18216
  src0->grad,
@@ -18380,16 +18220,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
18380
18220
  src2,
18381
18221
  n_dims,
18382
18222
  mode,
18383
- n_ctx,
18384
- n_orig_ctx,
18223
+ n_ctx_orig,
18385
18224
  freq_base,
18386
18225
  freq_scale,
18387
18226
  ext_factor,
18388
18227
  attn_factor,
18389
18228
  beta_fast,
18390
- beta_slow,
18391
- xpos_base,
18392
- xpos_down),
18229
+ beta_slow),
18393
18230
  zero_table);
18394
18231
  }
18395
18232
  } break;
@@ -18399,9 +18236,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
18399
18236
  //const int n_past = ((int32_t *) tensor->op_params)[0];
18400
18237
  const int n_dims = ((int32_t *) tensor->op_params)[1];
18401
18238
  const int mode = ((int32_t *) tensor->op_params)[2];
18402
- const int n_ctx = ((int32_t *) tensor->op_params)[3];
18403
- const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
18404
- float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
18239
+ //const int n_ctx = ((int32_t *) tensor->op_params)[3];
18240
+ const int n_ctx_orig = ((int32_t *) tensor->op_params)[4];
18241
+ float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
18405
18242
 
18406
18243
  memcpy(&freq_base, (int32_t *) tensor->op_params + 5, sizeof(float));
18407
18244
  memcpy(&freq_scale, (int32_t *) tensor->op_params + 6, sizeof(float));
@@ -18409,8 +18246,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
18409
18246
  memcpy(&attn_factor, (int32_t *) tensor->op_params + 8, sizeof(float));
18410
18247
  memcpy(&beta_fast, (int32_t *) tensor->op_params + 9, sizeof(float));
18411
18248
  memcpy(&beta_slow, (int32_t *) tensor->op_params + 10, sizeof(float));
18412
- memcpy(&xpos_base, (int32_t *) tensor->op_params + 11, sizeof(float));
18413
- memcpy(&xpos_down, (int32_t *) tensor->op_params + 12, sizeof(bool));
18414
18249
 
18415
18250
  src0->grad = ggml_add_or_set(ctx,
18416
18251
  src0->grad,
@@ -18420,16 +18255,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
18420
18255
  src2,
18421
18256
  n_dims,
18422
18257
  mode,
18423
- n_ctx,
18424
- n_orig_ctx,
18258
+ n_ctx_orig,
18425
18259
  freq_base,
18426
18260
  freq_scale,
18427
18261
  ext_factor,
18428
18262
  attn_factor,
18429
18263
  beta_fast,
18430
18264
  beta_slow,
18431
- xpos_base,
18432
- xpos_down,
18433
18265
  false),
18434
18266
  zero_table);
18435
18267
  }
@@ -19536,11 +19368,6 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
19536
19368
  {
19537
19369
  const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
19538
19370
 
19539
- #if defined(GGML_USE_CLBLAST)
19540
- if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
19541
- cur = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node);
19542
- } else
19543
- #endif
19544
19371
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
19545
19372
  if (ggml_compute_forward_mul_mat_use_blas(node)) {
19546
19373
  if (node->src[0]->type != GGML_TYPE_F32) {
@@ -19670,6 +19497,59 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
19670
19497
  return cplan;
19671
19498
  }
19672
19499
 
19500
+ static enum ggml_status ggml_graph_compute_parallel(struct ggml_compute_state * workers, int n_threads) {
19501
+ enum ggml_status compute_status = GGML_STATUS_SUCCESS;
19502
+
19503
+ #ifdef GGML_USE_OPENMP
19504
+ if (n_threads > 1) {
19505
+ #pragma omp parallel num_threads(n_threads)
19506
+ {
19507
+ #pragma omp single
19508
+ {
19509
+ // update the number of threads from the actual number of threads that we got from OpenMP
19510
+ n_threads = omp_get_num_threads();
19511
+ workers[0].shared->n_threads = n_threads;
19512
+ workers[0].shared->n_active = n_threads;
19513
+ }
19514
+ ggml_graph_compute_thread(&workers[omp_get_thread_num()]);
19515
+ }
19516
+ } else {
19517
+ ggml_graph_compute_thread(&workers[0]);
19518
+ }
19519
+ #else
19520
+ // create thread pool
19521
+ if (n_threads > 1) {
19522
+ for (int j = 1; j < n_threads; ++j) {
19523
+ const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
19524
+ GGML_ASSERT(rc == 0);
19525
+ UNUSED(rc);
19526
+ }
19527
+ }
19528
+
19529
+ // this is a work thread too
19530
+ ggml_graph_compute_thread(&workers[0]);
19531
+
19532
+ // join or kill thread pool
19533
+ if (n_threads > 1) {
19534
+ for (int j = 1; j < n_threads; j++) {
19535
+ const int rc = ggml_thread_join(workers[j].thrd, NULL);
19536
+ GGML_ASSERT(rc == 0);
19537
+ UNUSED(rc);
19538
+ }
19539
+ }
19540
+ #endif
19541
+ // don't leave affinity set on the main thread
19542
+ clear_numa_thread_affinity();
19543
+
19544
+ for (int j = 0; j < n_threads; j++) {
19545
+ if (workers[j].ec != GGML_STATUS_SUCCESS) {
19546
+ compute_status = workers[j].ec;
19547
+ break;
19548
+ }
19549
+ }
19550
+ return compute_status;
19551
+ }
19552
+
19673
19553
  enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
19674
19554
  {
19675
19555
  GGML_ASSERT(cplan);
@@ -19680,7 +19560,11 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
19680
19560
  }
19681
19561
  }
19682
19562
 
19683
- const int n_threads = cplan->n_threads;
19563
+ int n_threads = cplan->n_threads;
19564
+
19565
+ #if defined(GGML_USE_OPENMP)
19566
+ n_threads = MIN(n_threads, omp_get_max_threads());
19567
+ #endif
19684
19568
 
19685
19569
  struct ggml_compute_state_shared state_shared = {
19686
19570
  /*.cgraph =*/ cgraph,
@@ -19696,47 +19580,20 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
19696
19580
  /*.current_chunk; =*/ 0,
19697
19581
  };
19698
19582
  struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
19699
-
19700
- // create thread pool
19701
- if (n_threads > 1) {
19702
- for (int j = 1; j < n_threads; ++j) {
19703
- workers[j] = (struct ggml_compute_state) {
19704
- .thrd = 0,
19705
- .ith = j,
19706
- .shared = &state_shared,
19707
- .ec = GGML_STATUS_SUCCESS,
19708
- };
19709
-
19710
- const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
19711
- GGML_ASSERT(rc == 0);
19712
- UNUSED(rc);
19713
- }
19714
- }
19715
-
19716
- workers[0].ith = 0;
19717
- workers[0].shared = &state_shared;
19718
- workers[0].ec = GGML_STATUS_SUCCESS;
19719
-
19720
19583
  const int64_t perf_start_cycles = ggml_perf_cycles();
19721
19584
  const int64_t perf_start_time_us = ggml_perf_time_us();
19722
19585
 
19723
- // this is a work thread too
19724
- ggml_graph_compute_thread(&workers[0]);
19725
- enum ggml_status compute_status = workers[0].ec;
19726
-
19727
- // don't leave affinity set on the main thread
19728
- clear_numa_thread_affinity();
19729
-
19730
- // join or kill thread pool
19731
- if (n_threads > 1) {
19732
- for (int j = 1; j < n_threads; j++) {
19733
- const int rc = ggml_thread_join(workers[j].thrd, NULL);
19734
- GGML_ASSERT(rc == 0);
19735
- if (workers[j].ec != GGML_STATUS_SUCCESS)
19736
- compute_status = workers[j].ec;
19737
- }
19586
+ for (int j = 0; j < n_threads; ++j) {
19587
+ workers[j] = (struct ggml_compute_state) {
19588
+ .thrd = 0,
19589
+ .ith = j,
19590
+ .shared = &state_shared,
19591
+ .ec = GGML_STATUS_SUCCESS,
19592
+ };
19738
19593
  }
19739
19594
 
19595
+ enum ggml_status compute_status = ggml_graph_compute_parallel(workers, n_threads);
19596
+
19740
19597
  // performance stats (graph)
19741
19598
  {
19742
19599
  int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
@@ -22819,7 +22676,7 @@ int ggml_cpu_has_wasm_simd(void) {
22819
22676
  }
22820
22677
 
22821
22678
  int ggml_cpu_has_blas(void) {
22822
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
22679
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
22823
22680
  return 1;
22824
22681
  #else
22825
22682
  return 0;
@@ -22834,14 +22691,6 @@ int ggml_cpu_has_cuda(void) {
22834
22691
  #endif
22835
22692
  }
22836
22693
 
22837
- int ggml_cpu_has_clblast(void) {
22838
- #if defined(GGML_USE_CLBLAST)
22839
- return 1;
22840
- #else
22841
- return 0;
22842
- #endif
22843
- }
22844
-
22845
22694
  int ggml_cpu_has_vulkan(void) {
22846
22695
  #if defined(GGML_USE_VULKAN)
22847
22696
  return 1;
@@ -22875,8 +22724,7 @@ int ggml_cpu_has_rpc(void) {
22875
22724
  }
22876
22725
 
22877
22726
  int ggml_cpu_has_gpublas(void) {
22878
- return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
22879
- ggml_cpu_has_sycl();
22727
+ return ggml_cpu_has_cuda() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() || ggml_cpu_has_sycl();
22880
22728
  }
22881
22729
 
22882
22730
  int ggml_cpu_has_sse3(void) {