llama_cpp 0.16.0 → 0.16.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +6 -0
  3. data/ext/llama_cpp/extconf.rb +2 -0
  4. data/ext/llama_cpp/llama_cpp.cpp +2 -0
  5. data/lib/llama_cpp/version.rb +2 -2
  6. data/sig/llama_cpp.rbs +2 -0
  7. data/vendor/tmp/llama.cpp/Makefile +110 -53
  8. data/vendor/tmp/llama.cpp/ggml-alloc.c +78 -22
  9. data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
  10. data/vendor/tmp/llama.cpp/ggml-backend.c +178 -64
  11. data/vendor/tmp/llama.cpp/ggml-backend.h +3 -3
  12. data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
  13. data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
  14. data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
  15. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +1 -0
  16. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +21 -9
  17. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +1 -1
  18. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +15 -1491
  19. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +76 -61
  20. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +77 -10
  21. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +1 -0
  22. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +1 -1
  23. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +1 -1
  24. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +1 -1
  25. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +1 -1
  26. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +1 -1
  27. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +1 -1
  28. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +1 -1
  29. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +1 -1
  30. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +1 -1
  31. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +1 -1
  32. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +1 -1
  33. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +1 -1
  34. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +1 -1
  35. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +1 -1
  36. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +1 -1
  37. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +1 -1
  38. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +1 -1
  39. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +1 -1
  40. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +1 -1
  41. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +1 -1
  42. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +1 -1
  43. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +1 -1
  44. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +1 -1
  45. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +1 -1
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +1 -1
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +1 -1
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +1 -1
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +1 -1
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +1 -1
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +1 -1
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +1 -1
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +1 -1
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +1 -1
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +1 -1
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +1 -1
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +1 -1
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +1 -1
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +1 -1
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +1 -1
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +1 -1
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +1 -1
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +1 -1
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +1 -1
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +1 -1
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +1 -1
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +1 -1
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +1 -1
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +1 -1
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +1 -1
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +1 -1
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +1 -1
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +1 -1
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +1 -1
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +1 -1
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +1 -1
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +1 -1
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +1 -1
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +1 -1
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +1 -1
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +1 -1
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +1 -1
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +1 -1
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +1 -1
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +1 -1
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +1 -1
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +1 -1
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +1 -1
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +1 -1
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +1 -1
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +1 -1
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +1 -1
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +1 -1
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +1 -1
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +1 -1
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +1 -1
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +1 -1
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +1 -1
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +1 -1
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +1 -1
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +1 -1
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +1 -1
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +1 -1
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +1 -1
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +1 -1
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +1 -1
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +1 -1
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +1 -1
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +1 -1
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +1 -1
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +1 -1
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +1 -1
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  123. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +20 -0
  124. data/vendor/tmp/llama.cpp/ggml-cuda.cu +95 -129
  125. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +8 -7
  126. data/vendor/tmp/llama.cpp/ggml-metal.m +11 -9
  127. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +13 -12
  128. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +19 -23
  129. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +1230 -1129
  130. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +181 -148
  131. data/vendor/tmp/llama.cpp/ggml.c +102 -275
  132. data/vendor/tmp/llama.cpp/llama.cpp +103 -47
  133. data/vendor/tmp/llama.cpp/llama.h +4 -0
  134. metadata +15 -3
@@ -297,12 +297,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
297
297
 
298
298
  #if defined(GGML_USE_ACCELERATE)
299
299
  #include <Accelerate/Accelerate.h>
300
- #elif defined(GGML_USE_OPENBLAS)
301
- #if defined(GGML_BLAS_USE_MKL)
302
- #include <mkl.h>
303
- #else
304
- #include <cblas.h>
305
- #endif
306
300
  #endif
307
301
 
308
302
  // floating point type used to accumulate sums
@@ -3212,35 +3206,42 @@ GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor) {
3212
3206
  return tensor->nb[0] > tensor->nb[1];
3213
3207
  }
3214
3208
 
3215
- GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
3216
- static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3209
+ static bool ggml_is_contiguous_n(const struct ggml_tensor * tensor, int n) {
3210
+ size_t next_nb = ggml_type_size(tensor->type);
3211
+ if (tensor->ne[0] != ggml_blck_size(tensor->type) && tensor->nb[0] != next_nb) {
3212
+ return false;
3213
+ }
3214
+ next_nb *= tensor->ne[0]/ggml_blck_size(tensor->type);
3215
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
3216
+ if (tensor->ne[i] != 1) {
3217
+ if (i > n) {
3218
+ if (tensor->nb[i] != next_nb) {
3219
+ return false;
3220
+ }
3221
+ next_nb *= tensor->ne[i];
3222
+ } else {
3223
+ // this dimension does not need to be contiguous
3224
+ next_nb = tensor->ne[i]*tensor->nb[i];
3225
+ }
3226
+ }
3227
+ }
3228
+ return true;
3229
+ }
3217
3230
 
3218
- return
3219
- tensor->nb[0] == ggml_type_size(tensor->type) &&
3220
- tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
3221
- tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
3222
- tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
3231
+ GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
3232
+ return ggml_is_contiguous_0(tensor);
3223
3233
  }
3224
3234
 
3225
3235
  GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) {
3226
- return ggml_is_contiguous(tensor);
3236
+ return ggml_is_contiguous_n(tensor, 0);
3227
3237
  }
3228
3238
 
3229
3239
  GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor) {
3230
- static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3231
-
3232
- return
3233
- tensor->nb[0] == ggml_type_size(tensor->type) &&
3234
- tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
3235
- tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
3240
+ return ggml_is_contiguous_n(tensor, 1);
3236
3241
  }
3237
3242
 
3238
3243
  GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) {
3239
- static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3240
-
3241
- return
3242
- tensor->nb[0] == ggml_type_size(tensor->type) &&
3243
- tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
3244
+ return ggml_is_contiguous_n(tensor, 2);
3244
3245
  }
3245
3246
 
3246
3247
  GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
@@ -3272,20 +3273,20 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor
3272
3273
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3273
3274
 
3274
3275
  return
3275
- (t0->ne[0] == t1->ne[0] ) &&
3276
- (t0->ne[1] == t1->ne[1] ) &&
3277
- (t0->ne[2] == t1->ne[2] ) &&
3278
- (t0->ne[3] == t1->ne[3] );
3276
+ (t0->ne[0] == t1->ne[0]) &&
3277
+ (t0->ne[1] == t1->ne[1]) &&
3278
+ (t0->ne[2] == t1->ne[2]) &&
3279
+ (t0->ne[3] == t1->ne[3]);
3279
3280
  }
3280
3281
 
3281
3282
  bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
3282
3283
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3283
3284
 
3284
3285
  return
3285
- (t0->nb[0] == t1->nb[0] ) &&
3286
- (t0->nb[1] == t1->nb[1] ) &&
3287
- (t0->nb[2] == t1->nb[2] ) &&
3288
- (t0->nb[3] == t1->nb[3] );
3286
+ (t0->nb[0] == t1->nb[0]) &&
3287
+ (t0->nb[1] == t1->nb[1]) &&
3288
+ (t0->nb[2] == t1->nb[2]) &&
3289
+ (t0->nb[3] == t1->nb[3]);
3289
3290
  }
3290
3291
 
3291
3292
  // check if t1 can be represented as a repeatition of t0
@@ -4078,32 +4079,26 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
4078
4079
  switch (tensor->type) {
4079
4080
  case GGML_TYPE_I8:
4080
4081
  {
4081
- GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
4082
4082
  return ((int8_t *)(tensor->data))[i];
4083
4083
  }
4084
4084
  case GGML_TYPE_I16:
4085
4085
  {
4086
- GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
4087
4086
  return ((int16_t *)(tensor->data))[i];
4088
4087
  }
4089
4088
  case GGML_TYPE_I32:
4090
4089
  {
4091
- GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
4092
4090
  return ((int32_t *)(tensor->data))[i];
4093
4091
  }
4094
4092
  case GGML_TYPE_F16:
4095
4093
  {
4096
- GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
4097
4094
  return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
4098
4095
  }
4099
4096
  case GGML_TYPE_BF16:
4100
4097
  {
4101
- GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
4102
4098
  return GGML_BF16_TO_FP32(((ggml_bf16_t *)(tensor->data))[i]);
4103
4099
  }
4104
4100
  case GGML_TYPE_F32:
4105
4101
  {
4106
- GGML_ASSERT(tensor->nb[0] == sizeof(float));
4107
4102
  return ((float *)(tensor->data))[i];
4108
4103
  }
4109
4104
  default:
@@ -4125,32 +4120,26 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
4125
4120
  switch (tensor->type) {
4126
4121
  case GGML_TYPE_I8:
4127
4122
  {
4128
- GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
4129
4123
  ((int8_t *)(tensor->data))[i] = value;
4130
4124
  } break;
4131
4125
  case GGML_TYPE_I16:
4132
4126
  {
4133
- GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
4134
4127
  ((int16_t *)(tensor->data))[i] = value;
4135
4128
  } break;
4136
4129
  case GGML_TYPE_I32:
4137
4130
  {
4138
- GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
4139
4131
  ((int32_t *)(tensor->data))[i] = value;
4140
4132
  } break;
4141
4133
  case GGML_TYPE_F16:
4142
4134
  {
4143
- GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
4144
4135
  ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
4145
4136
  } break;
4146
4137
  case GGML_TYPE_BF16:
4147
4138
  {
4148
- GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
4149
4139
  ((ggml_bf16_t *)(tensor->data))[i] = GGML_FP32_TO_BF16(value);
4150
4140
  } break;
4151
4141
  case GGML_TYPE_F32:
4152
4142
  {
4153
- GGML_ASSERT(tensor->nb[0] == sizeof(float));
4154
4143
  ((float *)(tensor->data))[i] = value;
4155
4144
  } break;
4156
4145
  default:
@@ -7343,13 +7332,15 @@ struct ggml_tensor * ggml_add_rel_pos_inplace(
7343
7332
  return ggml_add_rel_pos_impl(ctx, a, pw, ph, true);
7344
7333
  }
7345
7334
 
7346
- // gmml_unary
7335
+ // ggml_unary
7347
7336
 
7348
7337
  static struct ggml_tensor * ggml_unary_impl(
7349
7338
  struct ggml_context * ctx,
7350
7339
  struct ggml_tensor * a,
7351
7340
  enum ggml_unary_op op,
7352
7341
  bool inplace) {
7342
+ GGML_ASSERT(ggml_is_contiguous_1(a));
7343
+
7353
7344
  bool is_node = false;
7354
7345
 
7355
7346
  if (!inplace && (a->grad)) {
@@ -11014,6 +11005,8 @@ static void ggml_compute_forward_abs_f32(
11014
11005
  const struct ggml_tensor * src0 = dst->src[0];
11015
11006
 
11016
11007
  assert(params->ith == 0);
11008
+ assert(ggml_is_contiguous_1(src0));
11009
+ assert(ggml_is_contiguous_1(dst));
11017
11010
  assert(ggml_are_same_shape(src0, dst));
11018
11011
 
11019
11012
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11023,9 +11016,6 @@ static void ggml_compute_forward_abs_f32(
11023
11016
  const int n = ggml_nrows(src0);
11024
11017
  const int nc = src0->ne[0];
11025
11018
 
11026
- assert(dst->nb[0] == sizeof(float));
11027
- assert(src0->nb[0] == sizeof(float));
11028
-
11029
11019
  for (int i = 0; i < n; i++) {
11030
11020
  ggml_vec_abs_f32(nc,
11031
11021
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11060,6 +11050,8 @@ static void ggml_compute_forward_sgn_f32(
11060
11050
  const struct ggml_tensor * src0 = dst->src[0];
11061
11051
 
11062
11052
  assert(params->ith == 0);
11053
+ assert(ggml_is_contiguous_1(src0));
11054
+ assert(ggml_is_contiguous_1(dst));
11063
11055
  assert(ggml_are_same_shape(src0, dst));
11064
11056
 
11065
11057
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11069,9 +11061,6 @@ static void ggml_compute_forward_sgn_f32(
11069
11061
  const int n = ggml_nrows(src0);
11070
11062
  const int nc = src0->ne[0];
11071
11063
 
11072
- assert(dst->nb[0] == sizeof(float));
11073
- assert(src0->nb[0] == sizeof(float));
11074
-
11075
11064
  for (int i = 0; i < n; i++) {
11076
11065
  ggml_vec_sgn_f32(nc,
11077
11066
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11106,6 +11095,8 @@ static void ggml_compute_forward_neg_f32(
11106
11095
  const struct ggml_tensor * src0 = dst->src[0];
11107
11096
 
11108
11097
  assert(params->ith == 0);
11098
+ assert(ggml_is_contiguous_1(src0));
11099
+ assert(ggml_is_contiguous_1(dst));
11109
11100
  assert(ggml_are_same_shape(src0, dst));
11110
11101
 
11111
11102
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11115,9 +11106,6 @@ static void ggml_compute_forward_neg_f32(
11115
11106
  const int n = ggml_nrows(src0);
11116
11107
  const int nc = src0->ne[0];
11117
11108
 
11118
- assert(dst->nb[0] == sizeof(float));
11119
- assert(src0->nb[0] == sizeof(float));
11120
-
11121
11109
  for (int i = 0; i < n; i++) {
11122
11110
  ggml_vec_neg_f32(nc,
11123
11111
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11152,6 +11140,8 @@ static void ggml_compute_forward_step_f32(
11152
11140
  const struct ggml_tensor * src0 = dst->src[0];
11153
11141
 
11154
11142
  assert(params->ith == 0);
11143
+ assert(ggml_is_contiguous_1(src0));
11144
+ assert(ggml_is_contiguous_1(dst));
11155
11145
  assert(ggml_are_same_shape(src0, dst));
11156
11146
 
11157
11147
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11161,9 +11151,6 @@ static void ggml_compute_forward_step_f32(
11161
11151
  const int n = ggml_nrows(src0);
11162
11152
  const int nc = src0->ne[0];
11163
11153
 
11164
- assert(dst->nb[0] == sizeof(float));
11165
- assert(src0->nb[0] == sizeof(float));
11166
-
11167
11154
  for (int i = 0; i < n; i++) {
11168
11155
  ggml_vec_step_f32(nc,
11169
11156
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11198,6 +11185,8 @@ static void ggml_compute_forward_tanh_f32(
11198
11185
  const struct ggml_tensor * src0 = dst->src[0];
11199
11186
 
11200
11187
  assert(params->ith == 0);
11188
+ assert(ggml_is_contiguous_1(src0));
11189
+ assert(ggml_is_contiguous_1(dst));
11201
11190
  assert(ggml_are_same_shape(src0, dst));
11202
11191
 
11203
11192
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11207,9 +11196,6 @@ static void ggml_compute_forward_tanh_f32(
11207
11196
  const int n = ggml_nrows(src0);
11208
11197
  const int nc = src0->ne[0];
11209
11198
 
11210
- assert(dst->nb[0] == sizeof(float));
11211
- assert(src0->nb[0] == sizeof(float));
11212
-
11213
11199
  for (int i = 0; i < n; i++) {
11214
11200
  ggml_vec_tanh_f32(nc,
11215
11201
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11244,6 +11230,8 @@ static void ggml_compute_forward_elu_f32(
11244
11230
  const struct ggml_tensor * src0 = dst->src[0];
11245
11231
 
11246
11232
  assert(params->ith == 0);
11233
+ assert(ggml_is_contiguous_1(src0));
11234
+ assert(ggml_is_contiguous_1(dst));
11247
11235
  assert(ggml_are_same_shape(src0, dst));
11248
11236
 
11249
11237
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11253,9 +11241,6 @@ static void ggml_compute_forward_elu_f32(
11253
11241
  const int n = ggml_nrows(src0);
11254
11242
  const int nc = src0->ne[0];
11255
11243
 
11256
- assert(dst->nb[0] == sizeof(float));
11257
- assert(src0->nb[0] == sizeof(float));
11258
-
11259
11244
  for (int i = 0; i < n; i++) {
11260
11245
  ggml_vec_elu_f32(nc,
11261
11246
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11290,6 +11275,8 @@ static void ggml_compute_forward_relu_f32(
11290
11275
  const struct ggml_tensor * src0 = dst->src[0];
11291
11276
 
11292
11277
  assert(params->ith == 0);
11278
+ assert(ggml_is_contiguous_1(src0));
11279
+ assert(ggml_is_contiguous_1(dst));
11293
11280
  assert(ggml_are_same_shape(src0, dst));
11294
11281
 
11295
11282
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11299,9 +11286,6 @@ static void ggml_compute_forward_relu_f32(
11299
11286
  const int n = ggml_nrows(src0);
11300
11287
  const int nc = src0->ne[0];
11301
11288
 
11302
- assert(dst->nb[0] == sizeof(float));
11303
- assert(src0->nb[0] == sizeof(float));
11304
-
11305
11289
  for (int i = 0; i < n; i++) {
11306
11290
  ggml_vec_relu_f32(nc,
11307
11291
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11336,6 +11320,8 @@ static void ggml_compute_forward_sigmoid_f32(
11336
11320
  const struct ggml_tensor * src0 = dst->src[0];
11337
11321
 
11338
11322
  assert(params->ith == 0);
11323
+ assert(ggml_is_contiguous_1(src0));
11324
+ assert(ggml_is_contiguous_1(dst));
11339
11325
  assert(ggml_are_same_shape(src0, dst));
11340
11326
 
11341
11327
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11345,9 +11331,6 @@ static void ggml_compute_forward_sigmoid_f32(
11345
11331
  const int n = ggml_nrows(src0);
11346
11332
  const int nc = src0->ne[0];
11347
11333
 
11348
- assert(dst->nb[0] == sizeof(float));
11349
- assert(src0->nb[0] == sizeof(float));
11350
-
11351
11334
  for (int i = 0; i < n; i++) {
11352
11335
  ggml_vec_sigmoid_f32(nc,
11353
11336
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11381,9 +11364,9 @@ static void ggml_compute_forward_gelu_f32(
11381
11364
 
11382
11365
  const struct ggml_tensor * src0 = dst->src[0];
11383
11366
 
11384
- GGML_ASSERT(ggml_is_contiguous_1(src0));
11385
- GGML_ASSERT(ggml_is_contiguous_1(dst));
11386
- GGML_ASSERT(ggml_are_same_shape(src0, dst));
11367
+ assert(ggml_is_contiguous_1(src0));
11368
+ assert(ggml_is_contiguous_1(dst));
11369
+ assert(ggml_are_same_shape(src0, dst));
11387
11370
 
11388
11371
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11389
11372
  return;
@@ -11444,9 +11427,9 @@ static void ggml_compute_forward_gelu_quick_f32(
11444
11427
 
11445
11428
  const struct ggml_tensor * src0 = dst->src[0];
11446
11429
 
11447
- GGML_ASSERT(ggml_is_contiguous_1(src0));
11448
- GGML_ASSERT(ggml_is_contiguous_1(dst));
11449
- GGML_ASSERT(ggml_are_same_shape(src0, dst));
11430
+ assert(ggml_is_contiguous_1(src0));
11431
+ assert(ggml_is_contiguous_1(dst));
11432
+ assert(ggml_are_same_shape(src0, dst));
11450
11433
 
11451
11434
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11452
11435
  return;
@@ -11507,9 +11490,9 @@ static void ggml_compute_forward_silu_f32(
11507
11490
 
11508
11491
  const struct ggml_tensor * src0 = dst->src[0];
11509
11492
 
11510
- GGML_ASSERT(ggml_is_contiguous_1(src0));
11511
- GGML_ASSERT(ggml_is_contiguous_1(dst));
11512
- GGML_ASSERT(ggml_are_same_shape(src0, dst));
11493
+ assert(ggml_is_contiguous_1(src0));
11494
+ assert(ggml_is_contiguous_1(dst));
11495
+ assert(ggml_are_same_shape(src0, dst));
11513
11496
 
11514
11497
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11515
11498
  return;
@@ -11570,6 +11553,8 @@ static void ggml_compute_forward_leaky_relu_f32(
11570
11553
  const struct ggml_tensor * src0 = dst->src[0];
11571
11554
 
11572
11555
  assert(params->ith == 0);
11556
+ assert(ggml_is_contiguous_1(src0));
11557
+ assert(ggml_is_contiguous_1(dst));
11573
11558
  assert(ggml_are_same_shape(src0, dst));
11574
11559
 
11575
11560
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11619,11 +11604,11 @@ static void ggml_compute_forward_silu_back_f32(
11619
11604
  const struct ggml_tensor * src0 = dst->src[0];
11620
11605
  const struct ggml_tensor * grad = dst->src[1];
11621
11606
 
11622
- GGML_ASSERT(ggml_is_contiguous_1(grad));
11623
- GGML_ASSERT(ggml_is_contiguous_1(src0));
11624
- GGML_ASSERT(ggml_is_contiguous_1(dst));
11625
- GGML_ASSERT(ggml_are_same_shape(src0, dst));
11626
- GGML_ASSERT(ggml_are_same_shape(src0, grad));
11607
+ assert(ggml_is_contiguous_1(grad));
11608
+ assert(ggml_is_contiguous_1(src0));
11609
+ assert(ggml_is_contiguous_1(dst));
11610
+ assert(ggml_are_same_shape(src0, dst));
11611
+ assert(ggml_are_same_shape(src0, grad));
11627
11612
 
11628
11613
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11629
11614
  return;
@@ -11685,6 +11670,8 @@ static void ggml_compute_forward_hardswish_f32(
11685
11670
  const struct ggml_tensor * src0 = dst->src[0];
11686
11671
 
11687
11672
  assert(params->ith == 0);
11673
+ assert(ggml_is_contiguous_1(src0));
11674
+ assert(ggml_is_contiguous_1(dst));
11688
11675
  assert(ggml_are_same_shape(src0, dst));
11689
11676
 
11690
11677
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11694,9 +11681,6 @@ static void ggml_compute_forward_hardswish_f32(
11694
11681
  const int n = ggml_nrows(src0);
11695
11682
  const int nc = src0->ne[0];
11696
11683
 
11697
- assert(dst->nb[0] == sizeof(float));
11698
- assert(src0->nb[0] == sizeof(float));
11699
-
11700
11684
  for (int i = 0; i < n; i++) {
11701
11685
  ggml_vec_hardswish_f32(nc,
11702
11686
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11728,6 +11712,8 @@ static void ggml_compute_forward_hardsigmoid_f32(
11728
11712
  const struct ggml_tensor * src0 = dst->src[0];
11729
11713
 
11730
11714
  assert(params->ith == 0);
11715
+ assert(ggml_is_contiguous_1(src0));
11716
+ assert(ggml_is_contiguous_1(dst));
11731
11717
  assert(ggml_are_same_shape(src0, dst));
11732
11718
 
11733
11719
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11737,9 +11723,6 @@ static void ggml_compute_forward_hardsigmoid_f32(
11737
11723
  const int n = ggml_nrows(src0);
11738
11724
  const int nc = src0->ne[0];
11739
11725
 
11740
- assert(dst->nb[0] == sizeof(float));
11741
- assert(src0->nb[0] == sizeof(float));
11742
-
11743
11726
  for (int i = 0; i < n; i++) {
11744
11727
  ggml_vec_hardsigmoid_f32(nc,
11745
11728
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -12190,39 +12173,6 @@ static void ggml_compute_forward_group_norm(
12190
12173
 
12191
12174
  // ggml_compute_forward_mul_mat
12192
12175
 
12193
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
12194
- // helper function to determine if it is better to use BLAS or not
12195
- // for large matrices, BLAS is faster
12196
- static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
12197
- const struct ggml_tensor * src0 = dst->src[0];
12198
- const struct ggml_tensor * src1 = dst->src[1];
12199
-
12200
- //const int64_t ne00 = src0->ne[0];
12201
- //const int64_t ne01 = src0->ne[1];
12202
-
12203
- const int64_t ne10 = src1->ne[0];
12204
-
12205
- const int64_t ne0 = dst->ne[0];
12206
- const int64_t ne1 = dst->ne[1];
12207
-
12208
- // NOTE: with GGML_OP_MUL_MAT_ID we don't want to go through the BLAS branch because it will dequantize (to_float)
12209
- // all the experts for each batch element and the processing would become incredibly slow
12210
- // TODO: find the optimal values for these
12211
- if (dst->op != GGML_OP_MUL_MAT_ID &&
12212
- ggml_is_contiguous(src0) &&
12213
- ggml_is_contiguous(src1) &&
12214
- //src0->type == GGML_TYPE_F32 &&
12215
- src1->type == GGML_TYPE_F32 &&
12216
- (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
12217
-
12218
- /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
12219
- return true;
12220
- }
12221
-
12222
- return false;
12223
- }
12224
- #endif
12225
-
12226
12176
  static void ggml_compute_forward_mul_mat_one_chunk(
12227
12177
  const struct ggml_compute_params * params,
12228
12178
  struct ggml_tensor * dst,
@@ -12360,73 +12310,6 @@ static void ggml_compute_forward_mul_mat(
12360
12310
  // nb01 >= nb00 - src0 is not transposed
12361
12311
  // compute by src0 rows
12362
12312
 
12363
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
12364
- if (ggml_compute_forward_mul_mat_use_blas(dst)) {
12365
- const int64_t ne_plane = ne01*ne00;
12366
- const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
12367
- UNUSED(desired_wsize);
12368
-
12369
- if (params->type == GGML_TASK_TYPE_INIT) {
12370
- if (type != GGML_TYPE_F32) {
12371
- assert(params->wsize >= desired_wsize);
12372
- // parallelize by src0 rows
12373
- for (int64_t i13 = 0; i13 < ne13; i13++) {
12374
- for (int64_t i12 = 0; i12 < ne12; i12++) {
12375
- // broadcast src0 into src1 across 2nd,3rd dimension
12376
- const int64_t i03 = i13/r3;
12377
- const int64_t i02 = i12/r2;
12378
-
12379
- const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
12380
- float * const wdata = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
12381
- ggml_to_float_t const to_float = type_traits[type].to_float;
12382
-
12383
- for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
12384
- to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
12385
- }
12386
- }
12387
- }
12388
- }
12389
- return;
12390
- }
12391
-
12392
- if (params->type == GGML_TASK_TYPE_FINALIZE) {
12393
- return;
12394
- }
12395
-
12396
- // perform sgemm, parallelization controlled by blas lib
12397
- if (ith != 0) {
12398
- return;
12399
- }
12400
-
12401
- //const int64_t tgemm0 = ggml_perf_time_us();
12402
- for (int64_t i13 = 0; i13 < ne13; i13++) {
12403
- for (int64_t i12 = 0; i12 < ne12; i12++) {
12404
- const int64_t i03 = i13/r3;
12405
- const int64_t i02 = i12/r2;
12406
-
12407
- const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
12408
- const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
12409
- float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
12410
-
12411
- if (type != GGML_TYPE_F32) {
12412
- x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
12413
- }
12414
-
12415
- cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
12416
- ne1, ne01, ne10,
12417
- 1.0f, y, ne10,
12418
- x, ne00,
12419
- 0.0f, d, ne01);
12420
- }
12421
- }
12422
- //printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);
12423
-
12424
- //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
12425
-
12426
- return;
12427
- }
12428
- #endif
12429
-
12430
12313
  #if GGML_USE_LLAMAFILE
12431
12314
  const bool src1_cont = ggml_is_contiguous(src1);
12432
12315
 
@@ -12807,19 +12690,7 @@ static void ggml_compute_forward_out_prod_f32(
12807
12690
  // nb01 >= nb00 - src0 is not transposed
12808
12691
  // compute by src0 rows
12809
12692
 
12810
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
12811
- bool use_blas = ggml_is_matrix(src0) &&
12812
- ggml_is_matrix(src1) &&
12813
- ggml_is_contiguous(src0) &&
12814
- (ggml_is_contiguous(src1) || ggml_is_transposed(src1));
12815
- #endif
12816
-
12817
12693
  if (params->type == GGML_TASK_TYPE_INIT) {
12818
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
12819
- if (use_blas) {
12820
- return;
12821
- }
12822
- #endif
12823
12694
  if (ith != 0) {
12824
12695
  return;
12825
12696
  }
@@ -12831,50 +12702,6 @@ static void ggml_compute_forward_out_prod_f32(
12831
12702
  return;
12832
12703
  }
12833
12704
 
12834
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
12835
- if (use_blas) {
12836
- if (params->ith != 0) { // All threads other than the first do no work.
12837
- return;
12838
- }
12839
- // Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
12840
- // src0: (k,n)
12841
- // src1: (k,m)
12842
- // dst: (m,n)
12843
- //
12844
- // Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
12845
- // Also expressed as (major,minor)
12846
- // a: (m,k): so src1 transposed
12847
- // b: (k,n): so src0
12848
- // c: (m,n)
12849
- //
12850
- // However, if ggml_is_transposed(src1) is true, then
12851
- // src1->data already contains a transposed version, so sgemm mustn't
12852
- // transpose it further.
12853
-
12854
- int n = src0->ne[0];
12855
- int k = src0->ne[1];
12856
- int m = src1->ne[0];
12857
-
12858
- int transposeA, lda;
12859
-
12860
- if (!ggml_is_transposed(src1)) {
12861
- transposeA = CblasTrans;
12862
- lda = m;
12863
- } else {
12864
- transposeA = CblasNoTrans;
12865
- lda = k;
12866
- }
12867
-
12868
- float * a = (float *) ((char *) src1->data);
12869
- float * b = (float *) ((char *) src0->data);
12870
- float * c = (float *) ((char *) dst->data);
12871
-
12872
- cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
12873
-
12874
- return;
12875
- }
12876
- #endif
12877
-
12878
12705
  // dst[:,:,:,:] = 0
12879
12706
  // for i2,i3:
12880
12707
  // for i1:
@@ -13004,8 +12831,6 @@ static void ggml_compute_forward_out_prod_q_f32(
13004
12831
  // nb01 >= nb00 - src0 is not transposed
13005
12832
  // compute by src0 rows
13006
12833
 
13007
- // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
13008
-
13009
12834
  if (params->type == GGML_TASK_TYPE_INIT) {
13010
12835
  if (ith != 0) {
13011
12836
  return;
@@ -13402,6 +13227,8 @@ static void ggml_compute_forward_get_rows_q(
13402
13227
  const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
13403
13228
  const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
13404
13229
 
13230
+ assert(i01 >= 0 && i01 < ne01);
13231
+
13405
13232
  dequantize_row_q(
13406
13233
  (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
13407
13234
  (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
@@ -13445,6 +13272,8 @@ static void ggml_compute_forward_get_rows_f16(
13445
13272
  const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
13446
13273
  const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
13447
13274
 
13275
+ assert(i01 >= 0 && i01 < ne01);
13276
+
13448
13277
  ggml_fp16_to_fp32_row(
13449
13278
  (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
13450
13279
  (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
@@ -13488,7 +13317,9 @@ static void ggml_compute_forward_get_rows_bf16(
13488
13317
  const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
13489
13318
  const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
13490
13319
 
13491
- ggml_bf16_to_fp32_row(
13320
+ assert(i01 >= 0 && i01 < ne01);
13321
+
13322
+ ggml_bf16_to_fp32_row(
13492
13323
  (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
13493
13324
  (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
13494
13325
  }
@@ -13531,6 +13362,8 @@ static void ggml_compute_forward_get_rows_f32(
13531
13362
  const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
13532
13363
  const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
13533
13364
 
13365
+ assert(i01 >= 0 && i01 < ne01);
13366
+
13534
13367
  ggml_vec_cpy_f32(nc,
13535
13368
  (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
13536
13369
  (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
@@ -16686,7 +16519,10 @@ static void ggml_compute_forward_map_unary_f32(
16686
16519
 
16687
16520
  const struct ggml_tensor * src0 = dst->src[0];
16688
16521
 
16689
- GGML_ASSERT(ggml_are_same_shape(src0, dst));
16522
+ assert(params->ith == 0);
16523
+ assert(ggml_is_contiguous_1(src0));
16524
+ assert(ggml_is_contiguous_1(dst));
16525
+ assert(ggml_are_same_shape(src0, dst));
16690
16526
 
16691
16527
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
16692
16528
  return;
@@ -16695,9 +16531,6 @@ static void ggml_compute_forward_map_unary_f32(
16695
16531
  const int n = ggml_nrows(src0);
16696
16532
  const int nc = src0->ne[0];
16697
16533
 
16698
- assert( dst->nb[0] == sizeof(float));
16699
- assert(src0->nb[0] == sizeof(float));
16700
-
16701
16534
  for (int i = 0; i < n; i++) {
16702
16535
  fun(nc,
16703
16536
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -16735,6 +16568,9 @@ static void ggml_compute_forward_map_binary_f32(
16735
16568
  const struct ggml_tensor * src1 = dst->src[1];
16736
16569
 
16737
16570
  assert(params->ith == 0);
16571
+ assert(ggml_is_contiguous_1(src0));
16572
+ assert(ggml_is_contiguous_1(src1));
16573
+ assert(ggml_is_contiguous_1(dst));
16738
16574
  assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
16739
16575
 
16740
16576
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -16744,10 +16580,6 @@ static void ggml_compute_forward_map_binary_f32(
16744
16580
  const int n = ggml_nrows(src0);
16745
16581
  const int nc = src0->ne[0];
16746
16582
 
16747
- assert( dst->nb[0] == sizeof(float));
16748
- assert(src0->nb[0] == sizeof(float));
16749
- assert(src1->nb[0] == sizeof(float));
16750
-
16751
16583
  for (int i = 0; i < n; i++) {
16752
16584
  fun(nc,
16753
16585
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -18905,6 +18737,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
18905
18737
  switch (node->op) {
18906
18738
  case GGML_OP_CPY:
18907
18739
  case GGML_OP_DUP:
18740
+ case GGML_OP_CONT:
18908
18741
  case GGML_OP_ADD:
18909
18742
  case GGML_OP_ADD1:
18910
18743
  case GGML_OP_ACC:
@@ -18989,7 +18822,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
18989
18822
  } break;
18990
18823
  case GGML_OP_SCALE:
18991
18824
  case GGML_OP_SET:
18992
- case GGML_OP_CONT:
18993
18825
  case GGML_OP_RESHAPE:
18994
18826
  case GGML_OP_VIEW:
18995
18827
  case GGML_OP_PERMUTE:
@@ -19149,8 +18981,11 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput
19149
18981
  sched_yield();
19150
18982
  }
19151
18983
 
19152
- * node_n = atomic_load(&state->shared->node_n);
19153
- if (* node_n != last_node_n) break;
18984
+ *node_n = atomic_load(&state->shared->node_n);
18985
+ if (*node_n != last_node_n) {
18986
+ break;
18987
+ }
18988
+
19154
18989
  #if defined(__SSE3__)
19155
18990
  // Tell the processor we're spinning. It's a processor hint for spinlocks.
19156
18991
  _mm_pause();
@@ -19160,15 +18995,18 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput
19160
18995
 
19161
18996
  static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
19162
18997
  // wait for other threads to finish
19163
- const int last_task_phase = * task_phase;
18998
+ const int last_task_phase = *task_phase;
19164
18999
 
19165
19000
  while (true) {
19166
19001
  if (do_yield) {
19167
19002
  sched_yield();
19168
19003
  }
19169
19004
 
19170
- * task_phase = atomic_load(&state->shared->node_task);
19171
- if (* task_phase != last_task_phase) break;
19005
+ *task_phase = atomic_load(&state->shared->node_task);
19006
+ if (*task_phase != last_task_phase) {
19007
+ break;
19008
+ }
19009
+
19172
19010
  #if defined(__SSE3__)
19173
19011
  // Tell the processor we're spinning. It's a processor hint for spinlocks.
19174
19012
  _mm_pause();
@@ -19368,17 +19206,6 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
19368
19206
  {
19369
19207
  const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
19370
19208
 
19371
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
19372
- if (ggml_compute_forward_mul_mat_use_blas(node)) {
19373
- if (node->src[0]->type != GGML_TYPE_F32) {
19374
- // here we need memory for fully dequantized matrix from src0
19375
- // take into account that src0 can be broadcasted into src1[2,3]
19376
- cur = ggml_type_size(GGML_TYPE_F32)
19377
- * node->src[0]->ne[0]*node->src[0]->ne[1]
19378
- * node->src[1]->ne[2]*node->src[1]->ne[3];
19379
- }
19380
- } else
19381
- #endif
19382
19209
  if (node->src[1]->type != vec_dot_type) {
19383
19210
  cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
19384
19211
  }
@@ -22676,7 +22503,7 @@ int ggml_cpu_has_wasm_simd(void) {
22676
22503
  }
22677
22504
 
22678
22505
  int ggml_cpu_has_blas(void) {
22679
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
22506
+ #if defined(GGML_USE_BLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
22680
22507
  return 1;
22681
22508
  #else
22682
22509
  return 0;