llama_cpp 0.16.0 → 0.16.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (134) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +6 -0
  3. data/ext/llama_cpp/extconf.rb +2 -0
  4. data/ext/llama_cpp/llama_cpp.cpp +2 -0
  5. data/lib/llama_cpp/version.rb +2 -2
  6. data/sig/llama_cpp.rbs +2 -0
  7. data/vendor/tmp/llama.cpp/Makefile +110 -53
  8. data/vendor/tmp/llama.cpp/ggml-alloc.c +78 -22
  9. data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
  10. data/vendor/tmp/llama.cpp/ggml-backend.c +178 -64
  11. data/vendor/tmp/llama.cpp/ggml-backend.h +3 -3
  12. data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
  13. data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
  14. data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
  15. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +1 -0
  16. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +21 -9
  17. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +1 -1
  18. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +15 -1491
  19. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +76 -61
  20. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +77 -10
  21. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +1 -0
  22. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +1 -1
  23. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +1 -1
  24. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +1 -1
  25. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +1 -1
  26. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +1 -1
  27. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +1 -1
  28. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +1 -1
  29. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +1 -1
  30. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +1 -1
  31. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +1 -1
  32. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +1 -1
  33. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +1 -1
  34. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +1 -1
  35. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +1 -1
  36. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +1 -1
  37. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +1 -1
  38. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +1 -1
  39. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +1 -1
  40. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +1 -1
  41. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +1 -1
  42. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +1 -1
  43. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +1 -1
  44. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +1 -1
  45. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +1 -1
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +1 -1
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +1 -1
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +1 -1
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +1 -1
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +1 -1
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +1 -1
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +1 -1
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +1 -1
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +1 -1
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +1 -1
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +1 -1
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +1 -1
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +1 -1
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +1 -1
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +1 -1
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +1 -1
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +1 -1
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +1 -1
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +1 -1
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +1 -1
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +1 -1
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +1 -1
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +1 -1
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +1 -1
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +1 -1
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +1 -1
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +1 -1
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +1 -1
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +1 -1
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +1 -1
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +1 -1
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +1 -1
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +1 -1
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +1 -1
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +1 -1
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +1 -1
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +1 -1
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +1 -1
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +1 -1
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +1 -1
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +1 -1
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +1 -1
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +1 -1
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +1 -1
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +1 -1
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +1 -1
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +1 -1
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +1 -1
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +1 -1
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +1 -1
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +1 -1
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +1 -1
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +1 -1
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +1 -1
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +1 -1
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +1 -1
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +1 -1
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +1 -1
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +1 -1
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +1 -1
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +1 -1
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +1 -1
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +1 -1
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +1 -1
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +1 -1
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +1 -1
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +1 -1
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  123. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +20 -0
  124. data/vendor/tmp/llama.cpp/ggml-cuda.cu +95 -129
  125. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +8 -7
  126. data/vendor/tmp/llama.cpp/ggml-metal.m +11 -9
  127. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +13 -12
  128. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +19 -23
  129. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +1230 -1129
  130. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +181 -148
  131. data/vendor/tmp/llama.cpp/ggml.c +102 -275
  132. data/vendor/tmp/llama.cpp/llama.cpp +103 -47
  133. data/vendor/tmp/llama.cpp/llama.h +4 -0
  134. metadata +15 -3
@@ -297,12 +297,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
297
297
 
298
298
  #if defined(GGML_USE_ACCELERATE)
299
299
  #include <Accelerate/Accelerate.h>
300
- #elif defined(GGML_USE_OPENBLAS)
301
- #if defined(GGML_BLAS_USE_MKL)
302
- #include <mkl.h>
303
- #else
304
- #include <cblas.h>
305
- #endif
306
300
  #endif
307
301
 
308
302
  // floating point type used to accumulate sums
@@ -3212,35 +3206,42 @@ GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor) {
3212
3206
  return tensor->nb[0] > tensor->nb[1];
3213
3207
  }
3214
3208
 
3215
- GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
3216
- static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3209
+ static bool ggml_is_contiguous_n(const struct ggml_tensor * tensor, int n) {
3210
+ size_t next_nb = ggml_type_size(tensor->type);
3211
+ if (tensor->ne[0] != ggml_blck_size(tensor->type) && tensor->nb[0] != next_nb) {
3212
+ return false;
3213
+ }
3214
+ next_nb *= tensor->ne[0]/ggml_blck_size(tensor->type);
3215
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
3216
+ if (tensor->ne[i] != 1) {
3217
+ if (i > n) {
3218
+ if (tensor->nb[i] != next_nb) {
3219
+ return false;
3220
+ }
3221
+ next_nb *= tensor->ne[i];
3222
+ } else {
3223
+ // this dimension does not need to be contiguous
3224
+ next_nb = tensor->ne[i]*tensor->nb[i];
3225
+ }
3226
+ }
3227
+ }
3228
+ return true;
3229
+ }
3217
3230
 
3218
- return
3219
- tensor->nb[0] == ggml_type_size(tensor->type) &&
3220
- tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
3221
- tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
3222
- tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
3231
+ GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
3232
+ return ggml_is_contiguous_0(tensor);
3223
3233
  }
3224
3234
 
3225
3235
  GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) {
3226
- return ggml_is_contiguous(tensor);
3236
+ return ggml_is_contiguous_n(tensor, 0);
3227
3237
  }
3228
3238
 
3229
3239
  GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor) {
3230
- static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3231
-
3232
- return
3233
- tensor->nb[0] == ggml_type_size(tensor->type) &&
3234
- tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
3235
- tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
3240
+ return ggml_is_contiguous_n(tensor, 1);
3236
3241
  }
3237
3242
 
3238
3243
  GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) {
3239
- static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3240
-
3241
- return
3242
- tensor->nb[0] == ggml_type_size(tensor->type) &&
3243
- tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
3244
+ return ggml_is_contiguous_n(tensor, 2);
3244
3245
  }
3245
3246
 
3246
3247
  GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
@@ -3272,20 +3273,20 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor
3272
3273
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3273
3274
 
3274
3275
  return
3275
- (t0->ne[0] == t1->ne[0] ) &&
3276
- (t0->ne[1] == t1->ne[1] ) &&
3277
- (t0->ne[2] == t1->ne[2] ) &&
3278
- (t0->ne[3] == t1->ne[3] );
3276
+ (t0->ne[0] == t1->ne[0]) &&
3277
+ (t0->ne[1] == t1->ne[1]) &&
3278
+ (t0->ne[2] == t1->ne[2]) &&
3279
+ (t0->ne[3] == t1->ne[3]);
3279
3280
  }
3280
3281
 
3281
3282
  bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
3282
3283
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3283
3284
 
3284
3285
  return
3285
- (t0->nb[0] == t1->nb[0] ) &&
3286
- (t0->nb[1] == t1->nb[1] ) &&
3287
- (t0->nb[2] == t1->nb[2] ) &&
3288
- (t0->nb[3] == t1->nb[3] );
3286
+ (t0->nb[0] == t1->nb[0]) &&
3287
+ (t0->nb[1] == t1->nb[1]) &&
3288
+ (t0->nb[2] == t1->nb[2]) &&
3289
+ (t0->nb[3] == t1->nb[3]);
3289
3290
  }
3290
3291
 
3291
3292
  // check if t1 can be represented as a repeatition of t0
@@ -4078,32 +4079,26 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
4078
4079
  switch (tensor->type) {
4079
4080
  case GGML_TYPE_I8:
4080
4081
  {
4081
- GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
4082
4082
  return ((int8_t *)(tensor->data))[i];
4083
4083
  }
4084
4084
  case GGML_TYPE_I16:
4085
4085
  {
4086
- GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
4087
4086
  return ((int16_t *)(tensor->data))[i];
4088
4087
  }
4089
4088
  case GGML_TYPE_I32:
4090
4089
  {
4091
- GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
4092
4090
  return ((int32_t *)(tensor->data))[i];
4093
4091
  }
4094
4092
  case GGML_TYPE_F16:
4095
4093
  {
4096
- GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
4097
4094
  return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
4098
4095
  }
4099
4096
  case GGML_TYPE_BF16:
4100
4097
  {
4101
- GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
4102
4098
  return GGML_BF16_TO_FP32(((ggml_bf16_t *)(tensor->data))[i]);
4103
4099
  }
4104
4100
  case GGML_TYPE_F32:
4105
4101
  {
4106
- GGML_ASSERT(tensor->nb[0] == sizeof(float));
4107
4102
  return ((float *)(tensor->data))[i];
4108
4103
  }
4109
4104
  default:
@@ -4125,32 +4120,26 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
4125
4120
  switch (tensor->type) {
4126
4121
  case GGML_TYPE_I8:
4127
4122
  {
4128
- GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
4129
4123
  ((int8_t *)(tensor->data))[i] = value;
4130
4124
  } break;
4131
4125
  case GGML_TYPE_I16:
4132
4126
  {
4133
- GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
4134
4127
  ((int16_t *)(tensor->data))[i] = value;
4135
4128
  } break;
4136
4129
  case GGML_TYPE_I32:
4137
4130
  {
4138
- GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
4139
4131
  ((int32_t *)(tensor->data))[i] = value;
4140
4132
  } break;
4141
4133
  case GGML_TYPE_F16:
4142
4134
  {
4143
- GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
4144
4135
  ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
4145
4136
  } break;
4146
4137
  case GGML_TYPE_BF16:
4147
4138
  {
4148
- GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
4149
4139
  ((ggml_bf16_t *)(tensor->data))[i] = GGML_FP32_TO_BF16(value);
4150
4140
  } break;
4151
4141
  case GGML_TYPE_F32:
4152
4142
  {
4153
- GGML_ASSERT(tensor->nb[0] == sizeof(float));
4154
4143
  ((float *)(tensor->data))[i] = value;
4155
4144
  } break;
4156
4145
  default:
@@ -7343,13 +7332,15 @@ struct ggml_tensor * ggml_add_rel_pos_inplace(
7343
7332
  return ggml_add_rel_pos_impl(ctx, a, pw, ph, true);
7344
7333
  }
7345
7334
 
7346
- // gmml_unary
7335
+ // ggml_unary
7347
7336
 
7348
7337
  static struct ggml_tensor * ggml_unary_impl(
7349
7338
  struct ggml_context * ctx,
7350
7339
  struct ggml_tensor * a,
7351
7340
  enum ggml_unary_op op,
7352
7341
  bool inplace) {
7342
+ GGML_ASSERT(ggml_is_contiguous_1(a));
7343
+
7353
7344
  bool is_node = false;
7354
7345
 
7355
7346
  if (!inplace && (a->grad)) {
@@ -11014,6 +11005,8 @@ static void ggml_compute_forward_abs_f32(
11014
11005
  const struct ggml_tensor * src0 = dst->src[0];
11015
11006
 
11016
11007
  assert(params->ith == 0);
11008
+ assert(ggml_is_contiguous_1(src0));
11009
+ assert(ggml_is_contiguous_1(dst));
11017
11010
  assert(ggml_are_same_shape(src0, dst));
11018
11011
 
11019
11012
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11023,9 +11016,6 @@ static void ggml_compute_forward_abs_f32(
11023
11016
  const int n = ggml_nrows(src0);
11024
11017
  const int nc = src0->ne[0];
11025
11018
 
11026
- assert(dst->nb[0] == sizeof(float));
11027
- assert(src0->nb[0] == sizeof(float));
11028
-
11029
11019
  for (int i = 0; i < n; i++) {
11030
11020
  ggml_vec_abs_f32(nc,
11031
11021
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11060,6 +11050,8 @@ static void ggml_compute_forward_sgn_f32(
11060
11050
  const struct ggml_tensor * src0 = dst->src[0];
11061
11051
 
11062
11052
  assert(params->ith == 0);
11053
+ assert(ggml_is_contiguous_1(src0));
11054
+ assert(ggml_is_contiguous_1(dst));
11063
11055
  assert(ggml_are_same_shape(src0, dst));
11064
11056
 
11065
11057
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11069,9 +11061,6 @@ static void ggml_compute_forward_sgn_f32(
11069
11061
  const int n = ggml_nrows(src0);
11070
11062
  const int nc = src0->ne[0];
11071
11063
 
11072
- assert(dst->nb[0] == sizeof(float));
11073
- assert(src0->nb[0] == sizeof(float));
11074
-
11075
11064
  for (int i = 0; i < n; i++) {
11076
11065
  ggml_vec_sgn_f32(nc,
11077
11066
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11106,6 +11095,8 @@ static void ggml_compute_forward_neg_f32(
11106
11095
  const struct ggml_tensor * src0 = dst->src[0];
11107
11096
 
11108
11097
  assert(params->ith == 0);
11098
+ assert(ggml_is_contiguous_1(src0));
11099
+ assert(ggml_is_contiguous_1(dst));
11109
11100
  assert(ggml_are_same_shape(src0, dst));
11110
11101
 
11111
11102
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11115,9 +11106,6 @@ static void ggml_compute_forward_neg_f32(
11115
11106
  const int n = ggml_nrows(src0);
11116
11107
  const int nc = src0->ne[0];
11117
11108
 
11118
- assert(dst->nb[0] == sizeof(float));
11119
- assert(src0->nb[0] == sizeof(float));
11120
-
11121
11109
  for (int i = 0; i < n; i++) {
11122
11110
  ggml_vec_neg_f32(nc,
11123
11111
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11152,6 +11140,8 @@ static void ggml_compute_forward_step_f32(
11152
11140
  const struct ggml_tensor * src0 = dst->src[0];
11153
11141
 
11154
11142
  assert(params->ith == 0);
11143
+ assert(ggml_is_contiguous_1(src0));
11144
+ assert(ggml_is_contiguous_1(dst));
11155
11145
  assert(ggml_are_same_shape(src0, dst));
11156
11146
 
11157
11147
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11161,9 +11151,6 @@ static void ggml_compute_forward_step_f32(
11161
11151
  const int n = ggml_nrows(src0);
11162
11152
  const int nc = src0->ne[0];
11163
11153
 
11164
- assert(dst->nb[0] == sizeof(float));
11165
- assert(src0->nb[0] == sizeof(float));
11166
-
11167
11154
  for (int i = 0; i < n; i++) {
11168
11155
  ggml_vec_step_f32(nc,
11169
11156
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11198,6 +11185,8 @@ static void ggml_compute_forward_tanh_f32(
11198
11185
  const struct ggml_tensor * src0 = dst->src[0];
11199
11186
 
11200
11187
  assert(params->ith == 0);
11188
+ assert(ggml_is_contiguous_1(src0));
11189
+ assert(ggml_is_contiguous_1(dst));
11201
11190
  assert(ggml_are_same_shape(src0, dst));
11202
11191
 
11203
11192
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11207,9 +11196,6 @@ static void ggml_compute_forward_tanh_f32(
11207
11196
  const int n = ggml_nrows(src0);
11208
11197
  const int nc = src0->ne[0];
11209
11198
 
11210
- assert(dst->nb[0] == sizeof(float));
11211
- assert(src0->nb[0] == sizeof(float));
11212
-
11213
11199
  for (int i = 0; i < n; i++) {
11214
11200
  ggml_vec_tanh_f32(nc,
11215
11201
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11244,6 +11230,8 @@ static void ggml_compute_forward_elu_f32(
11244
11230
  const struct ggml_tensor * src0 = dst->src[0];
11245
11231
 
11246
11232
  assert(params->ith == 0);
11233
+ assert(ggml_is_contiguous_1(src0));
11234
+ assert(ggml_is_contiguous_1(dst));
11247
11235
  assert(ggml_are_same_shape(src0, dst));
11248
11236
 
11249
11237
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11253,9 +11241,6 @@ static void ggml_compute_forward_elu_f32(
11253
11241
  const int n = ggml_nrows(src0);
11254
11242
  const int nc = src0->ne[0];
11255
11243
 
11256
- assert(dst->nb[0] == sizeof(float));
11257
- assert(src0->nb[0] == sizeof(float));
11258
-
11259
11244
  for (int i = 0; i < n; i++) {
11260
11245
  ggml_vec_elu_f32(nc,
11261
11246
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11290,6 +11275,8 @@ static void ggml_compute_forward_relu_f32(
11290
11275
  const struct ggml_tensor * src0 = dst->src[0];
11291
11276
 
11292
11277
  assert(params->ith == 0);
11278
+ assert(ggml_is_contiguous_1(src0));
11279
+ assert(ggml_is_contiguous_1(dst));
11293
11280
  assert(ggml_are_same_shape(src0, dst));
11294
11281
 
11295
11282
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11299,9 +11286,6 @@ static void ggml_compute_forward_relu_f32(
11299
11286
  const int n = ggml_nrows(src0);
11300
11287
  const int nc = src0->ne[0];
11301
11288
 
11302
- assert(dst->nb[0] == sizeof(float));
11303
- assert(src0->nb[0] == sizeof(float));
11304
-
11305
11289
  for (int i = 0; i < n; i++) {
11306
11290
  ggml_vec_relu_f32(nc,
11307
11291
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11336,6 +11320,8 @@ static void ggml_compute_forward_sigmoid_f32(
11336
11320
  const struct ggml_tensor * src0 = dst->src[0];
11337
11321
 
11338
11322
  assert(params->ith == 0);
11323
+ assert(ggml_is_contiguous_1(src0));
11324
+ assert(ggml_is_contiguous_1(dst));
11339
11325
  assert(ggml_are_same_shape(src0, dst));
11340
11326
 
11341
11327
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11345,9 +11331,6 @@ static void ggml_compute_forward_sigmoid_f32(
11345
11331
  const int n = ggml_nrows(src0);
11346
11332
  const int nc = src0->ne[0];
11347
11333
 
11348
- assert(dst->nb[0] == sizeof(float));
11349
- assert(src0->nb[0] == sizeof(float));
11350
-
11351
11334
  for (int i = 0; i < n; i++) {
11352
11335
  ggml_vec_sigmoid_f32(nc,
11353
11336
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11381,9 +11364,9 @@ static void ggml_compute_forward_gelu_f32(
11381
11364
 
11382
11365
  const struct ggml_tensor * src0 = dst->src[0];
11383
11366
 
11384
- GGML_ASSERT(ggml_is_contiguous_1(src0));
11385
- GGML_ASSERT(ggml_is_contiguous_1(dst));
11386
- GGML_ASSERT(ggml_are_same_shape(src0, dst));
11367
+ assert(ggml_is_contiguous_1(src0));
11368
+ assert(ggml_is_contiguous_1(dst));
11369
+ assert(ggml_are_same_shape(src0, dst));
11387
11370
 
11388
11371
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11389
11372
  return;
@@ -11444,9 +11427,9 @@ static void ggml_compute_forward_gelu_quick_f32(
11444
11427
 
11445
11428
  const struct ggml_tensor * src0 = dst->src[0];
11446
11429
 
11447
- GGML_ASSERT(ggml_is_contiguous_1(src0));
11448
- GGML_ASSERT(ggml_is_contiguous_1(dst));
11449
- GGML_ASSERT(ggml_are_same_shape(src0, dst));
11430
+ assert(ggml_is_contiguous_1(src0));
11431
+ assert(ggml_is_contiguous_1(dst));
11432
+ assert(ggml_are_same_shape(src0, dst));
11450
11433
 
11451
11434
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11452
11435
  return;
@@ -11507,9 +11490,9 @@ static void ggml_compute_forward_silu_f32(
11507
11490
 
11508
11491
  const struct ggml_tensor * src0 = dst->src[0];
11509
11492
 
11510
- GGML_ASSERT(ggml_is_contiguous_1(src0));
11511
- GGML_ASSERT(ggml_is_contiguous_1(dst));
11512
- GGML_ASSERT(ggml_are_same_shape(src0, dst));
11493
+ assert(ggml_is_contiguous_1(src0));
11494
+ assert(ggml_is_contiguous_1(dst));
11495
+ assert(ggml_are_same_shape(src0, dst));
11513
11496
 
11514
11497
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11515
11498
  return;
@@ -11570,6 +11553,8 @@ static void ggml_compute_forward_leaky_relu_f32(
11570
11553
  const struct ggml_tensor * src0 = dst->src[0];
11571
11554
 
11572
11555
  assert(params->ith == 0);
11556
+ assert(ggml_is_contiguous_1(src0));
11557
+ assert(ggml_is_contiguous_1(dst));
11573
11558
  assert(ggml_are_same_shape(src0, dst));
11574
11559
 
11575
11560
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11619,11 +11604,11 @@ static void ggml_compute_forward_silu_back_f32(
11619
11604
  const struct ggml_tensor * src0 = dst->src[0];
11620
11605
  const struct ggml_tensor * grad = dst->src[1];
11621
11606
 
11622
- GGML_ASSERT(ggml_is_contiguous_1(grad));
11623
- GGML_ASSERT(ggml_is_contiguous_1(src0));
11624
- GGML_ASSERT(ggml_is_contiguous_1(dst));
11625
- GGML_ASSERT(ggml_are_same_shape(src0, dst));
11626
- GGML_ASSERT(ggml_are_same_shape(src0, grad));
11607
+ assert(ggml_is_contiguous_1(grad));
11608
+ assert(ggml_is_contiguous_1(src0));
11609
+ assert(ggml_is_contiguous_1(dst));
11610
+ assert(ggml_are_same_shape(src0, dst));
11611
+ assert(ggml_are_same_shape(src0, grad));
11627
11612
 
11628
11613
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11629
11614
  return;
@@ -11685,6 +11670,8 @@ static void ggml_compute_forward_hardswish_f32(
11685
11670
  const struct ggml_tensor * src0 = dst->src[0];
11686
11671
 
11687
11672
  assert(params->ith == 0);
11673
+ assert(ggml_is_contiguous_1(src0));
11674
+ assert(ggml_is_contiguous_1(dst));
11688
11675
  assert(ggml_are_same_shape(src0, dst));
11689
11676
 
11690
11677
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11694,9 +11681,6 @@ static void ggml_compute_forward_hardswish_f32(
11694
11681
  const int n = ggml_nrows(src0);
11695
11682
  const int nc = src0->ne[0];
11696
11683
 
11697
- assert(dst->nb[0] == sizeof(float));
11698
- assert(src0->nb[0] == sizeof(float));
11699
-
11700
11684
  for (int i = 0; i < n; i++) {
11701
11685
  ggml_vec_hardswish_f32(nc,
11702
11686
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11728,6 +11712,8 @@ static void ggml_compute_forward_hardsigmoid_f32(
11728
11712
  const struct ggml_tensor * src0 = dst->src[0];
11729
11713
 
11730
11714
  assert(params->ith == 0);
11715
+ assert(ggml_is_contiguous_1(src0));
11716
+ assert(ggml_is_contiguous_1(dst));
11731
11717
  assert(ggml_are_same_shape(src0, dst));
11732
11718
 
11733
11719
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11737,9 +11723,6 @@ static void ggml_compute_forward_hardsigmoid_f32(
11737
11723
  const int n = ggml_nrows(src0);
11738
11724
  const int nc = src0->ne[0];
11739
11725
 
11740
- assert(dst->nb[0] == sizeof(float));
11741
- assert(src0->nb[0] == sizeof(float));
11742
-
11743
11726
  for (int i = 0; i < n; i++) {
11744
11727
  ggml_vec_hardsigmoid_f32(nc,
11745
11728
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -12190,39 +12173,6 @@ static void ggml_compute_forward_group_norm(
12190
12173
 
12191
12174
  // ggml_compute_forward_mul_mat
12192
12175
 
12193
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
12194
- // helper function to determine if it is better to use BLAS or not
12195
- // for large matrices, BLAS is faster
12196
- static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
12197
- const struct ggml_tensor * src0 = dst->src[0];
12198
- const struct ggml_tensor * src1 = dst->src[1];
12199
-
12200
- //const int64_t ne00 = src0->ne[0];
12201
- //const int64_t ne01 = src0->ne[1];
12202
-
12203
- const int64_t ne10 = src1->ne[0];
12204
-
12205
- const int64_t ne0 = dst->ne[0];
12206
- const int64_t ne1 = dst->ne[1];
12207
-
12208
- // NOTE: with GGML_OP_MUL_MAT_ID we don't want to go through the BLAS branch because it will dequantize (to_float)
12209
- // all the experts for each batch element and the processing would become incredibly slow
12210
- // TODO: find the optimal values for these
12211
- if (dst->op != GGML_OP_MUL_MAT_ID &&
12212
- ggml_is_contiguous(src0) &&
12213
- ggml_is_contiguous(src1) &&
12214
- //src0->type == GGML_TYPE_F32 &&
12215
- src1->type == GGML_TYPE_F32 &&
12216
- (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
12217
-
12218
- /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
12219
- return true;
12220
- }
12221
-
12222
- return false;
12223
- }
12224
- #endif
12225
-
12226
12176
  static void ggml_compute_forward_mul_mat_one_chunk(
12227
12177
  const struct ggml_compute_params * params,
12228
12178
  struct ggml_tensor * dst,
@@ -12360,73 +12310,6 @@ static void ggml_compute_forward_mul_mat(
12360
12310
  // nb01 >= nb00 - src0 is not transposed
12361
12311
  // compute by src0 rows
12362
12312
 
12363
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
12364
- if (ggml_compute_forward_mul_mat_use_blas(dst)) {
12365
- const int64_t ne_plane = ne01*ne00;
12366
- const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
12367
- UNUSED(desired_wsize);
12368
-
12369
- if (params->type == GGML_TASK_TYPE_INIT) {
12370
- if (type != GGML_TYPE_F32) {
12371
- assert(params->wsize >= desired_wsize);
12372
- // parallelize by src0 rows
12373
- for (int64_t i13 = 0; i13 < ne13; i13++) {
12374
- for (int64_t i12 = 0; i12 < ne12; i12++) {
12375
- // broadcast src0 into src1 across 2nd,3rd dimension
12376
- const int64_t i03 = i13/r3;
12377
- const int64_t i02 = i12/r2;
12378
-
12379
- const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
12380
- float * const wdata = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
12381
- ggml_to_float_t const to_float = type_traits[type].to_float;
12382
-
12383
- for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
12384
- to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
12385
- }
12386
- }
12387
- }
12388
- }
12389
- return;
12390
- }
12391
-
12392
- if (params->type == GGML_TASK_TYPE_FINALIZE) {
12393
- return;
12394
- }
12395
-
12396
- // perform sgemm, parallelization controlled by blas lib
12397
- if (ith != 0) {
12398
- return;
12399
- }
12400
-
12401
- //const int64_t tgemm0 = ggml_perf_time_us();
12402
- for (int64_t i13 = 0; i13 < ne13; i13++) {
12403
- for (int64_t i12 = 0; i12 < ne12; i12++) {
12404
- const int64_t i03 = i13/r3;
12405
- const int64_t i02 = i12/r2;
12406
-
12407
- const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
12408
- const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
12409
- float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
12410
-
12411
- if (type != GGML_TYPE_F32) {
12412
- x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
12413
- }
12414
-
12415
- cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
12416
- ne1, ne01, ne10,
12417
- 1.0f, y, ne10,
12418
- x, ne00,
12419
- 0.0f, d, ne01);
12420
- }
12421
- }
12422
- //printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);
12423
-
12424
- //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
12425
-
12426
- return;
12427
- }
12428
- #endif
12429
-
12430
12313
  #if GGML_USE_LLAMAFILE
12431
12314
  const bool src1_cont = ggml_is_contiguous(src1);
12432
12315
 
@@ -12807,19 +12690,7 @@ static void ggml_compute_forward_out_prod_f32(
12807
12690
  // nb01 >= nb00 - src0 is not transposed
12808
12691
  // compute by src0 rows
12809
12692
 
12810
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
12811
- bool use_blas = ggml_is_matrix(src0) &&
12812
- ggml_is_matrix(src1) &&
12813
- ggml_is_contiguous(src0) &&
12814
- (ggml_is_contiguous(src1) || ggml_is_transposed(src1));
12815
- #endif
12816
-
12817
12693
  if (params->type == GGML_TASK_TYPE_INIT) {
12818
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
12819
- if (use_blas) {
12820
- return;
12821
- }
12822
- #endif
12823
12694
  if (ith != 0) {
12824
12695
  return;
12825
12696
  }
@@ -12831,50 +12702,6 @@ static void ggml_compute_forward_out_prod_f32(
12831
12702
  return;
12832
12703
  }
12833
12704
 
12834
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
12835
- if (use_blas) {
12836
- if (params->ith != 0) { // All threads other than the first do no work.
12837
- return;
12838
- }
12839
- // Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
12840
- // src0: (k,n)
12841
- // src1: (k,m)
12842
- // dst: (m,n)
12843
- //
12844
- // Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
12845
- // Also expressed as (major,minor)
12846
- // a: (m,k): so src1 transposed
12847
- // b: (k,n): so src0
12848
- // c: (m,n)
12849
- //
12850
- // However, if ggml_is_transposed(src1) is true, then
12851
- // src1->data already contains a transposed version, so sgemm mustn't
12852
- // transpose it further.
12853
-
12854
- int n = src0->ne[0];
12855
- int k = src0->ne[1];
12856
- int m = src1->ne[0];
12857
-
12858
- int transposeA, lda;
12859
-
12860
- if (!ggml_is_transposed(src1)) {
12861
- transposeA = CblasTrans;
12862
- lda = m;
12863
- } else {
12864
- transposeA = CblasNoTrans;
12865
- lda = k;
12866
- }
12867
-
12868
- float * a = (float *) ((char *) src1->data);
12869
- float * b = (float *) ((char *) src0->data);
12870
- float * c = (float *) ((char *) dst->data);
12871
-
12872
- cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
12873
-
12874
- return;
12875
- }
12876
- #endif
12877
-
12878
12705
  // dst[:,:,:,:] = 0
12879
12706
  // for i2,i3:
12880
12707
  // for i1:
@@ -13004,8 +12831,6 @@ static void ggml_compute_forward_out_prod_q_f32(
13004
12831
  // nb01 >= nb00 - src0 is not transposed
13005
12832
  // compute by src0 rows
13006
12833
 
13007
- // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
13008
-
13009
12834
  if (params->type == GGML_TASK_TYPE_INIT) {
13010
12835
  if (ith != 0) {
13011
12836
  return;
@@ -13402,6 +13227,8 @@ static void ggml_compute_forward_get_rows_q(
13402
13227
  const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
13403
13228
  const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
13404
13229
 
13230
+ assert(i01 >= 0 && i01 < ne01);
13231
+
13405
13232
  dequantize_row_q(
13406
13233
  (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
13407
13234
  (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
@@ -13445,6 +13272,8 @@ static void ggml_compute_forward_get_rows_f16(
13445
13272
  const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
13446
13273
  const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
13447
13274
 
13275
+ assert(i01 >= 0 && i01 < ne01);
13276
+
13448
13277
  ggml_fp16_to_fp32_row(
13449
13278
  (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
13450
13279
  (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
@@ -13488,7 +13317,9 @@ static void ggml_compute_forward_get_rows_bf16(
13488
13317
  const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
13489
13318
  const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
13490
13319
 
13491
- ggml_bf16_to_fp32_row(
13320
+ assert(i01 >= 0 && i01 < ne01);
13321
+
13322
+ ggml_bf16_to_fp32_row(
13492
13323
  (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
13493
13324
  (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
13494
13325
  }
@@ -13531,6 +13362,8 @@ static void ggml_compute_forward_get_rows_f32(
13531
13362
  const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
13532
13363
  const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
13533
13364
 
13365
+ assert(i01 >= 0 && i01 < ne01);
13366
+
13534
13367
  ggml_vec_cpy_f32(nc,
13535
13368
  (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
13536
13369
  (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
@@ -16686,7 +16519,10 @@ static void ggml_compute_forward_map_unary_f32(
16686
16519
 
16687
16520
  const struct ggml_tensor * src0 = dst->src[0];
16688
16521
 
16689
- GGML_ASSERT(ggml_are_same_shape(src0, dst));
16522
+ assert(params->ith == 0);
16523
+ assert(ggml_is_contiguous_1(src0));
16524
+ assert(ggml_is_contiguous_1(dst));
16525
+ assert(ggml_are_same_shape(src0, dst));
16690
16526
 
16691
16527
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
16692
16528
  return;
@@ -16695,9 +16531,6 @@ static void ggml_compute_forward_map_unary_f32(
16695
16531
  const int n = ggml_nrows(src0);
16696
16532
  const int nc = src0->ne[0];
16697
16533
 
16698
- assert( dst->nb[0] == sizeof(float));
16699
- assert(src0->nb[0] == sizeof(float));
16700
-
16701
16534
  for (int i = 0; i < n; i++) {
16702
16535
  fun(nc,
16703
16536
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -16735,6 +16568,9 @@ static void ggml_compute_forward_map_binary_f32(
16735
16568
  const struct ggml_tensor * src1 = dst->src[1];
16736
16569
 
16737
16570
  assert(params->ith == 0);
16571
+ assert(ggml_is_contiguous_1(src0));
16572
+ assert(ggml_is_contiguous_1(src1));
16573
+ assert(ggml_is_contiguous_1(dst));
16738
16574
  assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
16739
16575
 
16740
16576
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -16744,10 +16580,6 @@ static void ggml_compute_forward_map_binary_f32(
16744
16580
  const int n = ggml_nrows(src0);
16745
16581
  const int nc = src0->ne[0];
16746
16582
 
16747
- assert( dst->nb[0] == sizeof(float));
16748
- assert(src0->nb[0] == sizeof(float));
16749
- assert(src1->nb[0] == sizeof(float));
16750
-
16751
16583
  for (int i = 0; i < n; i++) {
16752
16584
  fun(nc,
16753
16585
  (float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -18905,6 +18737,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
18905
18737
  switch (node->op) {
18906
18738
  case GGML_OP_CPY:
18907
18739
  case GGML_OP_DUP:
18740
+ case GGML_OP_CONT:
18908
18741
  case GGML_OP_ADD:
18909
18742
  case GGML_OP_ADD1:
18910
18743
  case GGML_OP_ACC:
@@ -18989,7 +18822,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
18989
18822
  } break;
18990
18823
  case GGML_OP_SCALE:
18991
18824
  case GGML_OP_SET:
18992
- case GGML_OP_CONT:
18993
18825
  case GGML_OP_RESHAPE:
18994
18826
  case GGML_OP_VIEW:
18995
18827
  case GGML_OP_PERMUTE:
@@ -19149,8 +18981,11 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput
19149
18981
  sched_yield();
19150
18982
  }
19151
18983
 
19152
- * node_n = atomic_load(&state->shared->node_n);
19153
- if (* node_n != last_node_n) break;
18984
+ *node_n = atomic_load(&state->shared->node_n);
18985
+ if (*node_n != last_node_n) {
18986
+ break;
18987
+ }
18988
+
19154
18989
  #if defined(__SSE3__)
19155
18990
  // Tell the processor we're spinning. It's a processor hint for spinlocks.
19156
18991
  _mm_pause();
@@ -19160,15 +18995,18 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput
19160
18995
 
19161
18996
  static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
19162
18997
  // wait for other threads to finish
19163
- const int last_task_phase = * task_phase;
18998
+ const int last_task_phase = *task_phase;
19164
18999
 
19165
19000
  while (true) {
19166
19001
  if (do_yield) {
19167
19002
  sched_yield();
19168
19003
  }
19169
19004
 
19170
- * task_phase = atomic_load(&state->shared->node_task);
19171
- if (* task_phase != last_task_phase) break;
19005
+ *task_phase = atomic_load(&state->shared->node_task);
19006
+ if (*task_phase != last_task_phase) {
19007
+ break;
19008
+ }
19009
+
19172
19010
  #if defined(__SSE3__)
19173
19011
  // Tell the processor we're spinning. It's a processor hint for spinlocks.
19174
19012
  _mm_pause();
@@ -19368,17 +19206,6 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
19368
19206
  {
19369
19207
  const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
19370
19208
 
19371
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
19372
- if (ggml_compute_forward_mul_mat_use_blas(node)) {
19373
- if (node->src[0]->type != GGML_TYPE_F32) {
19374
- // here we need memory for fully dequantized matrix from src0
19375
- // take into account that src0 can be broadcasted into src1[2,3]
19376
- cur = ggml_type_size(GGML_TYPE_F32)
19377
- * node->src[0]->ne[0]*node->src[0]->ne[1]
19378
- * node->src[1]->ne[2]*node->src[1]->ne[3];
19379
- }
19380
- } else
19381
- #endif
19382
19209
  if (node->src[1]->type != vec_dot_type) {
19383
19210
  cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
19384
19211
  }
@@ -22676,7 +22503,7 @@ int ggml_cpu_has_wasm_simd(void) {
22676
22503
  }
22677
22504
 
22678
22505
  int ggml_cpu_has_blas(void) {
22679
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
22506
+ #if defined(GGML_USE_BLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
22680
22507
  return 1;
22681
22508
  #else
22682
22509
  return 0;