llama_cpp 0.12.2 → 0.12.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -218,6 +218,7 @@ inline static void * ggml_aligned_malloc(size_t size) {
218
218
  break;
219
219
  }
220
220
  GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
221
+ GGML_ASSERT(false);
221
222
  return NULL;
222
223
  }
223
224
  return aligned_memory;
@@ -230,6 +231,38 @@ inline static void * ggml_aligned_malloc(size_t size) {
230
231
  #endif
231
232
  #endif
232
233
 
234
+ inline static void * ggml_malloc(size_t size) {
235
+ if (size == 0) {
236
+ GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_malloc!\n");
237
+ return NULL;
238
+ }
239
+ void * result = malloc(size);
240
+ if (result == NULL) {
241
+ GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
242
+ GGML_ASSERT(false);
243
+ }
244
+ return result;
245
+ }
246
+
247
+ // calloc
248
+ inline static void * ggml_calloc(size_t num, size_t size) {
249
+ if (num == 0 || size == 0) {
250
+ GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_calloc!\n");
251
+ return NULL;
252
+ }
253
+ void * result = calloc(num, size);
254
+ if (result == NULL) {
255
+ GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
256
+ GGML_ASSERT(false);
257
+ }
258
+ return result;
259
+ }
260
+
261
+ #define GGML_MALLOC(size) ggml_malloc(size)
262
+ #define GGML_CALLOC(num, size) ggml_calloc(num, size)
263
+
264
+ #define GGML_FREE(ptr) free(ptr)
265
+
233
266
  #define UNUSED GGML_UNUSED
234
267
  #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
235
268
 
@@ -248,6 +281,10 @@ inline static void * ggml_aligned_malloc(size_t size) {
248
281
  #include "ggml-cuda.h"
249
282
  #elif defined(GGML_USE_CLBLAST)
250
283
  #include "ggml-opencl.h"
284
+ #elif defined(GGML_USE_VULKAN)
285
+ #include "ggml-vulkan.h"
286
+ #elif defined(GGML_USE_SYCL)
287
+ #include "ggml-sycl.h"
251
288
  #endif
252
289
 
253
290
  // floating point type used to accumulate sums
@@ -394,12 +431,6 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
394
431
  static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
395
432
  static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
396
433
 
397
- ggml_collect_imatrix_t g_imatrix_collect = NULL;
398
-
399
- void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect) {
400
- g_imatrix_collect = imatrix_collect;
401
- }
402
-
403
434
  static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
404
435
  [GGML_TYPE_I8] = {
405
436
  .type_name = "i8",
@@ -601,6 +632,17 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
601
632
  .vec_dot = ggml_vec_dot_iq2_xs_q8_K,
602
633
  .vec_dot_type = GGML_TYPE_Q8_K,
603
634
  },
635
+ [GGML_TYPE_IQ3_XXS] = {
636
+ .type_name = "iq3_xxs",
637
+ .blck_size = QK_K,
638
+ .type_size = sizeof(block_iq3_xxs),
639
+ .is_quantized = true,
640
+ .to_float = (ggml_to_float_t) dequantize_row_iq3_xxs,
641
+ .from_float = quantize_row_iq3_xxs,
642
+ .from_float_reference = (ggml_from_float_t)quantize_row_iq3_xxs_reference,
643
+ .vec_dot = ggml_vec_dot_iq3_xxs_q8_K,
644
+ .vec_dot_type = GGML_TYPE_Q8_K,
645
+ },
604
646
  [GGML_TYPE_Q8_K] = {
605
647
  .type_name = "q8_K",
606
648
  .blck_size = QK_K,
@@ -1424,6 +1466,9 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) {
1424
1466
  inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
1425
1467
  inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
1426
1468
  inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
1469
+ // TODO: optimize performance
1470
+ inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
1471
+ inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
1427
1472
 
1428
1473
  static const float GELU_COEF_A = 0.044715f;
1429
1474
  static const float GELU_QUICK_COEF = -1.702f;
@@ -1782,9 +1827,11 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
1782
1827
  "GELU",
1783
1828
  "GELU_QUICK",
1784
1829
  "SILU",
1830
+ "HARDSWISH",
1831
+ "HARDSIGMOID",
1785
1832
  };
1786
1833
 
1787
- static_assert(GGML_UNARY_OP_COUNT == 10, "GGML_UNARY_OP_COUNT != 10");
1834
+ static_assert(GGML_UNARY_OP_COUNT == 12, "GGML_UNARY_OP_COUNT != 12");
1788
1835
 
1789
1836
 
1790
1837
  static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
@@ -2141,6 +2188,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
2141
2188
  case GGML_FTYPE_MOSTLY_Q6_K: wtype = GGML_TYPE_Q6_K; break;
2142
2189
  case GGML_FTYPE_MOSTLY_IQ2_XXS: wtype = GGML_TYPE_IQ2_XXS; break;
2143
2190
  case GGML_FTYPE_MOSTLY_IQ2_XS: wtype = GGML_TYPE_IQ2_XS; break;
2191
+ case GGML_FTYPE_MOSTLY_IQ3_XXS: wtype = GGML_TYPE_IQ3_XXS; break;
2144
2192
  case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
2145
2193
  case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
2146
2194
  }
@@ -2294,6 +2342,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
2294
2342
  ggml_init_cublas();
2295
2343
  #elif defined(GGML_USE_CLBLAST)
2296
2344
  ggml_cl_init();
2345
+ #elif defined(GGML_USE_VULKAN)
2346
+ ggml_vk_init();
2347
+ #elif defined(GGML_USE_SYCL)
2348
+ ggml_init_sycl();
2297
2349
  #endif
2298
2350
 
2299
2351
  ggml_setup_op_has_task_pass();
@@ -3951,6 +4003,20 @@ struct ggml_tensor * ggml_silu_back(
3951
4003
  return result;
3952
4004
  }
3953
4005
 
4006
+ // ggml hardswish
4007
+ struct ggml_tensor * ggml_hardswish(
4008
+ struct ggml_context * ctx,
4009
+ struct ggml_tensor * a) {
4010
+ return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSWISH);
4011
+ }
4012
+
4013
+ // ggml hardsigmoid
4014
+ struct ggml_tensor * ggml_hardsigmoid(
4015
+ struct ggml_context * ctx,
4016
+ struct ggml_tensor * a) {
4017
+ return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSIGMOID);
4018
+ }
4019
+
3954
4020
  // ggml_norm
3955
4021
 
3956
4022
  static struct ggml_tensor * ggml_norm_impl(
@@ -5283,7 +5349,7 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
5283
5349
  int s0,
5284
5350
  int p0,
5285
5351
  int d0) {
5286
- struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false); // [N, OL, IC * K]
5352
+ struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16); // [N, OL, IC * K]
5287
5353
 
5288
5354
  struct ggml_tensor * result =
5289
5355
  ggml_mul_mat(ctx,
@@ -5350,6 +5416,30 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
5350
5416
  return result;
5351
5417
  }
5352
5418
 
5419
+ // ggml_conv_depthwise
5420
+ struct ggml_tensor * ggml_conv_depthwise_2d(
5421
+ struct ggml_context * ctx,
5422
+ struct ggml_tensor * a,
5423
+ struct ggml_tensor * b,
5424
+ int s0,
5425
+ int s1,
5426
+ int p0,
5427
+ int p1,
5428
+ int d0,
5429
+ int d1) {
5430
+
5431
+ struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
5432
+ struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
5433
+ ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
5434
+ s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW]
5435
+ struct ggml_tensor * new_b = ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
5436
+
5437
+ new_a = ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1); // [OC,1, KH, KW] => [1, OC, 1, KH * KW]
5438
+ struct ggml_tensor * result = ggml_mul_mat(ctx, new_a, new_b);
5439
+ result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
5440
+
5441
+ return result;
5442
+ }
5353
5443
  // ggml_conv_2d
5354
5444
 
5355
5445
  // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
@@ -5366,7 +5456,8 @@ struct ggml_tensor * ggml_im2col(
5366
5456
  int p1,
5367
5457
  int d0,
5368
5458
  int d1,
5369
- bool is_2D) {
5459
+ bool is_2D,
5460
+ enum ggml_type dst_type) {
5370
5461
 
5371
5462
  if(is_2D) {
5372
5463
  GGML_ASSERT(a->ne[2] == b->ne[2]);
@@ -5390,7 +5481,7 @@ struct ggml_tensor * ggml_im2col(
5390
5481
  is_2D ? b->ne[3] : 1,
5391
5482
  };
5392
5483
 
5393
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
5484
+ struct ggml_tensor * result = ggml_new_tensor(ctx, dst_type, 4, ne);
5394
5485
  int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
5395
5486
  ggml_set_op_params(result, params, sizeof(params));
5396
5487
 
@@ -5415,7 +5506,7 @@ struct ggml_tensor * ggml_conv_2d(
5415
5506
  int p1,
5416
5507
  int d0,
5417
5508
  int d1) {
5418
- struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true); // [N, OH, OW, IC * KH * KW]
5509
+ struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N, OH, OW, IC * KH * KW]
5419
5510
 
5420
5511
  struct ggml_tensor * result =
5421
5512
  ggml_mul_mat(ctx,
@@ -5541,12 +5632,13 @@ struct ggml_tensor * ggml_pool_2d(
5541
5632
  is_node = true;
5542
5633
  }
5543
5634
 
5635
+ struct ggml_tensor * result;
5544
5636
  const int64_t ne[3] = {
5545
5637
  ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
5546
5638
  ggml_calc_pool_output_size(a->ne[1], k1, s1, p1),
5547
5639
  a->ne[2],
5548
5640
  };
5549
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
5641
+ result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
5550
5642
 
5551
5643
  int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
5552
5644
  ggml_set_op_params(result, params, sizeof(params));
@@ -5554,7 +5646,6 @@ struct ggml_tensor * ggml_pool_2d(
5554
5646
  result->op = GGML_OP_POOL_2D;
5555
5647
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5556
5648
  result->src[0] = a;
5557
-
5558
5649
  return result;
5559
5650
  }
5560
5651
 
@@ -7169,6 +7260,17 @@ static void ggml_compute_forward_add_f32(
7169
7260
  const int ith = params->ith;
7170
7261
  const int nth = params->nth;
7171
7262
 
7263
+ #ifdef GGML_USE_CLBLAST
7264
+ if (src1->backend == GGML_BACKEND_GPU) {
7265
+ // TODO: OpenCL kernel support full broadcast
7266
+ GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
7267
+ if (ith == 0) {
7268
+ ggml_cl_add(src0, src1, dst);
7269
+ }
7270
+ return;
7271
+ }
7272
+ #endif
7273
+
7172
7274
  const int nr = ggml_nrows(src0);
7173
7275
 
7174
7276
  GGML_TENSOR_BINARY_OP_LOCALS
@@ -7449,7 +7551,12 @@ static void ggml_compute_forward_add(
7449
7551
  switch (src0->type) {
7450
7552
  case GGML_TYPE_F32:
7451
7553
  {
7452
- ggml_compute_forward_add_f32(params, src0, src1, dst);
7554
+ if (src1->type == GGML_TYPE_F32) {
7555
+ ggml_compute_forward_add_f32(params, src0, src1, dst);
7556
+ }
7557
+ else {
7558
+ GGML_ASSERT(false);
7559
+ }
7453
7560
  } break;
7454
7561
  case GGML_TYPE_F16:
7455
7562
  {
@@ -7475,6 +7582,7 @@ static void ggml_compute_forward_add(
7475
7582
  case GGML_TYPE_Q6_K:
7476
7583
  case GGML_TYPE_IQ2_XXS:
7477
7584
  case GGML_TYPE_IQ2_XS:
7585
+ case GGML_TYPE_IQ3_XXS:
7478
7586
  {
7479
7587
  ggml_compute_forward_add_q_f32(params, src0, src1, dst);
7480
7588
  } break;
@@ -7741,6 +7849,7 @@ static void ggml_compute_forward_add1(
7741
7849
  case GGML_TYPE_Q6_K:
7742
7850
  case GGML_TYPE_IQ2_XXS:
7743
7851
  case GGML_TYPE_IQ2_XS:
7852
+ case GGML_TYPE_IQ3_XXS:
7744
7853
  {
7745
7854
  ggml_compute_forward_add1_q_f32(params, src0, src1, dst);
7746
7855
  } break;
@@ -7770,6 +7879,9 @@ static void ggml_compute_forward_acc_f32(
7770
7879
  bool inplace = (bool) ((int32_t *) dst->op_params)[4];
7771
7880
 
7772
7881
  if (!inplace && (params->type == GGML_TASK_INIT)) {
7882
+ if (params->ith != 0) {
7883
+ return;
7884
+ }
7773
7885
  // memcpy needs to be synchronized across threads to avoid race conditions.
7774
7886
  // => do it in INIT phase
7775
7887
  memcpy(
@@ -7857,6 +7969,7 @@ static void ggml_compute_forward_acc(
7857
7969
  case GGML_TYPE_Q6_K:
7858
7970
  case GGML_TYPE_IQ2_XXS:
7859
7971
  case GGML_TYPE_IQ2_XS:
7972
+ case GGML_TYPE_IQ3_XXS:
7860
7973
  default:
7861
7974
  {
7862
7975
  GGML_ASSERT(false);
@@ -7958,7 +8071,7 @@ static void ggml_compute_forward_mul_f32(
7958
8071
  const int ith = params->ith;
7959
8072
  const int nth = params->nth;
7960
8073
 
7961
- #ifdef GGML_USE_CLBLAST
8074
+ #if defined(GGML_USE_CLBLAST)
7962
8075
  if (src1->backend == GGML_BACKEND_GPU) {
7963
8076
  // TODO: OpenCL kernel support full broadcast
7964
8077
  GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
@@ -9339,6 +9452,87 @@ static void ggml_compute_forward_silu_back(
9339
9452
  }
9340
9453
  }
9341
9454
 
9455
+
9456
+ static void ggml_compute_forward_hardswish_f32(
9457
+ const struct ggml_compute_params * params,
9458
+ const struct ggml_tensor * src0,
9459
+ struct ggml_tensor * dst) {
9460
+ assert(params->ith == 0);
9461
+ assert(ggml_are_same_shape(src0, dst));
9462
+
9463
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9464
+ return;
9465
+ }
9466
+
9467
+ const int n = ggml_nrows(src0);
9468
+ const int nc = src0->ne[0];
9469
+
9470
+ assert(dst->nb[0] == sizeof(float));
9471
+ assert(src0->nb[0] == sizeof(float));
9472
+
9473
+ for (int i = 0; i < n; i++) {
9474
+ ggml_vec_hardswish_f32(nc,
9475
+ (float *) ((char *) dst->data + i*( dst->nb[1])),
9476
+ (float *) ((char *) src0->data + i*(src0->nb[1])));
9477
+ }
9478
+ }
9479
+ static void ggml_compute_forward_hardswish(
9480
+ const struct ggml_compute_params * params,
9481
+ const struct ggml_tensor * src0,
9482
+ struct ggml_tensor * dst) {
9483
+ switch (src0->type) {
9484
+ case GGML_TYPE_F32:
9485
+ {
9486
+ ggml_compute_forward_hardswish_f32(params, src0, dst);
9487
+ } break;
9488
+ default:
9489
+ {
9490
+ GGML_ASSERT(false);
9491
+ } break;
9492
+ }
9493
+ }
9494
+
9495
+ static void ggml_compute_forward_hardsigmoid_f32(
9496
+ const struct ggml_compute_params * params,
9497
+ const struct ggml_tensor * src0,
9498
+ struct ggml_tensor * dst) {
9499
+ assert(params->ith == 0);
9500
+ assert(ggml_are_same_shape(src0, dst));
9501
+
9502
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9503
+ return;
9504
+ }
9505
+
9506
+ const int n = ggml_nrows(src0);
9507
+ const int nc = src0->ne[0];
9508
+
9509
+ assert(dst->nb[0] == sizeof(float));
9510
+ assert(src0->nb[0] == sizeof(float));
9511
+
9512
+ for (int i = 0; i < n; i++) {
9513
+ ggml_vec_hardsigmoid_f32(nc,
9514
+ (float *) ((char *) dst->data + i*( dst->nb[1])),
9515
+ (float *) ((char *) src0->data + i*(src0->nb[1])));
9516
+ }
9517
+ }
9518
+
9519
+ static void ggml_compute_forward_hardsigmoid(
9520
+ const struct ggml_compute_params * params,
9521
+ const struct ggml_tensor * src0,
9522
+ struct ggml_tensor * dst) {
9523
+ switch (src0->type) {
9524
+ case GGML_TYPE_F32:
9525
+ {
9526
+ ggml_compute_forward_hardsigmoid_f32(params, src0, dst);
9527
+ } break;
9528
+ default:
9529
+ {
9530
+ GGML_ASSERT(false);
9531
+ } break;
9532
+ }
9533
+ }
9534
+
9535
+
9342
9536
  // ggml_compute_forward_norm
9343
9537
 
9344
9538
  static void ggml_compute_forward_norm_f32(
@@ -9790,10 +9984,6 @@ static void ggml_compute_forward_mul_mat(
9790
9984
  const int ith = params->ith;
9791
9985
  const int nth = params->nth;
9792
9986
 
9793
- if (ith == 1 && g_imatrix_collect) {
9794
- g_imatrix_collect(src0, src1);
9795
- }
9796
-
9797
9987
  const enum ggml_type type = src0->type;
9798
9988
 
9799
9989
  const bool src1_cont = ggml_is_contiguous(src1);
@@ -9835,11 +10025,30 @@ static void ggml_compute_forward_mul_mat(
9835
10025
 
9836
10026
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9837
10027
  if (ggml_compute_forward_mul_mat_use_blas(dst)) {
9838
- if (params->ith != 0) {
9839
- return;
9840
- }
10028
+ const int64_t ne_plane = ne01*ne00;
10029
+ const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
10030
+ UNUSED(desired_wsize);
9841
10031
 
9842
10032
  if (params->type == GGML_TASK_INIT) {
10033
+ if (type != GGML_TYPE_F32) {
10034
+ assert(params->wsize >= desired_wsize);
10035
+ // parallelize by src0 rows
10036
+ for (int64_t i13 = 0; i13 < ne13; i13++) {
10037
+ for (int64_t i12 = 0; i12 < ne12; i12++) {
10038
+ // broadcast src0 into src1 across 2nd,3rd dimension
10039
+ const int64_t i03 = i13/r3;
10040
+ const int64_t i02 = i12/r2;
10041
+
10042
+ const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
10043
+ float * const wdata = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
10044
+ ggml_to_float_t const to_float = type_traits[type].to_float;
10045
+
10046
+ for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
10047
+ to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
10048
+ }
10049
+ }
10050
+ }
10051
+ }
9843
10052
  return;
9844
10053
  }
9845
10054
 
@@ -9847,9 +10056,14 @@ static void ggml_compute_forward_mul_mat(
9847
10056
  return;
9848
10057
  }
9849
10058
 
10059
+ // perform sgemm, parallelization controlled by blas lib
10060
+ if (ith != 0) {
10061
+ return;
10062
+ }
10063
+
10064
+ //const int64_t tgemm0 = ggml_perf_time_us();
9850
10065
  for (int64_t i13 = 0; i13 < ne13; i13++) {
9851
10066
  for (int64_t i12 = 0; i12 < ne12; i12++) {
9852
- // broadcast src0 into src1 across 2nd,3rd dimension
9853
10067
  const int64_t i03 = i13/r3;
9854
10068
  const int64_t i02 = i12/r2;
9855
10069
 
@@ -9858,17 +10072,7 @@ static void ggml_compute_forward_mul_mat(
9858
10072
  float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
9859
10073
 
9860
10074
  if (type != GGML_TYPE_F32) {
9861
- float * const wdata = params->wdata;
9862
- ggml_to_float_t const to_float = type_traits[type].to_float;
9863
-
9864
- size_t id = 0;
9865
- for (int64_t i01 = 0; i01 < ne01; ++i01) {
9866
- to_float((const char *) x + i01*nb01, wdata + id, ne00);
9867
- id += ne00;
9868
- }
9869
-
9870
- assert(id*sizeof(float) <= params->wsize);
9871
- x = wdata;
10075
+ x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
9872
10076
  }
9873
10077
 
9874
10078
  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
@@ -9878,6 +10082,7 @@ static void ggml_compute_forward_mul_mat(
9878
10082
  0.0f, d, ne01);
9879
10083
  }
9880
10084
  }
10085
+ //printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);
9881
10086
 
9882
10087
  //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
9883
10088
 
@@ -9886,6 +10091,9 @@ static void ggml_compute_forward_mul_mat(
9886
10091
  #endif
9887
10092
 
9888
10093
  if (params->type == GGML_TASK_INIT) {
10094
+ if (ith != 0) {
10095
+ return;
10096
+ }
9889
10097
  if (src1->type != vec_dot_type) {
9890
10098
  char * wdata = params->wdata;
9891
10099
  const size_t row_size = ggml_row_size(vec_dot_type, ne10);
@@ -10050,6 +10258,9 @@ static void ggml_compute_forward_mul_mat_id(
10050
10258
  #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
10051
10259
 
10052
10260
  if (params->type == GGML_TASK_INIT) {
10261
+ if (ith != 0) {
10262
+ return;
10263
+ }
10053
10264
  char * wdata = params->wdata;
10054
10265
  if (src1->type != vec_dot_type) {
10055
10266
  const size_t row_size = ggml_row_size(vec_dot_type, ne10);
@@ -10097,10 +10308,6 @@ static void ggml_compute_forward_mul_mat_id(
10097
10308
 
10098
10309
  const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];
10099
10310
 
10100
- if (ith == 1 && g_imatrix_collect) {
10101
- g_imatrix_collect(src0_cur, src1);
10102
- }
10103
-
10104
10311
  const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
10105
10312
  const size_t row_size = ggml_row_size(vec_dot_type, ne10);
10106
10313
 
@@ -10239,6 +10446,9 @@ static void ggml_compute_forward_out_prod_f32(
10239
10446
  return;
10240
10447
  }
10241
10448
  #endif
10449
+ if (ith != 0) {
10450
+ return;
10451
+ }
10242
10452
  ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
10243
10453
  return;
10244
10454
  }
@@ -10422,6 +10632,9 @@ static void ggml_compute_forward_out_prod_q_f32(
10422
10632
  // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
10423
10633
 
10424
10634
  if (params->type == GGML_TASK_INIT) {
10635
+ if (ith != 0) {
10636
+ return;
10637
+ }
10425
10638
  ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
10426
10639
  return;
10427
10640
  }
@@ -10508,6 +10721,7 @@ static void ggml_compute_forward_out_prod(
10508
10721
  case GGML_TYPE_Q6_K:
10509
10722
  case GGML_TYPE_IQ2_XXS:
10510
10723
  case GGML_TYPE_IQ2_XS:
10724
+ case GGML_TYPE_IQ3_XXS:
10511
10725
  {
10512
10726
  ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst);
10513
10727
  } break;
@@ -10606,6 +10820,9 @@ static void ggml_compute_forward_set_f32(
10606
10820
  bool inplace = (bool) ((int32_t *) dst->op_params)[4];
10607
10821
 
10608
10822
  if (!inplace && (params->type == GGML_TASK_INIT)) {
10823
+ if (params->ith != 0) {
10824
+ return;
10825
+ }
10609
10826
  // memcpy needs to be synchronized across threads to avoid race conditions.
10610
10827
  // => do it in INIT phase
10611
10828
  memcpy(
@@ -10684,6 +10901,7 @@ static void ggml_compute_forward_set(
10684
10901
  case GGML_TYPE_Q6_K:
10685
10902
  case GGML_TYPE_IQ2_XXS:
10686
10903
  case GGML_TYPE_IQ2_XS:
10904
+ case GGML_TYPE_IQ3_XXS:
10687
10905
  default:
10688
10906
  {
10689
10907
  GGML_ASSERT(false);
@@ -10880,6 +11098,7 @@ static void ggml_compute_forward_get_rows(
10880
11098
  case GGML_TYPE_Q6_K:
10881
11099
  case GGML_TYPE_IQ2_XXS:
10882
11100
  case GGML_TYPE_IQ2_XS:
11101
+ case GGML_TYPE_IQ3_XXS:
10883
11102
  {
10884
11103
  ggml_compute_forward_get_rows_q(params, src0, src1, dst);
10885
11104
  } break;
@@ -10930,6 +11149,9 @@ static void ggml_compute_forward_get_rows_back_f32_f16(
10930
11149
  // ggml_compute_forward_dup_same_cont(params, opt0, dst);
10931
11150
 
10932
11151
  if (params->type == GGML_TASK_INIT) {
11152
+ if (params->ith != 0) {
11153
+ return;
11154
+ }
10933
11155
  memset(dst->data, 0, ggml_nbytes(dst));
10934
11156
  }
10935
11157
 
@@ -10964,6 +11186,9 @@ static void ggml_compute_forward_get_rows_back_f32(
10964
11186
  // ggml_compute_forward_dup_same_cont(params, opt0, dst);
10965
11187
 
10966
11188
  if (params->type == GGML_TASK_INIT) {
11189
+ if (params->ith != 0) {
11190
+ return;
11191
+ }
10967
11192
  memset(dst->data, 0, ggml_nbytes(dst));
10968
11193
  }
10969
11194
 
@@ -11101,6 +11326,9 @@ static void ggml_compute_forward_diag_mask_f32(
11101
11326
  GGML_ASSERT(n_past >= 0);
11102
11327
 
11103
11328
  if (!inplace && (params->type == GGML_TASK_INIT)) {
11329
+ if (ith != 0) {
11330
+ return;
11331
+ }
11104
11332
  // memcpy needs to be synchronized across threads to avoid race conditions.
11105
11333
  // => do it in INIT phase
11106
11334
  GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
@@ -11518,6 +11746,7 @@ static void ggml_compute_forward_alibi(
11518
11746
  case GGML_TYPE_Q6_K:
11519
11747
  case GGML_TYPE_IQ2_XXS:
11520
11748
  case GGML_TYPE_IQ2_XS:
11749
+ case GGML_TYPE_IQ3_XXS:
11521
11750
  case GGML_TYPE_Q8_K:
11522
11751
  case GGML_TYPE_I8:
11523
11752
  case GGML_TYPE_I16:
@@ -11594,6 +11823,7 @@ static void ggml_compute_forward_clamp(
11594
11823
  case GGML_TYPE_Q6_K:
11595
11824
  case GGML_TYPE_IQ2_XXS:
11596
11825
  case GGML_TYPE_IQ2_XS:
11826
+ case GGML_TYPE_IQ3_XXS:
11597
11827
  case GGML_TYPE_Q8_K:
11598
11828
  case GGML_TYPE_I8:
11599
11829
  case GGML_TYPE_I16:
@@ -12071,6 +12301,9 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
12071
12301
  GGML_ASSERT(nb10 == sizeof(float));
12072
12302
 
12073
12303
  if (params->type == GGML_TASK_INIT) {
12304
+ if (ith != 0) {
12305
+ return;
12306
+ }
12074
12307
  memset(params->wdata, 0, params->wsize);
12075
12308
 
12076
12309
  // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
@@ -12165,6 +12398,9 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
12165
12398
  GGML_ASSERT(nb10 == sizeof(float));
12166
12399
 
12167
12400
  if (params->type == GGML_TASK_INIT) {
12401
+ if (ith != 0) {
12402
+ return;
12403
+ }
12168
12404
  memset(params->wdata, 0, params->wsize);
12169
12405
 
12170
12406
  // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
@@ -12257,6 +12493,92 @@ static void ggml_compute_forward_conv_transpose_1d(
12257
12493
  }
12258
12494
  }
12259
12495
 
12496
+ // src0: kernel [OC, IC, KH, KW]
12497
+ // src1: image [N, IC, IH, IW]
12498
+ // dst: result [N, OH, OW, IC*KH*KW]
12499
+ static void ggml_compute_forward_im2col_f32(
12500
+ const struct ggml_compute_params * params,
12501
+ const struct ggml_tensor * src0,
12502
+ const struct ggml_tensor * src1,
12503
+ struct ggml_tensor * dst) {
12504
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
12505
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
12506
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
12507
+
12508
+ int64_t t0 = ggml_perf_time_us();
12509
+ UNUSED(t0);
12510
+
12511
+ GGML_TENSOR_BINARY_OP_LOCALS;
12512
+
12513
+ const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
12514
+ const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
12515
+ const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
12516
+ const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
12517
+ const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
12518
+ const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
12519
+ const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
12520
+
12521
+ const int ith = params->ith;
12522
+ const int nth = params->nth;
12523
+
12524
+ const int64_t N = is_2D ? ne13 : ne12;
12525
+ const int64_t IC = is_2D ? ne12 : ne11;
12526
+ const int64_t IH = is_2D ? ne11 : 1;
12527
+ const int64_t IW = ne10;
12528
+
12529
+ const int64_t KH = is_2D ? ne01 : 1;
12530
+ const int64_t KW = ne00;
12531
+
12532
+ const int64_t OH = is_2D ? ne2 : 1;
12533
+ const int64_t OW = ne1;
12534
+
12535
+ int ofs0 = is_2D ? nb13 : nb12;
12536
+ int ofs1 = is_2D ? nb12 : nb11;
12537
+
12538
+ GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
12539
+ GGML_ASSERT(nb10 == sizeof(float));
12540
+
12541
+ if (params->type == GGML_TASK_INIT) {
12542
+ return;
12543
+ }
12544
+
12545
+ if (params->type == GGML_TASK_FINALIZE) {
12546
+ return;
12547
+ }
12548
+
12549
+ // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
12550
+ {
12551
+ float * const wdata = (float *) dst->data;
12552
+
12553
+ for (int64_t in = 0; in < N; in++) {
12554
+ for (int64_t ioh = 0; ioh < OH; ioh++) { // 1
12555
+ for (int64_t iow = 0; iow < OW; iow++) {
12556
+ for (int64_t iic = ith; iic < IC; iic += nth) {
12557
+
12558
+ // micro kernel
12559
+ float * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
12560
+ const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW]
12561
+
12562
+ for (int64_t ikh = 0; ikh < KH; ikh++) { // 1
12563
+ for (int64_t ikw = 0; ikw < KW; ikw++) {
12564
+ const int64_t iiw = iow*s0 + ikw*d0 - p0;
12565
+ const int64_t iih = ioh*s1 + ikh*d1 - p1;
12566
+
12567
+ if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
12568
+ dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
12569
+ } else {
12570
+ dst_data[iic*(KH*KW) + ikh*KW + ikw] = (src_data[iih*IW + iiw]);
12571
+ }
12572
+ }
12573
+ }
12574
+ }
12575
+ }
12576
+ }
12577
+ }
12578
+ }
12579
+ }
12580
+
12581
+
12260
12582
  // src0: kernel [OC, IC, KH, KW]
12261
12583
  // src1: image [N, IC, IH, IW]
12262
12584
  // dst: result [N, OH, OW, IC*KH*KW]
@@ -12347,14 +12669,14 @@ static void ggml_compute_forward_im2col(
12347
12669
  const struct ggml_tensor * src0,
12348
12670
  const struct ggml_tensor * src1,
12349
12671
  struct ggml_tensor * dst) {
12350
- switch (src0->type) {
12672
+ switch (dst->type) {
12351
12673
  case GGML_TYPE_F16:
12352
12674
  {
12353
12675
  ggml_compute_forward_im2col_f16(params, src0, src1, dst);
12354
12676
  } break;
12355
12677
  case GGML_TYPE_F32:
12356
12678
  {
12357
- GGML_ASSERT(false);
12679
+ ggml_compute_forward_im2col_f32(params, src0, src1, dst);
12358
12680
  } break;
12359
12681
  default:
12360
12682
  {
@@ -12363,6 +12685,7 @@ static void ggml_compute_forward_im2col(
12363
12685
  }
12364
12686
  }
12365
12687
 
12688
+
12366
12689
  // ggml_compute_forward_conv_transpose_2d
12367
12690
 
12368
12691
  static void ggml_compute_forward_conv_transpose_2d(
@@ -12388,6 +12711,9 @@ static void ggml_compute_forward_conv_transpose_2d(
12388
12711
  GGML_ASSERT(nb10 == sizeof(float));
12389
12712
 
12390
12713
  if (params->type == GGML_TASK_INIT) {
12714
+ if (ith != 0) {
12715
+ return;
12716
+ }
12391
12717
  memset(params->wdata, 0, params->wsize);
12392
12718
 
12393
12719
  // permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
@@ -12541,8 +12867,8 @@ static void ggml_compute_forward_pool_2d(
12541
12867
  const struct ggml_compute_params * params,
12542
12868
  const struct ggml_tensor * src,
12543
12869
  struct ggml_tensor * dst) {
12544
- assert(src->type == GGML_TYPE_F32);
12545
- assert(params->ith == 0);
12870
+ GGML_ASSERT(src->type == GGML_TYPE_F32);
12871
+ GGML_ASSERT(params->ith == 0);
12546
12872
 
12547
12873
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12548
12874
  return;
@@ -13931,6 +14257,14 @@ static void ggml_compute_forward_unary(
13931
14257
  {
13932
14258
  ggml_compute_forward_silu(params, src0, dst);
13933
14259
  } break;
14260
+ case GGML_UNARY_OP_HARDSWISH:
14261
+ {
14262
+ ggml_compute_forward_hardswish(params, src0, dst);
14263
+ } break;
14264
+ case GGML_UNARY_OP_HARDSIGMOID:
14265
+ {
14266
+ ggml_compute_forward_hardsigmoid(params, src0, dst);
14267
+ } break;
13934
14268
  default:
13935
14269
  {
13936
14270
  GGML_ASSERT(false);
@@ -13994,6 +14328,9 @@ static void ggml_compute_forward_add_rel_pos_f32(
13994
14328
 
13995
14329
  const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
13996
14330
  if (!inplace && params->type == GGML_TASK_INIT) {
14331
+ if (params->ith != 0) {
14332
+ return;
14333
+ }
13997
14334
  memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
13998
14335
  return;
13999
14336
  }
@@ -14509,8 +14846,26 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14509
14846
  }
14510
14847
  GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
14511
14848
  GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
14849
+ #elif defined(GGML_USE_VULKAN)
14850
+ const bool skip_cpu = ggml_vk_compute_forward(params, tensor);
14851
+ #ifdef GGML_VULKAN_CHECK_RESULTS
14852
+ if (skip_cpu) {
14853
+ ggml_vk_check_results_1(params, tensor);
14854
+ }
14855
+ #endif
14856
+ if (skip_cpu) {
14857
+ return;
14858
+ }
14859
+ GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
14860
+ GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
14512
14861
  #endif // GGML_USE_CUBLAS
14513
14862
 
14863
+ #ifdef GGML_USE_SYCL
14864
+ bool skip_cpu = ggml_sycl_compute_forward(params, tensor);
14865
+ if (skip_cpu) {
14866
+ return;
14867
+ }
14868
+ #endif // GGML_USE_SYCL
14514
14869
  switch (tensor->op) {
14515
14870
  case GGML_OP_DUP:
14516
14871
  {
@@ -14913,13 +15268,13 @@ struct ggml_hash_set ggml_hash_set_new(size_t size) {
14913
15268
  size = ggml_hash_size(size);
14914
15269
  struct ggml_hash_set result;
14915
15270
  result.size = size;
14916
- result.keys = malloc(sizeof(struct ggml_tensor *) * size);
15271
+ result.keys = GGML_MALLOC(sizeof(struct ggml_tensor *) * size);
14917
15272
  memset(result.keys, 0, sizeof(struct ggml_tensor *) * size);
14918
15273
  return result;
14919
15274
  }
14920
15275
 
14921
15276
  static void ggml_hash_set_free(struct ggml_hash_set hash_set) {
14922
- free(hash_set.keys);
15277
+ GGML_FREE(hash_set.keys);
14923
15278
  }
14924
15279
 
14925
15280
  struct hash_map {
@@ -14928,17 +15283,17 @@ struct hash_map {
14928
15283
  };
14929
15284
 
14930
15285
  static struct hash_map * ggml_new_hash_map(size_t size) {
14931
- struct hash_map * result = malloc(sizeof(struct hash_map));
15286
+ struct hash_map * result = GGML_MALLOC(sizeof(struct hash_map));
14932
15287
  result->set = ggml_hash_set_new(size);
14933
- result->vals = malloc(sizeof(struct ggml_tensor *) * result->set.size);
15288
+ result->vals = GGML_MALLOC(sizeof(struct ggml_tensor *) * result->set.size);
14934
15289
  memset(result->vals, 0, sizeof(struct ggml_tensor *) * result->set.size);
14935
15290
  return result;
14936
15291
  }
14937
15292
 
14938
15293
  static void ggml_hash_map_free(struct hash_map * map) {
14939
15294
  ggml_hash_set_free(map->set);
14940
- free(map->vals);
14941
- free(map);
15295
+ GGML_FREE(map->vals);
15296
+ GGML_FREE(map);
14942
15297
  }
14943
15298
 
14944
15299
  // gradient checkpointing
@@ -16287,8 +16642,9 @@ struct ggml_compute_state_shared {
16287
16642
  const int n_threads;
16288
16643
 
16289
16644
  // synchronization primitives
16290
- atomic_int n_active; // num active threads
16291
- atomic_int node_n; // active graph node
16645
+ atomic_int n_active; // num active threads
16646
+ atomic_int node_n; // active graph node
16647
+ atomic_int node_task; // active graph node task phase
16292
16648
 
16293
16649
  bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
16294
16650
  void * abort_callback_data;
@@ -16344,6 +16700,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
16344
16700
  case GGML_UNARY_OP_TANH:
16345
16701
  case GGML_UNARY_OP_ELU:
16346
16702
  case GGML_UNARY_OP_RELU:
16703
+ case GGML_UNARY_OP_HARDSWISH: // to opt for multiple threads
16704
+ case GGML_UNARY_OP_HARDSIGMOID: // to opt for multiple threads
16347
16705
  {
16348
16706
  n_tasks = 1;
16349
16707
  } break;
@@ -16420,7 +16778,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
16420
16778
  } break;
16421
16779
  case GGML_OP_SOFT_MAX:
16422
16780
  {
16423
- n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
16781
+ n_tasks = MIN(n_threads, ggml_nrows(node->src[0]));
16424
16782
  } break;
16425
16783
  case GGML_OP_CONV_TRANSPOSE_1D:
16426
16784
  {
@@ -16534,6 +16892,34 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
16534
16892
  return n_tasks;
16535
16893
  }
16536
16894
 
16895
+ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_compute_state * state, const bool do_yield) {
16896
+ // wait for other threads to finish
16897
+ const int last_node_n = * node_n;
16898
+
16899
+ while (true) {
16900
+ if (do_yield) {
16901
+ sched_yield();
16902
+ }
16903
+
16904
+ * node_n = atomic_load(&state->shared->node_n);
16905
+ if (* node_n != last_node_n) break;
16906
+ }
16907
+ }
16908
+
16909
+ static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
16910
+ // wait for other threads to finish
16911
+ const int last_task_phase = * task_phase;
16912
+
16913
+ while (true) {
16914
+ if (do_yield) {
16915
+ sched_yield();
16916
+ }
16917
+
16918
+ * task_phase = atomic_load(&state->shared->node_task);
16919
+ if (* task_phase != last_task_phase) break;
16920
+ }
16921
+ }
16922
+
16537
16923
  static thread_ret_t ggml_graph_compute_thread(void * data) {
16538
16924
  struct ggml_compute_state * state = (struct ggml_compute_state *) data;
16539
16925
 
@@ -16544,7 +16930,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16544
16930
 
16545
16931
  set_numa_thread_affinity(state->ith, n_threads);
16546
16932
 
16547
- int node_n = -1;
16933
+ int node_n = -1;
16934
+ int task_phase = GGML_TASK_FINALIZE;
16548
16935
 
16549
16936
  while (true) {
16550
16937
  if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
@@ -16576,7 +16963,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16576
16963
  // distribute new work or execute it direct if 1T
16577
16964
  while (++node_n < cgraph->n_nodes) {
16578
16965
  GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
16579
-
16580
16966
  struct ggml_tensor * node = cgraph->nodes[node_n];
16581
16967
  const int n_tasks = ggml_get_n_tasks(node, n_threads);
16582
16968
 
@@ -16585,13 +16971,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16585
16971
 
16586
16972
  params.nth = n_tasks;
16587
16973
 
16588
- /* INIT */
16589
- if (GGML_OP_HAS_INIT[node->op]) {
16590
- params.type = GGML_TASK_INIT;
16591
- ggml_compute_forward(&params, node);
16592
- }
16593
-
16594
16974
  if (n_tasks == 1) {
16975
+ /* INIT */
16976
+ if (GGML_OP_HAS_INIT[node->op]) {
16977
+ params.type = GGML_TASK_INIT;
16978
+ ggml_compute_forward(&params, node);
16979
+ }
16980
+
16595
16981
  // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
16596
16982
  // they do something more efficient than spinning (?)
16597
16983
  params.type = GGML_TASK_COMPUTE;
@@ -16612,38 +16998,24 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16612
16998
  }
16613
16999
  }
16614
17000
 
16615
- atomic_store(&state->shared->n_active, n_threads);
16616
- atomic_store(&state->shared->node_n, node_n);
17001
+ task_phase = GGML_TASK_INIT;
17002
+ atomic_store(&state->shared->n_active, n_threads);
17003
+ atomic_store(&state->shared->node_n, node_n);
17004
+ atomic_store(&state->shared->node_task, task_phase);
16617
17005
  } else {
16618
- // wait for other threads to finish
16619
- const int last = node_n;
16620
-
16621
- const bool do_yield = last < 0 || cgraph->nodes[last]->op == GGML_OP_MUL_MAT;
16622
-
16623
- while (true) {
16624
- // TODO: this sched_yield can have significant impact on the performance - either positive or negative
16625
- // depending on the workload and the operating system.
16626
- // since it is not clear what is the best approach, it should potentially become user-configurable
16627
- // ref: https://github.com/ggerganov/ggml/issues/291
16628
- // UPD: adding the do_yield flag seems to resolve the issue universally
16629
- if (do_yield) {
16630
- sched_yield();
16631
- }
16632
-
16633
- node_n = atomic_load(&state->shared->node_n);
16634
- if (node_n != last) break;
16635
- };
17006
+ ggml_graph_compute_thread_sync_node(&node_n, state, false);
17007
+ ggml_graph_compute_thread_sync_task(&task_phase, state, false);
16636
17008
  }
16637
17009
 
16638
17010
  // check if we should stop
16639
17011
  if (node_n >= cgraph->n_nodes) break;
16640
17012
 
16641
- /* COMPUTE */
17013
+ /* INIT & COMPUTE */
16642
17014
  struct ggml_tensor * node = cgraph->nodes[node_n];
16643
17015
  const int n_tasks = ggml_get_n_tasks(node, n_threads);
16644
17016
 
16645
17017
  struct ggml_compute_params params = {
16646
- /*.type =*/ GGML_TASK_COMPUTE,
17018
+ /*.type =*/ GGML_TASK_INIT,
16647
17019
  /*.ith =*/ state->ith,
16648
17020
  /*.nth =*/ n_tasks,
16649
17021
  /*.wsize =*/ cplan->work_size,
@@ -16651,8 +17023,39 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16651
17023
  };
16652
17024
 
16653
17025
  if (state->ith < n_tasks) {
17026
+ if (GGML_OP_HAS_INIT[node->op]) {
17027
+ ggml_compute_forward(&params, node);
17028
+ }
17029
+ }
17030
+
17031
+ if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
17032
+ task_phase = GGML_TASK_COMPUTE;
17033
+ atomic_store(&state->shared->n_active, n_threads);
17034
+ atomic_store(&state->shared->node_task, task_phase);
17035
+ }
17036
+ else {
17037
+ // TODO: this sched_yield can have significant impact on the performance - either positive or negative
17038
+ // depending on the workload and the operating system.
17039
+ // since it is not clear what is the best approach, it should potentially become user-configurable
17040
+ // ref: https://github.com/ggerganov/ggml/issues/291
17041
+ // UPD: adding the do_yield flag seems to resolve the issue universally
17042
+ const bool do_yield = node_n < 0 || cgraph->nodes[node_n]->op == GGML_OP_MUL_MAT;
17043
+ ggml_graph_compute_thread_sync_task(&task_phase, state, do_yield);
17044
+ }
17045
+
17046
+ if (state->ith < n_tasks) {
17047
+ params.type = GGML_TASK_COMPUTE;
16654
17048
  ggml_compute_forward(&params, node);
16655
17049
  }
17050
+
17051
+ if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
17052
+ task_phase = GGML_TASK_FINALIZE;
17053
+ atomic_store(&state->shared->n_active, n_threads);
17054
+ atomic_store(&state->shared->node_task, task_phase);
17055
+ }
17056
+ else {
17057
+ ggml_graph_compute_thread_sync_task(&task_phase, state, false);
17058
+ }
16656
17059
  }
16657
17060
 
16658
17061
  return GGML_EXIT_SUCCESS;
@@ -16668,12 +17071,16 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
16668
17071
  struct ggml_cplan cplan;
16669
17072
  memset(&cplan, 0, sizeof(struct ggml_cplan));
16670
17073
 
17074
+ int max_tasks = 1;
17075
+
16671
17076
  // thread scheduling for the different operations + work buffer size estimation
16672
17077
  for (int i = 0; i < cgraph->n_nodes; i++) {
16673
17078
  struct ggml_tensor * node = cgraph->nodes[i];
16674
17079
 
16675
17080
  const int n_tasks = ggml_get_n_tasks(node, n_threads);
16676
17081
 
17082
+ max_tasks = MAX(max_tasks, n_tasks);
17083
+
16677
17084
  size_t cur = 0;
16678
17085
 
16679
17086
  switch (node->op) {
@@ -16709,8 +17116,11 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
16709
17116
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
16710
17117
  if (ggml_compute_forward_mul_mat_use_blas(node)) {
16711
17118
  if (node->src[0]->type != GGML_TYPE_F32) {
16712
- // here we need memory just for single 2D matrix from src0
16713
- cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
17119
+ // here we need memory for fully dequantized matrix from src0
17120
+ // take into account that src0 can be broadcasted into src1[2,3]
17121
+ cur = ggml_type_size(GGML_TYPE_F32)
17122
+ * node->src[0]->ne[0]*node->src[0]->ne[1]
17123
+ * node->src[1]->ne[2]*node->src[1]->ne[3];
16714
17124
  }
16715
17125
  } else
16716
17126
  #endif
@@ -16837,7 +17247,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
16837
17247
  work_size += CACHE_LINE_SIZE*(n_threads - 1);
16838
17248
  }
16839
17249
 
16840
- cplan.n_threads = n_threads;
17250
+ cplan.n_threads = MIN(max_tasks, n_threads);
16841
17251
  cplan.work_size = work_size;
16842
17252
  cplan.work_data = NULL;
16843
17253
 
@@ -16854,6 +17264,17 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
16854
17264
  }
16855
17265
  }
16856
17266
 
17267
+ #ifdef GGML_USE_VULKAN
17268
+ for (int i = 0; i < cgraph->n_nodes; i++) {
17269
+ ggml_vk_preallocate_buffers_graph(cgraph->nodes[i]);
17270
+ }
17271
+ ggml_vk_preallocate_buffers();
17272
+
17273
+ for (int i = 0; i < cgraph->n_nodes; i++) {
17274
+ ggml_vk_build_graph(cgraph->nodes[i], i == cgraph->n_nodes - 1);
17275
+ }
17276
+ #endif
17277
+
16857
17278
  const int n_threads = cplan->n_threads;
16858
17279
 
16859
17280
  struct ggml_compute_state_shared state_shared = {
@@ -16864,6 +17285,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
16864
17285
  /*.n_threads =*/ n_threads,
16865
17286
  /*.n_active =*/ n_threads,
16866
17287
  /*.node_n =*/ -1,
17288
+ /*.node_task =*/ GGML_TASK_FINALIZE,
16867
17289
  /*.abort_callback =*/ NULL,
16868
17290
  /*.abort_callback_data =*/ NULL,
16869
17291
  };
@@ -16904,6 +17326,10 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
16904
17326
  }
16905
17327
  }
16906
17328
 
17329
+ #ifdef GGML_USE_VULKAN
17330
+ ggml_vk_graph_cleanup();
17331
+ #endif
17332
+
16907
17333
  // performance stats (graph)
16908
17334
  {
16909
17335
  int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
@@ -18538,6 +18964,29 @@ enum ggml_opt_result ggml_opt_resume_g(
18538
18964
 
18539
18965
  ////////////////////////////////////////////////////////////////////////////////
18540
18966
 
18967
+ void ggml_quantize_init(enum ggml_type type) {
18968
+ ggml_critical_section_start();
18969
+
18970
+ switch (type) {
18971
+ case GGML_TYPE_IQ2_XXS: iq2xs_init_impl(256); break;
18972
+ case GGML_TYPE_IQ2_XS: iq2xs_init_impl(512); break;
18973
+ case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
18974
+ default: // nothing
18975
+ break;
18976
+ }
18977
+
18978
+ ggml_critical_section_end();
18979
+ }
18980
+
18981
+ void ggml_quantize_free(void) {
18982
+ ggml_critical_section_start();
18983
+
18984
+ iq2xs_free_impl(256);
18985
+ iq2xs_free_impl(512);
18986
+
18987
+ ggml_critical_section_end();
18988
+ }
18989
+
18541
18990
  size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) {
18542
18991
  assert(k % QK4_0 == 0);
18543
18992
  const int nb = k / QK4_0;
@@ -18665,9 +19114,15 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
18665
19114
  return (n/QK8_0*sizeof(block_q8_0));
18666
19115
  }
18667
19116
 
19117
+ bool ggml_quantize_requires_imatrix(enum ggml_type type) {
19118
+ return
19119
+ type == GGML_TYPE_IQ2_XXS ||
19120
+ type == GGML_TYPE_IQ2_XS;
19121
+ }
19122
+
18668
19123
  size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
18669
19124
  int nrows, int n_per_row, int64_t * hist, const float * imatrix) {
18670
- (void)imatrix;
19125
+ ggml_quantize_init(type); // this is noop if already initialized
18671
19126
  size_t result = 0;
18672
19127
  int n = nrows * n_per_row;
18673
19128
  switch (type) {
@@ -18778,15 +19233,24 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
18778
19233
  result = quantize_iq2_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18779
19234
  GGML_ASSERT(result == row_size * nrows);
18780
19235
  } break;
19236
+ case GGML_TYPE_IQ3_XXS:
19237
+ {
19238
+ GGML_ASSERT(start % QK_K == 0);
19239
+ GGML_ASSERT(start % n_per_row == 0);
19240
+ size_t start_row = start / n_per_row;
19241
+ size_t row_size = ggml_row_size(type, n_per_row);
19242
+ result = quantize_iq3_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19243
+ GGML_ASSERT(result == row_size * nrows);
19244
+ } break;
18781
19245
  case GGML_TYPE_F16:
18782
19246
  {
18783
- int elemsize = sizeof(ggml_fp16_t);
19247
+ size_t elemsize = sizeof(ggml_fp16_t);
18784
19248
  ggml_fp32_to_fp16_row(src + start, (ggml_fp16_t *)dst + start, n);
18785
19249
  result = n * elemsize;
18786
19250
  } break;
18787
19251
  case GGML_TYPE_F32:
18788
19252
  {
18789
- int elemsize = sizeof(float);
19253
+ size_t elemsize = sizeof(float);
18790
19254
  result = n * elemsize;
18791
19255
  memcpy((uint8_t *)dst + start * elemsize, src + start, result);
18792
19256
  } break;
@@ -18904,6 +19368,25 @@ struct gguf_context {
18904
19368
  void * data;
18905
19369
  };
18906
19370
 
19371
+ static size_t gguf_type_size(enum gguf_type type) {
19372
+ GGML_ASSERT(0 <= type && type < GGUF_TYPE_COUNT);
19373
+ return GGUF_TYPE_SIZE[type];
19374
+ }
19375
+
19376
+ static void gguf_tensor_info_sanitize(struct gguf_tensor_info * info) {
19377
+ GGML_ASSERT(info->n_dims <= GGML_MAX_DIMS);
19378
+ GGML_ASSERT(0 <= info->type && info->type < GGML_TYPE_COUNT);
19379
+
19380
+ for (uint32_t i = 0; i < info->n_dims; ++i) {
19381
+ GGML_ASSERT(info->ne[i] > 0);
19382
+ }
19383
+
19384
+ // prevent overflow for total number of elements
19385
+ GGML_ASSERT(INT64_MAX/info->ne[1] > info->ne[0]);
19386
+ GGML_ASSERT(INT64_MAX/info->ne[2] > info->ne[0]*info->ne[1]);
19387
+ GGML_ASSERT(INT64_MAX/info->ne[3] > info->ne[0]*info->ne[1]*info->ne[2]);
19388
+ }
19389
+
18907
19390
  static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) {
18908
19391
  const size_t n = fread(dst, 1, size, file);
18909
19392
  *offset += n;
@@ -18916,8 +19399,17 @@ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
18916
19399
 
18917
19400
  bool ok = true;
18918
19401
 
18919
- ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = calloc(p->n + 1, 1);
18920
- ok = ok && gguf_fread_el(file, p->data, p->n, offset);
19402
+ ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset);
19403
+
19404
+ // early exit if string length is invalid, prevents from integer overflow
19405
+ if (p->n == SIZE_MAX) {
19406
+ fprintf(stderr, "%s: invalid string length (%" PRIu64 ")\n", __func__, p->n);
19407
+ return false;
19408
+ }
19409
+
19410
+ p->data = GGML_CALLOC(p->n + 1, 1);
19411
+
19412
+ ok = ok && gguf_fread_el(file, p->data, p->n, offset);
18921
19413
 
18922
19414
  return ok;
18923
19415
  }
@@ -18989,6 +19481,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18989
19481
  return NULL;
18990
19482
  }
18991
19483
 
19484
+ // sanity-checks to prevent from integer/buffer overflows
19485
+
19486
+ ok = ok && (ctx->header.n_tensors < (SIZE_MAX/2)/sizeof(struct gguf_tensor_info));
19487
+ ok = ok && (ctx->header.n_tensors < (SIZE_MAX/2)/ggml_tensor_overhead());
19488
+ ok = ok && (ctx->header.n_kv < (SIZE_MAX/2)/sizeof(struct gguf_kv));
19489
+
18992
19490
  if (!ok) {
18993
19491
  fprintf(stderr, "%s: failed to read header\n", __func__);
18994
19492
  fclose(file);
@@ -18999,7 +19497,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18999
19497
 
19000
19498
  // read the kv pairs
19001
19499
  {
19002
- ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
19500
+ ctx->kv = GGML_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
19003
19501
 
19004
19502
  for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
19005
19503
  struct gguf_kv * kv = &ctx->kv[i];
@@ -19027,7 +19525,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19027
19525
  case GGUF_TYPE_ARRAY:
19028
19526
  {
19029
19527
  ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
19030
- ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
19528
+ ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
19031
19529
 
19032
19530
  switch (kv->value.arr.type) {
19033
19531
  case GGUF_TYPE_UINT8:
@@ -19042,21 +19540,39 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19042
19540
  case GGUF_TYPE_FLOAT64:
19043
19541
  case GGUF_TYPE_BOOL:
19044
19542
  {
19045
- kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
19046
- ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type], &offset);
19543
+ // prevent from integer overflow in the malloc below
19544
+ if (kv->value.arr.n >= SIZE_MAX/gguf_type_size(kv->value.arr.type)) {
19545
+ fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
19546
+ fclose(file);
19547
+ gguf_free(ctx);
19548
+ return NULL;
19549
+ }
19550
+
19551
+ kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * gguf_type_size(kv->value.arr.type));
19552
+
19553
+ ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type), &offset);
19047
19554
  } break;
19048
19555
  case GGUF_TYPE_STRING:
19049
19556
  {
19050
- kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
19557
+ // prevent from integer overflow in the malloc below
19558
+ if (kv->value.arr.n >= SIZE_MAX/sizeof(struct gguf_str)) {
19559
+ fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
19560
+ fclose(file);
19561
+ gguf_free(ctx);
19562
+ return NULL;
19563
+ }
19564
+
19565
+ kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * sizeof(struct gguf_str));
19566
+
19051
19567
  for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
19052
19568
  ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
19053
19569
  }
19054
19570
  } break;
19055
19571
  case GGUF_TYPE_ARRAY:
19056
- case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
19572
+ default: GGML_ASSERT(false && "invalid type"); break;
19057
19573
  }
19058
19574
  } break;
19059
- case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
19575
+ default: GGML_ASSERT(false && "invalid type");
19060
19576
  }
19061
19577
 
19062
19578
  if (!ok) {
@@ -19074,7 +19590,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19074
19590
 
19075
19591
  // read the tensor infos
19076
19592
  {
19077
- ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
19593
+ ctx->infos = GGML_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
19078
19594
 
19079
19595
  for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
19080
19596
  struct gguf_tensor_info * info = &ctx->infos[i];
@@ -19085,12 +19601,18 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19085
19601
 
19086
19602
  ok = ok && gguf_fread_str(file, &info->name, &offset);
19087
19603
  ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset);
19604
+
19605
+ ok = ok && (info->n_dims <= GGML_MAX_DIMS);
19606
+
19088
19607
  for (uint32_t j = 0; j < info->n_dims; ++j) {
19089
19608
  ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
19090
19609
  }
19610
+
19091
19611
  ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
19092
19612
  ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
19093
19613
 
19614
+ gguf_tensor_info_sanitize(info);
19615
+
19094
19616
  if (!ok) {
19095
19617
  fprintf(stderr, "%s: failed to read tensor info\n", __func__);
19096
19618
  fclose(file);
@@ -19244,12 +19766,12 @@ void gguf_free(struct gguf_context * ctx) {
19244
19766
  struct gguf_kv * kv = &ctx->kv[i];
19245
19767
 
19246
19768
  if (kv->key.data) {
19247
- free(kv->key.data);
19769
+ GGML_FREE(kv->key.data);
19248
19770
  }
19249
19771
 
19250
19772
  if (kv->type == GGUF_TYPE_STRING) {
19251
19773
  if (kv->value.str.data) {
19252
- free(kv->value.str.data);
19774
+ GGML_FREE(kv->value.str.data);
19253
19775
  }
19254
19776
  }
19255
19777
 
@@ -19259,16 +19781,16 @@ void gguf_free(struct gguf_context * ctx) {
19259
19781
  for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
19260
19782
  struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
19261
19783
  if (str->data) {
19262
- free(str->data);
19784
+ GGML_FREE(str->data);
19263
19785
  }
19264
19786
  }
19265
19787
  }
19266
- free(kv->value.arr.data);
19788
+ GGML_FREE(kv->value.arr.data);
19267
19789
  }
19268
19790
  }
19269
19791
  }
19270
19792
 
19271
- free(ctx->kv);
19793
+ GGML_FREE(ctx->kv);
19272
19794
  }
19273
19795
 
19274
19796
  if (ctx->infos) {
@@ -19276,11 +19798,11 @@ void gguf_free(struct gguf_context * ctx) {
19276
19798
  struct gguf_tensor_info * info = &ctx->infos[i];
19277
19799
 
19278
19800
  if (info->name.data) {
19279
- free(info->name.data);
19801
+ GGML_FREE(info->name.data);
19280
19802
  }
19281
19803
  }
19282
19804
 
19283
- free(ctx->infos);
19805
+ GGML_FREE(ctx->infos);
19284
19806
  }
19285
19807
 
19286
19808
  GGML_ALIGNED_FREE(ctx);
@@ -19581,8 +20103,8 @@ void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_ty
19581
20103
  ctx->kv[idx].type = GGUF_TYPE_ARRAY;
19582
20104
  ctx->kv[idx].value.arr.type = type;
19583
20105
  ctx->kv[idx].value.arr.n = n;
19584
- ctx->kv[idx].value.arr.data = malloc(n*GGUF_TYPE_SIZE[type]);
19585
- memcpy(ctx->kv[idx].value.arr.data, data, n*GGUF_TYPE_SIZE[type]);
20106
+ ctx->kv[idx].value.arr.data = GGML_MALLOC(n*gguf_type_size(type));
20107
+ memcpy(ctx->kv[idx].value.arr.data, data, n*gguf_type_size(type));
19586
20108
  }
19587
20109
 
19588
20110
  void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, int n) {
@@ -19591,7 +20113,7 @@ void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char **
19591
20113
  ctx->kv[idx].type = GGUF_TYPE_ARRAY;
19592
20114
  ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
19593
20115
  ctx->kv[idx].value.arr.n = n;
19594
- ctx->kv[idx].value.arr.data = malloc(n*sizeof(struct gguf_str));
20116
+ ctx->kv[idx].value.arr.data = GGML_MALLOC(n*sizeof(struct gguf_str));
19595
20117
  for (int i = 0; i < n; i++) {
19596
20118
  struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
19597
20119
  str->n = strlen(data[i]);
@@ -19618,19 +20140,19 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
19618
20140
  case GGUF_TYPE_ARRAY:
19619
20141
  {
19620
20142
  if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
19621
- const char ** data = malloc(src->kv[i].value.arr.n*sizeof(char *));
20143
+ const char ** data = GGML_MALLOC(src->kv[i].value.arr.n*sizeof(char *));
19622
20144
  for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
19623
20145
  data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
19624
20146
  }
19625
20147
  gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
19626
- free((void *)data);
20148
+ GGML_FREE((void *)data);
19627
20149
  } else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
19628
20150
  GGML_ASSERT(false && "nested arrays not supported");
19629
20151
  } else {
19630
20152
  gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n);
19631
20153
  }
19632
20154
  } break;
19633
- case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
20155
+ default: GGML_ASSERT(false && "invalid type"); break;
19634
20156
  }
19635
20157
  }
19636
20158
  }
@@ -19706,7 +20228,7 @@ struct gguf_buf {
19706
20228
 
19707
20229
  static struct gguf_buf gguf_buf_init(size_t size) {
19708
20230
  struct gguf_buf buf = {
19709
- /*buf.data =*/ size == 0 ? NULL : malloc(size),
20231
+ /*buf.data =*/ size == 0 ? NULL : GGML_MALLOC(size),
19710
20232
  /*buf.size =*/ size,
19711
20233
  /*buf.offset =*/ 0,
19712
20234
  };
@@ -19716,7 +20238,7 @@ static struct gguf_buf gguf_buf_init(size_t size) {
19716
20238
 
19717
20239
  static void gguf_buf_free(struct gguf_buf buf) {
19718
20240
  if (buf.data) {
19719
- free(buf.data);
20241
+ GGML_FREE(buf.data);
19720
20242
  }
19721
20243
  }
19722
20244
 
@@ -19797,7 +20319,7 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf *
19797
20319
  case GGUF_TYPE_FLOAT64:
19798
20320
  case GGUF_TYPE_BOOL:
19799
20321
  {
19800
- gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
20322
+ gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type));
19801
20323
  } break;
19802
20324
  case GGUF_TYPE_STRING:
19803
20325
  {
@@ -19806,10 +20328,10 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf *
19806
20328
  }
19807
20329
  } break;
19808
20330
  case GGUF_TYPE_ARRAY:
19809
- case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
20331
+ default: GGML_ASSERT(false && "invalid type"); break;
19810
20332
  }
19811
20333
  } break;
19812
- case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
20334
+ default: GGML_ASSERT(false && "invalid type");
19813
20335
  }
19814
20336
  }
19815
20337
 
@@ -20010,7 +20532,7 @@ int ggml_cpu_has_wasm_simd(void) {
20010
20532
  }
20011
20533
 
20012
20534
  int ggml_cpu_has_blas(void) {
20013
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
20535
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
20014
20536
  return 1;
20015
20537
  #else
20016
20538
  return 0;
@@ -20033,8 +20555,33 @@ int ggml_cpu_has_clblast(void) {
20033
20555
  #endif
20034
20556
  }
20035
20557
 
20558
+ int ggml_cpu_has_vulkan(void) {
20559
+ #if defined(GGML_USE_VULKAN)
20560
+ return 1;
20561
+ #else
20562
+ return 0;
20563
+ #endif
20564
+ }
20565
+
20566
+ int ggml_cpu_has_kompute(void) {
20567
+ #if defined(GGML_USE_KOMPUTE)
20568
+ return 1;
20569
+ #else
20570
+ return 0;
20571
+ #endif
20572
+ }
20573
+
20574
+ int ggml_cpu_has_sycl(void) {
20575
+ #if defined(GGML_USE_SYCL)
20576
+ return 1;
20577
+ #else
20578
+ return 0;
20579
+ #endif
20580
+ }
20581
+
20036
20582
  int ggml_cpu_has_gpublas(void) {
20037
- return ggml_cpu_has_cublas() || ggml_cpu_has_clblast();
20583
+ return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
20584
+ ggml_cpu_has_sycl();
20038
20585
  }
20039
20586
 
20040
20587
  int ggml_cpu_has_sse3(void) {