llama_cpp 0.12.2 → 0.12.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -218,6 +218,7 @@ inline static void * ggml_aligned_malloc(size_t size) {
218
218
  break;
219
219
  }
220
220
  GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
221
+ GGML_ASSERT(false);
221
222
  return NULL;
222
223
  }
223
224
  return aligned_memory;
@@ -230,6 +231,38 @@ inline static void * ggml_aligned_malloc(size_t size) {
230
231
  #endif
231
232
  #endif
232
233
 
234
+ inline static void * ggml_malloc(size_t size) {
235
+ if (size == 0) {
236
+ GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_malloc!\n");
237
+ return NULL;
238
+ }
239
+ void * result = malloc(size);
240
+ if (result == NULL) {
241
+ GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
242
+ GGML_ASSERT(false);
243
+ }
244
+ return result;
245
+ }
246
+
247
+ // calloc
248
+ inline static void * ggml_calloc(size_t num, size_t size) {
249
+ if (num == 0 || size == 0) {
250
+ GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_calloc!\n");
251
+ return NULL;
252
+ }
253
+ void * result = calloc(num, size);
254
+ if (result == NULL) {
255
+ GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
256
+ GGML_ASSERT(false);
257
+ }
258
+ return result;
259
+ }
260
+
261
+ #define GGML_MALLOC(size) ggml_malloc(size)
262
+ #define GGML_CALLOC(num, size) ggml_calloc(num, size)
263
+
264
+ #define GGML_FREE(ptr) free(ptr)
265
+
233
266
  #define UNUSED GGML_UNUSED
234
267
  #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
235
268
 
@@ -248,6 +281,10 @@ inline static void * ggml_aligned_malloc(size_t size) {
248
281
  #include "ggml-cuda.h"
249
282
  #elif defined(GGML_USE_CLBLAST)
250
283
  #include "ggml-opencl.h"
284
+ #elif defined(GGML_USE_VULKAN)
285
+ #include "ggml-vulkan.h"
286
+ #elif defined(GGML_USE_SYCL)
287
+ #include "ggml-sycl.h"
251
288
  #endif
252
289
 
253
290
  // floating point type used to accumulate sums
@@ -394,12 +431,6 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
394
431
  static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
395
432
  static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
396
433
 
397
- ggml_collect_imatrix_t g_imatrix_collect = NULL;
398
-
399
- void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect) {
400
- g_imatrix_collect = imatrix_collect;
401
- }
402
-
403
434
  static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
404
435
  [GGML_TYPE_I8] = {
405
436
  .type_name = "i8",
@@ -601,6 +632,17 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
601
632
  .vec_dot = ggml_vec_dot_iq2_xs_q8_K,
602
633
  .vec_dot_type = GGML_TYPE_Q8_K,
603
634
  },
635
+ [GGML_TYPE_IQ3_XXS] = {
636
+ .type_name = "iq3_xxs",
637
+ .blck_size = QK_K,
638
+ .type_size = sizeof(block_iq3_xxs),
639
+ .is_quantized = true,
640
+ .to_float = (ggml_to_float_t) dequantize_row_iq3_xxs,
641
+ .from_float = quantize_row_iq3_xxs,
642
+ .from_float_reference = (ggml_from_float_t)quantize_row_iq3_xxs_reference,
643
+ .vec_dot = ggml_vec_dot_iq3_xxs_q8_K,
644
+ .vec_dot_type = GGML_TYPE_Q8_K,
645
+ },
604
646
  [GGML_TYPE_Q8_K] = {
605
647
  .type_name = "q8_K",
606
648
  .blck_size = QK_K,
@@ -1424,6 +1466,9 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) {
1424
1466
  inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
1425
1467
  inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
1426
1468
  inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
1469
+ // TODO: optimize performance
1470
+ inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
1471
+ inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
1427
1472
 
1428
1473
  static const float GELU_COEF_A = 0.044715f;
1429
1474
  static const float GELU_QUICK_COEF = -1.702f;
@@ -1782,9 +1827,11 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
1782
1827
  "GELU",
1783
1828
  "GELU_QUICK",
1784
1829
  "SILU",
1830
+ "HARDSWISH",
1831
+ "HARDSIGMOID",
1785
1832
  };
1786
1833
 
1787
- static_assert(GGML_UNARY_OP_COUNT == 10, "GGML_UNARY_OP_COUNT != 10");
1834
+ static_assert(GGML_UNARY_OP_COUNT == 12, "GGML_UNARY_OP_COUNT != 12");
1788
1835
 
1789
1836
 
1790
1837
  static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
@@ -2141,6 +2188,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
2141
2188
  case GGML_FTYPE_MOSTLY_Q6_K: wtype = GGML_TYPE_Q6_K; break;
2142
2189
  case GGML_FTYPE_MOSTLY_IQ2_XXS: wtype = GGML_TYPE_IQ2_XXS; break;
2143
2190
  case GGML_FTYPE_MOSTLY_IQ2_XS: wtype = GGML_TYPE_IQ2_XS; break;
2191
+ case GGML_FTYPE_MOSTLY_IQ3_XXS: wtype = GGML_TYPE_IQ3_XXS; break;
2144
2192
  case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
2145
2193
  case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
2146
2194
  }
@@ -2294,6 +2342,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
2294
2342
  ggml_init_cublas();
2295
2343
  #elif defined(GGML_USE_CLBLAST)
2296
2344
  ggml_cl_init();
2345
+ #elif defined(GGML_USE_VULKAN)
2346
+ ggml_vk_init();
2347
+ #elif defined(GGML_USE_SYCL)
2348
+ ggml_init_sycl();
2297
2349
  #endif
2298
2350
 
2299
2351
  ggml_setup_op_has_task_pass();
@@ -3951,6 +4003,20 @@ struct ggml_tensor * ggml_silu_back(
3951
4003
  return result;
3952
4004
  }
3953
4005
 
4006
+ // ggml hardswish
4007
+ struct ggml_tensor * ggml_hardswish(
4008
+ struct ggml_context * ctx,
4009
+ struct ggml_tensor * a) {
4010
+ return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSWISH);
4011
+ }
4012
+
4013
+ // ggml hardsigmoid
4014
+ struct ggml_tensor * ggml_hardsigmoid(
4015
+ struct ggml_context * ctx,
4016
+ struct ggml_tensor * a) {
4017
+ return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSIGMOID);
4018
+ }
4019
+
3954
4020
  // ggml_norm
3955
4021
 
3956
4022
  static struct ggml_tensor * ggml_norm_impl(
@@ -5283,7 +5349,7 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
5283
5349
  int s0,
5284
5350
  int p0,
5285
5351
  int d0) {
5286
- struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false); // [N, OL, IC * K]
5352
+ struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16); // [N, OL, IC * K]
5287
5353
 
5288
5354
  struct ggml_tensor * result =
5289
5355
  ggml_mul_mat(ctx,
@@ -5350,6 +5416,30 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
5350
5416
  return result;
5351
5417
  }
5352
5418
 
5419
+ // ggml_conv_depthwise
5420
+ struct ggml_tensor * ggml_conv_depthwise_2d(
5421
+ struct ggml_context * ctx,
5422
+ struct ggml_tensor * a,
5423
+ struct ggml_tensor * b,
5424
+ int s0,
5425
+ int s1,
5426
+ int p0,
5427
+ int p1,
5428
+ int d0,
5429
+ int d1) {
5430
+
5431
+ struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
5432
+ struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
5433
+ ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
5434
+ s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW]
5435
+ struct ggml_tensor * new_b = ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
5436
+
5437
+ new_a = ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1); // [OC,1, KH, KW] => [1, OC, 1, KH * KW]
5438
+ struct ggml_tensor * result = ggml_mul_mat(ctx, new_a, new_b);
5439
+ result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
5440
+
5441
+ return result;
5442
+ }
5353
5443
  // ggml_conv_2d
5354
5444
 
5355
5445
  // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
@@ -5366,7 +5456,8 @@ struct ggml_tensor * ggml_im2col(
5366
5456
  int p1,
5367
5457
  int d0,
5368
5458
  int d1,
5369
- bool is_2D) {
5459
+ bool is_2D,
5460
+ enum ggml_type dst_type) {
5370
5461
 
5371
5462
  if(is_2D) {
5372
5463
  GGML_ASSERT(a->ne[2] == b->ne[2]);
@@ -5390,7 +5481,7 @@ struct ggml_tensor * ggml_im2col(
5390
5481
  is_2D ? b->ne[3] : 1,
5391
5482
  };
5392
5483
 
5393
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
5484
+ struct ggml_tensor * result = ggml_new_tensor(ctx, dst_type, 4, ne);
5394
5485
  int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
5395
5486
  ggml_set_op_params(result, params, sizeof(params));
5396
5487
 
@@ -5415,7 +5506,7 @@ struct ggml_tensor * ggml_conv_2d(
5415
5506
  int p1,
5416
5507
  int d0,
5417
5508
  int d1) {
5418
- struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true); // [N, OH, OW, IC * KH * KW]
5509
+ struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N, OH, OW, IC * KH * KW]
5419
5510
 
5420
5511
  struct ggml_tensor * result =
5421
5512
  ggml_mul_mat(ctx,
@@ -5541,12 +5632,13 @@ struct ggml_tensor * ggml_pool_2d(
5541
5632
  is_node = true;
5542
5633
  }
5543
5634
 
5635
+ struct ggml_tensor * result;
5544
5636
  const int64_t ne[3] = {
5545
5637
  ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
5546
5638
  ggml_calc_pool_output_size(a->ne[1], k1, s1, p1),
5547
5639
  a->ne[2],
5548
5640
  };
5549
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
5641
+ result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
5550
5642
 
5551
5643
  int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
5552
5644
  ggml_set_op_params(result, params, sizeof(params));
@@ -5554,7 +5646,6 @@ struct ggml_tensor * ggml_pool_2d(
5554
5646
  result->op = GGML_OP_POOL_2D;
5555
5647
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5556
5648
  result->src[0] = a;
5557
-
5558
5649
  return result;
5559
5650
  }
5560
5651
 
@@ -7169,6 +7260,17 @@ static void ggml_compute_forward_add_f32(
7169
7260
  const int ith = params->ith;
7170
7261
  const int nth = params->nth;
7171
7262
 
7263
+ #ifdef GGML_USE_CLBLAST
7264
+ if (src1->backend == GGML_BACKEND_GPU) {
7265
+ // TODO: OpenCL kernel support full broadcast
7266
+ GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
7267
+ if (ith == 0) {
7268
+ ggml_cl_add(src0, src1, dst);
7269
+ }
7270
+ return;
7271
+ }
7272
+ #endif
7273
+
7172
7274
  const int nr = ggml_nrows(src0);
7173
7275
 
7174
7276
  GGML_TENSOR_BINARY_OP_LOCALS
@@ -7449,7 +7551,12 @@ static void ggml_compute_forward_add(
7449
7551
  switch (src0->type) {
7450
7552
  case GGML_TYPE_F32:
7451
7553
  {
7452
- ggml_compute_forward_add_f32(params, src0, src1, dst);
7554
+ if (src1->type == GGML_TYPE_F32) {
7555
+ ggml_compute_forward_add_f32(params, src0, src1, dst);
7556
+ }
7557
+ else {
7558
+ GGML_ASSERT(false);
7559
+ }
7453
7560
  } break;
7454
7561
  case GGML_TYPE_F16:
7455
7562
  {
@@ -7475,6 +7582,7 @@ static void ggml_compute_forward_add(
7475
7582
  case GGML_TYPE_Q6_K:
7476
7583
  case GGML_TYPE_IQ2_XXS:
7477
7584
  case GGML_TYPE_IQ2_XS:
7585
+ case GGML_TYPE_IQ3_XXS:
7478
7586
  {
7479
7587
  ggml_compute_forward_add_q_f32(params, src0, src1, dst);
7480
7588
  } break;
@@ -7741,6 +7849,7 @@ static void ggml_compute_forward_add1(
7741
7849
  case GGML_TYPE_Q6_K:
7742
7850
  case GGML_TYPE_IQ2_XXS:
7743
7851
  case GGML_TYPE_IQ2_XS:
7852
+ case GGML_TYPE_IQ3_XXS:
7744
7853
  {
7745
7854
  ggml_compute_forward_add1_q_f32(params, src0, src1, dst);
7746
7855
  } break;
@@ -7770,6 +7879,9 @@ static void ggml_compute_forward_acc_f32(
7770
7879
  bool inplace = (bool) ((int32_t *) dst->op_params)[4];
7771
7880
 
7772
7881
  if (!inplace && (params->type == GGML_TASK_INIT)) {
7882
+ if (params->ith != 0) {
7883
+ return;
7884
+ }
7773
7885
  // memcpy needs to be synchronized across threads to avoid race conditions.
7774
7886
  // => do it in INIT phase
7775
7887
  memcpy(
@@ -7857,6 +7969,7 @@ static void ggml_compute_forward_acc(
7857
7969
  case GGML_TYPE_Q6_K:
7858
7970
  case GGML_TYPE_IQ2_XXS:
7859
7971
  case GGML_TYPE_IQ2_XS:
7972
+ case GGML_TYPE_IQ3_XXS:
7860
7973
  default:
7861
7974
  {
7862
7975
  GGML_ASSERT(false);
@@ -7958,7 +8071,7 @@ static void ggml_compute_forward_mul_f32(
7958
8071
  const int ith = params->ith;
7959
8072
  const int nth = params->nth;
7960
8073
 
7961
- #ifdef GGML_USE_CLBLAST
8074
+ #if defined(GGML_USE_CLBLAST)
7962
8075
  if (src1->backend == GGML_BACKEND_GPU) {
7963
8076
  // TODO: OpenCL kernel support full broadcast
7964
8077
  GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
@@ -9339,6 +9452,87 @@ static void ggml_compute_forward_silu_back(
9339
9452
  }
9340
9453
  }
9341
9454
 
9455
+
9456
+ static void ggml_compute_forward_hardswish_f32(
9457
+ const struct ggml_compute_params * params,
9458
+ const struct ggml_tensor * src0,
9459
+ struct ggml_tensor * dst) {
9460
+ assert(params->ith == 0);
9461
+ assert(ggml_are_same_shape(src0, dst));
9462
+
9463
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9464
+ return;
9465
+ }
9466
+
9467
+ const int n = ggml_nrows(src0);
9468
+ const int nc = src0->ne[0];
9469
+
9470
+ assert(dst->nb[0] == sizeof(float));
9471
+ assert(src0->nb[0] == sizeof(float));
9472
+
9473
+ for (int i = 0; i < n; i++) {
9474
+ ggml_vec_hardswish_f32(nc,
9475
+ (float *) ((char *) dst->data + i*( dst->nb[1])),
9476
+ (float *) ((char *) src0->data + i*(src0->nb[1])));
9477
+ }
9478
+ }
9479
+ static void ggml_compute_forward_hardswish(
9480
+ const struct ggml_compute_params * params,
9481
+ const struct ggml_tensor * src0,
9482
+ struct ggml_tensor * dst) {
9483
+ switch (src0->type) {
9484
+ case GGML_TYPE_F32:
9485
+ {
9486
+ ggml_compute_forward_hardswish_f32(params, src0, dst);
9487
+ } break;
9488
+ default:
9489
+ {
9490
+ GGML_ASSERT(false);
9491
+ } break;
9492
+ }
9493
+ }
9494
+
9495
+ static void ggml_compute_forward_hardsigmoid_f32(
9496
+ const struct ggml_compute_params * params,
9497
+ const struct ggml_tensor * src0,
9498
+ struct ggml_tensor * dst) {
9499
+ assert(params->ith == 0);
9500
+ assert(ggml_are_same_shape(src0, dst));
9501
+
9502
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9503
+ return;
9504
+ }
9505
+
9506
+ const int n = ggml_nrows(src0);
9507
+ const int nc = src0->ne[0];
9508
+
9509
+ assert(dst->nb[0] == sizeof(float));
9510
+ assert(src0->nb[0] == sizeof(float));
9511
+
9512
+ for (int i = 0; i < n; i++) {
9513
+ ggml_vec_hardsigmoid_f32(nc,
9514
+ (float *) ((char *) dst->data + i*( dst->nb[1])),
9515
+ (float *) ((char *) src0->data + i*(src0->nb[1])));
9516
+ }
9517
+ }
9518
+
9519
+ static void ggml_compute_forward_hardsigmoid(
9520
+ const struct ggml_compute_params * params,
9521
+ const struct ggml_tensor * src0,
9522
+ struct ggml_tensor * dst) {
9523
+ switch (src0->type) {
9524
+ case GGML_TYPE_F32:
9525
+ {
9526
+ ggml_compute_forward_hardsigmoid_f32(params, src0, dst);
9527
+ } break;
9528
+ default:
9529
+ {
9530
+ GGML_ASSERT(false);
9531
+ } break;
9532
+ }
9533
+ }
9534
+
9535
+
9342
9536
  // ggml_compute_forward_norm
9343
9537
 
9344
9538
  static void ggml_compute_forward_norm_f32(
@@ -9790,10 +9984,6 @@ static void ggml_compute_forward_mul_mat(
9790
9984
  const int ith = params->ith;
9791
9985
  const int nth = params->nth;
9792
9986
 
9793
- if (ith == 1 && g_imatrix_collect) {
9794
- g_imatrix_collect(src0, src1);
9795
- }
9796
-
9797
9987
  const enum ggml_type type = src0->type;
9798
9988
 
9799
9989
  const bool src1_cont = ggml_is_contiguous(src1);
@@ -9835,11 +10025,30 @@ static void ggml_compute_forward_mul_mat(
9835
10025
 
9836
10026
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9837
10027
  if (ggml_compute_forward_mul_mat_use_blas(dst)) {
9838
- if (params->ith != 0) {
9839
- return;
9840
- }
10028
+ const int64_t ne_plane = ne01*ne00;
10029
+ const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
10030
+ UNUSED(desired_wsize);
9841
10031
 
9842
10032
  if (params->type == GGML_TASK_INIT) {
10033
+ if (type != GGML_TYPE_F32) {
10034
+ assert(params->wsize >= desired_wsize);
10035
+ // parallelize by src0 rows
10036
+ for (int64_t i13 = 0; i13 < ne13; i13++) {
10037
+ for (int64_t i12 = 0; i12 < ne12; i12++) {
10038
+ // broadcast src0 into src1 across 2nd,3rd dimension
10039
+ const int64_t i03 = i13/r3;
10040
+ const int64_t i02 = i12/r2;
10041
+
10042
+ const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
10043
+ float * const wdata = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
10044
+ ggml_to_float_t const to_float = type_traits[type].to_float;
10045
+
10046
+ for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
10047
+ to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
10048
+ }
10049
+ }
10050
+ }
10051
+ }
9843
10052
  return;
9844
10053
  }
9845
10054
 
@@ -9847,9 +10056,14 @@ static void ggml_compute_forward_mul_mat(
9847
10056
  return;
9848
10057
  }
9849
10058
 
10059
+ // perform sgemm, parallelization controlled by blas lib
10060
+ if (ith != 0) {
10061
+ return;
10062
+ }
10063
+
10064
+ //const int64_t tgemm0 = ggml_perf_time_us();
9850
10065
  for (int64_t i13 = 0; i13 < ne13; i13++) {
9851
10066
  for (int64_t i12 = 0; i12 < ne12; i12++) {
9852
- // broadcast src0 into src1 across 2nd,3rd dimension
9853
10067
  const int64_t i03 = i13/r3;
9854
10068
  const int64_t i02 = i12/r2;
9855
10069
 
@@ -9858,17 +10072,7 @@ static void ggml_compute_forward_mul_mat(
9858
10072
  float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
9859
10073
 
9860
10074
  if (type != GGML_TYPE_F32) {
9861
- float * const wdata = params->wdata;
9862
- ggml_to_float_t const to_float = type_traits[type].to_float;
9863
-
9864
- size_t id = 0;
9865
- for (int64_t i01 = 0; i01 < ne01; ++i01) {
9866
- to_float((const char *) x + i01*nb01, wdata + id, ne00);
9867
- id += ne00;
9868
- }
9869
-
9870
- assert(id*sizeof(float) <= params->wsize);
9871
- x = wdata;
10075
+ x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
9872
10076
  }
9873
10077
 
9874
10078
  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
@@ -9878,6 +10082,7 @@ static void ggml_compute_forward_mul_mat(
9878
10082
  0.0f, d, ne01);
9879
10083
  }
9880
10084
  }
10085
+ //printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);
9881
10086
 
9882
10087
  //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
9883
10088
 
@@ -9886,6 +10091,9 @@ static void ggml_compute_forward_mul_mat(
9886
10091
  #endif
9887
10092
 
9888
10093
  if (params->type == GGML_TASK_INIT) {
10094
+ if (ith != 0) {
10095
+ return;
10096
+ }
9889
10097
  if (src1->type != vec_dot_type) {
9890
10098
  char * wdata = params->wdata;
9891
10099
  const size_t row_size = ggml_row_size(vec_dot_type, ne10);
@@ -10050,6 +10258,9 @@ static void ggml_compute_forward_mul_mat_id(
10050
10258
  #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
10051
10259
 
10052
10260
  if (params->type == GGML_TASK_INIT) {
10261
+ if (ith != 0) {
10262
+ return;
10263
+ }
10053
10264
  char * wdata = params->wdata;
10054
10265
  if (src1->type != vec_dot_type) {
10055
10266
  const size_t row_size = ggml_row_size(vec_dot_type, ne10);
@@ -10097,10 +10308,6 @@ static void ggml_compute_forward_mul_mat_id(
10097
10308
 
10098
10309
  const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];
10099
10310
 
10100
- if (ith == 1 && g_imatrix_collect) {
10101
- g_imatrix_collect(src0_cur, src1);
10102
- }
10103
-
10104
10311
  const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
10105
10312
  const size_t row_size = ggml_row_size(vec_dot_type, ne10);
10106
10313
 
@@ -10239,6 +10446,9 @@ static void ggml_compute_forward_out_prod_f32(
10239
10446
  return;
10240
10447
  }
10241
10448
  #endif
10449
+ if (ith != 0) {
10450
+ return;
10451
+ }
10242
10452
  ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
10243
10453
  return;
10244
10454
  }
@@ -10422,6 +10632,9 @@ static void ggml_compute_forward_out_prod_q_f32(
10422
10632
  // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
10423
10633
 
10424
10634
  if (params->type == GGML_TASK_INIT) {
10635
+ if (ith != 0) {
10636
+ return;
10637
+ }
10425
10638
  ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
10426
10639
  return;
10427
10640
  }
@@ -10508,6 +10721,7 @@ static void ggml_compute_forward_out_prod(
10508
10721
  case GGML_TYPE_Q6_K:
10509
10722
  case GGML_TYPE_IQ2_XXS:
10510
10723
  case GGML_TYPE_IQ2_XS:
10724
+ case GGML_TYPE_IQ3_XXS:
10511
10725
  {
10512
10726
  ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst);
10513
10727
  } break;
@@ -10606,6 +10820,9 @@ static void ggml_compute_forward_set_f32(
10606
10820
  bool inplace = (bool) ((int32_t *) dst->op_params)[4];
10607
10821
 
10608
10822
  if (!inplace && (params->type == GGML_TASK_INIT)) {
10823
+ if (params->ith != 0) {
10824
+ return;
10825
+ }
10609
10826
  // memcpy needs to be synchronized across threads to avoid race conditions.
10610
10827
  // => do it in INIT phase
10611
10828
  memcpy(
@@ -10684,6 +10901,7 @@ static void ggml_compute_forward_set(
10684
10901
  case GGML_TYPE_Q6_K:
10685
10902
  case GGML_TYPE_IQ2_XXS:
10686
10903
  case GGML_TYPE_IQ2_XS:
10904
+ case GGML_TYPE_IQ3_XXS:
10687
10905
  default:
10688
10906
  {
10689
10907
  GGML_ASSERT(false);
@@ -10880,6 +11098,7 @@ static void ggml_compute_forward_get_rows(
10880
11098
  case GGML_TYPE_Q6_K:
10881
11099
  case GGML_TYPE_IQ2_XXS:
10882
11100
  case GGML_TYPE_IQ2_XS:
11101
+ case GGML_TYPE_IQ3_XXS:
10883
11102
  {
10884
11103
  ggml_compute_forward_get_rows_q(params, src0, src1, dst);
10885
11104
  } break;
@@ -10930,6 +11149,9 @@ static void ggml_compute_forward_get_rows_back_f32_f16(
10930
11149
  // ggml_compute_forward_dup_same_cont(params, opt0, dst);
10931
11150
 
10932
11151
  if (params->type == GGML_TASK_INIT) {
11152
+ if (params->ith != 0) {
11153
+ return;
11154
+ }
10933
11155
  memset(dst->data, 0, ggml_nbytes(dst));
10934
11156
  }
10935
11157
 
@@ -10964,6 +11186,9 @@ static void ggml_compute_forward_get_rows_back_f32(
10964
11186
  // ggml_compute_forward_dup_same_cont(params, opt0, dst);
10965
11187
 
10966
11188
  if (params->type == GGML_TASK_INIT) {
11189
+ if (params->ith != 0) {
11190
+ return;
11191
+ }
10967
11192
  memset(dst->data, 0, ggml_nbytes(dst));
10968
11193
  }
10969
11194
 
@@ -11101,6 +11326,9 @@ static void ggml_compute_forward_diag_mask_f32(
11101
11326
  GGML_ASSERT(n_past >= 0);
11102
11327
 
11103
11328
  if (!inplace && (params->type == GGML_TASK_INIT)) {
11329
+ if (ith != 0) {
11330
+ return;
11331
+ }
11104
11332
  // memcpy needs to be synchronized across threads to avoid race conditions.
11105
11333
  // => do it in INIT phase
11106
11334
  GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
@@ -11518,6 +11746,7 @@ static void ggml_compute_forward_alibi(
11518
11746
  case GGML_TYPE_Q6_K:
11519
11747
  case GGML_TYPE_IQ2_XXS:
11520
11748
  case GGML_TYPE_IQ2_XS:
11749
+ case GGML_TYPE_IQ3_XXS:
11521
11750
  case GGML_TYPE_Q8_K:
11522
11751
  case GGML_TYPE_I8:
11523
11752
  case GGML_TYPE_I16:
@@ -11594,6 +11823,7 @@ static void ggml_compute_forward_clamp(
11594
11823
  case GGML_TYPE_Q6_K:
11595
11824
  case GGML_TYPE_IQ2_XXS:
11596
11825
  case GGML_TYPE_IQ2_XS:
11826
+ case GGML_TYPE_IQ3_XXS:
11597
11827
  case GGML_TYPE_Q8_K:
11598
11828
  case GGML_TYPE_I8:
11599
11829
  case GGML_TYPE_I16:
@@ -12071,6 +12301,9 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
12071
12301
  GGML_ASSERT(nb10 == sizeof(float));
12072
12302
 
12073
12303
  if (params->type == GGML_TASK_INIT) {
12304
+ if (ith != 0) {
12305
+ return;
12306
+ }
12074
12307
  memset(params->wdata, 0, params->wsize);
12075
12308
 
12076
12309
  // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
@@ -12165,6 +12398,9 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
12165
12398
  GGML_ASSERT(nb10 == sizeof(float));
12166
12399
 
12167
12400
  if (params->type == GGML_TASK_INIT) {
12401
+ if (ith != 0) {
12402
+ return;
12403
+ }
12168
12404
  memset(params->wdata, 0, params->wsize);
12169
12405
 
12170
12406
  // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
@@ -12257,6 +12493,92 @@ static void ggml_compute_forward_conv_transpose_1d(
12257
12493
  }
12258
12494
  }
12259
12495
 
12496
+ // src0: kernel [OC, IC, KH, KW]
12497
+ // src1: image [N, IC, IH, IW]
12498
+ // dst: result [N, OH, OW, IC*KH*KW]
12499
+ static void ggml_compute_forward_im2col_f32(
12500
+ const struct ggml_compute_params * params,
12501
+ const struct ggml_tensor * src0,
12502
+ const struct ggml_tensor * src1,
12503
+ struct ggml_tensor * dst) {
12504
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
12505
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
12506
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
12507
+
12508
+ int64_t t0 = ggml_perf_time_us();
12509
+ UNUSED(t0);
12510
+
12511
+ GGML_TENSOR_BINARY_OP_LOCALS;
12512
+
12513
+ const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
12514
+ const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
12515
+ const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
12516
+ const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
12517
+ const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
12518
+ const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
12519
+ const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
12520
+
12521
+ const int ith = params->ith;
12522
+ const int nth = params->nth;
12523
+
12524
+ const int64_t N = is_2D ? ne13 : ne12;
12525
+ const int64_t IC = is_2D ? ne12 : ne11;
12526
+ const int64_t IH = is_2D ? ne11 : 1;
12527
+ const int64_t IW = ne10;
12528
+
12529
+ const int64_t KH = is_2D ? ne01 : 1;
12530
+ const int64_t KW = ne00;
12531
+
12532
+ const int64_t OH = is_2D ? ne2 : 1;
12533
+ const int64_t OW = ne1;
12534
+
12535
+ int ofs0 = is_2D ? nb13 : nb12;
12536
+ int ofs1 = is_2D ? nb12 : nb11;
12537
+
12538
+ GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
12539
+ GGML_ASSERT(nb10 == sizeof(float));
12540
+
12541
+ if (params->type == GGML_TASK_INIT) {
12542
+ return;
12543
+ }
12544
+
12545
+ if (params->type == GGML_TASK_FINALIZE) {
12546
+ return;
12547
+ }
12548
+
12549
+ // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
12550
+ {
12551
+ float * const wdata = (float *) dst->data;
12552
+
12553
+ for (int64_t in = 0; in < N; in++) {
12554
+ for (int64_t ioh = 0; ioh < OH; ioh++) { // 1
12555
+ for (int64_t iow = 0; iow < OW; iow++) {
12556
+ for (int64_t iic = ith; iic < IC; iic += nth) {
12557
+
12558
+ // micro kernel
12559
+ float * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
12560
+ const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW]
12561
+
12562
+ for (int64_t ikh = 0; ikh < KH; ikh++) { // 1
12563
+ for (int64_t ikw = 0; ikw < KW; ikw++) {
12564
+ const int64_t iiw = iow*s0 + ikw*d0 - p0;
12565
+ const int64_t iih = ioh*s1 + ikh*d1 - p1;
12566
+
12567
+ if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
12568
+ dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
12569
+ } else {
12570
+ dst_data[iic*(KH*KW) + ikh*KW + ikw] = (src_data[iih*IW + iiw]);
12571
+ }
12572
+ }
12573
+ }
12574
+ }
12575
+ }
12576
+ }
12577
+ }
12578
+ }
12579
+ }
12580
+
12581
+
12260
12582
  // src0: kernel [OC, IC, KH, KW]
12261
12583
  // src1: image [N, IC, IH, IW]
12262
12584
  // dst: result [N, OH, OW, IC*KH*KW]
@@ -12347,14 +12669,14 @@ static void ggml_compute_forward_im2col(
12347
12669
  const struct ggml_tensor * src0,
12348
12670
  const struct ggml_tensor * src1,
12349
12671
  struct ggml_tensor * dst) {
12350
- switch (src0->type) {
12672
+ switch (dst->type) {
12351
12673
  case GGML_TYPE_F16:
12352
12674
  {
12353
12675
  ggml_compute_forward_im2col_f16(params, src0, src1, dst);
12354
12676
  } break;
12355
12677
  case GGML_TYPE_F32:
12356
12678
  {
12357
- GGML_ASSERT(false);
12679
+ ggml_compute_forward_im2col_f32(params, src0, src1, dst);
12358
12680
  } break;
12359
12681
  default:
12360
12682
  {
@@ -12363,6 +12685,7 @@ static void ggml_compute_forward_im2col(
12363
12685
  }
12364
12686
  }
12365
12687
 
12688
+
12366
12689
  // ggml_compute_forward_conv_transpose_2d
12367
12690
 
12368
12691
  static void ggml_compute_forward_conv_transpose_2d(
@@ -12388,6 +12711,9 @@ static void ggml_compute_forward_conv_transpose_2d(
12388
12711
  GGML_ASSERT(nb10 == sizeof(float));
12389
12712
 
12390
12713
  if (params->type == GGML_TASK_INIT) {
12714
+ if (ith != 0) {
12715
+ return;
12716
+ }
12391
12717
  memset(params->wdata, 0, params->wsize);
12392
12718
 
12393
12719
  // permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
@@ -12541,8 +12867,8 @@ static void ggml_compute_forward_pool_2d(
12541
12867
  const struct ggml_compute_params * params,
12542
12868
  const struct ggml_tensor * src,
12543
12869
  struct ggml_tensor * dst) {
12544
- assert(src->type == GGML_TYPE_F32);
12545
- assert(params->ith == 0);
12870
+ GGML_ASSERT(src->type == GGML_TYPE_F32);
12871
+ GGML_ASSERT(params->ith == 0);
12546
12872
 
12547
12873
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12548
12874
  return;
@@ -13931,6 +14257,14 @@ static void ggml_compute_forward_unary(
13931
14257
  {
13932
14258
  ggml_compute_forward_silu(params, src0, dst);
13933
14259
  } break;
14260
+ case GGML_UNARY_OP_HARDSWISH:
14261
+ {
14262
+ ggml_compute_forward_hardswish(params, src0, dst);
14263
+ } break;
14264
+ case GGML_UNARY_OP_HARDSIGMOID:
14265
+ {
14266
+ ggml_compute_forward_hardsigmoid(params, src0, dst);
14267
+ } break;
13934
14268
  default:
13935
14269
  {
13936
14270
  GGML_ASSERT(false);
@@ -13994,6 +14328,9 @@ static void ggml_compute_forward_add_rel_pos_f32(
13994
14328
 
13995
14329
  const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
13996
14330
  if (!inplace && params->type == GGML_TASK_INIT) {
14331
+ if (params->ith != 0) {
14332
+ return;
14333
+ }
13997
14334
  memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
13998
14335
  return;
13999
14336
  }
@@ -14509,8 +14846,26 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14509
14846
  }
14510
14847
  GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
14511
14848
  GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
14849
+ #elif defined(GGML_USE_VULKAN)
14850
+ const bool skip_cpu = ggml_vk_compute_forward(params, tensor);
14851
+ #ifdef GGML_VULKAN_CHECK_RESULTS
14852
+ if (skip_cpu) {
14853
+ ggml_vk_check_results_1(params, tensor);
14854
+ }
14855
+ #endif
14856
+ if (skip_cpu) {
14857
+ return;
14858
+ }
14859
+ GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
14860
+ GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
14512
14861
  #endif // GGML_USE_CUBLAS
14513
14862
 
14863
+ #ifdef GGML_USE_SYCL
14864
+ bool skip_cpu = ggml_sycl_compute_forward(params, tensor);
14865
+ if (skip_cpu) {
14866
+ return;
14867
+ }
14868
+ #endif // GGML_USE_SYCL
14514
14869
  switch (tensor->op) {
14515
14870
  case GGML_OP_DUP:
14516
14871
  {
@@ -14913,13 +15268,13 @@ struct ggml_hash_set ggml_hash_set_new(size_t size) {
14913
15268
  size = ggml_hash_size(size);
14914
15269
  struct ggml_hash_set result;
14915
15270
  result.size = size;
14916
- result.keys = malloc(sizeof(struct ggml_tensor *) * size);
15271
+ result.keys = GGML_MALLOC(sizeof(struct ggml_tensor *) * size);
14917
15272
  memset(result.keys, 0, sizeof(struct ggml_tensor *) * size);
14918
15273
  return result;
14919
15274
  }
14920
15275
 
14921
15276
  static void ggml_hash_set_free(struct ggml_hash_set hash_set) {
14922
- free(hash_set.keys);
15277
+ GGML_FREE(hash_set.keys);
14923
15278
  }
14924
15279
 
14925
15280
  struct hash_map {
@@ -14928,17 +15283,17 @@ struct hash_map {
14928
15283
  };
14929
15284
 
14930
15285
  static struct hash_map * ggml_new_hash_map(size_t size) {
14931
- struct hash_map * result = malloc(sizeof(struct hash_map));
15286
+ struct hash_map * result = GGML_MALLOC(sizeof(struct hash_map));
14932
15287
  result->set = ggml_hash_set_new(size);
14933
- result->vals = malloc(sizeof(struct ggml_tensor *) * result->set.size);
15288
+ result->vals = GGML_MALLOC(sizeof(struct ggml_tensor *) * result->set.size);
14934
15289
  memset(result->vals, 0, sizeof(struct ggml_tensor *) * result->set.size);
14935
15290
  return result;
14936
15291
  }
14937
15292
 
14938
15293
  static void ggml_hash_map_free(struct hash_map * map) {
14939
15294
  ggml_hash_set_free(map->set);
14940
- free(map->vals);
14941
- free(map);
15295
+ GGML_FREE(map->vals);
15296
+ GGML_FREE(map);
14942
15297
  }
14943
15298
 
14944
15299
  // gradient checkpointing
@@ -16287,8 +16642,9 @@ struct ggml_compute_state_shared {
16287
16642
  const int n_threads;
16288
16643
 
16289
16644
  // synchronization primitives
16290
- atomic_int n_active; // num active threads
16291
- atomic_int node_n; // active graph node
16645
+ atomic_int n_active; // num active threads
16646
+ atomic_int node_n; // active graph node
16647
+ atomic_int node_task; // active graph node task phase
16292
16648
 
16293
16649
  bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
16294
16650
  void * abort_callback_data;
@@ -16344,6 +16700,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
16344
16700
  case GGML_UNARY_OP_TANH:
16345
16701
  case GGML_UNARY_OP_ELU:
16346
16702
  case GGML_UNARY_OP_RELU:
16703
+ case GGML_UNARY_OP_HARDSWISH: // to opt for multiple threads
16704
+ case GGML_UNARY_OP_HARDSIGMOID: // to opt for multiple threads
16347
16705
  {
16348
16706
  n_tasks = 1;
16349
16707
  } break;
@@ -16420,7 +16778,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
16420
16778
  } break;
16421
16779
  case GGML_OP_SOFT_MAX:
16422
16780
  {
16423
- n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
16781
+ n_tasks = MIN(n_threads, ggml_nrows(node->src[0]));
16424
16782
  } break;
16425
16783
  case GGML_OP_CONV_TRANSPOSE_1D:
16426
16784
  {
@@ -16534,6 +16892,34 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
16534
16892
  return n_tasks;
16535
16893
  }
16536
16894
 
16895
+ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_compute_state * state, const bool do_yield) {
16896
+ // wait for other threads to finish
16897
+ const int last_node_n = * node_n;
16898
+
16899
+ while (true) {
16900
+ if (do_yield) {
16901
+ sched_yield();
16902
+ }
16903
+
16904
+ * node_n = atomic_load(&state->shared->node_n);
16905
+ if (* node_n != last_node_n) break;
16906
+ }
16907
+ }
16908
+
16909
+ static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
16910
+ // wait for other threads to finish
16911
+ const int last_task_phase = * task_phase;
16912
+
16913
+ while (true) {
16914
+ if (do_yield) {
16915
+ sched_yield();
16916
+ }
16917
+
16918
+ * task_phase = atomic_load(&state->shared->node_task);
16919
+ if (* task_phase != last_task_phase) break;
16920
+ }
16921
+ }
16922
+
16537
16923
  static thread_ret_t ggml_graph_compute_thread(void * data) {
16538
16924
  struct ggml_compute_state * state = (struct ggml_compute_state *) data;
16539
16925
 
@@ -16544,7 +16930,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16544
16930
 
16545
16931
  set_numa_thread_affinity(state->ith, n_threads);
16546
16932
 
16547
- int node_n = -1;
16933
+ int node_n = -1;
16934
+ int task_phase = GGML_TASK_FINALIZE;
16548
16935
 
16549
16936
  while (true) {
16550
16937
  if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
@@ -16576,7 +16963,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16576
16963
  // distribute new work or execute it direct if 1T
16577
16964
  while (++node_n < cgraph->n_nodes) {
16578
16965
  GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
16579
-
16580
16966
  struct ggml_tensor * node = cgraph->nodes[node_n];
16581
16967
  const int n_tasks = ggml_get_n_tasks(node, n_threads);
16582
16968
 
@@ -16585,13 +16971,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16585
16971
 
16586
16972
  params.nth = n_tasks;
16587
16973
 
16588
- /* INIT */
16589
- if (GGML_OP_HAS_INIT[node->op]) {
16590
- params.type = GGML_TASK_INIT;
16591
- ggml_compute_forward(&params, node);
16592
- }
16593
-
16594
16974
  if (n_tasks == 1) {
16975
+ /* INIT */
16976
+ if (GGML_OP_HAS_INIT[node->op]) {
16977
+ params.type = GGML_TASK_INIT;
16978
+ ggml_compute_forward(&params, node);
16979
+ }
16980
+
16595
16981
  // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
16596
16982
  // they do something more efficient than spinning (?)
16597
16983
  params.type = GGML_TASK_COMPUTE;
@@ -16612,38 +16998,24 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16612
16998
  }
16613
16999
  }
16614
17000
 
16615
- atomic_store(&state->shared->n_active, n_threads);
16616
- atomic_store(&state->shared->node_n, node_n);
17001
+ task_phase = GGML_TASK_INIT;
17002
+ atomic_store(&state->shared->n_active, n_threads);
17003
+ atomic_store(&state->shared->node_n, node_n);
17004
+ atomic_store(&state->shared->node_task, task_phase);
16617
17005
  } else {
16618
- // wait for other threads to finish
16619
- const int last = node_n;
16620
-
16621
- const bool do_yield = last < 0 || cgraph->nodes[last]->op == GGML_OP_MUL_MAT;
16622
-
16623
- while (true) {
16624
- // TODO: this sched_yield can have significant impact on the performance - either positive or negative
16625
- // depending on the workload and the operating system.
16626
- // since it is not clear what is the best approach, it should potentially become user-configurable
16627
- // ref: https://github.com/ggerganov/ggml/issues/291
16628
- // UPD: adding the do_yield flag seems to resolve the issue universally
16629
- if (do_yield) {
16630
- sched_yield();
16631
- }
16632
-
16633
- node_n = atomic_load(&state->shared->node_n);
16634
- if (node_n != last) break;
16635
- };
17006
+ ggml_graph_compute_thread_sync_node(&node_n, state, false);
17007
+ ggml_graph_compute_thread_sync_task(&task_phase, state, false);
16636
17008
  }
16637
17009
 
16638
17010
  // check if we should stop
16639
17011
  if (node_n >= cgraph->n_nodes) break;
16640
17012
 
16641
- /* COMPUTE */
17013
+ /* INIT & COMPUTE */
16642
17014
  struct ggml_tensor * node = cgraph->nodes[node_n];
16643
17015
  const int n_tasks = ggml_get_n_tasks(node, n_threads);
16644
17016
 
16645
17017
  struct ggml_compute_params params = {
16646
- /*.type =*/ GGML_TASK_COMPUTE,
17018
+ /*.type =*/ GGML_TASK_INIT,
16647
17019
  /*.ith =*/ state->ith,
16648
17020
  /*.nth =*/ n_tasks,
16649
17021
  /*.wsize =*/ cplan->work_size,
@@ -16651,8 +17023,39 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16651
17023
  };
16652
17024
 
16653
17025
  if (state->ith < n_tasks) {
17026
+ if (GGML_OP_HAS_INIT[node->op]) {
17027
+ ggml_compute_forward(&params, node);
17028
+ }
17029
+ }
17030
+
17031
+ if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
17032
+ task_phase = GGML_TASK_COMPUTE;
17033
+ atomic_store(&state->shared->n_active, n_threads);
17034
+ atomic_store(&state->shared->node_task, task_phase);
17035
+ }
17036
+ else {
17037
+ // TODO: this sched_yield can have significant impact on the performance - either positive or negative
17038
+ // depending on the workload and the operating system.
17039
+ // since it is not clear what is the best approach, it should potentially become user-configurable
17040
+ // ref: https://github.com/ggerganov/ggml/issues/291
17041
+ // UPD: adding the do_yield flag seems to resolve the issue universally
17042
+ const bool do_yield = node_n < 0 || cgraph->nodes[node_n]->op == GGML_OP_MUL_MAT;
17043
+ ggml_graph_compute_thread_sync_task(&task_phase, state, do_yield);
17044
+ }
17045
+
17046
+ if (state->ith < n_tasks) {
17047
+ params.type = GGML_TASK_COMPUTE;
16654
17048
  ggml_compute_forward(&params, node);
16655
17049
  }
17050
+
17051
+ if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
17052
+ task_phase = GGML_TASK_FINALIZE;
17053
+ atomic_store(&state->shared->n_active, n_threads);
17054
+ atomic_store(&state->shared->node_task, task_phase);
17055
+ }
17056
+ else {
17057
+ ggml_graph_compute_thread_sync_task(&task_phase, state, false);
17058
+ }
16656
17059
  }
16657
17060
 
16658
17061
  return GGML_EXIT_SUCCESS;
@@ -16668,12 +17071,16 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
16668
17071
  struct ggml_cplan cplan;
16669
17072
  memset(&cplan, 0, sizeof(struct ggml_cplan));
16670
17073
 
17074
+ int max_tasks = 1;
17075
+
16671
17076
  // thread scheduling for the different operations + work buffer size estimation
16672
17077
  for (int i = 0; i < cgraph->n_nodes; i++) {
16673
17078
  struct ggml_tensor * node = cgraph->nodes[i];
16674
17079
 
16675
17080
  const int n_tasks = ggml_get_n_tasks(node, n_threads);
16676
17081
 
17082
+ max_tasks = MAX(max_tasks, n_tasks);
17083
+
16677
17084
  size_t cur = 0;
16678
17085
 
16679
17086
  switch (node->op) {
@@ -16709,8 +17116,11 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
16709
17116
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
16710
17117
  if (ggml_compute_forward_mul_mat_use_blas(node)) {
16711
17118
  if (node->src[0]->type != GGML_TYPE_F32) {
16712
- // here we need memory just for single 2D matrix from src0
16713
- cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
17119
+ // here we need memory for fully dequantized matrix from src0
17120
+ // take into account that src0 can be broadcasted into src1[2,3]
17121
+ cur = ggml_type_size(GGML_TYPE_F32)
17122
+ * node->src[0]->ne[0]*node->src[0]->ne[1]
17123
+ * node->src[1]->ne[2]*node->src[1]->ne[3];
16714
17124
  }
16715
17125
  } else
16716
17126
  #endif
@@ -16837,7 +17247,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
16837
17247
  work_size += CACHE_LINE_SIZE*(n_threads - 1);
16838
17248
  }
16839
17249
 
16840
- cplan.n_threads = n_threads;
17250
+ cplan.n_threads = MIN(max_tasks, n_threads);
16841
17251
  cplan.work_size = work_size;
16842
17252
  cplan.work_data = NULL;
16843
17253
 
@@ -16854,6 +17264,17 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
16854
17264
  }
16855
17265
  }
16856
17266
 
17267
+ #ifdef GGML_USE_VULKAN
17268
+ for (int i = 0; i < cgraph->n_nodes; i++) {
17269
+ ggml_vk_preallocate_buffers_graph(cgraph->nodes[i]);
17270
+ }
17271
+ ggml_vk_preallocate_buffers();
17272
+
17273
+ for (int i = 0; i < cgraph->n_nodes; i++) {
17274
+ ggml_vk_build_graph(cgraph->nodes[i], i == cgraph->n_nodes - 1);
17275
+ }
17276
+ #endif
17277
+
16857
17278
  const int n_threads = cplan->n_threads;
16858
17279
 
16859
17280
  struct ggml_compute_state_shared state_shared = {
@@ -16864,6 +17285,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
16864
17285
  /*.n_threads =*/ n_threads,
16865
17286
  /*.n_active =*/ n_threads,
16866
17287
  /*.node_n =*/ -1,
17288
+ /*.node_task =*/ GGML_TASK_FINALIZE,
16867
17289
  /*.abort_callback =*/ NULL,
16868
17290
  /*.abort_callback_data =*/ NULL,
16869
17291
  };
@@ -16904,6 +17326,10 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
16904
17326
  }
16905
17327
  }
16906
17328
 
17329
+ #ifdef GGML_USE_VULKAN
17330
+ ggml_vk_graph_cleanup();
17331
+ #endif
17332
+
16907
17333
  // performance stats (graph)
16908
17334
  {
16909
17335
  int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
@@ -18538,6 +18964,29 @@ enum ggml_opt_result ggml_opt_resume_g(
18538
18964
 
18539
18965
  ////////////////////////////////////////////////////////////////////////////////
18540
18966
 
18967
+ void ggml_quantize_init(enum ggml_type type) {
18968
+ ggml_critical_section_start();
18969
+
18970
+ switch (type) {
18971
+ case GGML_TYPE_IQ2_XXS: iq2xs_init_impl(256); break;
18972
+ case GGML_TYPE_IQ2_XS: iq2xs_init_impl(512); break;
18973
+ case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
18974
+ default: // nothing
18975
+ break;
18976
+ }
18977
+
18978
+ ggml_critical_section_end();
18979
+ }
18980
+
18981
+ void ggml_quantize_free(void) {
18982
+ ggml_critical_section_start();
18983
+
18984
+ iq2xs_free_impl(256);
18985
+ iq2xs_free_impl(512);
18986
+
18987
+ ggml_critical_section_end();
18988
+ }
18989
+
18541
18990
  size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) {
18542
18991
  assert(k % QK4_0 == 0);
18543
18992
  const int nb = k / QK4_0;
@@ -18665,9 +19114,15 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
18665
19114
  return (n/QK8_0*sizeof(block_q8_0));
18666
19115
  }
18667
19116
 
19117
+ bool ggml_quantize_requires_imatrix(enum ggml_type type) {
19118
+ return
19119
+ type == GGML_TYPE_IQ2_XXS ||
19120
+ type == GGML_TYPE_IQ2_XS;
19121
+ }
19122
+
18668
19123
  size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
18669
19124
  int nrows, int n_per_row, int64_t * hist, const float * imatrix) {
18670
- (void)imatrix;
19125
+ ggml_quantize_init(type); // this is noop if already initialized
18671
19126
  size_t result = 0;
18672
19127
  int n = nrows * n_per_row;
18673
19128
  switch (type) {
@@ -18778,15 +19233,24 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
18778
19233
  result = quantize_iq2_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18779
19234
  GGML_ASSERT(result == row_size * nrows);
18780
19235
  } break;
19236
+ case GGML_TYPE_IQ3_XXS:
19237
+ {
19238
+ GGML_ASSERT(start % QK_K == 0);
19239
+ GGML_ASSERT(start % n_per_row == 0);
19240
+ size_t start_row = start / n_per_row;
19241
+ size_t row_size = ggml_row_size(type, n_per_row);
19242
+ result = quantize_iq3_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19243
+ GGML_ASSERT(result == row_size * nrows);
19244
+ } break;
18781
19245
  case GGML_TYPE_F16:
18782
19246
  {
18783
- int elemsize = sizeof(ggml_fp16_t);
19247
+ size_t elemsize = sizeof(ggml_fp16_t);
18784
19248
  ggml_fp32_to_fp16_row(src + start, (ggml_fp16_t *)dst + start, n);
18785
19249
  result = n * elemsize;
18786
19250
  } break;
18787
19251
  case GGML_TYPE_F32:
18788
19252
  {
18789
- int elemsize = sizeof(float);
19253
+ size_t elemsize = sizeof(float);
18790
19254
  result = n * elemsize;
18791
19255
  memcpy((uint8_t *)dst + start * elemsize, src + start, result);
18792
19256
  } break;
@@ -18904,6 +19368,25 @@ struct gguf_context {
18904
19368
  void * data;
18905
19369
  };
18906
19370
 
19371
+ static size_t gguf_type_size(enum gguf_type type) {
19372
+ GGML_ASSERT(0 <= type && type < GGUF_TYPE_COUNT);
19373
+ return GGUF_TYPE_SIZE[type];
19374
+ }
19375
+
19376
+ static void gguf_tensor_info_sanitize(struct gguf_tensor_info * info) {
19377
+ GGML_ASSERT(info->n_dims <= GGML_MAX_DIMS);
19378
+ GGML_ASSERT(0 <= info->type && info->type < GGML_TYPE_COUNT);
19379
+
19380
+ for (uint32_t i = 0; i < info->n_dims; ++i) {
19381
+ GGML_ASSERT(info->ne[i] > 0);
19382
+ }
19383
+
19384
+ // prevent overflow for total number of elements
19385
+ GGML_ASSERT(INT64_MAX/info->ne[1] > info->ne[0]);
19386
+ GGML_ASSERT(INT64_MAX/info->ne[2] > info->ne[0]*info->ne[1]);
19387
+ GGML_ASSERT(INT64_MAX/info->ne[3] > info->ne[0]*info->ne[1]*info->ne[2]);
19388
+ }
19389
+
18907
19390
  static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) {
18908
19391
  const size_t n = fread(dst, 1, size, file);
18909
19392
  *offset += n;
@@ -18916,8 +19399,17 @@ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
18916
19399
 
18917
19400
  bool ok = true;
18918
19401
 
18919
- ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = calloc(p->n + 1, 1);
18920
- ok = ok && gguf_fread_el(file, p->data, p->n, offset);
19402
+ ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset);
19403
+
19404
+ // early exit if string length is invalid, prevents from integer overflow
19405
+ if (p->n == SIZE_MAX) {
19406
+ fprintf(stderr, "%s: invalid string length (%" PRIu64 ")\n", __func__, p->n);
19407
+ return false;
19408
+ }
19409
+
19410
+ p->data = GGML_CALLOC(p->n + 1, 1);
19411
+
19412
+ ok = ok && gguf_fread_el(file, p->data, p->n, offset);
18921
19413
 
18922
19414
  return ok;
18923
19415
  }
@@ -18989,6 +19481,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18989
19481
  return NULL;
18990
19482
  }
18991
19483
 
19484
+ // sanity-checks to prevent from integer/buffer overflows
19485
+
19486
+ ok = ok && (ctx->header.n_tensors < (SIZE_MAX/2)/sizeof(struct gguf_tensor_info));
19487
+ ok = ok && (ctx->header.n_tensors < (SIZE_MAX/2)/ggml_tensor_overhead());
19488
+ ok = ok && (ctx->header.n_kv < (SIZE_MAX/2)/sizeof(struct gguf_kv));
19489
+
18992
19490
  if (!ok) {
18993
19491
  fprintf(stderr, "%s: failed to read header\n", __func__);
18994
19492
  fclose(file);
@@ -18999,7 +19497,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18999
19497
 
19000
19498
  // read the kv pairs
19001
19499
  {
19002
- ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
19500
+ ctx->kv = GGML_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
19003
19501
 
19004
19502
  for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
19005
19503
  struct gguf_kv * kv = &ctx->kv[i];
@@ -19027,7 +19525,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19027
19525
  case GGUF_TYPE_ARRAY:
19028
19526
  {
19029
19527
  ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
19030
- ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
19528
+ ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
19031
19529
 
19032
19530
  switch (kv->value.arr.type) {
19033
19531
  case GGUF_TYPE_UINT8:
@@ -19042,21 +19540,39 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19042
19540
  case GGUF_TYPE_FLOAT64:
19043
19541
  case GGUF_TYPE_BOOL:
19044
19542
  {
19045
- kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
19046
- ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type], &offset);
19543
+ // prevent from integer overflow in the malloc below
19544
+ if (kv->value.arr.n >= SIZE_MAX/gguf_type_size(kv->value.arr.type)) {
19545
+ fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
19546
+ fclose(file);
19547
+ gguf_free(ctx);
19548
+ return NULL;
19549
+ }
19550
+
19551
+ kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * gguf_type_size(kv->value.arr.type));
19552
+
19553
+ ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type), &offset);
19047
19554
  } break;
19048
19555
  case GGUF_TYPE_STRING:
19049
19556
  {
19050
- kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
19557
+ // prevent from integer overflow in the malloc below
19558
+ if (kv->value.arr.n >= SIZE_MAX/sizeof(struct gguf_str)) {
19559
+ fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
19560
+ fclose(file);
19561
+ gguf_free(ctx);
19562
+ return NULL;
19563
+ }
19564
+
19565
+ kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * sizeof(struct gguf_str));
19566
+
19051
19567
  for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
19052
19568
  ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
19053
19569
  }
19054
19570
  } break;
19055
19571
  case GGUF_TYPE_ARRAY:
19056
- case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
19572
+ default: GGML_ASSERT(false && "invalid type"); break;
19057
19573
  }
19058
19574
  } break;
19059
- case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
19575
+ default: GGML_ASSERT(false && "invalid type");
19060
19576
  }
19061
19577
 
19062
19578
  if (!ok) {
@@ -19074,7 +19590,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19074
19590
 
19075
19591
  // read the tensor infos
19076
19592
  {
19077
- ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
19593
+ ctx->infos = GGML_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
19078
19594
 
19079
19595
  for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
19080
19596
  struct gguf_tensor_info * info = &ctx->infos[i];
@@ -19085,12 +19601,18 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19085
19601
 
19086
19602
  ok = ok && gguf_fread_str(file, &info->name, &offset);
19087
19603
  ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset);
19604
+
19605
+ ok = ok && (info->n_dims <= GGML_MAX_DIMS);
19606
+
19088
19607
  for (uint32_t j = 0; j < info->n_dims; ++j) {
19089
19608
  ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
19090
19609
  }
19610
+
19091
19611
  ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
19092
19612
  ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
19093
19613
 
19614
+ gguf_tensor_info_sanitize(info);
19615
+
19094
19616
  if (!ok) {
19095
19617
  fprintf(stderr, "%s: failed to read tensor info\n", __func__);
19096
19618
  fclose(file);
@@ -19244,12 +19766,12 @@ void gguf_free(struct gguf_context * ctx) {
19244
19766
  struct gguf_kv * kv = &ctx->kv[i];
19245
19767
 
19246
19768
  if (kv->key.data) {
19247
- free(kv->key.data);
19769
+ GGML_FREE(kv->key.data);
19248
19770
  }
19249
19771
 
19250
19772
  if (kv->type == GGUF_TYPE_STRING) {
19251
19773
  if (kv->value.str.data) {
19252
- free(kv->value.str.data);
19774
+ GGML_FREE(kv->value.str.data);
19253
19775
  }
19254
19776
  }
19255
19777
 
@@ -19259,16 +19781,16 @@ void gguf_free(struct gguf_context * ctx) {
19259
19781
  for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
19260
19782
  struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
19261
19783
  if (str->data) {
19262
- free(str->data);
19784
+ GGML_FREE(str->data);
19263
19785
  }
19264
19786
  }
19265
19787
  }
19266
- free(kv->value.arr.data);
19788
+ GGML_FREE(kv->value.arr.data);
19267
19789
  }
19268
19790
  }
19269
19791
  }
19270
19792
 
19271
- free(ctx->kv);
19793
+ GGML_FREE(ctx->kv);
19272
19794
  }
19273
19795
 
19274
19796
  if (ctx->infos) {
@@ -19276,11 +19798,11 @@ void gguf_free(struct gguf_context * ctx) {
19276
19798
  struct gguf_tensor_info * info = &ctx->infos[i];
19277
19799
 
19278
19800
  if (info->name.data) {
19279
- free(info->name.data);
19801
+ GGML_FREE(info->name.data);
19280
19802
  }
19281
19803
  }
19282
19804
 
19283
- free(ctx->infos);
19805
+ GGML_FREE(ctx->infos);
19284
19806
  }
19285
19807
 
19286
19808
  GGML_ALIGNED_FREE(ctx);
@@ -19581,8 +20103,8 @@ void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_ty
19581
20103
  ctx->kv[idx].type = GGUF_TYPE_ARRAY;
19582
20104
  ctx->kv[idx].value.arr.type = type;
19583
20105
  ctx->kv[idx].value.arr.n = n;
19584
- ctx->kv[idx].value.arr.data = malloc(n*GGUF_TYPE_SIZE[type]);
19585
- memcpy(ctx->kv[idx].value.arr.data, data, n*GGUF_TYPE_SIZE[type]);
20106
+ ctx->kv[idx].value.arr.data = GGML_MALLOC(n*gguf_type_size(type));
20107
+ memcpy(ctx->kv[idx].value.arr.data, data, n*gguf_type_size(type));
19586
20108
  }
19587
20109
 
19588
20110
  void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, int n) {
@@ -19591,7 +20113,7 @@ void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char **
19591
20113
  ctx->kv[idx].type = GGUF_TYPE_ARRAY;
19592
20114
  ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
19593
20115
  ctx->kv[idx].value.arr.n = n;
19594
- ctx->kv[idx].value.arr.data = malloc(n*sizeof(struct gguf_str));
20116
+ ctx->kv[idx].value.arr.data = GGML_MALLOC(n*sizeof(struct gguf_str));
19595
20117
  for (int i = 0; i < n; i++) {
19596
20118
  struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
19597
20119
  str->n = strlen(data[i]);
@@ -19618,19 +20140,19 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
19618
20140
  case GGUF_TYPE_ARRAY:
19619
20141
  {
19620
20142
  if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
19621
- const char ** data = malloc(src->kv[i].value.arr.n*sizeof(char *));
20143
+ const char ** data = GGML_MALLOC(src->kv[i].value.arr.n*sizeof(char *));
19622
20144
  for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
19623
20145
  data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
19624
20146
  }
19625
20147
  gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
19626
- free((void *)data);
20148
+ GGML_FREE((void *)data);
19627
20149
  } else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
19628
20150
  GGML_ASSERT(false && "nested arrays not supported");
19629
20151
  } else {
19630
20152
  gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n);
19631
20153
  }
19632
20154
  } break;
19633
- case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
20155
+ default: GGML_ASSERT(false && "invalid type"); break;
19634
20156
  }
19635
20157
  }
19636
20158
  }
@@ -19706,7 +20228,7 @@ struct gguf_buf {
19706
20228
 
19707
20229
  static struct gguf_buf gguf_buf_init(size_t size) {
19708
20230
  struct gguf_buf buf = {
19709
- /*buf.data =*/ size == 0 ? NULL : malloc(size),
20231
+ /*buf.data =*/ size == 0 ? NULL : GGML_MALLOC(size),
19710
20232
  /*buf.size =*/ size,
19711
20233
  /*buf.offset =*/ 0,
19712
20234
  };
@@ -19716,7 +20238,7 @@ static struct gguf_buf gguf_buf_init(size_t size) {
19716
20238
 
19717
20239
  static void gguf_buf_free(struct gguf_buf buf) {
19718
20240
  if (buf.data) {
19719
- free(buf.data);
20241
+ GGML_FREE(buf.data);
19720
20242
  }
19721
20243
  }
19722
20244
 
@@ -19797,7 +20319,7 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf *
19797
20319
  case GGUF_TYPE_FLOAT64:
19798
20320
  case GGUF_TYPE_BOOL:
19799
20321
  {
19800
- gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
20322
+ gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type));
19801
20323
  } break;
19802
20324
  case GGUF_TYPE_STRING:
19803
20325
  {
@@ -19806,10 +20328,10 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf *
19806
20328
  }
19807
20329
  } break;
19808
20330
  case GGUF_TYPE_ARRAY:
19809
- case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
20331
+ default: GGML_ASSERT(false && "invalid type"); break;
19810
20332
  }
19811
20333
  } break;
19812
- case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
20334
+ default: GGML_ASSERT(false && "invalid type");
19813
20335
  }
19814
20336
  }
19815
20337
 
@@ -20010,7 +20532,7 @@ int ggml_cpu_has_wasm_simd(void) {
20010
20532
  }
20011
20533
 
20012
20534
  int ggml_cpu_has_blas(void) {
20013
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
20535
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
20014
20536
  return 1;
20015
20537
  #else
20016
20538
  return 0;
@@ -20033,8 +20555,33 @@ int ggml_cpu_has_clblast(void) {
20033
20555
  #endif
20034
20556
  }
20035
20557
 
20558
+ int ggml_cpu_has_vulkan(void) {
20559
+ #if defined(GGML_USE_VULKAN)
20560
+ return 1;
20561
+ #else
20562
+ return 0;
20563
+ #endif
20564
+ }
20565
+
20566
+ int ggml_cpu_has_kompute(void) {
20567
+ #if defined(GGML_USE_KOMPUTE)
20568
+ return 1;
20569
+ #else
20570
+ return 0;
20571
+ #endif
20572
+ }
20573
+
20574
+ int ggml_cpu_has_sycl(void) {
20575
+ #if defined(GGML_USE_SYCL)
20576
+ return 1;
20577
+ #else
20578
+ return 0;
20579
+ #endif
20580
+ }
20581
+
20036
20582
  int ggml_cpu_has_gpublas(void) {
20037
- return ggml_cpu_has_cublas() || ggml_cpu_has_clblast();
20583
+ return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
20584
+ ggml_cpu_has_sycl();
20038
20585
  }
20039
20586
 
20040
20587
  int ggml_cpu_has_sse3(void) {