llama_cpp 0.12.3 → 0.12.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -218,6 +218,7 @@ inline static void * ggml_aligned_malloc(size_t size) {
218
218
  break;
219
219
  }
220
220
  GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
221
+ GGML_ASSERT(false);
221
222
  return NULL;
222
223
  }
223
224
  return aligned_memory;
@@ -230,6 +231,38 @@ inline static void * ggml_aligned_malloc(size_t size) {
230
231
  #endif
231
232
  #endif
232
233
 
234
+ inline static void * ggml_malloc(size_t size) {
235
+ if (size == 0) {
236
+ GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_malloc!\n");
237
+ return NULL;
238
+ }
239
+ void * result = malloc(size);
240
+ if (result == NULL) {
241
+ GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
242
+ GGML_ASSERT(false);
243
+ }
244
+ return result;
245
+ }
246
+
247
+ // calloc
248
+ inline static void * ggml_calloc(size_t num, size_t size) {
249
+ if (num == 0 || size == 0) {
250
+ GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_calloc!\n");
251
+ return NULL;
252
+ }
253
+ void * result = calloc(num, size);
254
+ if (result == NULL) {
255
+ GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
256
+ GGML_ASSERT(false);
257
+ }
258
+ return result;
259
+ }
260
+
261
+ #define GGML_MALLOC(size) ggml_malloc(size)
262
+ #define GGML_CALLOC(num, size) ggml_calloc(num, size)
263
+
264
+ #define GGML_FREE(ptr) free(ptr)
265
+
233
266
  #define UNUSED GGML_UNUSED
234
267
  #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
235
268
 
@@ -248,6 +281,10 @@ inline static void * ggml_aligned_malloc(size_t size) {
248
281
  #include "ggml-cuda.h"
249
282
  #elif defined(GGML_USE_CLBLAST)
250
283
  #include "ggml-opencl.h"
284
+ #elif defined(GGML_USE_VULKAN)
285
+ #include "ggml-vulkan.h"
286
+ #elif defined(GGML_USE_SYCL)
287
+ #include "ggml-sycl.h"
251
288
  #endif
252
289
 
253
290
  // floating point type used to accumulate sums
@@ -595,6 +632,17 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
595
632
  .vec_dot = ggml_vec_dot_iq2_xs_q8_K,
596
633
  .vec_dot_type = GGML_TYPE_Q8_K,
597
634
  },
635
+ [GGML_TYPE_IQ3_XXS] = {
636
+ .type_name = "iq3_xxs",
637
+ .blck_size = QK_K,
638
+ .type_size = sizeof(block_iq3_xxs),
639
+ .is_quantized = true,
640
+ .to_float = (ggml_to_float_t) dequantize_row_iq3_xxs,
641
+ .from_float = quantize_row_iq3_xxs,
642
+ .from_float_reference = (ggml_from_float_t)quantize_row_iq3_xxs_reference,
643
+ .vec_dot = ggml_vec_dot_iq3_xxs_q8_K,
644
+ .vec_dot_type = GGML_TYPE_Q8_K,
645
+ },
598
646
  [GGML_TYPE_Q8_K] = {
599
647
  .type_name = "q8_K",
600
648
  .blck_size = QK_K,
@@ -2140,6 +2188,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
2140
2188
  case GGML_FTYPE_MOSTLY_Q6_K: wtype = GGML_TYPE_Q6_K; break;
2141
2189
  case GGML_FTYPE_MOSTLY_IQ2_XXS: wtype = GGML_TYPE_IQ2_XXS; break;
2142
2190
  case GGML_FTYPE_MOSTLY_IQ2_XS: wtype = GGML_TYPE_IQ2_XS; break;
2191
+ case GGML_FTYPE_MOSTLY_IQ3_XXS: wtype = GGML_TYPE_IQ3_XXS; break;
2143
2192
  case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
2144
2193
  case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
2145
2194
  }
@@ -2293,6 +2342,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
2293
2342
  ggml_init_cublas();
2294
2343
  #elif defined(GGML_USE_CLBLAST)
2295
2344
  ggml_cl_init();
2345
+ #elif defined(GGML_USE_VULKAN)
2346
+ ggml_vk_init_cpu_assist();
2347
+ #elif defined(GGML_USE_SYCL)
2348
+ ggml_init_sycl();
2296
2349
  #endif
2297
2350
 
2298
2351
  ggml_setup_op_has_task_pass();
@@ -2417,7 +2470,8 @@ size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
2417
2470
  size_t max_size = 0;
2418
2471
 
2419
2472
  for (struct ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor != NULL; tensor = ggml_get_next_tensor(ctx, tensor)) {
2420
- max_size = MAX(max_size, ggml_nbytes(tensor));
2473
+ size_t bytes = ggml_nbytes(tensor);
2474
+ max_size = MAX(max_size, bytes);
2421
2475
  }
2422
2476
 
2423
2477
  return max_size;
@@ -5296,7 +5350,7 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
5296
5350
  int s0,
5297
5351
  int p0,
5298
5352
  int d0) {
5299
- struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false); // [N, OL, IC * K]
5353
+ struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16); // [N, OL, IC * K]
5300
5354
 
5301
5355
  struct ggml_tensor * result =
5302
5356
  ggml_mul_mat(ctx,
@@ -5374,16 +5428,15 @@ struct ggml_tensor * ggml_conv_depthwise_2d(
5374
5428
  int p1,
5375
5429
  int d0,
5376
5430
  int d1) {
5431
+
5377
5432
  struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
5378
5433
  struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
5379
5434
  ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
5380
- s0, s1, p0, p1, d0, d1, true); // [N * IC, OH, OW, KH * KW]
5381
-
5382
- struct ggml_tensor * result =
5383
- ggml_mul_mat(ctx,
5384
- ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1), // [OC,1, KH, KW] => [1, OC, 1, KH * KW]
5385
- ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3])); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
5435
+ s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW]
5436
+ struct ggml_tensor * new_b = ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
5386
5437
 
5438
+ new_a = ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1); // [OC,1, KH, KW] => [1, OC, 1, KH * KW]
5439
+ struct ggml_tensor * result = ggml_mul_mat(ctx, new_a, new_b);
5387
5440
  result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
5388
5441
 
5389
5442
  return result;
@@ -5404,7 +5457,8 @@ struct ggml_tensor * ggml_im2col(
5404
5457
  int p1,
5405
5458
  int d0,
5406
5459
  int d1,
5407
- bool is_2D) {
5460
+ bool is_2D,
5461
+ enum ggml_type dst_type) {
5408
5462
 
5409
5463
  if(is_2D) {
5410
5464
  GGML_ASSERT(a->ne[2] == b->ne[2]);
@@ -5428,7 +5482,7 @@ struct ggml_tensor * ggml_im2col(
5428
5482
  is_2D ? b->ne[3] : 1,
5429
5483
  };
5430
5484
 
5431
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
5485
+ struct ggml_tensor * result = ggml_new_tensor(ctx, dst_type, 4, ne);
5432
5486
  int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
5433
5487
  ggml_set_op_params(result, params, sizeof(params));
5434
5488
 
@@ -5453,7 +5507,7 @@ struct ggml_tensor * ggml_conv_2d(
5453
5507
  int p1,
5454
5508
  int d0,
5455
5509
  int d1) {
5456
- struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true); // [N, OH, OW, IC * KH * KW]
5510
+ struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N, OH, OW, IC * KH * KW]
5457
5511
 
5458
5512
  struct ggml_tensor * result =
5459
5513
  ggml_mul_mat(ctx,
@@ -5579,12 +5633,13 @@ struct ggml_tensor * ggml_pool_2d(
5579
5633
  is_node = true;
5580
5634
  }
5581
5635
 
5636
+ struct ggml_tensor * result;
5582
5637
  const int64_t ne[3] = {
5583
5638
  ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
5584
5639
  ggml_calc_pool_output_size(a->ne[1], k1, s1, p1),
5585
5640
  a->ne[2],
5586
5641
  };
5587
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
5642
+ result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
5588
5643
 
5589
5644
  int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
5590
5645
  ggml_set_op_params(result, params, sizeof(params));
@@ -5592,7 +5647,6 @@ struct ggml_tensor * ggml_pool_2d(
5592
5647
  result->op = GGML_OP_POOL_2D;
5593
5648
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5594
5649
  result->src[0] = a;
5595
-
5596
5650
  return result;
5597
5651
  }
5598
5652
 
@@ -7207,6 +7261,17 @@ static void ggml_compute_forward_add_f32(
7207
7261
  const int ith = params->ith;
7208
7262
  const int nth = params->nth;
7209
7263
 
7264
+ #ifdef GGML_USE_CLBLAST
7265
+ if (src1->backend == GGML_BACKEND_GPU) {
7266
+ // TODO: OpenCL kernel support full broadcast
7267
+ GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
7268
+ if (ith == 0) {
7269
+ ggml_cl_add(src0, src1, dst);
7270
+ }
7271
+ return;
7272
+ }
7273
+ #endif
7274
+
7210
7275
  const int nr = ggml_nrows(src0);
7211
7276
 
7212
7277
  GGML_TENSOR_BINARY_OP_LOCALS
@@ -7487,7 +7552,12 @@ static void ggml_compute_forward_add(
7487
7552
  switch (src0->type) {
7488
7553
  case GGML_TYPE_F32:
7489
7554
  {
7490
- ggml_compute_forward_add_f32(params, src0, src1, dst);
7555
+ if (src1->type == GGML_TYPE_F32) {
7556
+ ggml_compute_forward_add_f32(params, src0, src1, dst);
7557
+ }
7558
+ else {
7559
+ GGML_ASSERT(false);
7560
+ }
7491
7561
  } break;
7492
7562
  case GGML_TYPE_F16:
7493
7563
  {
@@ -7513,6 +7583,7 @@ static void ggml_compute_forward_add(
7513
7583
  case GGML_TYPE_Q6_K:
7514
7584
  case GGML_TYPE_IQ2_XXS:
7515
7585
  case GGML_TYPE_IQ2_XS:
7586
+ case GGML_TYPE_IQ3_XXS:
7516
7587
  {
7517
7588
  ggml_compute_forward_add_q_f32(params, src0, src1, dst);
7518
7589
  } break;
@@ -7779,6 +7850,7 @@ static void ggml_compute_forward_add1(
7779
7850
  case GGML_TYPE_Q6_K:
7780
7851
  case GGML_TYPE_IQ2_XXS:
7781
7852
  case GGML_TYPE_IQ2_XS:
7853
+ case GGML_TYPE_IQ3_XXS:
7782
7854
  {
7783
7855
  ggml_compute_forward_add1_q_f32(params, src0, src1, dst);
7784
7856
  } break;
@@ -7898,6 +7970,7 @@ static void ggml_compute_forward_acc(
7898
7970
  case GGML_TYPE_Q6_K:
7899
7971
  case GGML_TYPE_IQ2_XXS:
7900
7972
  case GGML_TYPE_IQ2_XS:
7973
+ case GGML_TYPE_IQ3_XXS:
7901
7974
  default:
7902
7975
  {
7903
7976
  GGML_ASSERT(false);
@@ -7999,7 +8072,7 @@ static void ggml_compute_forward_mul_f32(
7999
8072
  const int ith = params->ith;
8000
8073
  const int nth = params->nth;
8001
8074
 
8002
- #ifdef GGML_USE_CLBLAST
8075
+ #if defined(GGML_USE_CLBLAST)
8003
8076
  if (src1->backend == GGML_BACKEND_GPU) {
8004
8077
  // TODO: OpenCL kernel support full broadcast
8005
8078
  GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
@@ -9954,7 +10027,7 @@ static void ggml_compute_forward_mul_mat(
9954
10027
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9955
10028
  if (ggml_compute_forward_mul_mat_use_blas(dst)) {
9956
10029
  const int64_t ne_plane = ne01*ne00;
9957
- const int64_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
10030
+ const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
9958
10031
  UNUSED(desired_wsize);
9959
10032
 
9960
10033
  if (params->type == GGML_TASK_INIT) {
@@ -10649,6 +10722,7 @@ static void ggml_compute_forward_out_prod(
10649
10722
  case GGML_TYPE_Q6_K:
10650
10723
  case GGML_TYPE_IQ2_XXS:
10651
10724
  case GGML_TYPE_IQ2_XS:
10725
+ case GGML_TYPE_IQ3_XXS:
10652
10726
  {
10653
10727
  ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst);
10654
10728
  } break;
@@ -10828,6 +10902,7 @@ static void ggml_compute_forward_set(
10828
10902
  case GGML_TYPE_Q6_K:
10829
10903
  case GGML_TYPE_IQ2_XXS:
10830
10904
  case GGML_TYPE_IQ2_XS:
10905
+ case GGML_TYPE_IQ3_XXS:
10831
10906
  default:
10832
10907
  {
10833
10908
  GGML_ASSERT(false);
@@ -11024,6 +11099,7 @@ static void ggml_compute_forward_get_rows(
11024
11099
  case GGML_TYPE_Q6_K:
11025
11100
  case GGML_TYPE_IQ2_XXS:
11026
11101
  case GGML_TYPE_IQ2_XS:
11102
+ case GGML_TYPE_IQ3_XXS:
11027
11103
  {
11028
11104
  ggml_compute_forward_get_rows_q(params, src0, src1, dst);
11029
11105
  } break;
@@ -11671,6 +11747,7 @@ static void ggml_compute_forward_alibi(
11671
11747
  case GGML_TYPE_Q6_K:
11672
11748
  case GGML_TYPE_IQ2_XXS:
11673
11749
  case GGML_TYPE_IQ2_XS:
11750
+ case GGML_TYPE_IQ3_XXS:
11674
11751
  case GGML_TYPE_Q8_K:
11675
11752
  case GGML_TYPE_I8:
11676
11753
  case GGML_TYPE_I16:
@@ -11747,6 +11824,7 @@ static void ggml_compute_forward_clamp(
11747
11824
  case GGML_TYPE_Q6_K:
11748
11825
  case GGML_TYPE_IQ2_XXS:
11749
11826
  case GGML_TYPE_IQ2_XS:
11827
+ case GGML_TYPE_IQ3_XXS:
11750
11828
  case GGML_TYPE_Q8_K:
11751
11829
  case GGML_TYPE_I8:
11752
11830
  case GGML_TYPE_I16:
@@ -11810,8 +11888,10 @@ GGML_CALL void ggml_rope_yarn_corr_dims(
11810
11888
  int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
11811
11889
  ) {
11812
11890
  // start and end correction dims
11813
- dims[0] = MAX(0, floorf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_fast, freq_base)));
11814
- dims[1] = MIN(n_dims - 1, ceilf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_slow, freq_base)));
11891
+ float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_fast, freq_base));
11892
+ float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_slow, freq_base));
11893
+ dims[0] = MAX(0, start);
11894
+ dims[1] = MIN(n_dims - 1, end);
11815
11895
  }
11816
11896
 
11817
11897
  static void ggml_compute_forward_rope_f32(
@@ -12416,6 +12496,92 @@ static void ggml_compute_forward_conv_transpose_1d(
12416
12496
  }
12417
12497
  }
12418
12498
 
12499
+ // src0: kernel [OC, IC, KH, KW]
12500
+ // src1: image [N, IC, IH, IW]
12501
+ // dst: result [N, OH, OW, IC*KH*KW]
12502
+ static void ggml_compute_forward_im2col_f32(
12503
+ const struct ggml_compute_params * params,
12504
+ const struct ggml_tensor * src0,
12505
+ const struct ggml_tensor * src1,
12506
+ struct ggml_tensor * dst) {
12507
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
12508
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
12509
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
12510
+
12511
+ int64_t t0 = ggml_perf_time_us();
12512
+ UNUSED(t0);
12513
+
12514
+ GGML_TENSOR_BINARY_OP_LOCALS;
12515
+
12516
+ const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
12517
+ const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
12518
+ const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
12519
+ const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
12520
+ const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
12521
+ const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
12522
+ const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
12523
+
12524
+ const int ith = params->ith;
12525
+ const int nth = params->nth;
12526
+
12527
+ const int64_t N = is_2D ? ne13 : ne12;
12528
+ const int64_t IC = is_2D ? ne12 : ne11;
12529
+ const int64_t IH = is_2D ? ne11 : 1;
12530
+ const int64_t IW = ne10;
12531
+
12532
+ const int64_t KH = is_2D ? ne01 : 1;
12533
+ const int64_t KW = ne00;
12534
+
12535
+ const int64_t OH = is_2D ? ne2 : 1;
12536
+ const int64_t OW = ne1;
12537
+
12538
+ int ofs0 = is_2D ? nb13 : nb12;
12539
+ int ofs1 = is_2D ? nb12 : nb11;
12540
+
12541
+ GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
12542
+ GGML_ASSERT(nb10 == sizeof(float));
12543
+
12544
+ if (params->type == GGML_TASK_INIT) {
12545
+ return;
12546
+ }
12547
+
12548
+ if (params->type == GGML_TASK_FINALIZE) {
12549
+ return;
12550
+ }
12551
+
12552
+ // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
12553
+ {
12554
+ float * const wdata = (float *) dst->data;
12555
+
12556
+ for (int64_t in = 0; in < N; in++) {
12557
+ for (int64_t ioh = 0; ioh < OH; ioh++) { // 1
12558
+ for (int64_t iow = 0; iow < OW; iow++) {
12559
+ for (int64_t iic = ith; iic < IC; iic += nth) {
12560
+
12561
+ // micro kernel
12562
+ float * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
12563
+ const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW]
12564
+
12565
+ for (int64_t ikh = 0; ikh < KH; ikh++) { // 1
12566
+ for (int64_t ikw = 0; ikw < KW; ikw++) {
12567
+ const int64_t iiw = iow*s0 + ikw*d0 - p0;
12568
+ const int64_t iih = ioh*s1 + ikh*d1 - p1;
12569
+
12570
+ if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
12571
+ dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
12572
+ } else {
12573
+ dst_data[iic*(KH*KW) + ikh*KW + ikw] = (src_data[iih*IW + iiw]);
12574
+ }
12575
+ }
12576
+ }
12577
+ }
12578
+ }
12579
+ }
12580
+ }
12581
+ }
12582
+ }
12583
+
12584
+
12419
12585
  // src0: kernel [OC, IC, KH, KW]
12420
12586
  // src1: image [N, IC, IH, IW]
12421
12587
  // dst: result [N, OH, OW, IC*KH*KW]
@@ -12506,14 +12672,14 @@ static void ggml_compute_forward_im2col(
12506
12672
  const struct ggml_tensor * src0,
12507
12673
  const struct ggml_tensor * src1,
12508
12674
  struct ggml_tensor * dst) {
12509
- switch (src0->type) {
12675
+ switch (dst->type) {
12510
12676
  case GGML_TYPE_F16:
12511
12677
  {
12512
12678
  ggml_compute_forward_im2col_f16(params, src0, src1, dst);
12513
12679
  } break;
12514
12680
  case GGML_TYPE_F32:
12515
12681
  {
12516
- GGML_ASSERT(false);
12682
+ ggml_compute_forward_im2col_f32(params, src0, src1, dst);
12517
12683
  } break;
12518
12684
  default:
12519
12685
  {
@@ -12704,8 +12870,8 @@ static void ggml_compute_forward_pool_2d(
12704
12870
  const struct ggml_compute_params * params,
12705
12871
  const struct ggml_tensor * src,
12706
12872
  struct ggml_tensor * dst) {
12707
- assert(src->type == GGML_TYPE_F32);
12708
- assert(params->ith == 0);
12873
+ GGML_ASSERT(src->type == GGML_TYPE_F32);
12874
+ GGML_ASSERT(params->ith == 0);
12709
12875
 
12710
12876
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12711
12877
  return;
@@ -14683,8 +14849,26 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14683
14849
  }
14684
14850
  GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
14685
14851
  GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
14852
+ #elif defined(GGML_USE_VULKAN)
14853
+ const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
14854
+ #ifdef GGML_VULKAN_CHECK_RESULTS
14855
+ if (skip_cpu) {
14856
+ ggml_vk_check_results_1_cpu_assist(params, tensor);
14857
+ }
14858
+ #endif
14859
+ if (skip_cpu) {
14860
+ return;
14861
+ }
14862
+ GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
14863
+ GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
14686
14864
  #endif // GGML_USE_CUBLAS
14687
14865
 
14866
+ #ifdef GGML_USE_SYCL
14867
+ bool skip_cpu = ggml_sycl_compute_forward(params, tensor);
14868
+ if (skip_cpu) {
14869
+ return;
14870
+ }
14871
+ #endif // GGML_USE_SYCL
14688
14872
  switch (tensor->op) {
14689
14873
  case GGML_OP_DUP:
14690
14874
  {
@@ -15087,13 +15271,13 @@ struct ggml_hash_set ggml_hash_set_new(size_t size) {
15087
15271
  size = ggml_hash_size(size);
15088
15272
  struct ggml_hash_set result;
15089
15273
  result.size = size;
15090
- result.keys = malloc(sizeof(struct ggml_tensor *) * size);
15274
+ result.keys = GGML_MALLOC(sizeof(struct ggml_tensor *) * size);
15091
15275
  memset(result.keys, 0, sizeof(struct ggml_tensor *) * size);
15092
15276
  return result;
15093
15277
  }
15094
15278
 
15095
15279
  static void ggml_hash_set_free(struct ggml_hash_set hash_set) {
15096
- free(hash_set.keys);
15280
+ GGML_FREE(hash_set.keys);
15097
15281
  }
15098
15282
 
15099
15283
  struct hash_map {
@@ -15102,17 +15286,17 @@ struct hash_map {
15102
15286
  };
15103
15287
 
15104
15288
  static struct hash_map * ggml_new_hash_map(size_t size) {
15105
- struct hash_map * result = malloc(sizeof(struct hash_map));
15289
+ struct hash_map * result = GGML_MALLOC(sizeof(struct hash_map));
15106
15290
  result->set = ggml_hash_set_new(size);
15107
- result->vals = malloc(sizeof(struct ggml_tensor *) * result->set.size);
15291
+ result->vals = GGML_MALLOC(sizeof(struct ggml_tensor *) * result->set.size);
15108
15292
  memset(result->vals, 0, sizeof(struct ggml_tensor *) * result->set.size);
15109
15293
  return result;
15110
15294
  }
15111
15295
 
15112
15296
  static void ggml_hash_map_free(struct hash_map * map) {
15113
15297
  ggml_hash_set_free(map->set);
15114
- free(map->vals);
15115
- free(map);
15298
+ GGML_FREE(map->vals);
15299
+ GGML_FREE(map);
15116
15300
  }
15117
15301
 
15118
15302
  // gradient checkpointing
@@ -16597,7 +16781,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
16597
16781
  } break;
16598
16782
  case GGML_OP_SOFT_MAX:
16599
16783
  {
16600
- n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
16784
+ n_tasks = MIN(n_threads, ggml_nrows(node->src[0]));
16601
16785
  } break;
16602
16786
  case GGML_OP_CONV_TRANSPOSE_1D:
16603
16787
  {
@@ -16890,12 +17074,16 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
16890
17074
  struct ggml_cplan cplan;
16891
17075
  memset(&cplan, 0, sizeof(struct ggml_cplan));
16892
17076
 
17077
+ int max_tasks = 1;
17078
+
16893
17079
  // thread scheduling for the different operations + work buffer size estimation
16894
17080
  for (int i = 0; i < cgraph->n_nodes; i++) {
16895
17081
  struct ggml_tensor * node = cgraph->nodes[i];
16896
17082
 
16897
17083
  const int n_tasks = ggml_get_n_tasks(node, n_threads);
16898
17084
 
17085
+ max_tasks = MAX(max_tasks, n_tasks);
17086
+
16899
17087
  size_t cur = 0;
16900
17088
 
16901
17089
  switch (node->op) {
@@ -17062,7 +17250,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
17062
17250
  work_size += CACHE_LINE_SIZE*(n_threads - 1);
17063
17251
  }
17064
17252
 
17065
- cplan.n_threads = n_threads;
17253
+ cplan.n_threads = MIN(max_tasks, n_threads);
17066
17254
  cplan.work_size = work_size;
17067
17255
  cplan.work_data = NULL;
17068
17256
 
@@ -17079,6 +17267,17 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
17079
17267
  }
17080
17268
  }
17081
17269
 
17270
+ #ifdef GGML_USE_VULKAN
17271
+ for (int i = 0; i < cgraph->n_nodes; i++) {
17272
+ ggml_vk_preallocate_buffers_graph_cpu_assist(cgraph->nodes[i]);
17273
+ }
17274
+ ggml_vk_preallocate_buffers_cpu_assist();
17275
+
17276
+ for (int i = 0; i < cgraph->n_nodes; i++) {
17277
+ ggml_vk_build_graph_cpu_assist(cgraph->nodes[i], i == cgraph->n_nodes - 1);
17278
+ }
17279
+ #endif
17280
+
17082
17281
  const int n_threads = cplan->n_threads;
17083
17282
 
17084
17283
  struct ggml_compute_state_shared state_shared = {
@@ -17130,6 +17329,10 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
17130
17329
  }
17131
17330
  }
17132
17331
 
17332
+ #ifdef GGML_USE_VULKAN
17333
+ ggml_vk_graph_cleanup_cpu_assist();
17334
+ #endif
17335
+
17133
17336
  // performance stats (graph)
17134
17337
  {
17135
17338
  int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
@@ -18770,6 +18973,7 @@ void ggml_quantize_init(enum ggml_type type) {
18770
18973
  switch (type) {
18771
18974
  case GGML_TYPE_IQ2_XXS: iq2xs_init_impl(256); break;
18772
18975
  case GGML_TYPE_IQ2_XS: iq2xs_init_impl(512); break;
18976
+ case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
18773
18977
  default: // nothing
18774
18978
  break;
18775
18979
  }
@@ -19032,6 +19236,15 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
19032
19236
  result = quantize_iq2_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19033
19237
  GGML_ASSERT(result == row_size * nrows);
19034
19238
  } break;
19239
+ case GGML_TYPE_IQ3_XXS:
19240
+ {
19241
+ GGML_ASSERT(start % QK_K == 0);
19242
+ GGML_ASSERT(start % n_per_row == 0);
19243
+ size_t start_row = start / n_per_row;
19244
+ size_t row_size = ggml_row_size(type, n_per_row);
19245
+ result = quantize_iq3_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19246
+ GGML_ASSERT(result == row_size * nrows);
19247
+ } break;
19035
19248
  case GGML_TYPE_F16:
19036
19249
  {
19037
19250
  size_t elemsize = sizeof(ggml_fp16_t);
@@ -19158,6 +19371,25 @@ struct gguf_context {
19158
19371
  void * data;
19159
19372
  };
19160
19373
 
19374
+ static size_t gguf_type_size(enum gguf_type type) {
19375
+ GGML_ASSERT(0 <= type && type < GGUF_TYPE_COUNT);
19376
+ return GGUF_TYPE_SIZE[type];
19377
+ }
19378
+
19379
+ static void gguf_tensor_info_sanitize(struct gguf_tensor_info * info) {
19380
+ GGML_ASSERT(info->n_dims <= GGML_MAX_DIMS);
19381
+ GGML_ASSERT(0 <= info->type && info->type < GGML_TYPE_COUNT);
19382
+
19383
+ for (uint32_t i = 0; i < info->n_dims; ++i) {
19384
+ GGML_ASSERT(info->ne[i] > 0);
19385
+ }
19386
+
19387
+ // prevent overflow for total number of elements
19388
+ GGML_ASSERT(INT64_MAX/info->ne[1] > info->ne[0]);
19389
+ GGML_ASSERT(INT64_MAX/info->ne[2] > info->ne[0]*info->ne[1]);
19390
+ GGML_ASSERT(INT64_MAX/info->ne[3] > info->ne[0]*info->ne[1]*info->ne[2]);
19391
+ }
19392
+
19161
19393
  static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) {
19162
19394
  const size_t n = fread(dst, 1, size, file);
19163
19395
  *offset += n;
@@ -19170,8 +19402,17 @@ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
19170
19402
 
19171
19403
  bool ok = true;
19172
19404
 
19173
- ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = calloc(p->n + 1, 1);
19174
- ok = ok && gguf_fread_el(file, p->data, p->n, offset);
19405
+ ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset);
19406
+
19407
+ // early exit if string length is invalid, prevents from integer overflow
19408
+ if (p->n == SIZE_MAX) {
19409
+ fprintf(stderr, "%s: invalid string length (%" PRIu64 ")\n", __func__, p->n);
19410
+ return false;
19411
+ }
19412
+
19413
+ p->data = GGML_CALLOC(p->n + 1, 1);
19414
+
19415
+ ok = ok && gguf_fread_el(file, p->data, p->n, offset);
19175
19416
 
19176
19417
  return ok;
19177
19418
  }
@@ -19243,6 +19484,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19243
19484
  return NULL;
19244
19485
  }
19245
19486
 
19487
+ // sanity-checks to prevent from integer/buffer overflows
19488
+
19489
+ ok = ok && (ctx->header.n_tensors < (SIZE_MAX/2)/sizeof(struct gguf_tensor_info));
19490
+ ok = ok && (ctx->header.n_tensors < (SIZE_MAX/2)/ggml_tensor_overhead());
19491
+ ok = ok && (ctx->header.n_kv < (SIZE_MAX/2)/sizeof(struct gguf_kv));
19492
+
19246
19493
  if (!ok) {
19247
19494
  fprintf(stderr, "%s: failed to read header\n", __func__);
19248
19495
  fclose(file);
@@ -19253,7 +19500,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19253
19500
 
19254
19501
  // read the kv pairs
19255
19502
  {
19256
- ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
19503
+ ctx->kv = GGML_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
19257
19504
 
19258
19505
  for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
19259
19506
  struct gguf_kv * kv = &ctx->kv[i];
@@ -19281,7 +19528,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19281
19528
  case GGUF_TYPE_ARRAY:
19282
19529
  {
19283
19530
  ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
19284
- ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
19531
+ ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
19285
19532
 
19286
19533
  switch (kv->value.arr.type) {
19287
19534
  case GGUF_TYPE_UINT8:
@@ -19296,21 +19543,39 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19296
19543
  case GGUF_TYPE_FLOAT64:
19297
19544
  case GGUF_TYPE_BOOL:
19298
19545
  {
19299
- kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
19300
- ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type], &offset);
19546
+ // prevent from integer overflow in the malloc below
19547
+ if (kv->value.arr.n >= SIZE_MAX/gguf_type_size(kv->value.arr.type)) {
19548
+ fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
19549
+ fclose(file);
19550
+ gguf_free(ctx);
19551
+ return NULL;
19552
+ }
19553
+
19554
+ kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * gguf_type_size(kv->value.arr.type));
19555
+
19556
+ ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type), &offset);
19301
19557
  } break;
19302
19558
  case GGUF_TYPE_STRING:
19303
19559
  {
19304
- kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
19560
+ // prevent from integer overflow in the malloc below
19561
+ if (kv->value.arr.n >= SIZE_MAX/sizeof(struct gguf_str)) {
19562
+ fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
19563
+ fclose(file);
19564
+ gguf_free(ctx);
19565
+ return NULL;
19566
+ }
19567
+
19568
+ kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * sizeof(struct gguf_str));
19569
+
19305
19570
  for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
19306
19571
  ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
19307
19572
  }
19308
19573
  } break;
19309
19574
  case GGUF_TYPE_ARRAY:
19310
- case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
19575
+ default: GGML_ASSERT(false && "invalid type"); break;
19311
19576
  }
19312
19577
  } break;
19313
- case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
19578
+ default: GGML_ASSERT(false && "invalid type");
19314
19579
  }
19315
19580
 
19316
19581
  if (!ok) {
@@ -19328,7 +19593,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19328
19593
 
19329
19594
  // read the tensor infos
19330
19595
  {
19331
- ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
19596
+ ctx->infos = GGML_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
19332
19597
 
19333
19598
  for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
19334
19599
  struct gguf_tensor_info * info = &ctx->infos[i];
@@ -19339,12 +19604,18 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19339
19604
 
19340
19605
  ok = ok && gguf_fread_str(file, &info->name, &offset);
19341
19606
  ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset);
19607
+
19608
+ ok = ok && (info->n_dims <= GGML_MAX_DIMS);
19609
+
19342
19610
  for (uint32_t j = 0; j < info->n_dims; ++j) {
19343
19611
  ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
19344
19612
  }
19613
+
19345
19614
  ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
19346
19615
  ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
19347
19616
 
19617
+ gguf_tensor_info_sanitize(info);
19618
+
19348
19619
  if (!ok) {
19349
19620
  fprintf(stderr, "%s: failed to read tensor info\n", __func__);
19350
19621
  fclose(file);
@@ -19498,12 +19769,12 @@ void gguf_free(struct gguf_context * ctx) {
19498
19769
  struct gguf_kv * kv = &ctx->kv[i];
19499
19770
 
19500
19771
  if (kv->key.data) {
19501
- free(kv->key.data);
19772
+ GGML_FREE(kv->key.data);
19502
19773
  }
19503
19774
 
19504
19775
  if (kv->type == GGUF_TYPE_STRING) {
19505
19776
  if (kv->value.str.data) {
19506
- free(kv->value.str.data);
19777
+ GGML_FREE(kv->value.str.data);
19507
19778
  }
19508
19779
  }
19509
19780
 
@@ -19513,16 +19784,16 @@ void gguf_free(struct gguf_context * ctx) {
19513
19784
  for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
19514
19785
  struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
19515
19786
  if (str->data) {
19516
- free(str->data);
19787
+ GGML_FREE(str->data);
19517
19788
  }
19518
19789
  }
19519
19790
  }
19520
- free(kv->value.arr.data);
19791
+ GGML_FREE(kv->value.arr.data);
19521
19792
  }
19522
19793
  }
19523
19794
  }
19524
19795
 
19525
- free(ctx->kv);
19796
+ GGML_FREE(ctx->kv);
19526
19797
  }
19527
19798
 
19528
19799
  if (ctx->infos) {
@@ -19530,11 +19801,11 @@ void gguf_free(struct gguf_context * ctx) {
19530
19801
  struct gguf_tensor_info * info = &ctx->infos[i];
19531
19802
 
19532
19803
  if (info->name.data) {
19533
- free(info->name.data);
19804
+ GGML_FREE(info->name.data);
19534
19805
  }
19535
19806
  }
19536
19807
 
19537
- free(ctx->infos);
19808
+ GGML_FREE(ctx->infos);
19538
19809
  }
19539
19810
 
19540
19811
  GGML_ALIGNED_FREE(ctx);
@@ -19835,8 +20106,8 @@ void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_ty
19835
20106
  ctx->kv[idx].type = GGUF_TYPE_ARRAY;
19836
20107
  ctx->kv[idx].value.arr.type = type;
19837
20108
  ctx->kv[idx].value.arr.n = n;
19838
- ctx->kv[idx].value.arr.data = malloc(n*GGUF_TYPE_SIZE[type]);
19839
- memcpy(ctx->kv[idx].value.arr.data, data, n*GGUF_TYPE_SIZE[type]);
20109
+ ctx->kv[idx].value.arr.data = GGML_MALLOC(n*gguf_type_size(type));
20110
+ memcpy(ctx->kv[idx].value.arr.data, data, n*gguf_type_size(type));
19840
20111
  }
19841
20112
 
19842
20113
  void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, int n) {
@@ -19845,7 +20116,7 @@ void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char **
19845
20116
  ctx->kv[idx].type = GGUF_TYPE_ARRAY;
19846
20117
  ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
19847
20118
  ctx->kv[idx].value.arr.n = n;
19848
- ctx->kv[idx].value.arr.data = malloc(n*sizeof(struct gguf_str));
20119
+ ctx->kv[idx].value.arr.data = GGML_MALLOC(n*sizeof(struct gguf_str));
19849
20120
  for (int i = 0; i < n; i++) {
19850
20121
  struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
19851
20122
  str->n = strlen(data[i]);
@@ -19872,19 +20143,19 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
19872
20143
  case GGUF_TYPE_ARRAY:
19873
20144
  {
19874
20145
  if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
19875
- const char ** data = malloc(src->kv[i].value.arr.n*sizeof(char *));
20146
+ const char ** data = GGML_MALLOC(src->kv[i].value.arr.n*sizeof(char *));
19876
20147
  for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
19877
20148
  data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
19878
20149
  }
19879
20150
  gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
19880
- free((void *)data);
20151
+ GGML_FREE((void *)data);
19881
20152
  } else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
19882
20153
  GGML_ASSERT(false && "nested arrays not supported");
19883
20154
  } else {
19884
20155
  gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n);
19885
20156
  }
19886
20157
  } break;
19887
- case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
20158
+ default: GGML_ASSERT(false && "invalid type"); break;
19888
20159
  }
19889
20160
  }
19890
20161
  }
@@ -19960,7 +20231,7 @@ struct gguf_buf {
19960
20231
 
19961
20232
  static struct gguf_buf gguf_buf_init(size_t size) {
19962
20233
  struct gguf_buf buf = {
19963
- /*buf.data =*/ size == 0 ? NULL : malloc(size),
20234
+ /*buf.data =*/ size == 0 ? NULL : GGML_MALLOC(size),
19964
20235
  /*buf.size =*/ size,
19965
20236
  /*buf.offset =*/ 0,
19966
20237
  };
@@ -19970,7 +20241,7 @@ static struct gguf_buf gguf_buf_init(size_t size) {
19970
20241
 
19971
20242
  static void gguf_buf_free(struct gguf_buf buf) {
19972
20243
  if (buf.data) {
19973
- free(buf.data);
20244
+ GGML_FREE(buf.data);
19974
20245
  }
19975
20246
  }
19976
20247
 
@@ -20051,7 +20322,7 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf *
20051
20322
  case GGUF_TYPE_FLOAT64:
20052
20323
  case GGUF_TYPE_BOOL:
20053
20324
  {
20054
- gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
20325
+ gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type));
20055
20326
  } break;
20056
20327
  case GGUF_TYPE_STRING:
20057
20328
  {
@@ -20060,10 +20331,10 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf *
20060
20331
  }
20061
20332
  } break;
20062
20333
  case GGUF_TYPE_ARRAY:
20063
- case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
20334
+ default: GGML_ASSERT(false && "invalid type"); break;
20064
20335
  }
20065
20336
  } break;
20066
- case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
20337
+ default: GGML_ASSERT(false && "invalid type");
20067
20338
  }
20068
20339
  }
20069
20340
 
@@ -20264,7 +20535,7 @@ int ggml_cpu_has_wasm_simd(void) {
20264
20535
  }
20265
20536
 
20266
20537
  int ggml_cpu_has_blas(void) {
20267
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
20538
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
20268
20539
  return 1;
20269
20540
  #else
20270
20541
  return 0;
@@ -20287,8 +20558,33 @@ int ggml_cpu_has_clblast(void) {
20287
20558
  #endif
20288
20559
  }
20289
20560
 
20561
+ int ggml_cpu_has_vulkan(void) {
20562
+ #if defined(GGML_USE_VULKAN)
20563
+ return 1;
20564
+ #else
20565
+ return 0;
20566
+ #endif
20567
+ }
20568
+
20569
+ int ggml_cpu_has_kompute(void) {
20570
+ #if defined(GGML_USE_KOMPUTE)
20571
+ return 1;
20572
+ #else
20573
+ return 0;
20574
+ #endif
20575
+ }
20576
+
20577
+ int ggml_cpu_has_sycl(void) {
20578
+ #if defined(GGML_USE_SYCL)
20579
+ return 1;
20580
+ #else
20581
+ return 0;
20582
+ #endif
20583
+ }
20584
+
20290
20585
  int ggml_cpu_has_gpublas(void) {
20291
- return ggml_cpu_has_cublas() || ggml_cpu_has_clblast();
20586
+ return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
20587
+ ggml_cpu_has_sycl();
20292
20588
  }
20293
20589
 
20294
20590
  int ggml_cpu_has_sse3(void) {