llama_cpp 0.12.7 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -320,6 +320,17 @@ static ggml_fp16_t ggml_table_exp_f16[1 << 16];
320
320
  // precomputed f32 table for f16 (256 KB) (ggml-impl.h)
321
321
  float ggml_table_f32_f16[1 << 16];
322
322
 
323
+ const char * ggml_status_to_string(enum ggml_status status) {
324
+ switch (status) {
325
+ case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)";
326
+ case GGML_STATUS_FAILED: return "GGML status: error (operation failed)";
327
+ case GGML_STATUS_SUCCESS: return "GGML status: success";
328
+ case GGML_STATUS_ABORTED: return "GGML status: warning (operation aborted)";
329
+ }
330
+
331
+ return "GGML status: unknown";
332
+ }
333
+
323
334
  // note: do not use these inside ggml.c
324
335
  // these are meant to be used via the ggml.h API
325
336
  float ggml_fp16_to_fp32(ggml_fp16_t x) {
@@ -355,6 +366,10 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n) {
355
366
  }
356
367
  }
357
368
 
369
+ bool ggml_guid_matches(ggml_guid_t guid_a, ggml_guid_t guid_b) {
370
+ return memcmp(guid_a, guid_b, sizeof(ggml_guid)) == 0;
371
+ }
372
+
358
373
  //
359
374
  // timing
360
375
  //
@@ -678,6 +693,30 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
678
693
  .vec_dot_type = GGML_TYPE_Q8_K,
679
694
  .nrows = 1,
680
695
  },
696
+ [GGML_TYPE_IQ3_S] = {
697
+ .type_name = "iq3_s",
698
+ .blck_size = QK_K,
699
+ .type_size = sizeof(block_iq3_s),
700
+ .is_quantized = true,
701
+ .to_float = (ggml_to_float_t) dequantize_row_iq3_s,
702
+ .from_float = quantize_row_iq3_s,
703
+ .from_float_reference = (ggml_from_float_t)quantize_row_iq3_s_reference,
704
+ .vec_dot = ggml_vec_dot_iq3_s_q8_K,
705
+ .vec_dot_type = GGML_TYPE_Q8_K,
706
+ .nrows = 1,
707
+ },
708
+ [GGML_TYPE_IQ2_S] = {
709
+ .type_name = "iq2_s",
710
+ .blck_size = QK_K,
711
+ .type_size = sizeof(block_iq2_s),
712
+ .is_quantized = true,
713
+ .to_float = (ggml_to_float_t) dequantize_row_iq2_s,
714
+ .from_float = quantize_row_iq2_s,
715
+ .from_float_reference = (ggml_from_float_t)quantize_row_iq2_s_reference,
716
+ .vec_dot = ggml_vec_dot_iq2_s_q8_K,
717
+ .vec_dot_type = GGML_TYPE_Q8_K,
718
+ .nrows = 1,
719
+ },
681
720
  [GGML_TYPE_IQ1_S] = {
682
721
  .type_name = "iq1_s",
683
722
  .blck_size = QK_K,
@@ -702,6 +741,26 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
702
741
  .vec_dot_type = GGML_TYPE_Q8_0,
703
742
  .nrows = 1,
704
743
  },
744
+ [GGML_TYPE_IQ4_XS] = {
745
+ .type_name = "iq4_xs",
746
+ #if QK_K == 64
747
+ .blck_size = QK4_NL,
748
+ #else
749
+ .blck_size = QK_K,
750
+ #endif
751
+ .type_size = sizeof(block_iq4_xs),
752
+ .is_quantized = true,
753
+ .to_float = (ggml_to_float_t) dequantize_row_iq4_xs,
754
+ .from_float = quantize_row_iq4_xs,
755
+ .from_float_reference = (ggml_from_float_t)quantize_row_iq4_xs_reference,
756
+ .vec_dot = ggml_vec_dot_iq4_xs_q8_K,
757
+ #if QK_K == 64
758
+ .vec_dot_type = GGML_TYPE_Q8_0,
759
+ #else
760
+ .vec_dot_type = GGML_TYPE_Q8_K,
761
+ #endif
762
+ .nrows = 1,
763
+ },
705
764
  [GGML_TYPE_Q8_K] = {
706
765
  .type_name = "q8_K",
707
766
  .blck_size = QK_K,
@@ -1560,9 +1619,15 @@ inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp
1560
1619
  inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
1561
1620
  uint16_t t;
1562
1621
  for (int i = 0; i < n; ++i) {
1563
- ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
1564
- memcpy(&t, &fp16, sizeof(uint16_t));
1565
- y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]);
1622
+ if (x[i] <= -10.0f) {
1623
+ y[i] = 0.0f;
1624
+ } else if (x[i] >= 10.0f) {
1625
+ y[i] = x[i];
1626
+ } else {
1627
+ ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
1628
+ memcpy(&t, &fp16, sizeof(uint16_t));
1629
+ y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]);
1630
+ }
1566
1631
  }
1567
1632
  }
1568
1633
  #else
@@ -1768,6 +1833,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
1768
1833
  "POOL_2D",
1769
1834
  "UPSCALE",
1770
1835
  "PAD",
1836
+ "ARANGE",
1837
+ "TIMESTEP_EMBEDDING",
1771
1838
  "ARGSORT",
1772
1839
  "LEAKY_RELU",
1773
1840
 
@@ -1796,7 +1863,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
1796
1863
  "CROSS_ENTROPY_LOSS_BACK",
1797
1864
  };
1798
1865
 
1799
- static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
1866
+ static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
1800
1867
 
1801
1868
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1802
1869
  "none",
@@ -1854,6 +1921,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1854
1921
  "pool_2d(x)",
1855
1922
  "upscale(x)",
1856
1923
  "pad(x)",
1924
+ "arange(start, stop, step)",
1925
+ "timestep_embedding(timesteps, dim, max_period)",
1857
1926
  "argsort(x)",
1858
1927
  "leaky_relu(x)",
1859
1928
 
@@ -1882,7 +1951,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1882
1951
  "cross_entropy_loss_back(x,y)",
1883
1952
  };
1884
1953
 
1885
- static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
1954
+ static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
1886
1955
 
1887
1956
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
1888
1957
 
@@ -2085,7 +2154,10 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
2085
2154
  getcpu_ret = getcpu(&current_cpu, &g_state.numa.current_node);
2086
2155
  #else
2087
2156
  // old glibc doesn't have a wrapper for this call. Fall back on direct syscall
2088
- getcpu_ret = syscall(SYS_getcpu,&current_cpu,&g_state.numa.current_node);
2157
+ # if !defined(SYS_getcpu) && defined(SYS_get_cpu)
2158
+ # define SYS_getcpu SYS_get_cpu // some older glibc versions use this name
2159
+ # endif
2160
+ getcpu_ret = syscall(SYS_getcpu, &current_cpu, &g_state.numa.current_node);
2089
2161
  #endif
2090
2162
 
2091
2163
  if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) {
@@ -2304,6 +2376,9 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
2304
2376
  case GGML_FTYPE_MOSTLY_IQ3_XXS: wtype = GGML_TYPE_IQ3_XXS; break;
2305
2377
  case GGML_FTYPE_MOSTLY_IQ1_S: wtype = GGML_TYPE_IQ1_S; break;
2306
2378
  case GGML_FTYPE_MOSTLY_IQ4_NL: wtype = GGML_TYPE_IQ4_NL; break;
2379
+ case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break;
2380
+ case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break;
2381
+ case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break;
2307
2382
  case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
2308
2383
  case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
2309
2384
  }
@@ -2708,7 +2783,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
2708
2783
  }
2709
2784
  }
2710
2785
 
2711
- struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size);
2786
+ struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TYPE_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size);
2712
2787
 
2713
2788
  // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here
2714
2789
 
@@ -2716,7 +2791,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
2716
2791
 
2717
2792
  *result = (struct ggml_tensor) {
2718
2793
  /*.type =*/ type,
2719
- /*.backend =*/ GGML_BACKEND_CPU,
2794
+ /*.backend =*/ GGML_BACKEND_TYPE_CPU,
2720
2795
  /*.buffer =*/ NULL,
2721
2796
  /*.ne =*/ { 1, 1, 1, 1 },
2722
2797
  /*.nb =*/ { 0, 0, 0, 0 },
@@ -2838,11 +2913,21 @@ static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_
2838
2913
  return ((const int32_t *)(tensor->op_params))[i];
2839
2914
  }
2840
2915
 
2916
+ static float ggml_get_op_params_f32(const struct ggml_tensor * tensor, uint32_t i) {
2917
+ assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
2918
+ return ((const float *)(tensor->op_params))[i];
2919
+ }
2920
+
2841
2921
  static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
2842
2922
  assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
2843
2923
  ((int32_t *)(tensor->op_params))[i] = value;
2844
2924
  }
2845
2925
 
2926
+ static void ggml_set_op_params_f32(struct ggml_tensor * tensor, uint32_t i, float value) {
2927
+ assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
2928
+ ((float *)(tensor->op_params))[i] = value;
2929
+ }
2930
+
2846
2931
  struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
2847
2932
  memset(tensor->data, 0, ggml_nbytes(tensor));
2848
2933
  return tensor;
@@ -3289,7 +3374,7 @@ struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx) {
3289
3374
  char * const mem_buffer = ctx->mem_buffer;
3290
3375
 
3291
3376
  while (obj != NULL) {
3292
- if (obj->type == GGML_OBJECT_TENSOR) {
3377
+ if (obj->type == GGML_OBJECT_TYPE_TENSOR) {
3293
3378
  return (struct ggml_tensor *)(mem_buffer + obj->offs);
3294
3379
  }
3295
3380
 
@@ -3306,7 +3391,7 @@ struct ggml_tensor * ggml_get_next_tensor(const struct ggml_context * ctx, struc
3306
3391
  char * const mem_buffer = ctx->mem_buffer;
3307
3392
 
3308
3393
  while (obj != NULL) {
3309
- if (obj->type == GGML_OBJECT_TENSOR) {
3394
+ if (obj->type == GGML_OBJECT_TYPE_TENSOR) {
3310
3395
  return (struct ggml_tensor *)(mem_buffer + obj->offs);
3311
3396
  }
3312
3397
 
@@ -3322,7 +3407,7 @@ struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * nam
3322
3407
  char * const mem_buffer = ctx->mem_buffer;
3323
3408
 
3324
3409
  while (obj != NULL) {
3325
- if (obj->type == GGML_OBJECT_TENSOR) {
3410
+ if (obj->type == GGML_OBJECT_TYPE_TENSOR) {
3326
3411
  struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
3327
3412
  if (strcmp(cur->name, name) == 0) {
3328
3413
  return cur;
@@ -5729,11 +5814,13 @@ struct ggml_tensor * ggml_pool_1d(
5729
5814
  is_node = true;
5730
5815
  }
5731
5816
 
5732
- const int64_t ne[2] = {
5817
+ const int64_t ne[4] = {
5733
5818
  ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
5734
5819
  a->ne[1],
5820
+ a->ne[2],
5821
+ a->ne[3],
5735
5822
  };
5736
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
5823
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
5737
5824
 
5738
5825
  int32_t params[] = { op, k0, s0, p0 };
5739
5826
  ggml_set_op_params(result, params, sizeof(params));
@@ -5839,6 +5926,55 @@ struct ggml_tensor * ggml_upscale(
5839
5926
  return ggml_upscale_impl(ctx, a, scale_factor);
5840
5927
  }
5841
5928
 
5929
+ struct ggml_tensor * ggml_arange(
5930
+ struct ggml_context * ctx,
5931
+ float start,
5932
+ float stop,
5933
+ float step) {
5934
+
5935
+ GGML_ASSERT(stop > start);
5936
+
5937
+ const int64_t steps = (int64_t) ceilf((stop - start) / step);
5938
+
5939
+ struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, steps);
5940
+
5941
+ result->op = GGML_OP_ARANGE;
5942
+ ggml_set_op_params_f32(result, 0, start);
5943
+ ggml_set_op_params_f32(result, 1, stop);
5944
+ ggml_set_op_params_f32(result, 2, step);
5945
+
5946
+ return result;
5947
+ }
5948
+
5949
+ struct ggml_tensor * ggml_timestep_embedding(
5950
+ struct ggml_context * ctx,
5951
+ struct ggml_tensor * timesteps,
5952
+ int dim,
5953
+ int max_period) {
5954
+ bool is_node = false;
5955
+
5956
+ if (timesteps->grad) {
5957
+ GGML_ASSERT(false); // TODO: implement backward
5958
+ is_node = true;
5959
+ }
5960
+
5961
+ int actual_dim = dim;
5962
+ if (dim % 2 != 0) {
5963
+ actual_dim = dim + 1;
5964
+ }
5965
+
5966
+ struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, actual_dim, timesteps->ne[0]);
5967
+
5968
+ result->op = GGML_OP_TIMESTEP_EMBEDDING;
5969
+ ggml_set_op_params_i32(result, 0, dim);
5970
+ ggml_set_op_params_i32(result, 1, max_period);
5971
+
5972
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5973
+ result->src[0] = timesteps;
5974
+
5975
+ return result;
5976
+ }
5977
+
5842
5978
  // ggml_argsort
5843
5979
 
5844
5980
  struct ggml_tensor * ggml_argsort(
@@ -5866,7 +6002,7 @@ struct ggml_tensor * ggml_top_k(
5866
6002
  int k) {
5867
6003
  GGML_ASSERT(a->ne[0] >= k);
5868
6004
 
5869
- struct ggml_tensor * result = ggml_argsort(ctx, a, GGML_SORT_DESC);
6005
+ struct ggml_tensor * result = ggml_argsort(ctx, a, GGML_SORT_ORDER_DESC);
5870
6006
 
5871
6007
  result = ggml_view_4d(ctx, result,
5872
6008
  k, result->ne[1], result->ne[2], result->ne[3],
@@ -6660,7 +6796,7 @@ static void ggml_compute_forward_dup_same_cont(
6660
6796
  GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
6661
6797
  GGML_ASSERT(src0->type == dst->type);
6662
6798
 
6663
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
6799
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
6664
6800
  return;
6665
6801
  }
6666
6802
 
@@ -6692,7 +6828,7 @@ static void ggml_compute_forward_dup_f16(
6692
6828
 
6693
6829
  GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
6694
6830
 
6695
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
6831
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
6696
6832
  return;
6697
6833
  }
6698
6834
 
@@ -6965,7 +7101,7 @@ static void ggml_compute_forward_dup_f32(
6965
7101
 
6966
7102
  GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
6967
7103
 
6968
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7104
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
6969
7105
  return;
6970
7106
  }
6971
7107
 
@@ -7218,7 +7354,7 @@ static void ggml_compute_forward_dup_bytes(
7218
7354
  GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
7219
7355
  GGML_ASSERT(src0->type == dst->type);
7220
7356
 
7221
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7357
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
7222
7358
  return;
7223
7359
  }
7224
7360
 
@@ -7398,7 +7534,7 @@ static void ggml_compute_forward_add_f32(
7398
7534
 
7399
7535
  GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
7400
7536
 
7401
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7537
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
7402
7538
  return;
7403
7539
  }
7404
7540
 
@@ -7406,7 +7542,7 @@ static void ggml_compute_forward_add_f32(
7406
7542
  const int nth = params->nth;
7407
7543
 
7408
7544
  #ifdef GGML_USE_CLBLAST
7409
- if (src1->backend == GGML_BACKEND_GPU) {
7545
+ if (src1->backend == GGML_BACKEND_TYPE_GPU) {
7410
7546
  // TODO: OpenCL kernel support full broadcast
7411
7547
  GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
7412
7548
  if (ith == 0) {
@@ -7488,7 +7624,7 @@ static void ggml_compute_forward_add_f16_f32(
7488
7624
 
7489
7625
  GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
7490
7626
 
7491
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7627
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
7492
7628
  return;
7493
7629
  }
7494
7630
 
@@ -7567,7 +7703,7 @@ static void ggml_compute_forward_add_f16_f16(
7567
7703
 
7568
7704
  GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
7569
7705
 
7570
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7706
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
7571
7707
  return;
7572
7708
  }
7573
7709
 
@@ -7623,7 +7759,7 @@ static void ggml_compute_forward_add_q_f32(
7623
7759
 
7624
7760
  GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
7625
7761
 
7626
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7762
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
7627
7763
  return;
7628
7764
  }
7629
7765
 
@@ -7738,6 +7874,9 @@ static void ggml_compute_forward_add(
7738
7874
  case GGML_TYPE_IQ3_XXS:
7739
7875
  case GGML_TYPE_IQ1_S:
7740
7876
  case GGML_TYPE_IQ4_NL:
7877
+ case GGML_TYPE_IQ4_XS:
7878
+ case GGML_TYPE_IQ3_S:
7879
+ case GGML_TYPE_IQ2_S:
7741
7880
  {
7742
7881
  ggml_compute_forward_add_q_f32(params, dst);
7743
7882
  } break;
@@ -7760,7 +7899,7 @@ static void ggml_compute_forward_add1_f32(
7760
7899
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
7761
7900
  GGML_ASSERT(ggml_is_scalar(src1));
7762
7901
 
7763
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7902
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
7764
7903
  return;
7765
7904
  }
7766
7905
 
@@ -7814,7 +7953,7 @@ static void ggml_compute_forward_add1_f16_f32(
7814
7953
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
7815
7954
  GGML_ASSERT(ggml_is_scalar(src1));
7816
7955
 
7817
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7956
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
7818
7957
  return;
7819
7958
  }
7820
7959
 
@@ -7866,7 +8005,7 @@ static void ggml_compute_forward_add1_f16_f16(
7866
8005
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
7867
8006
  GGML_ASSERT(ggml_is_scalar(src1));
7868
8007
 
7869
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8008
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
7870
8009
  return;
7871
8010
  }
7872
8011
 
@@ -7918,7 +8057,7 @@ static void ggml_compute_forward_add1_q_f32(
7918
8057
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
7919
8058
  GGML_ASSERT(ggml_is_scalar(src1));
7920
8059
 
7921
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8060
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
7922
8061
  return;
7923
8062
  }
7924
8063
 
@@ -8017,6 +8156,9 @@ static void ggml_compute_forward_add1(
8017
8156
  case GGML_TYPE_IQ3_XXS:
8018
8157
  case GGML_TYPE_IQ1_S:
8019
8158
  case GGML_TYPE_IQ4_NL:
8159
+ case GGML_TYPE_IQ4_XS:
8160
+ case GGML_TYPE_IQ3_S:
8161
+ case GGML_TYPE_IQ2_S:
8020
8162
  {
8021
8163
  ggml_compute_forward_add1_q_f32(params, dst);
8022
8164
  } break;
@@ -8047,7 +8189,7 @@ static void ggml_compute_forward_acc_f32(
8047
8189
  size_t offset = ((int32_t *) dst->op_params)[3];
8048
8190
  bool inplace = (bool) ((int32_t *) dst->op_params)[4];
8049
8191
 
8050
- if (!inplace && (params->type == GGML_TASK_INIT)) {
8192
+ if (!inplace && (params->type == GGML_TASK_TYPE_INIT)) {
8051
8193
  if (params->ith != 0) {
8052
8194
  return;
8053
8195
  }
@@ -8059,7 +8201,7 @@ static void ggml_compute_forward_acc_f32(
8059
8201
  ggml_nbytes(dst));
8060
8202
  }
8061
8203
 
8062
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8204
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8063
8205
  return;
8064
8206
  }
8065
8207
 
@@ -8141,6 +8283,9 @@ static void ggml_compute_forward_acc(
8141
8283
  case GGML_TYPE_IQ3_XXS:
8142
8284
  case GGML_TYPE_IQ1_S:
8143
8285
  case GGML_TYPE_IQ4_NL:
8286
+ case GGML_TYPE_IQ4_XS:
8287
+ case GGML_TYPE_IQ3_S:
8288
+ case GGML_TYPE_IQ2_S:
8144
8289
  default:
8145
8290
  {
8146
8291
  GGML_ASSERT(false);
@@ -8160,7 +8305,7 @@ static void ggml_compute_forward_sub_f32(
8160
8305
  assert(params->ith == 0);
8161
8306
  assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
8162
8307
 
8163
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8308
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8164
8309
  return;
8165
8310
  }
8166
8311
 
@@ -8241,14 +8386,14 @@ static void ggml_compute_forward_mul_f32(
8241
8386
 
8242
8387
  GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
8243
8388
 
8244
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8389
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8245
8390
  return;
8246
8391
  }
8247
8392
  const int ith = params->ith;
8248
8393
  const int nth = params->nth;
8249
8394
 
8250
8395
  #if defined(GGML_USE_CLBLAST)
8251
- if (src1->backend == GGML_BACKEND_GPU) {
8396
+ if (src1->backend == GGML_BACKEND_TYPE_GPU) {
8252
8397
  // TODO: OpenCL kernel support full broadcast
8253
8398
  GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
8254
8399
  if (ith == 0) {
@@ -8349,7 +8494,7 @@ static void ggml_compute_forward_div_f32(
8349
8494
 
8350
8495
  GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
8351
8496
 
8352
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8497
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8353
8498
  return;
8354
8499
  }
8355
8500
 
@@ -8444,7 +8589,7 @@ static void ggml_compute_forward_sqr_f32(
8444
8589
  assert(params->ith == 0);
8445
8590
  assert(ggml_are_same_shape(src0, dst));
8446
8591
 
8447
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8592
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8448
8593
  return;
8449
8594
  }
8450
8595
 
@@ -8490,7 +8635,7 @@ static void ggml_compute_forward_sqrt_f32(
8490
8635
  assert(params->ith == 0);
8491
8636
  assert(ggml_are_same_shape(src0, dst));
8492
8637
 
8493
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8638
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8494
8639
  return;
8495
8640
  }
8496
8641
 
@@ -8536,7 +8681,7 @@ static void ggml_compute_forward_log_f32(
8536
8681
  GGML_ASSERT(params->ith == 0);
8537
8682
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
8538
8683
 
8539
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8684
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8540
8685
  return;
8541
8686
  }
8542
8687
 
@@ -8582,7 +8727,7 @@ static void ggml_compute_forward_sum_f32(
8582
8727
  assert(params->ith == 0);
8583
8728
  assert(ggml_is_scalar(dst));
8584
8729
 
8585
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8730
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8586
8731
  return;
8587
8732
  }
8588
8733
 
@@ -8617,7 +8762,7 @@ static void ggml_compute_forward_sum_f16(
8617
8762
  assert(params->ith == 0);
8618
8763
  assert(ggml_is_scalar(dst));
8619
8764
 
8620
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8765
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8621
8766
  return;
8622
8767
  }
8623
8768
 
@@ -8674,7 +8819,7 @@ static void ggml_compute_forward_sum_rows_f32(
8674
8819
 
8675
8820
  GGML_ASSERT(params->ith == 0);
8676
8821
 
8677
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8822
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8678
8823
  return;
8679
8824
  }
8680
8825
 
@@ -8729,7 +8874,7 @@ static void ggml_compute_forward_mean_f32(
8729
8874
 
8730
8875
  assert(params->ith == 0);
8731
8876
 
8732
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8877
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8733
8878
  return;
8734
8879
  }
8735
8880
 
@@ -8788,7 +8933,7 @@ static void ggml_compute_forward_argmax_f32(
8788
8933
 
8789
8934
  assert(params->ith == 0);
8790
8935
 
8791
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8936
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8792
8937
  return;
8793
8938
  }
8794
8939
 
@@ -8839,7 +8984,7 @@ static void ggml_compute_forward_repeat_f32(
8839
8984
  GGML_ASSERT(params->ith == 0);
8840
8985
  GGML_ASSERT(ggml_can_repeat(src0, dst));
8841
8986
 
8842
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8987
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8843
8988
  return;
8844
8989
  }
8845
8990
 
@@ -8884,7 +9029,7 @@ static void ggml_compute_forward_repeat_f16(
8884
9029
  GGML_ASSERT(params->ith == 0);
8885
9030
  GGML_ASSERT(ggml_can_repeat(src0, dst));
8886
9031
 
8887
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9032
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8888
9033
  return;
8889
9034
  }
8890
9035
 
@@ -8958,7 +9103,7 @@ static void ggml_compute_forward_repeat_back_f32(
8958
9103
  GGML_ASSERT(params->ith == 0);
8959
9104
  GGML_ASSERT(ggml_can_repeat(dst, src0));
8960
9105
 
8961
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9106
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8962
9107
  return;
8963
9108
  }
8964
9109
 
@@ -9035,7 +9180,7 @@ static void ggml_compute_forward_concat_f32(
9035
9180
  const struct ggml_tensor * src0 = dst->src[0];
9036
9181
  const struct ggml_tensor * src1 = dst->src[1];
9037
9182
 
9038
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9183
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9039
9184
  return;
9040
9185
  }
9041
9186
 
@@ -9107,7 +9252,7 @@ static void ggml_compute_forward_abs_f32(
9107
9252
  assert(params->ith == 0);
9108
9253
  assert(ggml_are_same_shape(src0, dst));
9109
9254
 
9110
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9255
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9111
9256
  return;
9112
9257
  }
9113
9258
 
@@ -9153,7 +9298,7 @@ static void ggml_compute_forward_sgn_f32(
9153
9298
  assert(params->ith == 0);
9154
9299
  assert(ggml_are_same_shape(src0, dst));
9155
9300
 
9156
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9301
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9157
9302
  return;
9158
9303
  }
9159
9304
 
@@ -9199,7 +9344,7 @@ static void ggml_compute_forward_neg_f32(
9199
9344
  assert(params->ith == 0);
9200
9345
  assert(ggml_are_same_shape(src0, dst));
9201
9346
 
9202
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9347
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9203
9348
  return;
9204
9349
  }
9205
9350
 
@@ -9245,7 +9390,7 @@ static void ggml_compute_forward_step_f32(
9245
9390
  assert(params->ith == 0);
9246
9391
  assert(ggml_are_same_shape(src0, dst));
9247
9392
 
9248
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9393
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9249
9394
  return;
9250
9395
  }
9251
9396
 
@@ -9291,7 +9436,7 @@ static void ggml_compute_forward_tanh_f32(
9291
9436
  assert(params->ith == 0);
9292
9437
  assert(ggml_are_same_shape(src0, dst));
9293
9438
 
9294
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9439
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9295
9440
  return;
9296
9441
  }
9297
9442
 
@@ -9337,7 +9482,7 @@ static void ggml_compute_forward_elu_f32(
9337
9482
  assert(params->ith == 0);
9338
9483
  assert(ggml_are_same_shape(src0, dst));
9339
9484
 
9340
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9485
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9341
9486
  return;
9342
9487
  }
9343
9488
 
@@ -9383,7 +9528,7 @@ static void ggml_compute_forward_relu_f32(
9383
9528
  assert(params->ith == 0);
9384
9529
  assert(ggml_are_same_shape(src0, dst));
9385
9530
 
9386
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9531
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9387
9532
  return;
9388
9533
  }
9389
9534
 
@@ -9430,7 +9575,7 @@ static void ggml_compute_forward_gelu_f32(
9430
9575
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
9431
9576
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
9432
9577
 
9433
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9578
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9434
9579
  return;
9435
9580
  }
9436
9581
 
@@ -9493,7 +9638,7 @@ static void ggml_compute_forward_gelu_quick_f32(
9493
9638
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
9494
9639
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
9495
9640
 
9496
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9641
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9497
9642
  return;
9498
9643
  }
9499
9644
 
@@ -9556,7 +9701,7 @@ static void ggml_compute_forward_silu_f32(
9556
9701
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
9557
9702
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
9558
9703
 
9559
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9704
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9560
9705
  return;
9561
9706
  }
9562
9707
 
@@ -9617,7 +9762,7 @@ static void ggml_compute_forward_leaky_relu_f32(
9617
9762
  assert(params->ith == 0);
9618
9763
  assert(ggml_are_same_shape(src0, dst));
9619
9764
 
9620
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9765
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9621
9766
  return;
9622
9767
  }
9623
9768
 
@@ -9670,7 +9815,7 @@ static void ggml_compute_forward_silu_back_f32(
9670
9815
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
9671
9816
  GGML_ASSERT(ggml_are_same_shape(src0, grad));
9672
9817
 
9673
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9818
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9674
9819
  return;
9675
9820
  }
9676
9821
 
@@ -9732,7 +9877,7 @@ static void ggml_compute_forward_hardswish_f32(
9732
9877
  assert(params->ith == 0);
9733
9878
  assert(ggml_are_same_shape(src0, dst));
9734
9879
 
9735
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9880
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9736
9881
  return;
9737
9882
  }
9738
9883
 
@@ -9775,7 +9920,7 @@ static void ggml_compute_forward_hardsigmoid_f32(
9775
9920
  assert(params->ith == 0);
9776
9921
  assert(ggml_are_same_shape(src0, dst));
9777
9922
 
9778
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9923
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9779
9924
  return;
9780
9925
  }
9781
9926
 
@@ -9821,7 +9966,7 @@ static void ggml_compute_forward_norm_f32(
9821
9966
 
9822
9967
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
9823
9968
 
9824
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9969
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9825
9970
  return;
9826
9971
  }
9827
9972
 
@@ -9896,7 +10041,7 @@ static void ggml_compute_forward_rms_norm_f32(
9896
10041
 
9897
10042
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
9898
10043
 
9899
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10044
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9900
10045
  return;
9901
10046
  }
9902
10047
 
@@ -9967,7 +10112,7 @@ static void ggml_compute_forward_rms_norm_back_f32(
9967
10112
 
9968
10113
  GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_are_same_shape(src0, src1));
9969
10114
 
9970
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10115
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9971
10116
  return;
9972
10117
  }
9973
10118
 
@@ -10145,7 +10290,7 @@ static void ggml_compute_forward_group_norm_f32(
10145
10290
 
10146
10291
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
10147
10292
 
10148
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10293
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
10149
10294
  return;
10150
10295
  }
10151
10296
 
@@ -10163,7 +10308,7 @@ static void ggml_compute_forward_group_norm_f32(
10163
10308
  int n_channels = src0->ne[2];
10164
10309
  int n_groups = dst->op_params[0];
10165
10310
  int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
10166
- for (int i = ith; i < n_groups; i+=nth) {
10311
+ for (int i = ith; i < n_groups; i += nth) {
10167
10312
  int start = i * n_channels_per_group;
10168
10313
  int end = start + n_channels_per_group;
10169
10314
  if (end > n_channels) {
@@ -10177,28 +10322,32 @@ static void ggml_compute_forward_group_norm_f32(
10177
10322
  for (int64_t i01 = 0; i01 < ne01; i01++) {
10178
10323
  const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
10179
10324
 
10325
+ ggml_float sumr = 0.0;
10180
10326
  for (int64_t i00 = 0; i00 < ne00; i00++) {
10181
- sum += (ggml_float)x[i00];
10327
+ sumr += (ggml_float)x[i00];
10182
10328
  }
10329
+ sum += sumr;
10183
10330
  }
10184
10331
  }
10185
- float mean = sum / (ne00 * ne01 * step);
10186
- ggml_float sum2 = 0.0;
10332
+ const float mean = sum / (ne00 * ne01 * step);
10187
10333
 
10334
+ ggml_float sum2 = 0.0;
10188
10335
  for (int64_t i02 = start; i02 < end; i02++) {
10189
10336
  for (int64_t i01 = 0; i01 < ne01; i01++) {
10190
10337
  const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
10191
10338
 
10192
10339
  float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
10193
10340
 
10341
+ ggml_float sumr = 0.0;
10194
10342
  for (int64_t i00 = 0; i00 < ne00; i00++) {
10195
10343
  float v = x[i00] - mean;
10196
10344
  y[i00] = v;
10197
- sum2 += (ggml_float)(v * v);
10345
+ sumr += (ggml_float)(v * v);
10198
10346
  }
10347
+ sum2 += sumr;
10199
10348
  }
10200
10349
  }
10201
- float variance = sum2 / (ne00 * ne01 * step);
10350
+ const float variance = sum2 / (ne00 * ne01 * step);
10202
10351
  const float scale = 1.0f / sqrtf(variance + eps);
10203
10352
 
10204
10353
  for (int64_t i02 = start; i02 < end; i02++) {
@@ -10312,7 +10461,7 @@ static void ggml_compute_forward_mul_mat(
10312
10461
 
10313
10462
  #if defined(GGML_USE_CLBLAST)
10314
10463
  if (ggml_cl_can_mul_mat(src0, src1, dst)) {
10315
- if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
10464
+ if (params->ith == 0 && params->type == GGML_TASK_TYPE_COMPUTE) {
10316
10465
  ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
10317
10466
  }
10318
10467
  return;
@@ -10325,7 +10474,7 @@ static void ggml_compute_forward_mul_mat(
10325
10474
  const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
10326
10475
  UNUSED(desired_wsize);
10327
10476
 
10328
- if (params->type == GGML_TASK_INIT) {
10477
+ if (params->type == GGML_TASK_TYPE_INIT) {
10329
10478
  if (type != GGML_TYPE_F32) {
10330
10479
  assert(params->wsize >= desired_wsize);
10331
10480
  // parallelize by src0 rows
@@ -10348,7 +10497,7 @@ static void ggml_compute_forward_mul_mat(
10348
10497
  return;
10349
10498
  }
10350
10499
 
10351
- if (params->type == GGML_TASK_FINALIZE) {
10500
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
10352
10501
  return;
10353
10502
  }
10354
10503
 
@@ -10386,7 +10535,7 @@ static void ggml_compute_forward_mul_mat(
10386
10535
  }
10387
10536
  #endif
10388
10537
 
10389
- if (params->type == GGML_TASK_INIT) {
10538
+ if (params->type == GGML_TASK_TYPE_INIT) {
10390
10539
  if (ith != 0) {
10391
10540
  return;
10392
10541
  }
@@ -10410,7 +10559,7 @@ static void ggml_compute_forward_mul_mat(
10410
10559
  return;
10411
10560
  }
10412
10561
 
10413
- if (params->type == GGML_TASK_FINALIZE) {
10562
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
10414
10563
  return;
10415
10564
  }
10416
10565
 
@@ -10567,7 +10716,7 @@ static void ggml_compute_forward_mul_mat_id(
10567
10716
 
10568
10717
  #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
10569
10718
 
10570
- if (params->type == GGML_TASK_INIT) {
10719
+ if (params->type == GGML_TASK_TYPE_INIT) {
10571
10720
  if (ith != 0) {
10572
10721
  return;
10573
10722
  }
@@ -10604,7 +10753,7 @@ static void ggml_compute_forward_mul_mat_id(
10604
10753
  return;
10605
10754
  }
10606
10755
 
10607
- if (params->type == GGML_TASK_FINALIZE) {
10756
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
10608
10757
  return;
10609
10758
  }
10610
10759
 
@@ -10752,7 +10901,7 @@ static void ggml_compute_forward_out_prod_f32(
10752
10901
  (ggml_is_contiguous(src1) || ggml_is_transposed(src1));
10753
10902
  #endif
10754
10903
 
10755
- if (params->type == GGML_TASK_INIT) {
10904
+ if (params->type == GGML_TASK_TYPE_INIT) {
10756
10905
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
10757
10906
  if (use_blas) {
10758
10907
  return;
@@ -10765,7 +10914,7 @@ static void ggml_compute_forward_out_prod_f32(
10765
10914
  return;
10766
10915
  }
10767
10916
 
10768
- if (params->type == GGML_TASK_FINALIZE) {
10917
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
10769
10918
  return;
10770
10919
  }
10771
10920
 
@@ -10945,7 +11094,7 @@ static void ggml_compute_forward_out_prod_q_f32(
10945
11094
  // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
10946
11095
  // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
10947
11096
 
10948
- if (params->type == GGML_TASK_INIT) {
11097
+ if (params->type == GGML_TASK_TYPE_INIT) {
10949
11098
  if (ith != 0) {
10950
11099
  return;
10951
11100
  }
@@ -10953,7 +11102,7 @@ static void ggml_compute_forward_out_prod_q_f32(
10953
11102
  return;
10954
11103
  }
10955
11104
 
10956
- if (params->type == GGML_TASK_FINALIZE) {
11105
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
10957
11106
  return;
10958
11107
  }
10959
11108
 
@@ -11039,6 +11188,9 @@ static void ggml_compute_forward_out_prod(
11039
11188
  case GGML_TYPE_IQ3_XXS:
11040
11189
  case GGML_TYPE_IQ1_S:
11041
11190
  case GGML_TYPE_IQ4_NL:
11191
+ case GGML_TYPE_IQ4_XS:
11192
+ case GGML_TYPE_IQ3_S:
11193
+ case GGML_TYPE_IQ2_S:
11042
11194
  {
11043
11195
  ggml_compute_forward_out_prod_q_f32(params, dst);
11044
11196
  } break;
@@ -11070,7 +11222,7 @@ static void ggml_compute_forward_scale_f32(
11070
11222
  GGML_ASSERT(ggml_is_contiguous(dst));
11071
11223
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
11072
11224
 
11073
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11225
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11074
11226
  return;
11075
11227
  }
11076
11228
 
@@ -11142,7 +11294,7 @@ static void ggml_compute_forward_set_f32(
11142
11294
  size_t offset = ((int32_t *) dst->op_params)[3];
11143
11295
  bool inplace = (bool) ((int32_t *) dst->op_params)[4];
11144
11296
 
11145
- if (!inplace && (params->type == GGML_TASK_INIT)) {
11297
+ if (!inplace && (params->type == GGML_TASK_TYPE_INIT)) {
11146
11298
  if (params->ith != 0) {
11147
11299
  return;
11148
11300
  }
@@ -11154,7 +11306,7 @@ static void ggml_compute_forward_set_f32(
11154
11306
  ggml_nbytes(dst));
11155
11307
  }
11156
11308
 
11157
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11309
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11158
11310
  return;
11159
11311
  }
11160
11312
 
@@ -11227,6 +11379,9 @@ static void ggml_compute_forward_set(
11227
11379
  case GGML_TYPE_IQ3_XXS:
11228
11380
  case GGML_TYPE_IQ1_S:
11229
11381
  case GGML_TYPE_IQ4_NL:
11382
+ case GGML_TYPE_IQ4_XS:
11383
+ case GGML_TYPE_IQ3_S:
11384
+ case GGML_TYPE_IQ2_S:
11230
11385
  default:
11231
11386
  {
11232
11387
  GGML_ASSERT(false);
@@ -11301,7 +11456,7 @@ static void ggml_compute_forward_get_rows_q(
11301
11456
 
11302
11457
  assert(params->ith == 0);
11303
11458
 
11304
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11459
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11305
11460
  return;
11306
11461
  }
11307
11462
 
@@ -11341,7 +11496,7 @@ static void ggml_compute_forward_get_rows_f16(
11341
11496
 
11342
11497
  assert(params->ith == 0);
11343
11498
 
11344
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11499
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11345
11500
  return;
11346
11501
  }
11347
11502
 
@@ -11378,7 +11533,7 @@ static void ggml_compute_forward_get_rows_f32(
11378
11533
 
11379
11534
  assert(params->ith == 0);
11380
11535
 
11381
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11536
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11382
11537
  return;
11383
11538
  }
11384
11539
 
@@ -11429,6 +11584,9 @@ static void ggml_compute_forward_get_rows(
11429
11584
  case GGML_TYPE_IQ3_XXS:
11430
11585
  case GGML_TYPE_IQ1_S:
11431
11586
  case GGML_TYPE_IQ4_NL:
11587
+ case GGML_TYPE_IQ4_XS:
11588
+ case GGML_TYPE_IQ3_S:
11589
+ case GGML_TYPE_IQ2_S:
11432
11590
  {
11433
11591
  ggml_compute_forward_get_rows_q(params, dst);
11434
11592
  } break;
@@ -11480,14 +11638,14 @@ static void ggml_compute_forward_get_rows_back_f32_f16(
11480
11638
 
11481
11639
  // ggml_compute_forward_dup_same_cont(params, opt0, dst);
11482
11640
 
11483
- if (params->type == GGML_TASK_INIT) {
11641
+ if (params->type == GGML_TASK_TYPE_INIT) {
11484
11642
  if (params->ith != 0) {
11485
11643
  return;
11486
11644
  }
11487
11645
  memset(dst->data, 0, ggml_nbytes(dst));
11488
11646
  }
11489
11647
 
11490
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11648
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11491
11649
  return;
11492
11650
  }
11493
11651
 
@@ -11519,14 +11677,14 @@ static void ggml_compute_forward_get_rows_back_f32(
11519
11677
 
11520
11678
  // ggml_compute_forward_dup_same_cont(params, opt0, dst);
11521
11679
 
11522
- if (params->type == GGML_TASK_INIT) {
11680
+ if (params->type == GGML_TASK_TYPE_INIT) {
11523
11681
  if (params->ith != 0) {
11524
11682
  return;
11525
11683
  }
11526
11684
  memset(dst->data, 0, ggml_nbytes(dst));
11527
11685
  }
11528
11686
 
11529
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11687
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11530
11688
  return;
11531
11689
  }
11532
11690
 
@@ -11596,7 +11754,7 @@ static void ggml_compute_forward_diag_f32(
11596
11754
 
11597
11755
  GGML_ASSERT(params->ith == 0);
11598
11756
 
11599
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11757
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11600
11758
  return;
11601
11759
  }
11602
11760
 
@@ -11665,7 +11823,7 @@ static void ggml_compute_forward_diag_mask_f32(
11665
11823
 
11666
11824
  GGML_ASSERT(n_past >= 0);
11667
11825
 
11668
- if (!inplace && (params->type == GGML_TASK_INIT)) {
11826
+ if (!inplace && (params->type == GGML_TASK_TYPE_INIT)) {
11669
11827
  if (ith != 0) {
11670
11828
  return;
11671
11829
  }
@@ -11679,7 +11837,7 @@ static void ggml_compute_forward_diag_mask_f32(
11679
11837
  ggml_nbytes(dst));
11680
11838
  }
11681
11839
 
11682
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11840
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11683
11841
  return;
11684
11842
  }
11685
11843
 
@@ -11753,7 +11911,7 @@ static void ggml_compute_forward_soft_max_f32(
11753
11911
  assert(ggml_is_contiguous(dst));
11754
11912
  assert(ggml_are_same_shape(src0, dst));
11755
11913
 
11756
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11914
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11757
11915
  return;
11758
11916
  }
11759
11917
 
@@ -11891,7 +12049,7 @@ static void ggml_compute_forward_soft_max_back_f32(
11891
12049
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
11892
12050
  GGML_ASSERT(ggml_are_same_shape(src1, dst));
11893
12051
 
11894
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12052
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11895
12053
  return;
11896
12054
  }
11897
12055
 
@@ -11985,7 +12143,7 @@ static void ggml_compute_forward_alibi_f32(
11985
12143
 
11986
12144
  assert(params->ith == 0);
11987
12145
 
11988
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12146
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11989
12147
  return;
11990
12148
  }
11991
12149
 
@@ -12044,7 +12202,7 @@ static void ggml_compute_forward_alibi_f16(
12044
12202
 
12045
12203
  assert(params->ith == 0);
12046
12204
 
12047
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12205
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
12048
12206
  return;
12049
12207
  }
12050
12208
 
@@ -12129,6 +12287,9 @@ static void ggml_compute_forward_alibi(
12129
12287
  case GGML_TYPE_IQ3_XXS:
12130
12288
  case GGML_TYPE_IQ1_S:
12131
12289
  case GGML_TYPE_IQ4_NL:
12290
+ case GGML_TYPE_IQ4_XS:
12291
+ case GGML_TYPE_IQ3_S:
12292
+ case GGML_TYPE_IQ2_S:
12132
12293
  case GGML_TYPE_Q8_K:
12133
12294
  case GGML_TYPE_I8:
12134
12295
  case GGML_TYPE_I16:
@@ -12150,7 +12311,7 @@ static void ggml_compute_forward_clamp_f32(
12150
12311
 
12151
12312
  assert(params->ith == 0);
12152
12313
 
12153
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12314
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
12154
12315
  return;
12155
12316
  }
12156
12317
 
@@ -12212,6 +12373,9 @@ static void ggml_compute_forward_clamp(
12212
12373
  case GGML_TYPE_IQ3_XXS:
12213
12374
  case GGML_TYPE_IQ1_S:
12214
12375
  case GGML_TYPE_IQ4_NL:
12376
+ case GGML_TYPE_IQ4_XS:
12377
+ case GGML_TYPE_IQ3_S:
12378
+ case GGML_TYPE_IQ2_S:
12215
12379
  case GGML_TYPE_Q8_K:
12216
12380
  case GGML_TYPE_I8:
12217
12381
  case GGML_TYPE_I16:
@@ -12289,7 +12453,7 @@ static void ggml_compute_forward_rope_f32(
12289
12453
  const struct ggml_tensor * src0 = dst->src[0];
12290
12454
  const struct ggml_tensor * src1 = dst->src[1];
12291
12455
 
12292
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12456
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
12293
12457
  return;
12294
12458
  }
12295
12459
 
@@ -12467,7 +12631,7 @@ static void ggml_compute_forward_rope_f16(
12467
12631
  const struct ggml_tensor * src0 = dst->src[0];
12468
12632
  const struct ggml_tensor * src1 = dst->src[1];
12469
12633
 
12470
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12634
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
12471
12635
  return;
12472
12636
  }
12473
12637
 
@@ -12698,7 +12862,7 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
12698
12862
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
12699
12863
  GGML_ASSERT(nb10 == sizeof(float));
12700
12864
 
12701
- if (params->type == GGML_TASK_INIT) {
12865
+ if (params->type == GGML_TASK_TYPE_INIT) {
12702
12866
  if (ith != 0) {
12703
12867
  return;
12704
12868
  }
@@ -12738,7 +12902,7 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
12738
12902
  return;
12739
12903
  }
12740
12904
 
12741
- if (params->type == GGML_TASK_FINALIZE) {
12905
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
12742
12906
  return;
12743
12907
  }
12744
12908
 
@@ -12797,7 +12961,7 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
12797
12961
  GGML_ASSERT(nb00 == sizeof(float));
12798
12962
  GGML_ASSERT(nb10 == sizeof(float));
12799
12963
 
12800
- if (params->type == GGML_TASK_INIT) {
12964
+ if (params->type == GGML_TASK_TYPE_INIT) {
12801
12965
  if (ith != 0) {
12802
12966
  return;
12803
12967
  }
@@ -12837,7 +13001,7 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
12837
13001
  return;
12838
13002
  }
12839
13003
 
12840
- if (params->type == GGML_TASK_FINALIZE) {
13004
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
12841
13005
  return;
12842
13006
  }
12843
13007
 
@@ -12941,11 +13105,11 @@ static void ggml_compute_forward_im2col_f32(
12941
13105
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
12942
13106
  GGML_ASSERT(nb10 == sizeof(float));
12943
13107
 
12944
- if (params->type == GGML_TASK_INIT) {
13108
+ if (params->type == GGML_TASK_TYPE_INIT) {
12945
13109
  return;
12946
13110
  }
12947
13111
 
12948
- if (params->type == GGML_TASK_FINALIZE) {
13112
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
12949
13113
  return;
12950
13114
  }
12951
13115
 
@@ -13029,11 +13193,11 @@ static void ggml_compute_forward_im2col_f16(
13029
13193
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
13030
13194
  GGML_ASSERT(nb10 == sizeof(float));
13031
13195
 
13032
- if (params->type == GGML_TASK_INIT) {
13196
+ if (params->type == GGML_TASK_TYPE_INIT) {
13033
13197
  return;
13034
13198
  }
13035
13199
 
13036
- if (params->type == GGML_TASK_FINALIZE) {
13200
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
13037
13201
  return;
13038
13202
  }
13039
13203
 
@@ -13115,7 +13279,7 @@ static void ggml_compute_forward_conv_transpose_2d(
13115
13279
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
13116
13280
  GGML_ASSERT(nb10 == sizeof(float));
13117
13281
 
13118
- if (params->type == GGML_TASK_INIT) {
13282
+ if (params->type == GGML_TASK_TYPE_INIT) {
13119
13283
  if (ith != 0) {
13120
13284
  return;
13121
13285
  }
@@ -13157,7 +13321,7 @@ static void ggml_compute_forward_conv_transpose_2d(
13157
13321
  return;
13158
13322
  }
13159
13323
 
13160
- if (params->type == GGML_TASK_FINALIZE) {
13324
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
13161
13325
  return;
13162
13326
  }
13163
13327
 
@@ -13209,7 +13373,7 @@ static void ggml_compute_forward_pool_1d_sk_p0(
13209
13373
  assert(src->type == GGML_TYPE_F32);
13210
13374
  assert(params->ith == 0);
13211
13375
 
13212
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
13376
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
13213
13377
  return;
13214
13378
  }
13215
13379
 
@@ -13278,7 +13442,7 @@ static void ggml_compute_forward_pool_2d(
13278
13442
  GGML_ASSERT(src->type == GGML_TYPE_F32);
13279
13443
  GGML_ASSERT(params->ith == 0);
13280
13444
 
13281
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
13445
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
13282
13446
  return;
13283
13447
  }
13284
13448
 
@@ -13351,7 +13515,7 @@ static void ggml_compute_forward_upscale_f32(
13351
13515
 
13352
13516
  const struct ggml_tensor * src0 = dst->src[0];
13353
13517
 
13354
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
13518
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
13355
13519
  return;
13356
13520
  }
13357
13521
 
@@ -13411,7 +13575,7 @@ static void ggml_compute_forward_pad_f32(
13411
13575
 
13412
13576
  const struct ggml_tensor * src0 = dst->src[0];
13413
13577
 
13414
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
13578
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
13415
13579
  return;
13416
13580
  }
13417
13581
 
@@ -13464,6 +13628,106 @@ static void ggml_compute_forward_pad(
13464
13628
  }
13465
13629
  }
13466
13630
 
13631
+
13632
+ // ggml_compute_forward_arange
13633
+
13634
+ static void ggml_compute_forward_arange_f32(
13635
+ const struct ggml_compute_params * params,
13636
+ struct ggml_tensor * dst) {
13637
+
13638
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
13639
+ return;
13640
+ }
13641
+
13642
+ GGML_ASSERT(dst->nb[0] == sizeof(float));
13643
+
13644
+ const int ith = params->ith;
13645
+ const int nth = params->nth;
13646
+
13647
+ const float start = ggml_get_op_params_f32(dst, 0);
13648
+ const float stop = ggml_get_op_params_f32(dst, 1);
13649
+ const float step = ggml_get_op_params_f32(dst, 2);
13650
+
13651
+ const int64_t steps = (int64_t) ceilf((stop - start) / step);
13652
+
13653
+ GGML_ASSERT(ggml_nelements(dst) == steps);
13654
+
13655
+ for (int64_t i = ith; i < steps; i+= nth) {
13656
+ float value = start + step * i;
13657
+ ((float *)dst->data)[i] = value;
13658
+ }
13659
+ }
13660
+
13661
+ static void ggml_compute_forward_arange(
13662
+ const struct ggml_compute_params * params,
13663
+ struct ggml_tensor * dst) {
13664
+ switch (dst->type) {
13665
+ case GGML_TYPE_F32:
13666
+ {
13667
+ ggml_compute_forward_arange_f32(params, dst);
13668
+ } break;
13669
+ default:
13670
+ {
13671
+ GGML_ASSERT(false);
13672
+ } break;
13673
+ }
13674
+ }
13675
+
13676
+ static void ggml_compute_forward_timestep_embedding_f32(
13677
+ const struct ggml_compute_params * params,
13678
+ struct ggml_tensor * dst) {
13679
+
13680
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
13681
+ return;
13682
+ }
13683
+
13684
+ const struct ggml_tensor * src0 = dst->src[0];
13685
+
13686
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
13687
+
13688
+ const int ith = params->ith;
13689
+ const int nth = params->nth;
13690
+
13691
+ GGML_TENSOR_UNARY_OP_LOCALS
13692
+
13693
+ const int dim = ggml_get_op_params_i32(dst, 0);
13694
+ const int max_period = ggml_get_op_params_i32(dst, 1);
13695
+
13696
+ int half = dim / 2;
13697
+
13698
+ for (int64_t i = 0; i < ne00; i++) {
13699
+ float * embed_data = (float *)((char *) dst->data + i*nb1);
13700
+ for (int64_t j = ith; j < half; j += nth) {
13701
+ float timestep = ((float *)src0->data)[i];
13702
+ float freq = (float)expf(-logf(max_period) * j / half);
13703
+ float arg = timestep * freq;
13704
+ embed_data[j] = cosf(arg);
13705
+ embed_data[j + half] = sinf(arg);
13706
+ }
13707
+ if (dim % 2 != 0 && ith == 0) {
13708
+ embed_data[dim] = 0.f;
13709
+ }
13710
+ }
13711
+ }
13712
+
13713
+ static void ggml_compute_forward_timestep_embedding(
13714
+ const struct ggml_compute_params * params,
13715
+ struct ggml_tensor * dst) {
13716
+
13717
+ const struct ggml_tensor * src0 = dst->src[0];
13718
+
13719
+ switch (src0->type) {
13720
+ case GGML_TYPE_F32:
13721
+ {
13722
+ ggml_compute_forward_timestep_embedding_f32(params, dst);
13723
+ } break;
13724
+ default:
13725
+ {
13726
+ GGML_ASSERT(false);
13727
+ } break;
13728
+ }
13729
+ }
13730
+
13467
13731
  // ggml_compute_forward_argsort
13468
13732
 
13469
13733
  static void ggml_compute_forward_argsort_f32(
@@ -13472,7 +13736,7 @@ static void ggml_compute_forward_argsort_f32(
13472
13736
 
13473
13737
  const struct ggml_tensor * src0 = dst->src[0];
13474
13738
 
13475
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
13739
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
13476
13740
  return;
13477
13741
  }
13478
13742
 
@@ -13498,8 +13762,8 @@ static void ggml_compute_forward_argsort_f32(
13498
13762
  // C doesn't have a functional sort, so we do a bubble sort instead
13499
13763
  for (int64_t j = 0; j < ne0; j++) {
13500
13764
  for (int64_t k = j + 1; k < ne0; k++) {
13501
- if ((order == GGML_SORT_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
13502
- (order == GGML_SORT_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
13765
+ if ((order == GGML_SORT_ORDER_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
13766
+ (order == GGML_SORT_ORDER_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
13503
13767
  int32_t tmp = dst_data[j];
13504
13768
  dst_data[j] = dst_data[k];
13505
13769
  dst_data[k] = tmp;
@@ -13582,11 +13846,11 @@ static void ggml_compute_forward_flash_attn_f32(
13582
13846
  GGML_ASSERT(nb1 <= nb2);
13583
13847
  GGML_ASSERT(nb2 <= nb3);
13584
13848
 
13585
- if (params->type == GGML_TASK_INIT) {
13849
+ if (params->type == GGML_TASK_TYPE_INIT) {
13586
13850
  return;
13587
13851
  }
13588
13852
 
13589
- if (params->type == GGML_TASK_FINALIZE) {
13853
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
13590
13854
  return;
13591
13855
  }
13592
13856
 
@@ -13774,11 +14038,11 @@ static void ggml_compute_forward_flash_attn_f16(
13774
14038
  GGML_ASSERT(nb1 <= nb2);
13775
14039
  GGML_ASSERT(nb2 <= nb3);
13776
14040
 
13777
- if (params->type == GGML_TASK_INIT) {
14041
+ if (params->type == GGML_TASK_TYPE_INIT) {
13778
14042
  return;
13779
14043
  }
13780
14044
 
13781
- if (params->type == GGML_TASK_FINALIZE) {
14045
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
13782
14046
  return;
13783
14047
  }
13784
14048
 
@@ -14033,11 +14297,11 @@ static void ggml_compute_forward_flash_ff_f16(
14033
14297
  GGML_ASSERT(nb1 <= nb2);
14034
14298
  GGML_ASSERT(nb2 <= nb3);
14035
14299
 
14036
- if (params->type == GGML_TASK_INIT) {
14300
+ if (params->type == GGML_TASK_TYPE_INIT) {
14037
14301
  return;
14038
14302
  }
14039
14303
 
14040
- if (params->type == GGML_TASK_FINALIZE) {
14304
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
14041
14305
  return;
14042
14306
  }
14043
14307
 
@@ -14192,14 +14456,14 @@ static void ggml_compute_forward_flash_attn_back_f32(
14192
14456
  GGML_ASSERT(nb1 <= nb2);
14193
14457
  GGML_ASSERT(nb2 <= nb3);
14194
14458
 
14195
- if (params->type == GGML_TASK_INIT) {
14459
+ if (params->type == GGML_TASK_TYPE_INIT) {
14196
14460
  if (ith == 0) {
14197
14461
  memset(dst->data, 0, nb0*ne0*ne1*ne2*ne3);
14198
14462
  }
14199
14463
  return;
14200
14464
  }
14201
14465
 
14202
- if (params->type == GGML_TASK_FINALIZE) {
14466
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
14203
14467
  return;
14204
14468
  }
14205
14469
 
@@ -14515,7 +14779,7 @@ static void ggml_compute_forward_win_part_f32(
14515
14779
 
14516
14780
  const struct ggml_tensor * src0 = dst->src[0];
14517
14781
 
14518
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14782
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14519
14783
  return;
14520
14784
  }
14521
14785
 
@@ -14581,7 +14845,7 @@ static void ggml_compute_forward_win_unpart_f32(
14581
14845
 
14582
14846
  const struct ggml_tensor * src0 = dst->src[0];
14583
14847
 
14584
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14848
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14585
14849
  return;
14586
14850
  }
14587
14851
 
@@ -14709,7 +14973,7 @@ static void ggml_compute_forward_get_rel_pos_f16(
14709
14973
 
14710
14974
  const struct ggml_tensor * src0 = dst->src[0];
14711
14975
 
14712
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14976
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14713
14977
  return;
14714
14978
  }
14715
14979
 
@@ -14761,14 +15025,14 @@ static void ggml_compute_forward_add_rel_pos_f32(
14761
15025
  const struct ggml_tensor * src2 = dst->src[2];
14762
15026
 
14763
15027
  const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
14764
- if (!inplace && params->type == GGML_TASK_INIT) {
15028
+ if (!inplace && params->type == GGML_TASK_TYPE_INIT) {
14765
15029
  if (params->ith != 0) {
14766
15030
  return;
14767
15031
  }
14768
15032
  memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
14769
15033
  return;
14770
15034
  }
14771
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15035
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14772
15036
  return;
14773
15037
  }
14774
15038
 
@@ -14850,7 +15114,7 @@ static void ggml_compute_forward_map_unary_f32(
14850
15114
 
14851
15115
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
14852
15116
 
14853
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15117
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14854
15118
  return;
14855
15119
  }
14856
15120
 
@@ -14899,7 +15163,7 @@ static void ggml_compute_forward_map_binary_f32(
14899
15163
  assert(params->ith == 0);
14900
15164
  assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
14901
15165
 
14902
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15166
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14903
15167
  return;
14904
15168
  }
14905
15169
 
@@ -14948,7 +15212,7 @@ static void ggml_compute_forward_map_custom1_f32(
14948
15212
 
14949
15213
  assert(params->ith == 0);
14950
15214
 
14951
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15215
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14952
15216
  return;
14953
15217
  }
14954
15218
 
@@ -14967,7 +15231,7 @@ static void ggml_compute_forward_map_custom2_f32(
14967
15231
 
14968
15232
  assert(params->ith == 0);
14969
15233
 
14970
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15234
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14971
15235
  return;
14972
15236
  }
14973
15237
 
@@ -14987,7 +15251,7 @@ static void ggml_compute_forward_map_custom3_f32(
14987
15251
 
14988
15252
  assert(params->ith == 0);
14989
15253
 
14990
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15254
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14991
15255
  return;
14992
15256
  }
14993
15257
 
@@ -15002,13 +15266,14 @@ static void ggml_compute_forward_map_custom1(
15002
15266
 
15003
15267
  const struct ggml_tensor * a = dst->src[0];
15004
15268
 
15005
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15269
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
15006
15270
  return;
15007
15271
  }
15008
15272
 
15009
- struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) dst->op_params;
15273
+ struct ggml_map_custom1_op_params p;
15274
+ memcpy(&p, dst->op_params, sizeof(p));
15010
15275
 
15011
- p->fun(dst, a, params->ith, params->nth, p->userdata);
15276
+ p.fun(dst, a, params->ith, params->nth, p.userdata);
15012
15277
  }
15013
15278
 
15014
15279
  // ggml_compute_forward_map_custom2
@@ -15020,13 +15285,14 @@ static void ggml_compute_forward_map_custom2(
15020
15285
  const struct ggml_tensor * a = dst->src[0];
15021
15286
  const struct ggml_tensor * b = dst->src[1];
15022
15287
 
15023
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15288
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
15024
15289
  return;
15025
15290
  }
15026
15291
 
15027
- struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) dst->op_params;
15292
+ struct ggml_map_custom2_op_params p;
15293
+ memcpy(&p, dst->op_params, sizeof(p));
15028
15294
 
15029
- p->fun(dst, a, b, params->ith, params->nth, p->userdata);
15295
+ p.fun(dst, a, b, params->ith, params->nth, p.userdata);
15030
15296
  }
15031
15297
 
15032
15298
  // ggml_compute_forward_map_custom3
@@ -15039,13 +15305,14 @@ static void ggml_compute_forward_map_custom3(
15039
15305
  const struct ggml_tensor * b = dst->src[1];
15040
15306
  const struct ggml_tensor * c = dst->src[2];
15041
15307
 
15042
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15308
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
15043
15309
  return;
15044
15310
  }
15045
15311
 
15046
- struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) dst->op_params;
15312
+ struct ggml_map_custom3_op_params p;
15313
+ memcpy(&p, dst->op_params, sizeof(p));
15047
15314
 
15048
- p->fun(dst, a, b, c, params->ith, params->nth, p->userdata);
15315
+ p.fun(dst, a, b, c, params->ith, params->nth, p.userdata);
15049
15316
  }
15050
15317
 
15051
15318
  // ggml_compute_forward_cross_entropy_loss
@@ -15073,14 +15340,14 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
15073
15340
 
15074
15341
  GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc));
15075
15342
 
15076
- if (params->type == GGML_TASK_INIT) {
15343
+ if (params->type == GGML_TASK_TYPE_INIT) {
15077
15344
  if (ith == 0) {
15078
15345
  memset(sums, 0, sizeof(float) * (nth + nth * nc));
15079
15346
  }
15080
15347
  return;
15081
15348
  }
15082
15349
 
15083
- if (params->type == GGML_TASK_FINALIZE) {
15350
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
15084
15351
  if (ith == 0) {
15085
15352
  float * dp = (float *) dst->data;
15086
15353
  ggml_vec_sum_f32(nth, dp, sums);
@@ -15195,7 +15462,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
15195
15462
  const int64_t ith = params->ith;
15196
15463
  const int64_t nth = params->nth;
15197
15464
 
15198
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15465
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
15199
15466
  return;
15200
15467
  }
15201
15468
 
@@ -15302,8 +15569,8 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15302
15569
  if (skip_cpu) {
15303
15570
  return;
15304
15571
  }
15305
- GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
15306
- GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
15572
+ GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
15573
+ GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
15307
15574
  #elif defined(GGML_USE_VULKAN)
15308
15575
  const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
15309
15576
  #ifdef GGML_VULKAN_CHECK_RESULTS
@@ -15314,8 +15581,8 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15314
15581
  if (skip_cpu) {
15315
15582
  return;
15316
15583
  }
15317
- GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
15318
- GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
15584
+ GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
15585
+ GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
15319
15586
  #endif // GGML_USE_CUBLAS
15320
15587
 
15321
15588
  #ifdef GGML_USE_SYCL
@@ -15529,6 +15796,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15529
15796
  {
15530
15797
  ggml_compute_forward_pad(params, tensor);
15531
15798
  } break;
15799
+ case GGML_OP_ARANGE:
15800
+ {
15801
+ ggml_compute_forward_arange(params, tensor);
15802
+ } break;
15803
+ case GGML_OP_TIMESTEP_EMBEDDING:
15804
+ {
15805
+ ggml_compute_forward_timestep_embedding(params, tensor);
15806
+ } break;
15532
15807
  case GGML_OP_ARGSORT:
15533
15808
  {
15534
15809
  ggml_compute_forward_argsort(params, tensor);
@@ -16531,6 +16806,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
16531
16806
  {
16532
16807
  GGML_ASSERT(false); // TODO: not implemented
16533
16808
  } break;
16809
+ case GGML_OP_ARANGE:
16810
+ {
16811
+ GGML_ASSERT(false); // TODO: not implemented
16812
+ } break;
16813
+ case GGML_OP_TIMESTEP_EMBEDDING:
16814
+ {
16815
+ GGML_ASSERT(false); // TODO: not implemented
16816
+ } break;
16534
16817
  case GGML_OP_ARGSORT:
16535
16818
  {
16536
16819
  GGML_ASSERT(false); // TODO: not implemented
@@ -16861,7 +17144,7 @@ size_t ggml_graph_overhead(void) {
16861
17144
 
16862
17145
  struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads) {
16863
17146
  const size_t obj_size = ggml_graph_nbytes(size, grads);
16864
- struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, obj_size);
17147
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_GRAPH, obj_size);
16865
17148
  struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
16866
17149
 
16867
17150
  struct ggml_tensor ** data_start = (struct ggml_tensor **) (cgraph + 1);
@@ -17131,6 +17414,7 @@ struct ggml_compute_state {
17131
17414
  ggml_thread_t thrd;
17132
17415
  int ith;
17133
17416
  struct ggml_compute_state_shared * shared;
17417
+ enum ggml_status ec;
17134
17418
  };
17135
17419
 
17136
17420
  static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
@@ -17282,6 +17566,14 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
17282
17566
  {
17283
17567
  n_tasks = n_threads;
17284
17568
  } break;
17569
+ case GGML_OP_ARANGE:
17570
+ {
17571
+ n_tasks = n_threads;
17572
+ } break;
17573
+ case GGML_OP_TIMESTEP_EMBEDDING:
17574
+ {
17575
+ n_tasks = n_threads;
17576
+ } break;
17285
17577
  case GGML_OP_ARGSORT:
17286
17578
  {
17287
17579
  n_tasks = n_threads;
@@ -17311,29 +17603,32 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
17311
17603
  } break;
17312
17604
  case GGML_OP_MAP_CUSTOM1:
17313
17605
  {
17314
- struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) node->op_params;
17315
- if (p->n_tasks == GGML_N_TASKS_MAX) {
17606
+ struct ggml_map_custom1_op_params p;
17607
+ memcpy(&p, node->op_params, sizeof(p));
17608
+ if (p.n_tasks == GGML_N_TASKS_MAX) {
17316
17609
  n_tasks = n_threads;
17317
17610
  } else {
17318
- n_tasks = MIN(p->n_tasks, n_threads);
17611
+ n_tasks = MIN(p.n_tasks, n_threads);
17319
17612
  }
17320
17613
  } break;
17321
17614
  case GGML_OP_MAP_CUSTOM2:
17322
17615
  {
17323
- struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) node->op_params;
17324
- if (p->n_tasks == GGML_N_TASKS_MAX) {
17616
+ struct ggml_map_custom2_op_params p;
17617
+ memcpy(&p, node->op_params, sizeof(p));
17618
+ if (p.n_tasks == GGML_N_TASKS_MAX) {
17325
17619
  n_tasks = n_threads;
17326
17620
  } else {
17327
- n_tasks = MIN(p->n_tasks, n_threads);
17621
+ n_tasks = MIN(p.n_tasks, n_threads);
17328
17622
  }
17329
17623
  } break;
17330
17624
  case GGML_OP_MAP_CUSTOM3:
17331
17625
  {
17332
- struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) node->op_params;
17333
- if (p->n_tasks == GGML_N_TASKS_MAX) {
17626
+ struct ggml_map_custom3_op_params p;
17627
+ memcpy(&p, node->op_params, sizeof(p));
17628
+ if (p.n_tasks == GGML_N_TASKS_MAX) {
17334
17629
  n_tasks = n_threads;
17335
17630
  } else {
17336
- n_tasks = MIN(p->n_tasks, n_threads);
17631
+ n_tasks = MIN(p.n_tasks, n_threads);
17337
17632
  }
17338
17633
  } break;
17339
17634
  case GGML_OP_CROSS_ENTROPY_LOSS:
@@ -17408,19 +17703,20 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
17408
17703
  set_numa_thread_affinity(state->ith);
17409
17704
 
17410
17705
  int node_n = -1;
17411
- int task_phase = GGML_TASK_FINALIZE;
17706
+ int task_phase = GGML_TASK_TYPE_FINALIZE;
17412
17707
 
17413
17708
  while (true) {
17414
17709
  if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
17415
17710
  state->shared->node_n += 1;
17416
- return (thread_ret_t) GGML_EXIT_ABORTED;
17711
+ state->ec = GGML_STATUS_ABORTED;
17712
+ return 0;
17417
17713
  }
17418
17714
 
17419
17715
  if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
17420
17716
  // all other threads are finished and spinning
17421
17717
  // do finalize and init here so we don't have synchronize again
17422
17718
  struct ggml_compute_params params = {
17423
- /*.type =*/ GGML_TASK_FINALIZE,
17719
+ /*.type =*/ GGML_TASK_TYPE_FINALIZE,
17424
17720
  /*.ith =*/ 0,
17425
17721
  /*.nth =*/ 0,
17426
17722
  /*.wsize =*/ cplan->work_size,
@@ -17451,17 +17747,17 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
17451
17747
  if (n_tasks == 1) {
17452
17748
  /* INIT */
17453
17749
  if (GGML_OP_HAS_INIT[node->op]) {
17454
- params.type = GGML_TASK_INIT;
17750
+ params.type = GGML_TASK_TYPE_INIT;
17455
17751
  ggml_compute_forward(&params, node);
17456
17752
  }
17457
17753
 
17458
17754
  // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
17459
17755
  // they do something more efficient than spinning (?)
17460
- params.type = GGML_TASK_COMPUTE;
17756
+ params.type = GGML_TASK_TYPE_COMPUTE;
17461
17757
  ggml_compute_forward(&params, node);
17462
17758
 
17463
17759
  if (GGML_OP_HAS_FINALIZE[node->op]) {
17464
- params.type = GGML_TASK_FINALIZE;
17760
+ params.type = GGML_TASK_TYPE_FINALIZE;
17465
17761
  ggml_compute_forward(&params, node);
17466
17762
  }
17467
17763
 
@@ -17475,7 +17771,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
17475
17771
  }
17476
17772
  }
17477
17773
 
17478
- task_phase = GGML_TASK_INIT;
17774
+ task_phase = GGML_TASK_TYPE_INIT;
17479
17775
  atomic_store(&state->shared->n_active, n_threads);
17480
17776
  atomic_store(&state->shared->node_n, node_n);
17481
17777
  atomic_store(&state->shared->node_task, task_phase);
@@ -17492,7 +17788,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
17492
17788
  const int n_tasks = ggml_get_n_tasks(node, n_threads);
17493
17789
 
17494
17790
  struct ggml_compute_params params = {
17495
- /*.type =*/ GGML_TASK_INIT,
17791
+ /*.type =*/ GGML_TASK_TYPE_INIT,
17496
17792
  /*.ith =*/ state->ith,
17497
17793
  /*.nth =*/ n_tasks,
17498
17794
  /*.wsize =*/ cplan->work_size,
@@ -17506,7 +17802,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
17506
17802
  }
17507
17803
 
17508
17804
  if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
17509
- task_phase = GGML_TASK_COMPUTE;
17805
+ task_phase = GGML_TASK_TYPE_COMPUTE;
17510
17806
  atomic_store(&state->shared->n_active, n_threads);
17511
17807
  atomic_store(&state->shared->node_task, task_phase);
17512
17808
  }
@@ -17521,12 +17817,12 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
17521
17817
  }
17522
17818
 
17523
17819
  if (state->ith < n_tasks) {
17524
- params.type = GGML_TASK_COMPUTE;
17820
+ params.type = GGML_TASK_TYPE_COMPUTE;
17525
17821
  ggml_compute_forward(&params, node);
17526
17822
  }
17527
17823
 
17528
17824
  if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
17529
- task_phase = GGML_TASK_FINALIZE;
17825
+ task_phase = GGML_TASK_TYPE_FINALIZE;
17530
17826
  atomic_store(&state->shared->n_active, n_threads);
17531
17827
  atomic_store(&state->shared->node_task, task_phase);
17532
17828
  }
@@ -17535,7 +17831,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
17535
17831
  }
17536
17832
  }
17537
17833
 
17538
- return GGML_EXIT_SUCCESS;
17834
+ return 0;
17539
17835
  }
17540
17836
 
17541
17837
  struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) {
@@ -17731,7 +18027,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
17731
18027
  return cplan;
17732
18028
  }
17733
18029
 
17734
- int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
18030
+ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
17735
18031
  {
17736
18032
  GGML_ASSERT(cplan);
17737
18033
  GGML_ASSERT(cplan->n_threads > 0);
@@ -17762,7 +18058,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
17762
18058
  /*.n_threads =*/ n_threads,
17763
18059
  /*.n_active =*/ n_threads,
17764
18060
  /*.node_n =*/ -1,
17765
- /*.node_task =*/ GGML_TASK_FINALIZE,
18061
+ /*.node_task =*/ GGML_TASK_TYPE_FINALIZE,
17766
18062
  /*.abort_callback =*/ NULL,
17767
18063
  /*.abort_callback_data =*/ NULL,
17768
18064
  };
@@ -17775,6 +18071,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
17775
18071
  .thrd = 0,
17776
18072
  .ith = j,
17777
18073
  .shared = &state_shared,
18074
+ .ec = GGML_STATUS_SUCCESS,
17778
18075
  };
17779
18076
 
17780
18077
  const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
@@ -17785,12 +18082,14 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
17785
18082
 
17786
18083
  workers[0].ith = 0;
17787
18084
  workers[0].shared = &state_shared;
18085
+ workers[0].ec = GGML_STATUS_SUCCESS;
17788
18086
 
17789
18087
  const int64_t perf_start_cycles = ggml_perf_cycles();
17790
18088
  const int64_t perf_start_time_us = ggml_perf_time_us();
17791
18089
 
17792
18090
  // this is a work thread too
17793
- int compute_status = (size_t) ggml_graph_compute_thread(&workers[0]);
18091
+ ggml_graph_compute_thread(&workers[0]);
18092
+ enum ggml_status compute_status = workers[0].ec;
17794
18093
 
17795
18094
  // don't leave affinity set on the main thread
17796
18095
  clear_numa_thread_affinity();
@@ -17800,6 +18099,8 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
17800
18099
  for (int j = 1; j < n_threads; j++) {
17801
18100
  const int rc = ggml_thread_join(workers[j].thrd, NULL);
17802
18101
  GGML_ASSERT(rc == 0);
18102
+ if (workers[j].ec != GGML_STATUS_SUCCESS)
18103
+ compute_status = workers[j].ec;
17803
18104
  }
17804
18105
  }
17805
18106
 
@@ -17827,14 +18128,14 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
17827
18128
  return compute_status;
17828
18129
  }
17829
18130
 
17830
- void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
18131
+ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
17831
18132
  struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
17832
18133
 
17833
- struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
18134
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
17834
18135
 
17835
18136
  cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
17836
18137
 
17837
- ggml_graph_compute(cgraph, &cplan);
18138
+ return ggml_graph_compute(cgraph, &cplan);
17838
18139
  }
17839
18140
 
17840
18141
  struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
@@ -18638,7 +18939,7 @@ static enum ggml_opt_result ggml_opt_adam(
18638
18939
  float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
18639
18940
 
18640
18941
  struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
18641
- struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
18942
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
18642
18943
  cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
18643
18944
 
18644
18945
  bool cancel = false;
@@ -18650,7 +18951,7 @@ static enum ggml_opt_result ggml_opt_adam(
18650
18951
  if (callback) {
18651
18952
  callback(callback_data, accum_step, &sched, &cancel);
18652
18953
  if (cancel) {
18653
- return GGML_OPT_CANCEL;
18954
+ return GGML_OPT_RESULT_CANCEL;
18654
18955
  }
18655
18956
  }
18656
18957
  // ggml_graph_reset (gf);
@@ -18741,7 +19042,7 @@ static enum ggml_opt_result ggml_opt_adam(
18741
19042
  if (callback) {
18742
19043
  callback(callback_data, accum_step, &sched, &cancel);
18743
19044
  if (cancel) {
18744
- return GGML_OPT_CANCEL;;
19045
+ return GGML_OPT_RESULT_CANCEL;;
18745
19046
  }
18746
19047
  }
18747
19048
  // ggml_graph_reset (gf);
@@ -18758,7 +19059,7 @@ static enum ggml_opt_result ggml_opt_adam(
18758
19059
  if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) {
18759
19060
  GGML_PRINT_DEBUG("converged\n");
18760
19061
 
18761
- return GGML_OPT_OK;
19062
+ return GGML_OPT_RESULT_OK;
18762
19063
  }
18763
19064
 
18764
19065
  // delta-based convergence test
@@ -18768,7 +19069,7 @@ static enum ggml_opt_result ggml_opt_adam(
18768
19069
  const float rate = (pf[(iter0 + t)%params.past] - fx)/fx;
18769
19070
 
18770
19071
  if (fabsf(rate) < params.delta) {
18771
- return GGML_OPT_OK;
19072
+ return GGML_OPT_RESULT_OK;
18772
19073
  }
18773
19074
  }
18774
19075
 
@@ -18784,7 +19085,7 @@ static enum ggml_opt_result ggml_opt_adam(
18784
19085
  ++n_no_improvement[0];
18785
19086
 
18786
19087
  if (n_no_improvement[0] >= params.max_no_improvement) {
18787
- return GGML_OPT_OK;
19088
+ return GGML_OPT_RESULT_OK;
18788
19089
  }
18789
19090
  }
18790
19091
  }
@@ -18802,7 +19103,7 @@ static enum ggml_opt_result ggml_opt_adam(
18802
19103
  }
18803
19104
  }
18804
19105
 
18805
- return GGML_OPT_DID_NOT_CONVERGE;
19106
+ return GGML_OPT_RESULT_DID_NOT_CONVERGE;
18806
19107
  }
18807
19108
 
18808
19109
  //
@@ -18883,7 +19184,7 @@ static enum ggml_opt_result linesearch_backtracking(
18883
19184
  float sched = 0;
18884
19185
  callback(callback_data, accum_step, &sched, cancel);
18885
19186
  if (*cancel) {
18886
- return GGML_OPT_CANCEL;
19187
+ return GGML_OPT_RESULT_CANCEL;
18887
19188
  }
18888
19189
  }
18889
19190
  // ggml_graph_reset (gf);
@@ -18956,7 +19257,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18956
19257
  if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE ||
18957
19258
  params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) {
18958
19259
  if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) {
18959
- return GGML_OPT_INVALID_WOLFE;
19260
+ return GGML_OPT_RESULT_INVALID_WOLFE;
18960
19261
  }
18961
19262
  }
18962
19263
 
@@ -18985,7 +19286,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18985
19286
  }
18986
19287
 
18987
19288
  struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
18988
- struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
19289
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
18989
19290
  cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
18990
19291
 
18991
19292
  float * x = opt->lbfgs.x->data; // current parameters
@@ -19026,7 +19327,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
19026
19327
  float sched = 0;
19027
19328
  callback(callback_data, accum_step, &sched, &cancel);
19028
19329
  if (cancel) {
19029
- return GGML_OPT_CANCEL;
19330
+ return GGML_OPT_RESULT_CANCEL;
19030
19331
  }
19031
19332
  }
19032
19333
  // ggml_graph_reset (gf);
@@ -19054,7 +19355,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
19054
19355
 
19055
19356
  // already optimized
19056
19357
  if (gnorm/xnorm <= params.lbfgs.eps) {
19057
- return GGML_OPT_OK;
19358
+ return GGML_OPT_RESULT_OK;
19058
19359
  }
19059
19360
 
19060
19361
  if (opt->just_initialized) {
@@ -19099,7 +19400,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
19099
19400
  // way to test and don't want to break something with so many changes lined up
19100
19401
  ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
19101
19402
  if (cancel) {
19102
- return GGML_OPT_CANCEL;
19403
+ return GGML_OPT_RESULT_CANCEL;
19103
19404
  }
19104
19405
 
19105
19406
  if (ls < 0) {
@@ -19122,7 +19423,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
19122
19423
  }
19123
19424
  if (gnorm/xnorm <= params.lbfgs.eps) {
19124
19425
  // converged
19125
- return GGML_OPT_OK;
19426
+ return GGML_OPT_RESULT_OK;
19126
19427
  }
19127
19428
 
19128
19429
  // delta-based convergence test
@@ -19132,7 +19433,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
19132
19433
  const float rate = (pf[k[0]%params.past] - fx)/fx;
19133
19434
 
19134
19435
  if (fabsf(rate) < params.delta) {
19135
- return GGML_OPT_OK;
19436
+ return GGML_OPT_RESULT_OK;
19136
19437
  }
19137
19438
  }
19138
19439
 
@@ -19148,14 +19449,14 @@ static enum ggml_opt_result ggml_opt_lbfgs(
19148
19449
  n_no_improvement[0]++;
19149
19450
 
19150
19451
  if (n_no_improvement[0] >= params.max_no_improvement) {
19151
- return GGML_OPT_OK;
19452
+ return GGML_OPT_RESULT_OK;
19152
19453
  }
19153
19454
  }
19154
19455
  }
19155
19456
 
19156
19457
  if (params.lbfgs.n_iter != 0 && params.lbfgs.n_iter < it + 1) {
19157
19458
  // reached the maximum number of iterations
19158
- return GGML_OPT_DID_NOT_CONVERGE;
19459
+ return GGML_OPT_RESULT_DID_NOT_CONVERGE;
19159
19460
  }
19160
19461
 
19161
19462
  // update vectors s and y:
@@ -19211,17 +19512,17 @@ static enum ggml_opt_result ggml_opt_lbfgs(
19211
19512
 
19212
19513
  GGML_ASSERT(false && "lbfgs failed");
19213
19514
 
19214
- return GGML_OPT_DID_NOT_CONVERGE;
19515
+ return GGML_OPT_RESULT_DID_NOT_CONVERGE;
19215
19516
  }
19216
19517
 
19217
19518
  struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
19218
19519
  struct ggml_opt_params result;
19219
19520
 
19220
19521
  switch (type) {
19221
- case GGML_OPT_ADAM:
19522
+ case GGML_OPT_TYPE_ADAM:
19222
19523
  {
19223
19524
  result = (struct ggml_opt_params) {
19224
- .type = GGML_OPT_ADAM,
19525
+ .type = GGML_OPT_TYPE_ADAM,
19225
19526
  .graph_size = GGML_DEFAULT_GRAPH_SIZE,
19226
19527
  .n_threads = 1, // FIXME: GGML_DEFAULT_N_THREADS ?
19227
19528
  .past = 0,
@@ -19249,10 +19550,10 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
19249
19550
  },
19250
19551
  };
19251
19552
  } break;
19252
- case GGML_OPT_LBFGS:
19553
+ case GGML_OPT_TYPE_LBFGS:
19253
19554
  {
19254
19555
  result = (struct ggml_opt_params) {
19255
- .type = GGML_OPT_LBFGS,
19556
+ .type = GGML_OPT_TYPE_LBFGS,
19256
19557
  .graph_size = GGML_DEFAULT_GRAPH_SIZE,
19257
19558
  .n_threads = 1,
19258
19559
  .past = 0,
@@ -19297,12 +19598,12 @@ GGML_API void ggml_opt_init(
19297
19598
  opt->just_initialized = true;
19298
19599
  if (opt->ctx == NULL) {
19299
19600
  struct ggml_init_params ctx_opt_params;
19300
- if (opt->params.type == GGML_OPT_ADAM) {
19601
+ if (opt->params.type == GGML_OPT_TYPE_ADAM) {
19301
19602
  ctx_opt_params.mem_size = GGML_MEM_ALIGN*3 + ggml_tensor_overhead()*3 + ggml_type_size(GGML_TYPE_F32)*nx*3;
19302
19603
  if (opt->params.past > 0) {
19303
19604
  ctx_opt_params.mem_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_size(GGML_TYPE_F32)*opt->params.past;
19304
19605
  }
19305
- } else if (opt->params.type == GGML_OPT_LBFGS) {
19606
+ } else if (opt->params.type == GGML_OPT_TYPE_LBFGS) {
19306
19607
  ctx_opt_params.mem_size = GGML_MEM_ALIGN*9 + ggml_tensor_overhead()*9 + ggml_type_size(GGML_TYPE_F32)*(nx*5 + opt->params.lbfgs.m*2 + nx*opt->params.lbfgs.m*2);
19307
19608
  if (opt->params.past > 0) {
19308
19609
  ctx_opt_params.mem_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_size(GGML_TYPE_F32)*opt->params.past;
@@ -19314,7 +19615,7 @@ GGML_API void ggml_opt_init(
19314
19615
  opt->ctx = ggml_init(ctx_opt_params);
19315
19616
  }
19316
19617
  switch (opt->params.type) {
19317
- case GGML_OPT_ADAM:
19618
+ case GGML_OPT_TYPE_ADAM:
19318
19619
  {
19319
19620
  opt->adam.g = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
19320
19621
  opt->adam.m = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
@@ -19328,7 +19629,7 @@ GGML_API void ggml_opt_init(
19328
19629
  ggml_set_zero(opt->adam.pf);
19329
19630
  }
19330
19631
  } break;
19331
- case GGML_OPT_LBFGS:
19632
+ case GGML_OPT_TYPE_LBFGS:
19332
19633
  {
19333
19634
  opt->lbfgs.x = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
19334
19635
  opt->lbfgs.xp = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
@@ -19372,13 +19673,13 @@ enum ggml_opt_result ggml_opt(
19372
19673
 
19373
19674
  ctx = ggml_init(params_ctx);
19374
19675
  if (ctx == NULL) {
19375
- return GGML_OPT_NO_CONTEXT;
19676
+ return GGML_OPT_RESULT_NO_CONTEXT;
19376
19677
  }
19377
19678
 
19378
19679
  free_ctx = true;
19379
19680
  }
19380
19681
 
19381
- enum ggml_opt_result result = GGML_OPT_OK;
19682
+ enum ggml_opt_result result = GGML_OPT_RESULT_OK;
19382
19683
 
19383
19684
  struct ggml_opt_context * opt = (struct ggml_opt_context *) alloca(sizeof(struct ggml_opt_context));
19384
19685
 
@@ -19417,14 +19718,14 @@ enum ggml_opt_result ggml_opt_resume_g(
19417
19718
  void * callback_data) {
19418
19719
 
19419
19720
  // build forward + backward compute graphs
19420
- enum ggml_opt_result result = GGML_OPT_OK;
19721
+ enum ggml_opt_result result = GGML_OPT_RESULT_OK;
19421
19722
 
19422
19723
  switch (opt->params.type) {
19423
- case GGML_OPT_ADAM:
19724
+ case GGML_OPT_TYPE_ADAM:
19424
19725
  {
19425
19726
  result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
19426
19727
  } break;
19427
- case GGML_OPT_LBFGS:
19728
+ case GGML_OPT_TYPE_LBFGS:
19428
19729
  {
19429
19730
  result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
19430
19731
  } break;
@@ -19461,8 +19762,10 @@ void ggml_quantize_init(enum ggml_type type) {
19461
19762
  switch (type) {
19462
19763
  case GGML_TYPE_IQ2_XXS:
19463
19764
  case GGML_TYPE_IQ2_XS:
19765
+ case GGML_TYPE_IQ2_S:
19464
19766
  case GGML_TYPE_IQ1_S: iq2xs_init_impl(type); break;
19465
19767
  case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
19768
+ case GGML_TYPE_IQ3_S: iq3xs_init_impl(512); break;
19466
19769
  default: // nothing
19467
19770
  break;
19468
19771
  }
@@ -19737,6 +20040,24 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
19737
20040
  result = quantize_iq3_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19738
20041
  GGML_ASSERT(result == row_size * nrows);
19739
20042
  } break;
20043
+ case GGML_TYPE_IQ3_S:
20044
+ {
20045
+ GGML_ASSERT(start % QK_K == 0);
20046
+ GGML_ASSERT(start % n_per_row == 0);
20047
+ size_t start_row = start / n_per_row;
20048
+ size_t row_size = ggml_row_size(type, n_per_row);
20049
+ result = quantize_iq3_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
20050
+ GGML_ASSERT(result == row_size * nrows);
20051
+ } break;
20052
+ case GGML_TYPE_IQ2_S:
20053
+ {
20054
+ GGML_ASSERT(start % QK_K == 0);
20055
+ GGML_ASSERT(start % n_per_row == 0);
20056
+ size_t start_row = start / n_per_row;
20057
+ size_t row_size = ggml_row_size(type, n_per_row);
20058
+ result = quantize_iq2_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
20059
+ GGML_ASSERT(result == row_size * nrows);
20060
+ } break;
19740
20061
  case GGML_TYPE_IQ1_S:
19741
20062
  {
19742
20063
  GGML_ASSERT(start % QK_K == 0);
@@ -19747,6 +20068,9 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
19747
20068
  GGML_ASSERT(result == row_size * nrows);
19748
20069
  } break;
19749
20070
  case GGML_TYPE_IQ4_NL:
20071
+ #if QK_K == 64
20072
+ case GGML_TYPE_IQ4_XS:
20073
+ #endif
19750
20074
  {
19751
20075
  GGML_ASSERT(start % QK4_NL == 0);
19752
20076
  GGML_ASSERT(start % n_per_row == 0);
@@ -19755,6 +20079,17 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
19755
20079
  result = quantize_iq4_nl(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19756
20080
  GGML_ASSERT(result == row_size * nrows);
19757
20081
  } break;
20082
+ #if QK_K != 64
20083
+ case GGML_TYPE_IQ4_XS:
20084
+ {
20085
+ GGML_ASSERT(start % QK_K == 0);
20086
+ GGML_ASSERT(start % n_per_row == 0);
20087
+ size_t start_row = start / n_per_row;
20088
+ size_t row_size = ggml_row_size(type, n_per_row);
20089
+ result = quantize_iq4_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
20090
+ GGML_ASSERT(result == row_size * nrows);
20091
+ } break;
20092
+ #endif
19758
20093
  case GGML_TYPE_F16:
19759
20094
  {
19760
20095
  size_t elemsize = sizeof(ggml_fp16_t);