llama_cpp 0.12.7 → 0.14.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -320,6 +320,17 @@ static ggml_fp16_t ggml_table_exp_f16[1 << 16];
320
320
  // precomputed f32 table for f16 (256 KB) (ggml-impl.h)
321
321
  float ggml_table_f32_f16[1 << 16];
322
322
 
323
+ const char * ggml_status_to_string(enum ggml_status status) {
324
+ switch (status) {
325
+ case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)";
326
+ case GGML_STATUS_FAILED: return "GGML status: error (operation failed)";
327
+ case GGML_STATUS_SUCCESS: return "GGML status: success";
328
+ case GGML_STATUS_ABORTED: return "GGML status: warning (operation aborted)";
329
+ }
330
+
331
+ return "GGML status: unknown";
332
+ }
333
+
323
334
  // note: do not use these inside ggml.c
324
335
  // these are meant to be used via the ggml.h API
325
336
  float ggml_fp16_to_fp32(ggml_fp16_t x) {
@@ -355,6 +366,10 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n) {
355
366
  }
356
367
  }
357
368
 
369
+ bool ggml_guid_matches(ggml_guid_t guid_a, ggml_guid_t guid_b) {
370
+ return memcmp(guid_a, guid_b, sizeof(ggml_guid)) == 0;
371
+ }
372
+
358
373
  //
359
374
  // timing
360
375
  //
@@ -678,6 +693,30 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
678
693
  .vec_dot_type = GGML_TYPE_Q8_K,
679
694
  .nrows = 1,
680
695
  },
696
+ [GGML_TYPE_IQ3_S] = {
697
+ .type_name = "iq3_s",
698
+ .blck_size = QK_K,
699
+ .type_size = sizeof(block_iq3_s),
700
+ .is_quantized = true,
701
+ .to_float = (ggml_to_float_t) dequantize_row_iq3_s,
702
+ .from_float = quantize_row_iq3_s,
703
+ .from_float_reference = (ggml_from_float_t)quantize_row_iq3_s_reference,
704
+ .vec_dot = ggml_vec_dot_iq3_s_q8_K,
705
+ .vec_dot_type = GGML_TYPE_Q8_K,
706
+ .nrows = 1,
707
+ },
708
+ [GGML_TYPE_IQ2_S] = {
709
+ .type_name = "iq2_s",
710
+ .blck_size = QK_K,
711
+ .type_size = sizeof(block_iq2_s),
712
+ .is_quantized = true,
713
+ .to_float = (ggml_to_float_t) dequantize_row_iq2_s,
714
+ .from_float = quantize_row_iq2_s,
715
+ .from_float_reference = (ggml_from_float_t)quantize_row_iq2_s_reference,
716
+ .vec_dot = ggml_vec_dot_iq2_s_q8_K,
717
+ .vec_dot_type = GGML_TYPE_Q8_K,
718
+ .nrows = 1,
719
+ },
681
720
  [GGML_TYPE_IQ1_S] = {
682
721
  .type_name = "iq1_s",
683
722
  .blck_size = QK_K,
@@ -702,6 +741,26 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
702
741
  .vec_dot_type = GGML_TYPE_Q8_0,
703
742
  .nrows = 1,
704
743
  },
744
+ [GGML_TYPE_IQ4_XS] = {
745
+ .type_name = "iq4_xs",
746
+ #if QK_K == 64
747
+ .blck_size = QK4_NL,
748
+ #else
749
+ .blck_size = QK_K,
750
+ #endif
751
+ .type_size = sizeof(block_iq4_xs),
752
+ .is_quantized = true,
753
+ .to_float = (ggml_to_float_t) dequantize_row_iq4_xs,
754
+ .from_float = quantize_row_iq4_xs,
755
+ .from_float_reference = (ggml_from_float_t)quantize_row_iq4_xs_reference,
756
+ .vec_dot = ggml_vec_dot_iq4_xs_q8_K,
757
+ #if QK_K == 64
758
+ .vec_dot_type = GGML_TYPE_Q8_0,
759
+ #else
760
+ .vec_dot_type = GGML_TYPE_Q8_K,
761
+ #endif
762
+ .nrows = 1,
763
+ },
705
764
  [GGML_TYPE_Q8_K] = {
706
765
  .type_name = "q8_K",
707
766
  .blck_size = QK_K,
@@ -1560,9 +1619,15 @@ inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp
1560
1619
  inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
1561
1620
  uint16_t t;
1562
1621
  for (int i = 0; i < n; ++i) {
1563
- ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
1564
- memcpy(&t, &fp16, sizeof(uint16_t));
1565
- y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]);
1622
+ if (x[i] <= -10.0f) {
1623
+ y[i] = 0.0f;
1624
+ } else if (x[i] >= 10.0f) {
1625
+ y[i] = x[i];
1626
+ } else {
1627
+ ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
1628
+ memcpy(&t, &fp16, sizeof(uint16_t));
1629
+ y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]);
1630
+ }
1566
1631
  }
1567
1632
  }
1568
1633
  #else
@@ -1768,6 +1833,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
1768
1833
  "POOL_2D",
1769
1834
  "UPSCALE",
1770
1835
  "PAD",
1836
+ "ARANGE",
1837
+ "TIMESTEP_EMBEDDING",
1771
1838
  "ARGSORT",
1772
1839
  "LEAKY_RELU",
1773
1840
 
@@ -1796,7 +1863,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
1796
1863
  "CROSS_ENTROPY_LOSS_BACK",
1797
1864
  };
1798
1865
 
1799
- static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
1866
+ static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
1800
1867
 
1801
1868
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1802
1869
  "none",
@@ -1854,6 +1921,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1854
1921
  "pool_2d(x)",
1855
1922
  "upscale(x)",
1856
1923
  "pad(x)",
1924
+ "arange(start, stop, step)",
1925
+ "timestep_embedding(timesteps, dim, max_period)",
1857
1926
  "argsort(x)",
1858
1927
  "leaky_relu(x)",
1859
1928
 
@@ -1882,7 +1951,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1882
1951
  "cross_entropy_loss_back(x,y)",
1883
1952
  };
1884
1953
 
1885
- static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
1954
+ static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
1886
1955
 
1887
1956
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
1888
1957
 
@@ -2085,7 +2154,10 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
2085
2154
  getcpu_ret = getcpu(&current_cpu, &g_state.numa.current_node);
2086
2155
  #else
2087
2156
  // old glibc doesn't have a wrapper for this call. Fall back on direct syscall
2088
- getcpu_ret = syscall(SYS_getcpu,&current_cpu,&g_state.numa.current_node);
2157
+ # if !defined(SYS_getcpu) && defined(SYS_get_cpu)
2158
+ # define SYS_getcpu SYS_get_cpu // some older glibc versions use this name
2159
+ # endif
2160
+ getcpu_ret = syscall(SYS_getcpu, &current_cpu, &g_state.numa.current_node);
2089
2161
  #endif
2090
2162
 
2091
2163
  if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) {
@@ -2304,6 +2376,9 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
2304
2376
  case GGML_FTYPE_MOSTLY_IQ3_XXS: wtype = GGML_TYPE_IQ3_XXS; break;
2305
2377
  case GGML_FTYPE_MOSTLY_IQ1_S: wtype = GGML_TYPE_IQ1_S; break;
2306
2378
  case GGML_FTYPE_MOSTLY_IQ4_NL: wtype = GGML_TYPE_IQ4_NL; break;
2379
+ case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break;
2380
+ case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break;
2381
+ case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break;
2307
2382
  case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
2308
2383
  case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
2309
2384
  }
@@ -2708,7 +2783,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
2708
2783
  }
2709
2784
  }
2710
2785
 
2711
- struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size);
2786
+ struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TYPE_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size);
2712
2787
 
2713
2788
  // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here
2714
2789
 
@@ -2716,7 +2791,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
2716
2791
 
2717
2792
  *result = (struct ggml_tensor) {
2718
2793
  /*.type =*/ type,
2719
- /*.backend =*/ GGML_BACKEND_CPU,
2794
+ /*.backend =*/ GGML_BACKEND_TYPE_CPU,
2720
2795
  /*.buffer =*/ NULL,
2721
2796
  /*.ne =*/ { 1, 1, 1, 1 },
2722
2797
  /*.nb =*/ { 0, 0, 0, 0 },
@@ -2838,11 +2913,21 @@ static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_
2838
2913
  return ((const int32_t *)(tensor->op_params))[i];
2839
2914
  }
2840
2915
 
2916
+ static float ggml_get_op_params_f32(const struct ggml_tensor * tensor, uint32_t i) {
2917
+ assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
2918
+ return ((const float *)(tensor->op_params))[i];
2919
+ }
2920
+
2841
2921
  static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
2842
2922
  assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
2843
2923
  ((int32_t *)(tensor->op_params))[i] = value;
2844
2924
  }
2845
2925
 
2926
+ static void ggml_set_op_params_f32(struct ggml_tensor * tensor, uint32_t i, float value) {
2927
+ assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
2928
+ ((float *)(tensor->op_params))[i] = value;
2929
+ }
2930
+
2846
2931
  struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
2847
2932
  memset(tensor->data, 0, ggml_nbytes(tensor));
2848
2933
  return tensor;
@@ -3289,7 +3374,7 @@ struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx) {
3289
3374
  char * const mem_buffer = ctx->mem_buffer;
3290
3375
 
3291
3376
  while (obj != NULL) {
3292
- if (obj->type == GGML_OBJECT_TENSOR) {
3377
+ if (obj->type == GGML_OBJECT_TYPE_TENSOR) {
3293
3378
  return (struct ggml_tensor *)(mem_buffer + obj->offs);
3294
3379
  }
3295
3380
 
@@ -3306,7 +3391,7 @@ struct ggml_tensor * ggml_get_next_tensor(const struct ggml_context * ctx, struc
3306
3391
  char * const mem_buffer = ctx->mem_buffer;
3307
3392
 
3308
3393
  while (obj != NULL) {
3309
- if (obj->type == GGML_OBJECT_TENSOR) {
3394
+ if (obj->type == GGML_OBJECT_TYPE_TENSOR) {
3310
3395
  return (struct ggml_tensor *)(mem_buffer + obj->offs);
3311
3396
  }
3312
3397
 
@@ -3322,7 +3407,7 @@ struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * nam
3322
3407
  char * const mem_buffer = ctx->mem_buffer;
3323
3408
 
3324
3409
  while (obj != NULL) {
3325
- if (obj->type == GGML_OBJECT_TENSOR) {
3410
+ if (obj->type == GGML_OBJECT_TYPE_TENSOR) {
3326
3411
  struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
3327
3412
  if (strcmp(cur->name, name) == 0) {
3328
3413
  return cur;
@@ -5729,11 +5814,13 @@ struct ggml_tensor * ggml_pool_1d(
5729
5814
  is_node = true;
5730
5815
  }
5731
5816
 
5732
- const int64_t ne[2] = {
5817
+ const int64_t ne[4] = {
5733
5818
  ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
5734
5819
  a->ne[1],
5820
+ a->ne[2],
5821
+ a->ne[3],
5735
5822
  };
5736
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
5823
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
5737
5824
 
5738
5825
  int32_t params[] = { op, k0, s0, p0 };
5739
5826
  ggml_set_op_params(result, params, sizeof(params));
@@ -5839,6 +5926,55 @@ struct ggml_tensor * ggml_upscale(
5839
5926
  return ggml_upscale_impl(ctx, a, scale_factor);
5840
5927
  }
5841
5928
 
5929
+ struct ggml_tensor * ggml_arange(
5930
+ struct ggml_context * ctx,
5931
+ float start,
5932
+ float stop,
5933
+ float step) {
5934
+
5935
+ GGML_ASSERT(stop > start);
5936
+
5937
+ const int64_t steps = (int64_t) ceilf((stop - start) / step);
5938
+
5939
+ struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, steps);
5940
+
5941
+ result->op = GGML_OP_ARANGE;
5942
+ ggml_set_op_params_f32(result, 0, start);
5943
+ ggml_set_op_params_f32(result, 1, stop);
5944
+ ggml_set_op_params_f32(result, 2, step);
5945
+
5946
+ return result;
5947
+ }
5948
+
5949
+ struct ggml_tensor * ggml_timestep_embedding(
5950
+ struct ggml_context * ctx,
5951
+ struct ggml_tensor * timesteps,
5952
+ int dim,
5953
+ int max_period) {
5954
+ bool is_node = false;
5955
+
5956
+ if (timesteps->grad) {
5957
+ GGML_ASSERT(false); // TODO: implement backward
5958
+ is_node = true;
5959
+ }
5960
+
5961
+ int actual_dim = dim;
5962
+ if (dim % 2 != 0) {
5963
+ actual_dim = dim + 1;
5964
+ }
5965
+
5966
+ struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, actual_dim, timesteps->ne[0]);
5967
+
5968
+ result->op = GGML_OP_TIMESTEP_EMBEDDING;
5969
+ ggml_set_op_params_i32(result, 0, dim);
5970
+ ggml_set_op_params_i32(result, 1, max_period);
5971
+
5972
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5973
+ result->src[0] = timesteps;
5974
+
5975
+ return result;
5976
+ }
5977
+
5842
5978
  // ggml_argsort
5843
5979
 
5844
5980
  struct ggml_tensor * ggml_argsort(
@@ -5866,7 +6002,7 @@ struct ggml_tensor * ggml_top_k(
5866
6002
  int k) {
5867
6003
  GGML_ASSERT(a->ne[0] >= k);
5868
6004
 
5869
- struct ggml_tensor * result = ggml_argsort(ctx, a, GGML_SORT_DESC);
6005
+ struct ggml_tensor * result = ggml_argsort(ctx, a, GGML_SORT_ORDER_DESC);
5870
6006
 
5871
6007
  result = ggml_view_4d(ctx, result,
5872
6008
  k, result->ne[1], result->ne[2], result->ne[3],
@@ -6660,7 +6796,7 @@ static void ggml_compute_forward_dup_same_cont(
6660
6796
  GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
6661
6797
  GGML_ASSERT(src0->type == dst->type);
6662
6798
 
6663
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
6799
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
6664
6800
  return;
6665
6801
  }
6666
6802
 
@@ -6692,7 +6828,7 @@ static void ggml_compute_forward_dup_f16(
6692
6828
 
6693
6829
  GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
6694
6830
 
6695
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
6831
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
6696
6832
  return;
6697
6833
  }
6698
6834
 
@@ -6965,7 +7101,7 @@ static void ggml_compute_forward_dup_f32(
6965
7101
 
6966
7102
  GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
6967
7103
 
6968
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7104
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
6969
7105
  return;
6970
7106
  }
6971
7107
 
@@ -7218,7 +7354,7 @@ static void ggml_compute_forward_dup_bytes(
7218
7354
  GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
7219
7355
  GGML_ASSERT(src0->type == dst->type);
7220
7356
 
7221
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7357
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
7222
7358
  return;
7223
7359
  }
7224
7360
 
@@ -7398,7 +7534,7 @@ static void ggml_compute_forward_add_f32(
7398
7534
 
7399
7535
  GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
7400
7536
 
7401
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7537
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
7402
7538
  return;
7403
7539
  }
7404
7540
 
@@ -7406,7 +7542,7 @@ static void ggml_compute_forward_add_f32(
7406
7542
  const int nth = params->nth;
7407
7543
 
7408
7544
  #ifdef GGML_USE_CLBLAST
7409
- if (src1->backend == GGML_BACKEND_GPU) {
7545
+ if (src1->backend == GGML_BACKEND_TYPE_GPU) {
7410
7546
  // TODO: OpenCL kernel support full broadcast
7411
7547
  GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
7412
7548
  if (ith == 0) {
@@ -7488,7 +7624,7 @@ static void ggml_compute_forward_add_f16_f32(
7488
7624
 
7489
7625
  GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
7490
7626
 
7491
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7627
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
7492
7628
  return;
7493
7629
  }
7494
7630
 
@@ -7567,7 +7703,7 @@ static void ggml_compute_forward_add_f16_f16(
7567
7703
 
7568
7704
  GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
7569
7705
 
7570
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7706
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
7571
7707
  return;
7572
7708
  }
7573
7709
 
@@ -7623,7 +7759,7 @@ static void ggml_compute_forward_add_q_f32(
7623
7759
 
7624
7760
  GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
7625
7761
 
7626
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7762
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
7627
7763
  return;
7628
7764
  }
7629
7765
 
@@ -7738,6 +7874,9 @@ static void ggml_compute_forward_add(
7738
7874
  case GGML_TYPE_IQ3_XXS:
7739
7875
  case GGML_TYPE_IQ1_S:
7740
7876
  case GGML_TYPE_IQ4_NL:
7877
+ case GGML_TYPE_IQ4_XS:
7878
+ case GGML_TYPE_IQ3_S:
7879
+ case GGML_TYPE_IQ2_S:
7741
7880
  {
7742
7881
  ggml_compute_forward_add_q_f32(params, dst);
7743
7882
  } break;
@@ -7760,7 +7899,7 @@ static void ggml_compute_forward_add1_f32(
7760
7899
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
7761
7900
  GGML_ASSERT(ggml_is_scalar(src1));
7762
7901
 
7763
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7902
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
7764
7903
  return;
7765
7904
  }
7766
7905
 
@@ -7814,7 +7953,7 @@ static void ggml_compute_forward_add1_f16_f32(
7814
7953
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
7815
7954
  GGML_ASSERT(ggml_is_scalar(src1));
7816
7955
 
7817
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7956
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
7818
7957
  return;
7819
7958
  }
7820
7959
 
@@ -7866,7 +8005,7 @@ static void ggml_compute_forward_add1_f16_f16(
7866
8005
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
7867
8006
  GGML_ASSERT(ggml_is_scalar(src1));
7868
8007
 
7869
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8008
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
7870
8009
  return;
7871
8010
  }
7872
8011
 
@@ -7918,7 +8057,7 @@ static void ggml_compute_forward_add1_q_f32(
7918
8057
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
7919
8058
  GGML_ASSERT(ggml_is_scalar(src1));
7920
8059
 
7921
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8060
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
7922
8061
  return;
7923
8062
  }
7924
8063
 
@@ -8017,6 +8156,9 @@ static void ggml_compute_forward_add1(
8017
8156
  case GGML_TYPE_IQ3_XXS:
8018
8157
  case GGML_TYPE_IQ1_S:
8019
8158
  case GGML_TYPE_IQ4_NL:
8159
+ case GGML_TYPE_IQ4_XS:
8160
+ case GGML_TYPE_IQ3_S:
8161
+ case GGML_TYPE_IQ2_S:
8020
8162
  {
8021
8163
  ggml_compute_forward_add1_q_f32(params, dst);
8022
8164
  } break;
@@ -8047,7 +8189,7 @@ static void ggml_compute_forward_acc_f32(
8047
8189
  size_t offset = ((int32_t *) dst->op_params)[3];
8048
8190
  bool inplace = (bool) ((int32_t *) dst->op_params)[4];
8049
8191
 
8050
- if (!inplace && (params->type == GGML_TASK_INIT)) {
8192
+ if (!inplace && (params->type == GGML_TASK_TYPE_INIT)) {
8051
8193
  if (params->ith != 0) {
8052
8194
  return;
8053
8195
  }
@@ -8059,7 +8201,7 @@ static void ggml_compute_forward_acc_f32(
8059
8201
  ggml_nbytes(dst));
8060
8202
  }
8061
8203
 
8062
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8204
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8063
8205
  return;
8064
8206
  }
8065
8207
 
@@ -8141,6 +8283,9 @@ static void ggml_compute_forward_acc(
8141
8283
  case GGML_TYPE_IQ3_XXS:
8142
8284
  case GGML_TYPE_IQ1_S:
8143
8285
  case GGML_TYPE_IQ4_NL:
8286
+ case GGML_TYPE_IQ4_XS:
8287
+ case GGML_TYPE_IQ3_S:
8288
+ case GGML_TYPE_IQ2_S:
8144
8289
  default:
8145
8290
  {
8146
8291
  GGML_ASSERT(false);
@@ -8160,7 +8305,7 @@ static void ggml_compute_forward_sub_f32(
8160
8305
  assert(params->ith == 0);
8161
8306
  assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
8162
8307
 
8163
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8308
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8164
8309
  return;
8165
8310
  }
8166
8311
 
@@ -8241,14 +8386,14 @@ static void ggml_compute_forward_mul_f32(
8241
8386
 
8242
8387
  GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
8243
8388
 
8244
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8389
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8245
8390
  return;
8246
8391
  }
8247
8392
  const int ith = params->ith;
8248
8393
  const int nth = params->nth;
8249
8394
 
8250
8395
  #if defined(GGML_USE_CLBLAST)
8251
- if (src1->backend == GGML_BACKEND_GPU) {
8396
+ if (src1->backend == GGML_BACKEND_TYPE_GPU) {
8252
8397
  // TODO: OpenCL kernel support full broadcast
8253
8398
  GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
8254
8399
  if (ith == 0) {
@@ -8349,7 +8494,7 @@ static void ggml_compute_forward_div_f32(
8349
8494
 
8350
8495
  GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
8351
8496
 
8352
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8497
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8353
8498
  return;
8354
8499
  }
8355
8500
 
@@ -8444,7 +8589,7 @@ static void ggml_compute_forward_sqr_f32(
8444
8589
  assert(params->ith == 0);
8445
8590
  assert(ggml_are_same_shape(src0, dst));
8446
8591
 
8447
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8592
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8448
8593
  return;
8449
8594
  }
8450
8595
 
@@ -8490,7 +8635,7 @@ static void ggml_compute_forward_sqrt_f32(
8490
8635
  assert(params->ith == 0);
8491
8636
  assert(ggml_are_same_shape(src0, dst));
8492
8637
 
8493
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8638
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8494
8639
  return;
8495
8640
  }
8496
8641
 
@@ -8536,7 +8681,7 @@ static void ggml_compute_forward_log_f32(
8536
8681
  GGML_ASSERT(params->ith == 0);
8537
8682
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
8538
8683
 
8539
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8684
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8540
8685
  return;
8541
8686
  }
8542
8687
 
@@ -8582,7 +8727,7 @@ static void ggml_compute_forward_sum_f32(
8582
8727
  assert(params->ith == 0);
8583
8728
  assert(ggml_is_scalar(dst));
8584
8729
 
8585
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8730
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8586
8731
  return;
8587
8732
  }
8588
8733
 
@@ -8617,7 +8762,7 @@ static void ggml_compute_forward_sum_f16(
8617
8762
  assert(params->ith == 0);
8618
8763
  assert(ggml_is_scalar(dst));
8619
8764
 
8620
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8765
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8621
8766
  return;
8622
8767
  }
8623
8768
 
@@ -8674,7 +8819,7 @@ static void ggml_compute_forward_sum_rows_f32(
8674
8819
 
8675
8820
  GGML_ASSERT(params->ith == 0);
8676
8821
 
8677
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8822
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8678
8823
  return;
8679
8824
  }
8680
8825
 
@@ -8729,7 +8874,7 @@ static void ggml_compute_forward_mean_f32(
8729
8874
 
8730
8875
  assert(params->ith == 0);
8731
8876
 
8732
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8877
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8733
8878
  return;
8734
8879
  }
8735
8880
 
@@ -8788,7 +8933,7 @@ static void ggml_compute_forward_argmax_f32(
8788
8933
 
8789
8934
  assert(params->ith == 0);
8790
8935
 
8791
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8936
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8792
8937
  return;
8793
8938
  }
8794
8939
 
@@ -8839,7 +8984,7 @@ static void ggml_compute_forward_repeat_f32(
8839
8984
  GGML_ASSERT(params->ith == 0);
8840
8985
  GGML_ASSERT(ggml_can_repeat(src0, dst));
8841
8986
 
8842
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8987
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8843
8988
  return;
8844
8989
  }
8845
8990
 
@@ -8884,7 +9029,7 @@ static void ggml_compute_forward_repeat_f16(
8884
9029
  GGML_ASSERT(params->ith == 0);
8885
9030
  GGML_ASSERT(ggml_can_repeat(src0, dst));
8886
9031
 
8887
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9032
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8888
9033
  return;
8889
9034
  }
8890
9035
 
@@ -8958,7 +9103,7 @@ static void ggml_compute_forward_repeat_back_f32(
8958
9103
  GGML_ASSERT(params->ith == 0);
8959
9104
  GGML_ASSERT(ggml_can_repeat(dst, src0));
8960
9105
 
8961
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9106
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
8962
9107
  return;
8963
9108
  }
8964
9109
 
@@ -9035,7 +9180,7 @@ static void ggml_compute_forward_concat_f32(
9035
9180
  const struct ggml_tensor * src0 = dst->src[0];
9036
9181
  const struct ggml_tensor * src1 = dst->src[1];
9037
9182
 
9038
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9183
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9039
9184
  return;
9040
9185
  }
9041
9186
 
@@ -9107,7 +9252,7 @@ static void ggml_compute_forward_abs_f32(
9107
9252
  assert(params->ith == 0);
9108
9253
  assert(ggml_are_same_shape(src0, dst));
9109
9254
 
9110
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9255
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9111
9256
  return;
9112
9257
  }
9113
9258
 
@@ -9153,7 +9298,7 @@ static void ggml_compute_forward_sgn_f32(
9153
9298
  assert(params->ith == 0);
9154
9299
  assert(ggml_are_same_shape(src0, dst));
9155
9300
 
9156
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9301
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9157
9302
  return;
9158
9303
  }
9159
9304
 
@@ -9199,7 +9344,7 @@ static void ggml_compute_forward_neg_f32(
9199
9344
  assert(params->ith == 0);
9200
9345
  assert(ggml_are_same_shape(src0, dst));
9201
9346
 
9202
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9347
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9203
9348
  return;
9204
9349
  }
9205
9350
 
@@ -9245,7 +9390,7 @@ static void ggml_compute_forward_step_f32(
9245
9390
  assert(params->ith == 0);
9246
9391
  assert(ggml_are_same_shape(src0, dst));
9247
9392
 
9248
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9393
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9249
9394
  return;
9250
9395
  }
9251
9396
 
@@ -9291,7 +9436,7 @@ static void ggml_compute_forward_tanh_f32(
9291
9436
  assert(params->ith == 0);
9292
9437
  assert(ggml_are_same_shape(src0, dst));
9293
9438
 
9294
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9439
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9295
9440
  return;
9296
9441
  }
9297
9442
 
@@ -9337,7 +9482,7 @@ static void ggml_compute_forward_elu_f32(
9337
9482
  assert(params->ith == 0);
9338
9483
  assert(ggml_are_same_shape(src0, dst));
9339
9484
 
9340
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9485
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9341
9486
  return;
9342
9487
  }
9343
9488
 
@@ -9383,7 +9528,7 @@ static void ggml_compute_forward_relu_f32(
9383
9528
  assert(params->ith == 0);
9384
9529
  assert(ggml_are_same_shape(src0, dst));
9385
9530
 
9386
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9531
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9387
9532
  return;
9388
9533
  }
9389
9534
 
@@ -9430,7 +9575,7 @@ static void ggml_compute_forward_gelu_f32(
9430
9575
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
9431
9576
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
9432
9577
 
9433
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9578
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9434
9579
  return;
9435
9580
  }
9436
9581
 
@@ -9493,7 +9638,7 @@ static void ggml_compute_forward_gelu_quick_f32(
9493
9638
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
9494
9639
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
9495
9640
 
9496
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9641
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9497
9642
  return;
9498
9643
  }
9499
9644
 
@@ -9556,7 +9701,7 @@ static void ggml_compute_forward_silu_f32(
9556
9701
  GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
9557
9702
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
9558
9703
 
9559
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9704
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9560
9705
  return;
9561
9706
  }
9562
9707
 
@@ -9617,7 +9762,7 @@ static void ggml_compute_forward_leaky_relu_f32(
9617
9762
  assert(params->ith == 0);
9618
9763
  assert(ggml_are_same_shape(src0, dst));
9619
9764
 
9620
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9765
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9621
9766
  return;
9622
9767
  }
9623
9768
 
@@ -9670,7 +9815,7 @@ static void ggml_compute_forward_silu_back_f32(
9670
9815
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
9671
9816
  GGML_ASSERT(ggml_are_same_shape(src0, grad));
9672
9817
 
9673
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9818
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9674
9819
  return;
9675
9820
  }
9676
9821
 
@@ -9732,7 +9877,7 @@ static void ggml_compute_forward_hardswish_f32(
9732
9877
  assert(params->ith == 0);
9733
9878
  assert(ggml_are_same_shape(src0, dst));
9734
9879
 
9735
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9880
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9736
9881
  return;
9737
9882
  }
9738
9883
 
@@ -9775,7 +9920,7 @@ static void ggml_compute_forward_hardsigmoid_f32(
9775
9920
  assert(params->ith == 0);
9776
9921
  assert(ggml_are_same_shape(src0, dst));
9777
9922
 
9778
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9923
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9779
9924
  return;
9780
9925
  }
9781
9926
 
@@ -9821,7 +9966,7 @@ static void ggml_compute_forward_norm_f32(
9821
9966
 
9822
9967
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
9823
9968
 
9824
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9969
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9825
9970
  return;
9826
9971
  }
9827
9972
 
@@ -9896,7 +10041,7 @@ static void ggml_compute_forward_rms_norm_f32(
9896
10041
 
9897
10042
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
9898
10043
 
9899
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10044
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9900
10045
  return;
9901
10046
  }
9902
10047
 
@@ -9967,7 +10112,7 @@ static void ggml_compute_forward_rms_norm_back_f32(
9967
10112
 
9968
10113
  GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_are_same_shape(src0, src1));
9969
10114
 
9970
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10115
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
9971
10116
  return;
9972
10117
  }
9973
10118
 
@@ -10145,7 +10290,7 @@ static void ggml_compute_forward_group_norm_f32(
10145
10290
 
10146
10291
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
10147
10292
 
10148
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10293
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
10149
10294
  return;
10150
10295
  }
10151
10296
 
@@ -10163,7 +10308,7 @@ static void ggml_compute_forward_group_norm_f32(
10163
10308
  int n_channels = src0->ne[2];
10164
10309
  int n_groups = dst->op_params[0];
10165
10310
  int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
10166
- for (int i = ith; i < n_groups; i+=nth) {
10311
+ for (int i = ith; i < n_groups; i += nth) {
10167
10312
  int start = i * n_channels_per_group;
10168
10313
  int end = start + n_channels_per_group;
10169
10314
  if (end > n_channels) {
@@ -10177,28 +10322,32 @@ static void ggml_compute_forward_group_norm_f32(
10177
10322
  for (int64_t i01 = 0; i01 < ne01; i01++) {
10178
10323
  const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
10179
10324
 
10325
+ ggml_float sumr = 0.0;
10180
10326
  for (int64_t i00 = 0; i00 < ne00; i00++) {
10181
- sum += (ggml_float)x[i00];
10327
+ sumr += (ggml_float)x[i00];
10182
10328
  }
10329
+ sum += sumr;
10183
10330
  }
10184
10331
  }
10185
- float mean = sum / (ne00 * ne01 * step);
10186
- ggml_float sum2 = 0.0;
10332
+ const float mean = sum / (ne00 * ne01 * step);
10187
10333
 
10334
+ ggml_float sum2 = 0.0;
10188
10335
  for (int64_t i02 = start; i02 < end; i02++) {
10189
10336
  for (int64_t i01 = 0; i01 < ne01; i01++) {
10190
10337
  const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
10191
10338
 
10192
10339
  float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
10193
10340
 
10341
+ ggml_float sumr = 0.0;
10194
10342
  for (int64_t i00 = 0; i00 < ne00; i00++) {
10195
10343
  float v = x[i00] - mean;
10196
10344
  y[i00] = v;
10197
- sum2 += (ggml_float)(v * v);
10345
+ sumr += (ggml_float)(v * v);
10198
10346
  }
10347
+ sum2 += sumr;
10199
10348
  }
10200
10349
  }
10201
- float variance = sum2 / (ne00 * ne01 * step);
10350
+ const float variance = sum2 / (ne00 * ne01 * step);
10202
10351
  const float scale = 1.0f / sqrtf(variance + eps);
10203
10352
 
10204
10353
  for (int64_t i02 = start; i02 < end; i02++) {
@@ -10312,7 +10461,7 @@ static void ggml_compute_forward_mul_mat(
10312
10461
 
10313
10462
  #if defined(GGML_USE_CLBLAST)
10314
10463
  if (ggml_cl_can_mul_mat(src0, src1, dst)) {
10315
- if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
10464
+ if (params->ith == 0 && params->type == GGML_TASK_TYPE_COMPUTE) {
10316
10465
  ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
10317
10466
  }
10318
10467
  return;
@@ -10325,7 +10474,7 @@ static void ggml_compute_forward_mul_mat(
10325
10474
  const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
10326
10475
  UNUSED(desired_wsize);
10327
10476
 
10328
- if (params->type == GGML_TASK_INIT) {
10477
+ if (params->type == GGML_TASK_TYPE_INIT) {
10329
10478
  if (type != GGML_TYPE_F32) {
10330
10479
  assert(params->wsize >= desired_wsize);
10331
10480
  // parallelize by src0 rows
@@ -10348,7 +10497,7 @@ static void ggml_compute_forward_mul_mat(
10348
10497
  return;
10349
10498
  }
10350
10499
 
10351
- if (params->type == GGML_TASK_FINALIZE) {
10500
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
10352
10501
  return;
10353
10502
  }
10354
10503
 
@@ -10386,7 +10535,7 @@ static void ggml_compute_forward_mul_mat(
10386
10535
  }
10387
10536
  #endif
10388
10537
 
10389
- if (params->type == GGML_TASK_INIT) {
10538
+ if (params->type == GGML_TASK_TYPE_INIT) {
10390
10539
  if (ith != 0) {
10391
10540
  return;
10392
10541
  }
@@ -10410,7 +10559,7 @@ static void ggml_compute_forward_mul_mat(
10410
10559
  return;
10411
10560
  }
10412
10561
 
10413
- if (params->type == GGML_TASK_FINALIZE) {
10562
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
10414
10563
  return;
10415
10564
  }
10416
10565
 
@@ -10567,7 +10716,7 @@ static void ggml_compute_forward_mul_mat_id(
10567
10716
 
10568
10717
  #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
10569
10718
 
10570
- if (params->type == GGML_TASK_INIT) {
10719
+ if (params->type == GGML_TASK_TYPE_INIT) {
10571
10720
  if (ith != 0) {
10572
10721
  return;
10573
10722
  }
@@ -10604,7 +10753,7 @@ static void ggml_compute_forward_mul_mat_id(
10604
10753
  return;
10605
10754
  }
10606
10755
 
10607
- if (params->type == GGML_TASK_FINALIZE) {
10756
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
10608
10757
  return;
10609
10758
  }
10610
10759
 
@@ -10752,7 +10901,7 @@ static void ggml_compute_forward_out_prod_f32(
10752
10901
  (ggml_is_contiguous(src1) || ggml_is_transposed(src1));
10753
10902
  #endif
10754
10903
 
10755
- if (params->type == GGML_TASK_INIT) {
10904
+ if (params->type == GGML_TASK_TYPE_INIT) {
10756
10905
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
10757
10906
  if (use_blas) {
10758
10907
  return;
@@ -10765,7 +10914,7 @@ static void ggml_compute_forward_out_prod_f32(
10765
10914
  return;
10766
10915
  }
10767
10916
 
10768
- if (params->type == GGML_TASK_FINALIZE) {
10917
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
10769
10918
  return;
10770
10919
  }
10771
10920
 
@@ -10945,7 +11094,7 @@ static void ggml_compute_forward_out_prod_q_f32(
10945
11094
  // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
10946
11095
  // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
10947
11096
 
10948
- if (params->type == GGML_TASK_INIT) {
11097
+ if (params->type == GGML_TASK_TYPE_INIT) {
10949
11098
  if (ith != 0) {
10950
11099
  return;
10951
11100
  }
@@ -10953,7 +11102,7 @@ static void ggml_compute_forward_out_prod_q_f32(
10953
11102
  return;
10954
11103
  }
10955
11104
 
10956
- if (params->type == GGML_TASK_FINALIZE) {
11105
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
10957
11106
  return;
10958
11107
  }
10959
11108
 
@@ -11039,6 +11188,9 @@ static void ggml_compute_forward_out_prod(
11039
11188
  case GGML_TYPE_IQ3_XXS:
11040
11189
  case GGML_TYPE_IQ1_S:
11041
11190
  case GGML_TYPE_IQ4_NL:
11191
+ case GGML_TYPE_IQ4_XS:
11192
+ case GGML_TYPE_IQ3_S:
11193
+ case GGML_TYPE_IQ2_S:
11042
11194
  {
11043
11195
  ggml_compute_forward_out_prod_q_f32(params, dst);
11044
11196
  } break;
@@ -11070,7 +11222,7 @@ static void ggml_compute_forward_scale_f32(
11070
11222
  GGML_ASSERT(ggml_is_contiguous(dst));
11071
11223
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
11072
11224
 
11073
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11225
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11074
11226
  return;
11075
11227
  }
11076
11228
 
@@ -11142,7 +11294,7 @@ static void ggml_compute_forward_set_f32(
11142
11294
  size_t offset = ((int32_t *) dst->op_params)[3];
11143
11295
  bool inplace = (bool) ((int32_t *) dst->op_params)[4];
11144
11296
 
11145
- if (!inplace && (params->type == GGML_TASK_INIT)) {
11297
+ if (!inplace && (params->type == GGML_TASK_TYPE_INIT)) {
11146
11298
  if (params->ith != 0) {
11147
11299
  return;
11148
11300
  }
@@ -11154,7 +11306,7 @@ static void ggml_compute_forward_set_f32(
11154
11306
  ggml_nbytes(dst));
11155
11307
  }
11156
11308
 
11157
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11309
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11158
11310
  return;
11159
11311
  }
11160
11312
 
@@ -11227,6 +11379,9 @@ static void ggml_compute_forward_set(
11227
11379
  case GGML_TYPE_IQ3_XXS:
11228
11380
  case GGML_TYPE_IQ1_S:
11229
11381
  case GGML_TYPE_IQ4_NL:
11382
+ case GGML_TYPE_IQ4_XS:
11383
+ case GGML_TYPE_IQ3_S:
11384
+ case GGML_TYPE_IQ2_S:
11230
11385
  default:
11231
11386
  {
11232
11387
  GGML_ASSERT(false);
@@ -11301,7 +11456,7 @@ static void ggml_compute_forward_get_rows_q(
11301
11456
 
11302
11457
  assert(params->ith == 0);
11303
11458
 
11304
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11459
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11305
11460
  return;
11306
11461
  }
11307
11462
 
@@ -11341,7 +11496,7 @@ static void ggml_compute_forward_get_rows_f16(
11341
11496
 
11342
11497
  assert(params->ith == 0);
11343
11498
 
11344
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11499
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11345
11500
  return;
11346
11501
  }
11347
11502
 
@@ -11378,7 +11533,7 @@ static void ggml_compute_forward_get_rows_f32(
11378
11533
 
11379
11534
  assert(params->ith == 0);
11380
11535
 
11381
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11536
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11382
11537
  return;
11383
11538
  }
11384
11539
 
@@ -11429,6 +11584,9 @@ static void ggml_compute_forward_get_rows(
11429
11584
  case GGML_TYPE_IQ3_XXS:
11430
11585
  case GGML_TYPE_IQ1_S:
11431
11586
  case GGML_TYPE_IQ4_NL:
11587
+ case GGML_TYPE_IQ4_XS:
11588
+ case GGML_TYPE_IQ3_S:
11589
+ case GGML_TYPE_IQ2_S:
11432
11590
  {
11433
11591
  ggml_compute_forward_get_rows_q(params, dst);
11434
11592
  } break;
@@ -11480,14 +11638,14 @@ static void ggml_compute_forward_get_rows_back_f32_f16(
11480
11638
 
11481
11639
  // ggml_compute_forward_dup_same_cont(params, opt0, dst);
11482
11640
 
11483
- if (params->type == GGML_TASK_INIT) {
11641
+ if (params->type == GGML_TASK_TYPE_INIT) {
11484
11642
  if (params->ith != 0) {
11485
11643
  return;
11486
11644
  }
11487
11645
  memset(dst->data, 0, ggml_nbytes(dst));
11488
11646
  }
11489
11647
 
11490
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11648
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11491
11649
  return;
11492
11650
  }
11493
11651
 
@@ -11519,14 +11677,14 @@ static void ggml_compute_forward_get_rows_back_f32(
11519
11677
 
11520
11678
  // ggml_compute_forward_dup_same_cont(params, opt0, dst);
11521
11679
 
11522
- if (params->type == GGML_TASK_INIT) {
11680
+ if (params->type == GGML_TASK_TYPE_INIT) {
11523
11681
  if (params->ith != 0) {
11524
11682
  return;
11525
11683
  }
11526
11684
  memset(dst->data, 0, ggml_nbytes(dst));
11527
11685
  }
11528
11686
 
11529
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11687
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11530
11688
  return;
11531
11689
  }
11532
11690
 
@@ -11596,7 +11754,7 @@ static void ggml_compute_forward_diag_f32(
11596
11754
 
11597
11755
  GGML_ASSERT(params->ith == 0);
11598
11756
 
11599
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11757
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11600
11758
  return;
11601
11759
  }
11602
11760
 
@@ -11665,7 +11823,7 @@ static void ggml_compute_forward_diag_mask_f32(
11665
11823
 
11666
11824
  GGML_ASSERT(n_past >= 0);
11667
11825
 
11668
- if (!inplace && (params->type == GGML_TASK_INIT)) {
11826
+ if (!inplace && (params->type == GGML_TASK_TYPE_INIT)) {
11669
11827
  if (ith != 0) {
11670
11828
  return;
11671
11829
  }
@@ -11679,7 +11837,7 @@ static void ggml_compute_forward_diag_mask_f32(
11679
11837
  ggml_nbytes(dst));
11680
11838
  }
11681
11839
 
11682
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11840
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11683
11841
  return;
11684
11842
  }
11685
11843
 
@@ -11753,7 +11911,7 @@ static void ggml_compute_forward_soft_max_f32(
11753
11911
  assert(ggml_is_contiguous(dst));
11754
11912
  assert(ggml_are_same_shape(src0, dst));
11755
11913
 
11756
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11914
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11757
11915
  return;
11758
11916
  }
11759
11917
 
@@ -11891,7 +12049,7 @@ static void ggml_compute_forward_soft_max_back_f32(
11891
12049
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
11892
12050
  GGML_ASSERT(ggml_are_same_shape(src1, dst));
11893
12051
 
11894
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12052
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11895
12053
  return;
11896
12054
  }
11897
12055
 
@@ -11985,7 +12143,7 @@ static void ggml_compute_forward_alibi_f32(
11985
12143
 
11986
12144
  assert(params->ith == 0);
11987
12145
 
11988
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12146
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11989
12147
  return;
11990
12148
  }
11991
12149
 
@@ -12044,7 +12202,7 @@ static void ggml_compute_forward_alibi_f16(
12044
12202
 
12045
12203
  assert(params->ith == 0);
12046
12204
 
12047
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12205
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
12048
12206
  return;
12049
12207
  }
12050
12208
 
@@ -12129,6 +12287,9 @@ static void ggml_compute_forward_alibi(
12129
12287
  case GGML_TYPE_IQ3_XXS:
12130
12288
  case GGML_TYPE_IQ1_S:
12131
12289
  case GGML_TYPE_IQ4_NL:
12290
+ case GGML_TYPE_IQ4_XS:
12291
+ case GGML_TYPE_IQ3_S:
12292
+ case GGML_TYPE_IQ2_S:
12132
12293
  case GGML_TYPE_Q8_K:
12133
12294
  case GGML_TYPE_I8:
12134
12295
  case GGML_TYPE_I16:
@@ -12150,7 +12311,7 @@ static void ggml_compute_forward_clamp_f32(
12150
12311
 
12151
12312
  assert(params->ith == 0);
12152
12313
 
12153
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12314
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
12154
12315
  return;
12155
12316
  }
12156
12317
 
@@ -12212,6 +12373,9 @@ static void ggml_compute_forward_clamp(
12212
12373
  case GGML_TYPE_IQ3_XXS:
12213
12374
  case GGML_TYPE_IQ1_S:
12214
12375
  case GGML_TYPE_IQ4_NL:
12376
+ case GGML_TYPE_IQ4_XS:
12377
+ case GGML_TYPE_IQ3_S:
12378
+ case GGML_TYPE_IQ2_S:
12215
12379
  case GGML_TYPE_Q8_K:
12216
12380
  case GGML_TYPE_I8:
12217
12381
  case GGML_TYPE_I16:
@@ -12289,7 +12453,7 @@ static void ggml_compute_forward_rope_f32(
12289
12453
  const struct ggml_tensor * src0 = dst->src[0];
12290
12454
  const struct ggml_tensor * src1 = dst->src[1];
12291
12455
 
12292
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12456
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
12293
12457
  return;
12294
12458
  }
12295
12459
 
@@ -12467,7 +12631,7 @@ static void ggml_compute_forward_rope_f16(
12467
12631
  const struct ggml_tensor * src0 = dst->src[0];
12468
12632
  const struct ggml_tensor * src1 = dst->src[1];
12469
12633
 
12470
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12634
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
12471
12635
  return;
12472
12636
  }
12473
12637
 
@@ -12698,7 +12862,7 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
12698
12862
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
12699
12863
  GGML_ASSERT(nb10 == sizeof(float));
12700
12864
 
12701
- if (params->type == GGML_TASK_INIT) {
12865
+ if (params->type == GGML_TASK_TYPE_INIT) {
12702
12866
  if (ith != 0) {
12703
12867
  return;
12704
12868
  }
@@ -12738,7 +12902,7 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
12738
12902
  return;
12739
12903
  }
12740
12904
 
12741
- if (params->type == GGML_TASK_FINALIZE) {
12905
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
12742
12906
  return;
12743
12907
  }
12744
12908
 
@@ -12797,7 +12961,7 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
12797
12961
  GGML_ASSERT(nb00 == sizeof(float));
12798
12962
  GGML_ASSERT(nb10 == sizeof(float));
12799
12963
 
12800
- if (params->type == GGML_TASK_INIT) {
12964
+ if (params->type == GGML_TASK_TYPE_INIT) {
12801
12965
  if (ith != 0) {
12802
12966
  return;
12803
12967
  }
@@ -12837,7 +13001,7 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
12837
13001
  return;
12838
13002
  }
12839
13003
 
12840
- if (params->type == GGML_TASK_FINALIZE) {
13004
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
12841
13005
  return;
12842
13006
  }
12843
13007
 
@@ -12941,11 +13105,11 @@ static void ggml_compute_forward_im2col_f32(
12941
13105
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
12942
13106
  GGML_ASSERT(nb10 == sizeof(float));
12943
13107
 
12944
- if (params->type == GGML_TASK_INIT) {
13108
+ if (params->type == GGML_TASK_TYPE_INIT) {
12945
13109
  return;
12946
13110
  }
12947
13111
 
12948
- if (params->type == GGML_TASK_FINALIZE) {
13112
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
12949
13113
  return;
12950
13114
  }
12951
13115
 
@@ -13029,11 +13193,11 @@ static void ggml_compute_forward_im2col_f16(
13029
13193
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
13030
13194
  GGML_ASSERT(nb10 == sizeof(float));
13031
13195
 
13032
- if (params->type == GGML_TASK_INIT) {
13196
+ if (params->type == GGML_TASK_TYPE_INIT) {
13033
13197
  return;
13034
13198
  }
13035
13199
 
13036
- if (params->type == GGML_TASK_FINALIZE) {
13200
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
13037
13201
  return;
13038
13202
  }
13039
13203
 
@@ -13115,7 +13279,7 @@ static void ggml_compute_forward_conv_transpose_2d(
13115
13279
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
13116
13280
  GGML_ASSERT(nb10 == sizeof(float));
13117
13281
 
13118
- if (params->type == GGML_TASK_INIT) {
13282
+ if (params->type == GGML_TASK_TYPE_INIT) {
13119
13283
  if (ith != 0) {
13120
13284
  return;
13121
13285
  }
@@ -13157,7 +13321,7 @@ static void ggml_compute_forward_conv_transpose_2d(
13157
13321
  return;
13158
13322
  }
13159
13323
 
13160
- if (params->type == GGML_TASK_FINALIZE) {
13324
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
13161
13325
  return;
13162
13326
  }
13163
13327
 
@@ -13209,7 +13373,7 @@ static void ggml_compute_forward_pool_1d_sk_p0(
13209
13373
  assert(src->type == GGML_TYPE_F32);
13210
13374
  assert(params->ith == 0);
13211
13375
 
13212
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
13376
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
13213
13377
  return;
13214
13378
  }
13215
13379
 
@@ -13278,7 +13442,7 @@ static void ggml_compute_forward_pool_2d(
13278
13442
  GGML_ASSERT(src->type == GGML_TYPE_F32);
13279
13443
  GGML_ASSERT(params->ith == 0);
13280
13444
 
13281
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
13445
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
13282
13446
  return;
13283
13447
  }
13284
13448
 
@@ -13351,7 +13515,7 @@ static void ggml_compute_forward_upscale_f32(
13351
13515
 
13352
13516
  const struct ggml_tensor * src0 = dst->src[0];
13353
13517
 
13354
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
13518
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
13355
13519
  return;
13356
13520
  }
13357
13521
 
@@ -13411,7 +13575,7 @@ static void ggml_compute_forward_pad_f32(
13411
13575
 
13412
13576
  const struct ggml_tensor * src0 = dst->src[0];
13413
13577
 
13414
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
13578
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
13415
13579
  return;
13416
13580
  }
13417
13581
 
@@ -13464,6 +13628,106 @@ static void ggml_compute_forward_pad(
13464
13628
  }
13465
13629
  }
13466
13630
 
13631
+
13632
+ // ggml_compute_forward_arange
13633
+
13634
+ static void ggml_compute_forward_arange_f32(
13635
+ const struct ggml_compute_params * params,
13636
+ struct ggml_tensor * dst) {
13637
+
13638
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
13639
+ return;
13640
+ }
13641
+
13642
+ GGML_ASSERT(dst->nb[0] == sizeof(float));
13643
+
13644
+ const int ith = params->ith;
13645
+ const int nth = params->nth;
13646
+
13647
+ const float start = ggml_get_op_params_f32(dst, 0);
13648
+ const float stop = ggml_get_op_params_f32(dst, 1);
13649
+ const float step = ggml_get_op_params_f32(dst, 2);
13650
+
13651
+ const int64_t steps = (int64_t) ceilf((stop - start) / step);
13652
+
13653
+ GGML_ASSERT(ggml_nelements(dst) == steps);
13654
+
13655
+ for (int64_t i = ith; i < steps; i+= nth) {
13656
+ float value = start + step * i;
13657
+ ((float *)dst->data)[i] = value;
13658
+ }
13659
+ }
13660
+
13661
+ static void ggml_compute_forward_arange(
13662
+ const struct ggml_compute_params * params,
13663
+ struct ggml_tensor * dst) {
13664
+ switch (dst->type) {
13665
+ case GGML_TYPE_F32:
13666
+ {
13667
+ ggml_compute_forward_arange_f32(params, dst);
13668
+ } break;
13669
+ default:
13670
+ {
13671
+ GGML_ASSERT(false);
13672
+ } break;
13673
+ }
13674
+ }
13675
+
13676
+ static void ggml_compute_forward_timestep_embedding_f32(
13677
+ const struct ggml_compute_params * params,
13678
+ struct ggml_tensor * dst) {
13679
+
13680
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
13681
+ return;
13682
+ }
13683
+
13684
+ const struct ggml_tensor * src0 = dst->src[0];
13685
+
13686
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
13687
+
13688
+ const int ith = params->ith;
13689
+ const int nth = params->nth;
13690
+
13691
+ GGML_TENSOR_UNARY_OP_LOCALS
13692
+
13693
+ const int dim = ggml_get_op_params_i32(dst, 0);
13694
+ const int max_period = ggml_get_op_params_i32(dst, 1);
13695
+
13696
+ int half = dim / 2;
13697
+
13698
+ for (int64_t i = 0; i < ne00; i++) {
13699
+ float * embed_data = (float *)((char *) dst->data + i*nb1);
13700
+ for (int64_t j = ith; j < half; j += nth) {
13701
+ float timestep = ((float *)src0->data)[i];
13702
+ float freq = (float)expf(-logf(max_period) * j / half);
13703
+ float arg = timestep * freq;
13704
+ embed_data[j] = cosf(arg);
13705
+ embed_data[j + half] = sinf(arg);
13706
+ }
13707
+ if (dim % 2 != 0 && ith == 0) {
13708
+ embed_data[dim] = 0.f;
13709
+ }
13710
+ }
13711
+ }
13712
+
13713
+ static void ggml_compute_forward_timestep_embedding(
13714
+ const struct ggml_compute_params * params,
13715
+ struct ggml_tensor * dst) {
13716
+
13717
+ const struct ggml_tensor * src0 = dst->src[0];
13718
+
13719
+ switch (src0->type) {
13720
+ case GGML_TYPE_F32:
13721
+ {
13722
+ ggml_compute_forward_timestep_embedding_f32(params, dst);
13723
+ } break;
13724
+ default:
13725
+ {
13726
+ GGML_ASSERT(false);
13727
+ } break;
13728
+ }
13729
+ }
13730
+
13467
13731
  // ggml_compute_forward_argsort
13468
13732
 
13469
13733
  static void ggml_compute_forward_argsort_f32(
@@ -13472,7 +13736,7 @@ static void ggml_compute_forward_argsort_f32(
13472
13736
 
13473
13737
  const struct ggml_tensor * src0 = dst->src[0];
13474
13738
 
13475
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
13739
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
13476
13740
  return;
13477
13741
  }
13478
13742
 
@@ -13498,8 +13762,8 @@ static void ggml_compute_forward_argsort_f32(
13498
13762
  // C doesn't have a functional sort, so we do a bubble sort instead
13499
13763
  for (int64_t j = 0; j < ne0; j++) {
13500
13764
  for (int64_t k = j + 1; k < ne0; k++) {
13501
- if ((order == GGML_SORT_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
13502
- (order == GGML_SORT_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
13765
+ if ((order == GGML_SORT_ORDER_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
13766
+ (order == GGML_SORT_ORDER_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
13503
13767
  int32_t tmp = dst_data[j];
13504
13768
  dst_data[j] = dst_data[k];
13505
13769
  dst_data[k] = tmp;
@@ -13582,11 +13846,11 @@ static void ggml_compute_forward_flash_attn_f32(
13582
13846
  GGML_ASSERT(nb1 <= nb2);
13583
13847
  GGML_ASSERT(nb2 <= nb3);
13584
13848
 
13585
- if (params->type == GGML_TASK_INIT) {
13849
+ if (params->type == GGML_TASK_TYPE_INIT) {
13586
13850
  return;
13587
13851
  }
13588
13852
 
13589
- if (params->type == GGML_TASK_FINALIZE) {
13853
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
13590
13854
  return;
13591
13855
  }
13592
13856
 
@@ -13774,11 +14038,11 @@ static void ggml_compute_forward_flash_attn_f16(
13774
14038
  GGML_ASSERT(nb1 <= nb2);
13775
14039
  GGML_ASSERT(nb2 <= nb3);
13776
14040
 
13777
- if (params->type == GGML_TASK_INIT) {
14041
+ if (params->type == GGML_TASK_TYPE_INIT) {
13778
14042
  return;
13779
14043
  }
13780
14044
 
13781
- if (params->type == GGML_TASK_FINALIZE) {
14045
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
13782
14046
  return;
13783
14047
  }
13784
14048
 
@@ -14033,11 +14297,11 @@ static void ggml_compute_forward_flash_ff_f16(
14033
14297
  GGML_ASSERT(nb1 <= nb2);
14034
14298
  GGML_ASSERT(nb2 <= nb3);
14035
14299
 
14036
- if (params->type == GGML_TASK_INIT) {
14300
+ if (params->type == GGML_TASK_TYPE_INIT) {
14037
14301
  return;
14038
14302
  }
14039
14303
 
14040
- if (params->type == GGML_TASK_FINALIZE) {
14304
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
14041
14305
  return;
14042
14306
  }
14043
14307
 
@@ -14192,14 +14456,14 @@ static void ggml_compute_forward_flash_attn_back_f32(
14192
14456
  GGML_ASSERT(nb1 <= nb2);
14193
14457
  GGML_ASSERT(nb2 <= nb3);
14194
14458
 
14195
- if (params->type == GGML_TASK_INIT) {
14459
+ if (params->type == GGML_TASK_TYPE_INIT) {
14196
14460
  if (ith == 0) {
14197
14461
  memset(dst->data, 0, nb0*ne0*ne1*ne2*ne3);
14198
14462
  }
14199
14463
  return;
14200
14464
  }
14201
14465
 
14202
- if (params->type == GGML_TASK_FINALIZE) {
14466
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
14203
14467
  return;
14204
14468
  }
14205
14469
 
@@ -14515,7 +14779,7 @@ static void ggml_compute_forward_win_part_f32(
14515
14779
 
14516
14780
  const struct ggml_tensor * src0 = dst->src[0];
14517
14781
 
14518
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14782
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14519
14783
  return;
14520
14784
  }
14521
14785
 
@@ -14581,7 +14845,7 @@ static void ggml_compute_forward_win_unpart_f32(
14581
14845
 
14582
14846
  const struct ggml_tensor * src0 = dst->src[0];
14583
14847
 
14584
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14848
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14585
14849
  return;
14586
14850
  }
14587
14851
 
@@ -14709,7 +14973,7 @@ static void ggml_compute_forward_get_rel_pos_f16(
14709
14973
 
14710
14974
  const struct ggml_tensor * src0 = dst->src[0];
14711
14975
 
14712
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14976
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14713
14977
  return;
14714
14978
  }
14715
14979
 
@@ -14761,14 +15025,14 @@ static void ggml_compute_forward_add_rel_pos_f32(
14761
15025
  const struct ggml_tensor * src2 = dst->src[2];
14762
15026
 
14763
15027
  const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
14764
- if (!inplace && params->type == GGML_TASK_INIT) {
15028
+ if (!inplace && params->type == GGML_TASK_TYPE_INIT) {
14765
15029
  if (params->ith != 0) {
14766
15030
  return;
14767
15031
  }
14768
15032
  memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
14769
15033
  return;
14770
15034
  }
14771
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15035
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14772
15036
  return;
14773
15037
  }
14774
15038
 
@@ -14850,7 +15114,7 @@ static void ggml_compute_forward_map_unary_f32(
14850
15114
 
14851
15115
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
14852
15116
 
14853
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15117
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14854
15118
  return;
14855
15119
  }
14856
15120
 
@@ -14899,7 +15163,7 @@ static void ggml_compute_forward_map_binary_f32(
14899
15163
  assert(params->ith == 0);
14900
15164
  assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
14901
15165
 
14902
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15166
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14903
15167
  return;
14904
15168
  }
14905
15169
 
@@ -14948,7 +15212,7 @@ static void ggml_compute_forward_map_custom1_f32(
14948
15212
 
14949
15213
  assert(params->ith == 0);
14950
15214
 
14951
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15215
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14952
15216
  return;
14953
15217
  }
14954
15218
 
@@ -14967,7 +15231,7 @@ static void ggml_compute_forward_map_custom2_f32(
14967
15231
 
14968
15232
  assert(params->ith == 0);
14969
15233
 
14970
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15234
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14971
15235
  return;
14972
15236
  }
14973
15237
 
@@ -14987,7 +15251,7 @@ static void ggml_compute_forward_map_custom3_f32(
14987
15251
 
14988
15252
  assert(params->ith == 0);
14989
15253
 
14990
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15254
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14991
15255
  return;
14992
15256
  }
14993
15257
 
@@ -15002,13 +15266,14 @@ static void ggml_compute_forward_map_custom1(
15002
15266
 
15003
15267
  const struct ggml_tensor * a = dst->src[0];
15004
15268
 
15005
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15269
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
15006
15270
  return;
15007
15271
  }
15008
15272
 
15009
- struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) dst->op_params;
15273
+ struct ggml_map_custom1_op_params p;
15274
+ memcpy(&p, dst->op_params, sizeof(p));
15010
15275
 
15011
- p->fun(dst, a, params->ith, params->nth, p->userdata);
15276
+ p.fun(dst, a, params->ith, params->nth, p.userdata);
15012
15277
  }
15013
15278
 
15014
15279
  // ggml_compute_forward_map_custom2
@@ -15020,13 +15285,14 @@ static void ggml_compute_forward_map_custom2(
15020
15285
  const struct ggml_tensor * a = dst->src[0];
15021
15286
  const struct ggml_tensor * b = dst->src[1];
15022
15287
 
15023
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15288
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
15024
15289
  return;
15025
15290
  }
15026
15291
 
15027
- struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) dst->op_params;
15292
+ struct ggml_map_custom2_op_params p;
15293
+ memcpy(&p, dst->op_params, sizeof(p));
15028
15294
 
15029
- p->fun(dst, a, b, params->ith, params->nth, p->userdata);
15295
+ p.fun(dst, a, b, params->ith, params->nth, p.userdata);
15030
15296
  }
15031
15297
 
15032
15298
  // ggml_compute_forward_map_custom3
@@ -15039,13 +15305,14 @@ static void ggml_compute_forward_map_custom3(
15039
15305
  const struct ggml_tensor * b = dst->src[1];
15040
15306
  const struct ggml_tensor * c = dst->src[2];
15041
15307
 
15042
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15308
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
15043
15309
  return;
15044
15310
  }
15045
15311
 
15046
- struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) dst->op_params;
15312
+ struct ggml_map_custom3_op_params p;
15313
+ memcpy(&p, dst->op_params, sizeof(p));
15047
15314
 
15048
- p->fun(dst, a, b, c, params->ith, params->nth, p->userdata);
15315
+ p.fun(dst, a, b, c, params->ith, params->nth, p.userdata);
15049
15316
  }
15050
15317
 
15051
15318
  // ggml_compute_forward_cross_entropy_loss
@@ -15073,14 +15340,14 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
15073
15340
 
15074
15341
  GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc));
15075
15342
 
15076
- if (params->type == GGML_TASK_INIT) {
15343
+ if (params->type == GGML_TASK_TYPE_INIT) {
15077
15344
  if (ith == 0) {
15078
15345
  memset(sums, 0, sizeof(float) * (nth + nth * nc));
15079
15346
  }
15080
15347
  return;
15081
15348
  }
15082
15349
 
15083
- if (params->type == GGML_TASK_FINALIZE) {
15350
+ if (params->type == GGML_TASK_TYPE_FINALIZE) {
15084
15351
  if (ith == 0) {
15085
15352
  float * dp = (float *) dst->data;
15086
15353
  ggml_vec_sum_f32(nth, dp, sums);
@@ -15195,7 +15462,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
15195
15462
  const int64_t ith = params->ith;
15196
15463
  const int64_t nth = params->nth;
15197
15464
 
15198
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15465
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
15199
15466
  return;
15200
15467
  }
15201
15468
 
@@ -15302,8 +15569,8 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15302
15569
  if (skip_cpu) {
15303
15570
  return;
15304
15571
  }
15305
- GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
15306
- GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
15572
+ GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
15573
+ GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
15307
15574
  #elif defined(GGML_USE_VULKAN)
15308
15575
  const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
15309
15576
  #ifdef GGML_VULKAN_CHECK_RESULTS
@@ -15314,8 +15581,8 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15314
15581
  if (skip_cpu) {
15315
15582
  return;
15316
15583
  }
15317
- GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
15318
- GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
15584
+ GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
15585
+ GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
15319
15586
  #endif // GGML_USE_CUBLAS
15320
15587
 
15321
15588
  #ifdef GGML_USE_SYCL
@@ -15529,6 +15796,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15529
15796
  {
15530
15797
  ggml_compute_forward_pad(params, tensor);
15531
15798
  } break;
15799
+ case GGML_OP_ARANGE:
15800
+ {
15801
+ ggml_compute_forward_arange(params, tensor);
15802
+ } break;
15803
+ case GGML_OP_TIMESTEP_EMBEDDING:
15804
+ {
15805
+ ggml_compute_forward_timestep_embedding(params, tensor);
15806
+ } break;
15532
15807
  case GGML_OP_ARGSORT:
15533
15808
  {
15534
15809
  ggml_compute_forward_argsort(params, tensor);
@@ -16531,6 +16806,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
16531
16806
  {
16532
16807
  GGML_ASSERT(false); // TODO: not implemented
16533
16808
  } break;
16809
+ case GGML_OP_ARANGE:
16810
+ {
16811
+ GGML_ASSERT(false); // TODO: not implemented
16812
+ } break;
16813
+ case GGML_OP_TIMESTEP_EMBEDDING:
16814
+ {
16815
+ GGML_ASSERT(false); // TODO: not implemented
16816
+ } break;
16534
16817
  case GGML_OP_ARGSORT:
16535
16818
  {
16536
16819
  GGML_ASSERT(false); // TODO: not implemented
@@ -16861,7 +17144,7 @@ size_t ggml_graph_overhead(void) {
16861
17144
 
16862
17145
  struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads) {
16863
17146
  const size_t obj_size = ggml_graph_nbytes(size, grads);
16864
- struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, obj_size);
17147
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_GRAPH, obj_size);
16865
17148
  struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
16866
17149
 
16867
17150
  struct ggml_tensor ** data_start = (struct ggml_tensor **) (cgraph + 1);
@@ -17131,6 +17414,7 @@ struct ggml_compute_state {
17131
17414
  ggml_thread_t thrd;
17132
17415
  int ith;
17133
17416
  struct ggml_compute_state_shared * shared;
17417
+ enum ggml_status ec;
17134
17418
  };
17135
17419
 
17136
17420
  static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
@@ -17282,6 +17566,14 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
17282
17566
  {
17283
17567
  n_tasks = n_threads;
17284
17568
  } break;
17569
+ case GGML_OP_ARANGE:
17570
+ {
17571
+ n_tasks = n_threads;
17572
+ } break;
17573
+ case GGML_OP_TIMESTEP_EMBEDDING:
17574
+ {
17575
+ n_tasks = n_threads;
17576
+ } break;
17285
17577
  case GGML_OP_ARGSORT:
17286
17578
  {
17287
17579
  n_tasks = n_threads;
@@ -17311,29 +17603,32 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
17311
17603
  } break;
17312
17604
  case GGML_OP_MAP_CUSTOM1:
17313
17605
  {
17314
- struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) node->op_params;
17315
- if (p->n_tasks == GGML_N_TASKS_MAX) {
17606
+ struct ggml_map_custom1_op_params p;
17607
+ memcpy(&p, node->op_params, sizeof(p));
17608
+ if (p.n_tasks == GGML_N_TASKS_MAX) {
17316
17609
  n_tasks = n_threads;
17317
17610
  } else {
17318
- n_tasks = MIN(p->n_tasks, n_threads);
17611
+ n_tasks = MIN(p.n_tasks, n_threads);
17319
17612
  }
17320
17613
  } break;
17321
17614
  case GGML_OP_MAP_CUSTOM2:
17322
17615
  {
17323
- struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) node->op_params;
17324
- if (p->n_tasks == GGML_N_TASKS_MAX) {
17616
+ struct ggml_map_custom2_op_params p;
17617
+ memcpy(&p, node->op_params, sizeof(p));
17618
+ if (p.n_tasks == GGML_N_TASKS_MAX) {
17325
17619
  n_tasks = n_threads;
17326
17620
  } else {
17327
- n_tasks = MIN(p->n_tasks, n_threads);
17621
+ n_tasks = MIN(p.n_tasks, n_threads);
17328
17622
  }
17329
17623
  } break;
17330
17624
  case GGML_OP_MAP_CUSTOM3:
17331
17625
  {
17332
- struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) node->op_params;
17333
- if (p->n_tasks == GGML_N_TASKS_MAX) {
17626
+ struct ggml_map_custom3_op_params p;
17627
+ memcpy(&p, node->op_params, sizeof(p));
17628
+ if (p.n_tasks == GGML_N_TASKS_MAX) {
17334
17629
  n_tasks = n_threads;
17335
17630
  } else {
17336
- n_tasks = MIN(p->n_tasks, n_threads);
17631
+ n_tasks = MIN(p.n_tasks, n_threads);
17337
17632
  }
17338
17633
  } break;
17339
17634
  case GGML_OP_CROSS_ENTROPY_LOSS:
@@ -17408,19 +17703,20 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
17408
17703
  set_numa_thread_affinity(state->ith);
17409
17704
 
17410
17705
  int node_n = -1;
17411
- int task_phase = GGML_TASK_FINALIZE;
17706
+ int task_phase = GGML_TASK_TYPE_FINALIZE;
17412
17707
 
17413
17708
  while (true) {
17414
17709
  if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
17415
17710
  state->shared->node_n += 1;
17416
- return (thread_ret_t) GGML_EXIT_ABORTED;
17711
+ state->ec = GGML_STATUS_ABORTED;
17712
+ return 0;
17417
17713
  }
17418
17714
 
17419
17715
  if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
17420
17716
  // all other threads are finished and spinning
17421
17717
  // do finalize and init here so we don't have synchronize again
17422
17718
  struct ggml_compute_params params = {
17423
- /*.type =*/ GGML_TASK_FINALIZE,
17719
+ /*.type =*/ GGML_TASK_TYPE_FINALIZE,
17424
17720
  /*.ith =*/ 0,
17425
17721
  /*.nth =*/ 0,
17426
17722
  /*.wsize =*/ cplan->work_size,
@@ -17451,17 +17747,17 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
17451
17747
  if (n_tasks == 1) {
17452
17748
  /* INIT */
17453
17749
  if (GGML_OP_HAS_INIT[node->op]) {
17454
- params.type = GGML_TASK_INIT;
17750
+ params.type = GGML_TASK_TYPE_INIT;
17455
17751
  ggml_compute_forward(&params, node);
17456
17752
  }
17457
17753
 
17458
17754
  // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
17459
17755
  // they do something more efficient than spinning (?)
17460
- params.type = GGML_TASK_COMPUTE;
17756
+ params.type = GGML_TASK_TYPE_COMPUTE;
17461
17757
  ggml_compute_forward(&params, node);
17462
17758
 
17463
17759
  if (GGML_OP_HAS_FINALIZE[node->op]) {
17464
- params.type = GGML_TASK_FINALIZE;
17760
+ params.type = GGML_TASK_TYPE_FINALIZE;
17465
17761
  ggml_compute_forward(&params, node);
17466
17762
  }
17467
17763
 
@@ -17475,7 +17771,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
17475
17771
  }
17476
17772
  }
17477
17773
 
17478
- task_phase = GGML_TASK_INIT;
17774
+ task_phase = GGML_TASK_TYPE_INIT;
17479
17775
  atomic_store(&state->shared->n_active, n_threads);
17480
17776
  atomic_store(&state->shared->node_n, node_n);
17481
17777
  atomic_store(&state->shared->node_task, task_phase);
@@ -17492,7 +17788,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
17492
17788
  const int n_tasks = ggml_get_n_tasks(node, n_threads);
17493
17789
 
17494
17790
  struct ggml_compute_params params = {
17495
- /*.type =*/ GGML_TASK_INIT,
17791
+ /*.type =*/ GGML_TASK_TYPE_INIT,
17496
17792
  /*.ith =*/ state->ith,
17497
17793
  /*.nth =*/ n_tasks,
17498
17794
  /*.wsize =*/ cplan->work_size,
@@ -17506,7 +17802,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
17506
17802
  }
17507
17803
 
17508
17804
  if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
17509
- task_phase = GGML_TASK_COMPUTE;
17805
+ task_phase = GGML_TASK_TYPE_COMPUTE;
17510
17806
  atomic_store(&state->shared->n_active, n_threads);
17511
17807
  atomic_store(&state->shared->node_task, task_phase);
17512
17808
  }
@@ -17521,12 +17817,12 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
17521
17817
  }
17522
17818
 
17523
17819
  if (state->ith < n_tasks) {
17524
- params.type = GGML_TASK_COMPUTE;
17820
+ params.type = GGML_TASK_TYPE_COMPUTE;
17525
17821
  ggml_compute_forward(&params, node);
17526
17822
  }
17527
17823
 
17528
17824
  if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
17529
- task_phase = GGML_TASK_FINALIZE;
17825
+ task_phase = GGML_TASK_TYPE_FINALIZE;
17530
17826
  atomic_store(&state->shared->n_active, n_threads);
17531
17827
  atomic_store(&state->shared->node_task, task_phase);
17532
17828
  }
@@ -17535,7 +17831,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
17535
17831
  }
17536
17832
  }
17537
17833
 
17538
- return GGML_EXIT_SUCCESS;
17834
+ return 0;
17539
17835
  }
17540
17836
 
17541
17837
  struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) {
@@ -17731,7 +18027,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
17731
18027
  return cplan;
17732
18028
  }
17733
18029
 
17734
- int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
18030
+ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
17735
18031
  {
17736
18032
  GGML_ASSERT(cplan);
17737
18033
  GGML_ASSERT(cplan->n_threads > 0);
@@ -17762,7 +18058,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
17762
18058
  /*.n_threads =*/ n_threads,
17763
18059
  /*.n_active =*/ n_threads,
17764
18060
  /*.node_n =*/ -1,
17765
- /*.node_task =*/ GGML_TASK_FINALIZE,
18061
+ /*.node_task =*/ GGML_TASK_TYPE_FINALIZE,
17766
18062
  /*.abort_callback =*/ NULL,
17767
18063
  /*.abort_callback_data =*/ NULL,
17768
18064
  };
@@ -17775,6 +18071,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
17775
18071
  .thrd = 0,
17776
18072
  .ith = j,
17777
18073
  .shared = &state_shared,
18074
+ .ec = GGML_STATUS_SUCCESS,
17778
18075
  };
17779
18076
 
17780
18077
  const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
@@ -17785,12 +18082,14 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
17785
18082
 
17786
18083
  workers[0].ith = 0;
17787
18084
  workers[0].shared = &state_shared;
18085
+ workers[0].ec = GGML_STATUS_SUCCESS;
17788
18086
 
17789
18087
  const int64_t perf_start_cycles = ggml_perf_cycles();
17790
18088
  const int64_t perf_start_time_us = ggml_perf_time_us();
17791
18089
 
17792
18090
  // this is a work thread too
17793
- int compute_status = (size_t) ggml_graph_compute_thread(&workers[0]);
18091
+ ggml_graph_compute_thread(&workers[0]);
18092
+ enum ggml_status compute_status = workers[0].ec;
17794
18093
 
17795
18094
  // don't leave affinity set on the main thread
17796
18095
  clear_numa_thread_affinity();
@@ -17800,6 +18099,8 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
17800
18099
  for (int j = 1; j < n_threads; j++) {
17801
18100
  const int rc = ggml_thread_join(workers[j].thrd, NULL);
17802
18101
  GGML_ASSERT(rc == 0);
18102
+ if (workers[j].ec != GGML_STATUS_SUCCESS)
18103
+ compute_status = workers[j].ec;
17803
18104
  }
17804
18105
  }
17805
18106
 
@@ -17827,14 +18128,14 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
17827
18128
  return compute_status;
17828
18129
  }
17829
18130
 
17830
- void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
18131
+ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
17831
18132
  struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
17832
18133
 
17833
- struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
18134
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
17834
18135
 
17835
18136
  cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
17836
18137
 
17837
- ggml_graph_compute(cgraph, &cplan);
18138
+ return ggml_graph_compute(cgraph, &cplan);
17838
18139
  }
17839
18140
 
17840
18141
  struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
@@ -18638,7 +18939,7 @@ static enum ggml_opt_result ggml_opt_adam(
18638
18939
  float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
18639
18940
 
18640
18941
  struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
18641
- struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
18942
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
18642
18943
  cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
18643
18944
 
18644
18945
  bool cancel = false;
@@ -18650,7 +18951,7 @@ static enum ggml_opt_result ggml_opt_adam(
18650
18951
  if (callback) {
18651
18952
  callback(callback_data, accum_step, &sched, &cancel);
18652
18953
  if (cancel) {
18653
- return GGML_OPT_CANCEL;
18954
+ return GGML_OPT_RESULT_CANCEL;
18654
18955
  }
18655
18956
  }
18656
18957
  // ggml_graph_reset (gf);
@@ -18741,7 +19042,7 @@ static enum ggml_opt_result ggml_opt_adam(
18741
19042
  if (callback) {
18742
19043
  callback(callback_data, accum_step, &sched, &cancel);
18743
19044
  if (cancel) {
18744
- return GGML_OPT_CANCEL;;
19045
+ return GGML_OPT_RESULT_CANCEL;;
18745
19046
  }
18746
19047
  }
18747
19048
  // ggml_graph_reset (gf);
@@ -18758,7 +19059,7 @@ static enum ggml_opt_result ggml_opt_adam(
18758
19059
  if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) {
18759
19060
  GGML_PRINT_DEBUG("converged\n");
18760
19061
 
18761
- return GGML_OPT_OK;
19062
+ return GGML_OPT_RESULT_OK;
18762
19063
  }
18763
19064
 
18764
19065
  // delta-based convergence test
@@ -18768,7 +19069,7 @@ static enum ggml_opt_result ggml_opt_adam(
18768
19069
  const float rate = (pf[(iter0 + t)%params.past] - fx)/fx;
18769
19070
 
18770
19071
  if (fabsf(rate) < params.delta) {
18771
- return GGML_OPT_OK;
19072
+ return GGML_OPT_RESULT_OK;
18772
19073
  }
18773
19074
  }
18774
19075
 
@@ -18784,7 +19085,7 @@ static enum ggml_opt_result ggml_opt_adam(
18784
19085
  ++n_no_improvement[0];
18785
19086
 
18786
19087
  if (n_no_improvement[0] >= params.max_no_improvement) {
18787
- return GGML_OPT_OK;
19088
+ return GGML_OPT_RESULT_OK;
18788
19089
  }
18789
19090
  }
18790
19091
  }
@@ -18802,7 +19103,7 @@ static enum ggml_opt_result ggml_opt_adam(
18802
19103
  }
18803
19104
  }
18804
19105
 
18805
- return GGML_OPT_DID_NOT_CONVERGE;
19106
+ return GGML_OPT_RESULT_DID_NOT_CONVERGE;
18806
19107
  }
18807
19108
 
18808
19109
  //
@@ -18883,7 +19184,7 @@ static enum ggml_opt_result linesearch_backtracking(
18883
19184
  float sched = 0;
18884
19185
  callback(callback_data, accum_step, &sched, cancel);
18885
19186
  if (*cancel) {
18886
- return GGML_OPT_CANCEL;
19187
+ return GGML_OPT_RESULT_CANCEL;
18887
19188
  }
18888
19189
  }
18889
19190
  // ggml_graph_reset (gf);
@@ -18956,7 +19257,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18956
19257
  if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE ||
18957
19258
  params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) {
18958
19259
  if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) {
18959
- return GGML_OPT_INVALID_WOLFE;
19260
+ return GGML_OPT_RESULT_INVALID_WOLFE;
18960
19261
  }
18961
19262
  }
18962
19263
 
@@ -18985,7 +19286,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
18985
19286
  }
18986
19287
 
18987
19288
  struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
18988
- struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
19289
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
18989
19290
  cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
18990
19291
 
18991
19292
  float * x = opt->lbfgs.x->data; // current parameters
@@ -19026,7 +19327,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
19026
19327
  float sched = 0;
19027
19328
  callback(callback_data, accum_step, &sched, &cancel);
19028
19329
  if (cancel) {
19029
- return GGML_OPT_CANCEL;
19330
+ return GGML_OPT_RESULT_CANCEL;
19030
19331
  }
19031
19332
  }
19032
19333
  // ggml_graph_reset (gf);
@@ -19054,7 +19355,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
19054
19355
 
19055
19356
  // already optimized
19056
19357
  if (gnorm/xnorm <= params.lbfgs.eps) {
19057
- return GGML_OPT_OK;
19358
+ return GGML_OPT_RESULT_OK;
19058
19359
  }
19059
19360
 
19060
19361
  if (opt->just_initialized) {
@@ -19099,7 +19400,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
19099
19400
  // way to test and don't want to break something with so many changes lined up
19100
19401
  ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
19101
19402
  if (cancel) {
19102
- return GGML_OPT_CANCEL;
19403
+ return GGML_OPT_RESULT_CANCEL;
19103
19404
  }
19104
19405
 
19105
19406
  if (ls < 0) {
@@ -19122,7 +19423,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
19122
19423
  }
19123
19424
  if (gnorm/xnorm <= params.lbfgs.eps) {
19124
19425
  // converged
19125
- return GGML_OPT_OK;
19426
+ return GGML_OPT_RESULT_OK;
19126
19427
  }
19127
19428
 
19128
19429
  // delta-based convergence test
@@ -19132,7 +19433,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
19132
19433
  const float rate = (pf[k[0]%params.past] - fx)/fx;
19133
19434
 
19134
19435
  if (fabsf(rate) < params.delta) {
19135
- return GGML_OPT_OK;
19436
+ return GGML_OPT_RESULT_OK;
19136
19437
  }
19137
19438
  }
19138
19439
 
@@ -19148,14 +19449,14 @@ static enum ggml_opt_result ggml_opt_lbfgs(
19148
19449
  n_no_improvement[0]++;
19149
19450
 
19150
19451
  if (n_no_improvement[0] >= params.max_no_improvement) {
19151
- return GGML_OPT_OK;
19452
+ return GGML_OPT_RESULT_OK;
19152
19453
  }
19153
19454
  }
19154
19455
  }
19155
19456
 
19156
19457
  if (params.lbfgs.n_iter != 0 && params.lbfgs.n_iter < it + 1) {
19157
19458
  // reached the maximum number of iterations
19158
- return GGML_OPT_DID_NOT_CONVERGE;
19459
+ return GGML_OPT_RESULT_DID_NOT_CONVERGE;
19159
19460
  }
19160
19461
 
19161
19462
  // update vectors s and y:
@@ -19211,17 +19512,17 @@ static enum ggml_opt_result ggml_opt_lbfgs(
19211
19512
 
19212
19513
  GGML_ASSERT(false && "lbfgs failed");
19213
19514
 
19214
- return GGML_OPT_DID_NOT_CONVERGE;
19515
+ return GGML_OPT_RESULT_DID_NOT_CONVERGE;
19215
19516
  }
19216
19517
 
19217
19518
  struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
19218
19519
  struct ggml_opt_params result;
19219
19520
 
19220
19521
  switch (type) {
19221
- case GGML_OPT_ADAM:
19522
+ case GGML_OPT_TYPE_ADAM:
19222
19523
  {
19223
19524
  result = (struct ggml_opt_params) {
19224
- .type = GGML_OPT_ADAM,
19525
+ .type = GGML_OPT_TYPE_ADAM,
19225
19526
  .graph_size = GGML_DEFAULT_GRAPH_SIZE,
19226
19527
  .n_threads = 1, // FIXME: GGML_DEFAULT_N_THREADS ?
19227
19528
  .past = 0,
@@ -19249,10 +19550,10 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
19249
19550
  },
19250
19551
  };
19251
19552
  } break;
19252
- case GGML_OPT_LBFGS:
19553
+ case GGML_OPT_TYPE_LBFGS:
19253
19554
  {
19254
19555
  result = (struct ggml_opt_params) {
19255
- .type = GGML_OPT_LBFGS,
19556
+ .type = GGML_OPT_TYPE_LBFGS,
19256
19557
  .graph_size = GGML_DEFAULT_GRAPH_SIZE,
19257
19558
  .n_threads = 1,
19258
19559
  .past = 0,
@@ -19297,12 +19598,12 @@ GGML_API void ggml_opt_init(
19297
19598
  opt->just_initialized = true;
19298
19599
  if (opt->ctx == NULL) {
19299
19600
  struct ggml_init_params ctx_opt_params;
19300
- if (opt->params.type == GGML_OPT_ADAM) {
19601
+ if (opt->params.type == GGML_OPT_TYPE_ADAM) {
19301
19602
  ctx_opt_params.mem_size = GGML_MEM_ALIGN*3 + ggml_tensor_overhead()*3 + ggml_type_size(GGML_TYPE_F32)*nx*3;
19302
19603
  if (opt->params.past > 0) {
19303
19604
  ctx_opt_params.mem_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_size(GGML_TYPE_F32)*opt->params.past;
19304
19605
  }
19305
- } else if (opt->params.type == GGML_OPT_LBFGS) {
19606
+ } else if (opt->params.type == GGML_OPT_TYPE_LBFGS) {
19306
19607
  ctx_opt_params.mem_size = GGML_MEM_ALIGN*9 + ggml_tensor_overhead()*9 + ggml_type_size(GGML_TYPE_F32)*(nx*5 + opt->params.lbfgs.m*2 + nx*opt->params.lbfgs.m*2);
19307
19608
  if (opt->params.past > 0) {
19308
19609
  ctx_opt_params.mem_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_size(GGML_TYPE_F32)*opt->params.past;
@@ -19314,7 +19615,7 @@ GGML_API void ggml_opt_init(
19314
19615
  opt->ctx = ggml_init(ctx_opt_params);
19315
19616
  }
19316
19617
  switch (opt->params.type) {
19317
- case GGML_OPT_ADAM:
19618
+ case GGML_OPT_TYPE_ADAM:
19318
19619
  {
19319
19620
  opt->adam.g = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
19320
19621
  opt->adam.m = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
@@ -19328,7 +19629,7 @@ GGML_API void ggml_opt_init(
19328
19629
  ggml_set_zero(opt->adam.pf);
19329
19630
  }
19330
19631
  } break;
19331
- case GGML_OPT_LBFGS:
19632
+ case GGML_OPT_TYPE_LBFGS:
19332
19633
  {
19333
19634
  opt->lbfgs.x = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
19334
19635
  opt->lbfgs.xp = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
@@ -19372,13 +19673,13 @@ enum ggml_opt_result ggml_opt(
19372
19673
 
19373
19674
  ctx = ggml_init(params_ctx);
19374
19675
  if (ctx == NULL) {
19375
- return GGML_OPT_NO_CONTEXT;
19676
+ return GGML_OPT_RESULT_NO_CONTEXT;
19376
19677
  }
19377
19678
 
19378
19679
  free_ctx = true;
19379
19680
  }
19380
19681
 
19381
- enum ggml_opt_result result = GGML_OPT_OK;
19682
+ enum ggml_opt_result result = GGML_OPT_RESULT_OK;
19382
19683
 
19383
19684
  struct ggml_opt_context * opt = (struct ggml_opt_context *) alloca(sizeof(struct ggml_opt_context));
19384
19685
 
@@ -19417,14 +19718,14 @@ enum ggml_opt_result ggml_opt_resume_g(
19417
19718
  void * callback_data) {
19418
19719
 
19419
19720
  // build forward + backward compute graphs
19420
- enum ggml_opt_result result = GGML_OPT_OK;
19721
+ enum ggml_opt_result result = GGML_OPT_RESULT_OK;
19421
19722
 
19422
19723
  switch (opt->params.type) {
19423
- case GGML_OPT_ADAM:
19724
+ case GGML_OPT_TYPE_ADAM:
19424
19725
  {
19425
19726
  result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
19426
19727
  } break;
19427
- case GGML_OPT_LBFGS:
19728
+ case GGML_OPT_TYPE_LBFGS:
19428
19729
  {
19429
19730
  result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
19430
19731
  } break;
@@ -19461,8 +19762,10 @@ void ggml_quantize_init(enum ggml_type type) {
19461
19762
  switch (type) {
19462
19763
  case GGML_TYPE_IQ2_XXS:
19463
19764
  case GGML_TYPE_IQ2_XS:
19765
+ case GGML_TYPE_IQ2_S:
19464
19766
  case GGML_TYPE_IQ1_S: iq2xs_init_impl(type); break;
19465
19767
  case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
19768
+ case GGML_TYPE_IQ3_S: iq3xs_init_impl(512); break;
19466
19769
  default: // nothing
19467
19770
  break;
19468
19771
  }
@@ -19737,6 +20040,24 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
19737
20040
  result = quantize_iq3_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19738
20041
  GGML_ASSERT(result == row_size * nrows);
19739
20042
  } break;
20043
+ case GGML_TYPE_IQ3_S:
20044
+ {
20045
+ GGML_ASSERT(start % QK_K == 0);
20046
+ GGML_ASSERT(start % n_per_row == 0);
20047
+ size_t start_row = start / n_per_row;
20048
+ size_t row_size = ggml_row_size(type, n_per_row);
20049
+ result = quantize_iq3_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
20050
+ GGML_ASSERT(result == row_size * nrows);
20051
+ } break;
20052
+ case GGML_TYPE_IQ2_S:
20053
+ {
20054
+ GGML_ASSERT(start % QK_K == 0);
20055
+ GGML_ASSERT(start % n_per_row == 0);
20056
+ size_t start_row = start / n_per_row;
20057
+ size_t row_size = ggml_row_size(type, n_per_row);
20058
+ result = quantize_iq2_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
20059
+ GGML_ASSERT(result == row_size * nrows);
20060
+ } break;
19740
20061
  case GGML_TYPE_IQ1_S:
19741
20062
  {
19742
20063
  GGML_ASSERT(start % QK_K == 0);
@@ -19747,6 +20068,9 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
19747
20068
  GGML_ASSERT(result == row_size * nrows);
19748
20069
  } break;
19749
20070
  case GGML_TYPE_IQ4_NL:
20071
+ #if QK_K == 64
20072
+ case GGML_TYPE_IQ4_XS:
20073
+ #endif
19750
20074
  {
19751
20075
  GGML_ASSERT(start % QK4_NL == 0);
19752
20076
  GGML_ASSERT(start % n_per_row == 0);
@@ -19755,6 +20079,17 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
19755
20079
  result = quantize_iq4_nl(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19756
20080
  GGML_ASSERT(result == row_size * nrows);
19757
20081
  } break;
20082
+ #if QK_K != 64
20083
+ case GGML_TYPE_IQ4_XS:
20084
+ {
20085
+ GGML_ASSERT(start % QK_K == 0);
20086
+ GGML_ASSERT(start % n_per_row == 0);
20087
+ size_t start_row = start / n_per_row;
20088
+ size_t row_size = ggml_row_size(type, n_per_row);
20089
+ result = quantize_iq4_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
20090
+ GGML_ASSERT(result == row_size * nrows);
20091
+ } break;
20092
+ #endif
19758
20093
  case GGML_TYPE_F16:
19759
20094
  {
19760
20095
  size_t elemsize = sizeof(ggml_fp16_t);