llama_cpp 0.13.0 → 0.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -320,6 +320,17 @@ static ggml_fp16_t ggml_table_exp_f16[1 << 16];
320
320
  // precomputed f32 table for f16 (256 KB) (ggml-impl.h)
321
321
  float ggml_table_f32_f16[1 << 16];
322
322
 
323
+ const char * ggml_status_to_string(enum ggml_status status) {
324
+ switch (status) {
325
+ case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)";
326
+ case GGML_STATUS_FAILED: return "GGML status: error (operation failed)";
327
+ case GGML_STATUS_SUCCESS: return "GGML status: success";
328
+ case GGML_STATUS_ABORTED: return "GGML status: warning (operation aborted)";
329
+ }
330
+
331
+ return "GGML status: unknown";
332
+ }
333
+
323
334
  // note: do not use these inside ggml.c
324
335
  // these are meant to be used via the ggml.h API
325
336
  float ggml_fp16_to_fp32(ggml_fp16_t x) {
@@ -459,6 +470,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
459
470
  .type_size = sizeof(int32_t),
460
471
  .is_quantized = false,
461
472
  },
473
+ [GGML_TYPE_I64] = {
474
+ .type_name = "i64",
475
+ .blck_size = 1,
476
+ .type_size = sizeof(int64_t),
477
+ .is_quantized = false,
478
+ },
479
+ [GGML_TYPE_F64] = {
480
+ .type_name = "f64",
481
+ .blck_size = 1,
482
+ .type_size = sizeof(double),
483
+ .is_quantized = false,
484
+ .nrows = 1,
485
+ },
462
486
  [GGML_TYPE_F32] = {
463
487
  .type_name = "f32",
464
488
  .blck_size = 1,
@@ -846,7 +870,7 @@ inline static float vaddvq_f32(float32x4_t v) {
846
870
  #define GGML_F16x8 float16x8_t
847
871
  #define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
848
872
  #define GGML_F16x8_SET1(x) vdupq_n_f16(x)
849
- #define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x))
873
+ #define GGML_F16x8_LOAD(x) vld1q_f16((const ggml_fp16_internal_t *)(x))
850
874
  #define GGML_F16x8_STORE vst1q_f16
851
875
  #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
852
876
  #define GGML_F16x8_ADD vaddq_f16
@@ -889,7 +913,7 @@ inline static float vaddvq_f32(float32x4_t v) {
889
913
  #define GGML_F32Cx4 float32x4_t
890
914
  #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
891
915
  #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
892
- #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
916
+ #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const ggml_fp16_internal_t *)(x)))
893
917
  #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
894
918
  #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
895
919
  #define GGML_F32Cx4_ADD vaddq_f32
@@ -1822,12 +1846,16 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
1822
1846
  "POOL_2D",
1823
1847
  "UPSCALE",
1824
1848
  "PAD",
1849
+ "ARANGE",
1850
+ "TIMESTEP_EMBEDDING",
1825
1851
  "ARGSORT",
1826
1852
  "LEAKY_RELU",
1827
1853
 
1828
1854
  "FLASH_ATTN",
1829
1855
  "FLASH_FF",
1830
1856
  "FLASH_ATTN_BACK",
1857
+ "SSM_CONV",
1858
+ "SSM_SCAN",
1831
1859
  "WIN_PART",
1832
1860
  "WIN_UNPART",
1833
1861
  "GET_REL_POS",
@@ -1850,7 +1878,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
1850
1878
  "CROSS_ENTROPY_LOSS_BACK",
1851
1879
  };
1852
1880
 
1853
- static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
1881
+ static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
1854
1882
 
1855
1883
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1856
1884
  "none",
@@ -1908,12 +1936,16 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1908
1936
  "pool_2d(x)",
1909
1937
  "upscale(x)",
1910
1938
  "pad(x)",
1939
+ "arange(start, stop, step)",
1940
+ "timestep_embedding(timesteps, dim, max_period)",
1911
1941
  "argsort(x)",
1912
1942
  "leaky_relu(x)",
1913
1943
 
1914
1944
  "flash_attn(x)",
1915
1945
  "flash_ff(x)",
1916
1946
  "flash_attn_back(x)",
1947
+ "ssm_conv(x)",
1948
+ "ssm_scan(x)",
1917
1949
  "win_part(x)",
1918
1950
  "win_unpart(x)",
1919
1951
  "get_rel_pos(x)",
@@ -1936,7 +1968,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1936
1968
  "cross_entropy_loss_back(x,y)",
1937
1969
  };
1938
1970
 
1939
- static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
1971
+ static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
1940
1972
 
1941
1973
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
1942
1974
 
@@ -2139,7 +2171,10 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
2139
2171
  getcpu_ret = getcpu(&current_cpu, &g_state.numa.current_node);
2140
2172
  #else
2141
2173
  // old glibc doesn't have a wrapper for this call. Fall back on direct syscall
2142
- getcpu_ret = syscall(SYS_getcpu,&current_cpu,&g_state.numa.current_node);
2174
+ # if !defined(SYS_getcpu) && defined(SYS_get_cpu)
2175
+ # define SYS_getcpu SYS_get_cpu // some older glibc versions use this name
2176
+ # endif
2177
+ getcpu_ret = syscall(SYS_getcpu, &current_cpu, &g_state.numa.current_node);
2143
2178
  #endif
2144
2179
 
2145
2180
  if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) {
@@ -2895,11 +2930,21 @@ static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_
2895
2930
  return ((const int32_t *)(tensor->op_params))[i];
2896
2931
  }
2897
2932
 
2933
+ static float ggml_get_op_params_f32(const struct ggml_tensor * tensor, uint32_t i) {
2934
+ assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
2935
+ return ((const float *)(tensor->op_params))[i];
2936
+ }
2937
+
2898
2938
  static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
2899
2939
  assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
2900
2940
  ((int32_t *)(tensor->op_params))[i] = value;
2901
2941
  }
2902
2942
 
2943
+ static void ggml_set_op_params_f32(struct ggml_tensor * tensor, uint32_t i, float value) {
2944
+ assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
2945
+ ((float *)(tensor->op_params))[i] = value;
2946
+ }
2947
+
2903
2948
  struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
2904
2949
  memset(tensor->data, 0, ggml_nbytes(tensor));
2905
2950
  return tensor;
@@ -5898,6 +5943,55 @@ struct ggml_tensor * ggml_upscale(
5898
5943
  return ggml_upscale_impl(ctx, a, scale_factor);
5899
5944
  }
5900
5945
 
5946
+ struct ggml_tensor * ggml_arange(
5947
+ struct ggml_context * ctx,
5948
+ float start,
5949
+ float stop,
5950
+ float step) {
5951
+
5952
+ GGML_ASSERT(stop > start);
5953
+
5954
+ const int64_t steps = (int64_t) ceilf((stop - start) / step);
5955
+
5956
+ struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, steps);
5957
+
5958
+ result->op = GGML_OP_ARANGE;
5959
+ ggml_set_op_params_f32(result, 0, start);
5960
+ ggml_set_op_params_f32(result, 1, stop);
5961
+ ggml_set_op_params_f32(result, 2, step);
5962
+
5963
+ return result;
5964
+ }
5965
+
5966
+ struct ggml_tensor * ggml_timestep_embedding(
5967
+ struct ggml_context * ctx,
5968
+ struct ggml_tensor * timesteps,
5969
+ int dim,
5970
+ int max_period) {
5971
+ bool is_node = false;
5972
+
5973
+ if (timesteps->grad) {
5974
+ GGML_ASSERT(false); // TODO: implement backward
5975
+ is_node = true;
5976
+ }
5977
+
5978
+ int actual_dim = dim;
5979
+ if (dim % 2 != 0) {
5980
+ actual_dim = dim + 1;
5981
+ }
5982
+
5983
+ struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, actual_dim, timesteps->ne[0]);
5984
+
5985
+ result->op = GGML_OP_TIMESTEP_EMBEDDING;
5986
+ ggml_set_op_params_i32(result, 0, dim);
5987
+ ggml_set_op_params_i32(result, 1, max_period);
5988
+
5989
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5990
+ result->src[0] = timesteps;
5991
+
5992
+ return result;
5993
+ }
5994
+
5901
5995
  // ggml_argsort
5902
5996
 
5903
5997
  struct ggml_tensor * ggml_argsort(
@@ -6077,6 +6171,108 @@ struct ggml_tensor * ggml_flash_attn_back(
6077
6171
  return result;
6078
6172
  }
6079
6173
 
6174
+ // ggml_ssm_conv
6175
+
6176
+ struct ggml_tensor * ggml_ssm_conv(
6177
+ struct ggml_context * ctx,
6178
+ struct ggml_tensor * s,
6179
+ struct ggml_tensor * x,
6180
+ struct ggml_tensor * c,
6181
+ struct ggml_tensor * sq) {
6182
+ GGML_ASSERT(ggml_is_3d(s));
6183
+ GGML_ASSERT(ggml_is_matrix(x));
6184
+ GGML_ASSERT(ggml_is_matrix(c));
6185
+ GGML_ASSERT(ggml_is_matrix(sq));
6186
+ GGML_ASSERT(sq->type == GGML_TYPE_I32);
6187
+
6188
+ const int64_t d_conv = c->ne[0];
6189
+ const int64_t d_inner = c->ne[1];
6190
+ const int64_t n_tokens = x->ne[1];
6191
+ const int64_t n_kv = s->ne[2];
6192
+
6193
+ GGML_ASSERT( s->ne[0] == d_conv - 1);
6194
+ GGML_ASSERT( s->ne[1] == d_inner);
6195
+ GGML_ASSERT( x->ne[0] == d_inner);
6196
+ GGML_ASSERT(sq->ne[0] == n_kv);
6197
+ GGML_ASSERT(sq->ne[1] == n_tokens);
6198
+
6199
+ bool is_node = false;
6200
+
6201
+ if (s->grad || x->grad || c->grad || sq->grad) {
6202
+ GGML_ASSERT(false); // TODO: implement
6203
+ is_node = true;
6204
+ }
6205
+
6206
+ // 2-in-1 concatenated x and conv_states, {d_inner, n_tokens} with {d_conv, d_inner, n_kv}
6207
+ struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, (d_inner*n_tokens) + (d_conv*d_inner*n_kv));
6208
+
6209
+ result->op = GGML_OP_SSM_CONV;
6210
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6211
+ result->src[0] = s;
6212
+ result->src[1] = x;
6213
+ result->src[2] = c;
6214
+ result->src[3] = sq;
6215
+
6216
+ return result;
6217
+ }
6218
+
6219
+ // ggml_ssm_scan
6220
+
6221
+ struct ggml_tensor * ggml_ssm_scan(
6222
+ struct ggml_context * ctx,
6223
+ struct ggml_tensor * s,
6224
+ struct ggml_tensor * x,
6225
+ struct ggml_tensor * dt,
6226
+ struct ggml_tensor * A,
6227
+ struct ggml_tensor * B,
6228
+ struct ggml_tensor * C,
6229
+ struct ggml_tensor * sq) {
6230
+ GGML_ASSERT(ggml_is_contiguous(s));
6231
+ GGML_ASSERT(ggml_is_contiguous(x));
6232
+ GGML_ASSERT(ggml_is_contiguous(dt));
6233
+ GGML_ASSERT(ggml_is_contiguous(A));
6234
+ GGML_ASSERT(sq->type == GGML_TYPE_I32);
6235
+ GGML_ASSERT(B->nb[0] == ggml_type_size(B->type));
6236
+ GGML_ASSERT(C->nb[0] == ggml_type_size(C->type));
6237
+ GGML_ASSERT(ggml_are_same_shape(x, dt));
6238
+
6239
+ {
6240
+ const int64_t d_state = s->ne[0];
6241
+ const int64_t d_inner = s->ne[1];
6242
+ const int64_t n_tokens = x->ne[1];
6243
+
6244
+ GGML_ASSERT(x->ne[0] == d_inner);
6245
+ GGML_ASSERT(A->ne[0] == d_state);
6246
+ GGML_ASSERT(A->ne[1] == d_inner);
6247
+ GGML_ASSERT(B->ne[0] == d_state);
6248
+ GGML_ASSERT(B->ne[1] == n_tokens);
6249
+ GGML_ASSERT(C->ne[0] == d_state);
6250
+ GGML_ASSERT(C->ne[1] == n_tokens);
6251
+ }
6252
+
6253
+ bool is_node = false;
6254
+
6255
+ if (s->grad || x->grad || dt->grad || A->grad || B->grad || C->grad || sq->grad) {
6256
+ GGML_ASSERT(false); // TODO: implement
6257
+ is_node = true;
6258
+ }
6259
+
6260
+ // 2-in-1 concatenated y and ssm_states, {d_inner, n_tokens} with {d_state, d_inner, n_kv}
6261
+ struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ggml_nelements(x) + ggml_nelements(s));
6262
+
6263
+ result->op = GGML_OP_SSM_SCAN;
6264
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6265
+ result->src[0] = s;
6266
+ result->src[1] = x;
6267
+ result->src[2] = dt;
6268
+ result->src[3] = A;
6269
+ result->src[4] = B;
6270
+ result->src[5] = C;
6271
+ result->src[6] = sq;
6272
+
6273
+ return result;
6274
+ }
6275
+
6080
6276
  // ggml_win_part
6081
6277
 
6082
6278
  struct ggml_tensor * ggml_win_part(
@@ -10231,7 +10427,7 @@ static void ggml_compute_forward_group_norm_f32(
10231
10427
  int n_channels = src0->ne[2];
10232
10428
  int n_groups = dst->op_params[0];
10233
10429
  int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
10234
- for (int i = ith; i < n_groups; i+=nth) {
10430
+ for (int i = ith; i < n_groups; i += nth) {
10235
10431
  int start = i * n_channels_per_group;
10236
10432
  int end = start + n_channels_per_group;
10237
10433
  if (end > n_channels) {
@@ -10245,28 +10441,32 @@ static void ggml_compute_forward_group_norm_f32(
10245
10441
  for (int64_t i01 = 0; i01 < ne01; i01++) {
10246
10442
  const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
10247
10443
 
10444
+ ggml_float sumr = 0.0;
10248
10445
  for (int64_t i00 = 0; i00 < ne00; i00++) {
10249
- sum += (ggml_float)x[i00];
10446
+ sumr += (ggml_float)x[i00];
10250
10447
  }
10448
+ sum += sumr;
10251
10449
  }
10252
10450
  }
10253
- float mean = sum / (ne00 * ne01 * step);
10254
- ggml_float sum2 = 0.0;
10451
+ const float mean = sum / (ne00 * ne01 * step);
10255
10452
 
10453
+ ggml_float sum2 = 0.0;
10256
10454
  for (int64_t i02 = start; i02 < end; i02++) {
10257
10455
  for (int64_t i01 = 0; i01 < ne01; i01++) {
10258
10456
  const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
10259
10457
 
10260
10458
  float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
10261
10459
 
10460
+ ggml_float sumr = 0.0;
10262
10461
  for (int64_t i00 = 0; i00 < ne00; i00++) {
10263
10462
  float v = x[i00] - mean;
10264
10463
  y[i00] = v;
10265
- sum2 += (ggml_float)(v * v);
10464
+ sumr += (ggml_float)(v * v);
10266
10465
  }
10466
+ sum2 += sumr;
10267
10467
  }
10268
10468
  }
10269
- float variance = sum2 / (ne00 * ne01 * step);
10469
+ const float variance = sum2 / (ne00 * ne01 * step);
10270
10470
  const float scale = 1.0f / sqrtf(variance + eps);
10271
10471
 
10272
10472
  for (int64_t i02 = start; i02 < end; i02++) {
@@ -11373,8 +11573,6 @@ static void ggml_compute_forward_get_rows_q(
11373
11573
  const struct ggml_tensor * src0 = dst->src[0];
11374
11574
  const struct ggml_tensor * src1 = dst->src[1];
11375
11575
 
11376
- assert(params->ith == 0);
11377
-
11378
11576
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11379
11577
  return;
11380
11578
  }
@@ -11382,7 +11580,7 @@ static void ggml_compute_forward_get_rows_q(
11382
11580
  GGML_TENSOR_BINARY_OP_LOCALS
11383
11581
 
11384
11582
  const int64_t nc = ne00;
11385
- const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
11583
+ const int64_t nr = ggml_nelements(src1);
11386
11584
 
11387
11585
  const enum ggml_type type = src0->type;
11388
11586
  ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
@@ -11392,17 +11590,25 @@ static void ggml_compute_forward_get_rows_q(
11392
11590
  assert(nb00 == ggml_type_size(type));
11393
11591
  assert(ggml_nrows(dst) == nr);
11394
11592
 
11395
- // TODO: multi-thread
11396
- for (int64_t i12 = 0; i12 < ne12; ++i12) {
11397
- for (int64_t i11 = 0; i11 < ne11; ++i11) {
11398
- for (int64_t i10 = 0; i10 < ne10; ++i10) {
11399
- const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
11593
+ const int ith = params->ith;
11594
+ const int nth = params->nth;
11400
11595
 
11401
- dequantize_row_q(
11402
- (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
11403
- (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
11404
- }
11405
- }
11596
+ // rows per thread
11597
+ const int dr = (nr + nth - 1)/nth;
11598
+
11599
+ // row range for this thread
11600
+ const int ir0 = dr*ith;
11601
+ const int ir1 = MIN(ir0 + dr, nr);
11602
+
11603
+ for (int64_t i = ir0; i < ir1; ++i) {
11604
+ const int64_t i12 = i/(ne11*ne10);
11605
+ const int64_t i11 = (i - i12*ne11*ne10)/ne10;
11606
+ const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
11607
+ const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
11608
+
11609
+ dequantize_row_q(
11610
+ (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
11611
+ (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
11406
11612
  }
11407
11613
  }
11408
11614
 
@@ -11413,8 +11619,6 @@ static void ggml_compute_forward_get_rows_f16(
11413
11619
  const struct ggml_tensor * src0 = dst->src[0];
11414
11620
  const struct ggml_tensor * src1 = dst->src[1];
11415
11621
 
11416
- assert(params->ith == 0);
11417
-
11418
11622
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11419
11623
  return;
11420
11624
  }
@@ -11422,24 +11626,32 @@ static void ggml_compute_forward_get_rows_f16(
11422
11626
  GGML_TENSOR_BINARY_OP_LOCALS
11423
11627
 
11424
11628
  const int64_t nc = ne00;
11425
- const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
11629
+ const int64_t nr = ggml_nelements(src1);
11426
11630
 
11427
11631
  assert(ne0 == nc);
11428
11632
  assert(ne02 == ne11);
11429
11633
  assert(nb00 == sizeof(ggml_fp16_t));
11430
11634
  assert(ggml_nrows(dst) == nr);
11431
11635
 
11432
- // TODO: multi-thread
11433
- for (int64_t i12 = 0; i12 < ne12; ++i12) {
11434
- for (int64_t i11 = 0; i11 < ne11; ++i11) {
11435
- for (int64_t i10 = 0; i10 < ne10; ++i10) {
11436
- const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
11636
+ const int ith = params->ith;
11637
+ const int nth = params->nth;
11437
11638
 
11438
- ggml_fp16_to_fp32_row(
11439
- (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
11440
- (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
11441
- }
11442
- }
11639
+ // rows per thread
11640
+ const int dr = (nr + nth - 1)/nth;
11641
+
11642
+ // row range for this thread
11643
+ const int ir0 = dr*ith;
11644
+ const int ir1 = MIN(ir0 + dr, nr);
11645
+
11646
+ for (int64_t i = ir0; i < ir1; ++i) {
11647
+ const int64_t i12 = i/(ne11*ne10);
11648
+ const int64_t i11 = (i - i12*ne11*ne10)/ne10;
11649
+ const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
11650
+ const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
11651
+
11652
+ ggml_fp16_to_fp32_row(
11653
+ (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
11654
+ (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
11443
11655
  }
11444
11656
  }
11445
11657
 
@@ -11450,8 +11662,6 @@ static void ggml_compute_forward_get_rows_f32(
11450
11662
  const struct ggml_tensor * src0 = dst->src[0];
11451
11663
  const struct ggml_tensor * src1 = dst->src[1];
11452
11664
 
11453
- assert(params->ith == 0);
11454
-
11455
11665
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11456
11666
  return;
11457
11667
  }
@@ -11459,24 +11669,32 @@ static void ggml_compute_forward_get_rows_f32(
11459
11669
  GGML_TENSOR_BINARY_OP_LOCALS
11460
11670
 
11461
11671
  const int64_t nc = ne00;
11462
- const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
11672
+ const int64_t nr = ggml_nelements(src1);
11463
11673
 
11464
11674
  assert(ne0 == nc);
11465
11675
  assert(ne02 == ne11);
11466
11676
  assert(nb00 == sizeof(float));
11467
11677
  assert(ggml_nrows(dst) == nr);
11468
11678
 
11469
- // TODO: multi-thread
11470
- for (int64_t i12 = 0; i12 < ne12; ++i12) {
11471
- for (int64_t i11 = 0; i11 < ne11; ++i11) {
11472
- for (int64_t i10 = 0; i10 < ne10; ++i10) {
11473
- const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
11679
+ const int ith = params->ith;
11680
+ const int nth = params->nth;
11474
11681
 
11475
- ggml_vec_cpy_f32(nc,
11476
- (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
11477
- (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
11478
- }
11479
- }
11682
+ // rows per thread
11683
+ const int dr = (nr + nth - 1)/nth;
11684
+
11685
+ // row range for this thread
11686
+ const int ir0 = dr*ith;
11687
+ const int ir1 = MIN(ir0 + dr, nr);
11688
+
11689
+ for (int64_t i = ir0; i < ir1; ++i) {
11690
+ const int64_t i12 = i/(ne11*ne10);
11691
+ const int64_t i11 = (i - i12*ne11*ne10)/ne10;
11692
+ const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
11693
+ const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
11694
+
11695
+ ggml_vec_cpy_f32(nc,
11696
+ (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
11697
+ (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
11480
11698
  }
11481
11699
  }
11482
11700
 
@@ -12213,6 +12431,8 @@ static void ggml_compute_forward_alibi(
12213
12431
  case GGML_TYPE_I8:
12214
12432
  case GGML_TYPE_I16:
12215
12433
  case GGML_TYPE_I32:
12434
+ case GGML_TYPE_I64:
12435
+ case GGML_TYPE_F64:
12216
12436
  case GGML_TYPE_COUNT:
12217
12437
  {
12218
12438
  GGML_ASSERT(false);
@@ -12299,6 +12519,8 @@ static void ggml_compute_forward_clamp(
12299
12519
  case GGML_TYPE_I8:
12300
12520
  case GGML_TYPE_I16:
12301
12521
  case GGML_TYPE_I32:
12522
+ case GGML_TYPE_I64:
12523
+ case GGML_TYPE_F64:
12302
12524
  case GGML_TYPE_COUNT:
12303
12525
  {
12304
12526
  GGML_ASSERT(false);
@@ -13547,6 +13769,106 @@ static void ggml_compute_forward_pad(
13547
13769
  }
13548
13770
  }
13549
13771
 
13772
+
13773
+ // ggml_compute_forward_arange
13774
+
13775
+ static void ggml_compute_forward_arange_f32(
13776
+ const struct ggml_compute_params * params,
13777
+ struct ggml_tensor * dst) {
13778
+
13779
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
13780
+ return;
13781
+ }
13782
+
13783
+ GGML_ASSERT(dst->nb[0] == sizeof(float));
13784
+
13785
+ const int ith = params->ith;
13786
+ const int nth = params->nth;
13787
+
13788
+ const float start = ggml_get_op_params_f32(dst, 0);
13789
+ const float stop = ggml_get_op_params_f32(dst, 1);
13790
+ const float step = ggml_get_op_params_f32(dst, 2);
13791
+
13792
+ const int64_t steps = (int64_t) ceilf((stop - start) / step);
13793
+
13794
+ GGML_ASSERT(ggml_nelements(dst) == steps);
13795
+
13796
+ for (int64_t i = ith; i < steps; i+= nth) {
13797
+ float value = start + step * i;
13798
+ ((float *)dst->data)[i] = value;
13799
+ }
13800
+ }
13801
+
13802
+ static void ggml_compute_forward_arange(
13803
+ const struct ggml_compute_params * params,
13804
+ struct ggml_tensor * dst) {
13805
+ switch (dst->type) {
13806
+ case GGML_TYPE_F32:
13807
+ {
13808
+ ggml_compute_forward_arange_f32(params, dst);
13809
+ } break;
13810
+ default:
13811
+ {
13812
+ GGML_ASSERT(false);
13813
+ } break;
13814
+ }
13815
+ }
13816
+
13817
+ static void ggml_compute_forward_timestep_embedding_f32(
13818
+ const struct ggml_compute_params * params,
13819
+ struct ggml_tensor * dst) {
13820
+
13821
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
13822
+ return;
13823
+ }
13824
+
13825
+ const struct ggml_tensor * src0 = dst->src[0];
13826
+
13827
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
13828
+
13829
+ const int ith = params->ith;
13830
+ const int nth = params->nth;
13831
+
13832
+ GGML_TENSOR_UNARY_OP_LOCALS
13833
+
13834
+ const int dim = ggml_get_op_params_i32(dst, 0);
13835
+ const int max_period = ggml_get_op_params_i32(dst, 1);
13836
+
13837
+ int half = dim / 2;
13838
+
13839
+ for (int64_t i = 0; i < ne00; i++) {
13840
+ float * embed_data = (float *)((char *) dst->data + i*nb1);
13841
+ for (int64_t j = ith; j < half; j += nth) {
13842
+ float timestep = ((float *)src0->data)[i];
13843
+ float freq = (float)expf(-logf(max_period) * j / half);
13844
+ float arg = timestep * freq;
13845
+ embed_data[j] = cosf(arg);
13846
+ embed_data[j + half] = sinf(arg);
13847
+ }
13848
+ if (dim % 2 != 0 && ith == 0) {
13849
+ embed_data[dim] = 0.f;
13850
+ }
13851
+ }
13852
+ }
13853
+
13854
+ static void ggml_compute_forward_timestep_embedding(
13855
+ const struct ggml_compute_params * params,
13856
+ struct ggml_tensor * dst) {
13857
+
13858
+ const struct ggml_tensor * src0 = dst->src[0];
13859
+
13860
+ switch (src0->type) {
13861
+ case GGML_TYPE_F32:
13862
+ {
13863
+ ggml_compute_forward_timestep_embedding_f32(params, dst);
13864
+ } break;
13865
+ default:
13866
+ {
13867
+ GGML_ASSERT(false);
13868
+ } break;
13869
+ }
13870
+ }
13871
+
13550
13872
  // ggml_compute_forward_argsort
13551
13873
 
13552
13874
  static void ggml_compute_forward_argsort_f32(
@@ -14590,6 +14912,257 @@ static void ggml_compute_forward_flash_attn_back(
14590
14912
  }
14591
14913
  }
14592
14914
 
14915
+ // ggml_compute_forward_ssm_conv
14916
+
14917
+ static void ggml_compute_forward_ssm_conv_f32(
14918
+ const struct ggml_compute_params * params,
14919
+ struct ggml_tensor * dst) {
14920
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14921
+ return;
14922
+ }
14923
+
14924
+ const struct ggml_tensor * src0 = dst->src[0]; // conv_state
14925
+ const struct ggml_tensor * src1 = dst->src[1]; // x
14926
+ const struct ggml_tensor * src2 = dst->src[2]; // conv1d.weight
14927
+ const struct ggml_tensor * src3 = dst->src[3]; // state_seq
14928
+
14929
+ const int ith = params->ith;
14930
+ const int nth = params->nth;
14931
+
14932
+ const int nc = src2->ne[0]; // d_conv
14933
+ const int nr = src0->ne[1]; // d_inner
14934
+ const int n_t = src1->ne[1]; // n_tokens
14935
+ const int n_kv = src0->ne[2]; // max number of sequences in the batch
14936
+
14937
+ GGML_ASSERT((nr*n_t) + (nc*nr*n_kv) == ggml_nelements(dst));
14938
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
14939
+ GGML_ASSERT(src1->nb[0] == sizeof(float));
14940
+ GGML_ASSERT(src2->nb[0] == sizeof(float));
14941
+ GGML_ASSERT(src3->nb[0] == sizeof(int32_t));
14942
+ GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float));
14943
+ // for use with the destination state offset between sequences
14944
+ GGML_ASSERT(src2->nb[2] == src2->ne[1]*src2->ne[0]*sizeof(float));
14945
+
14946
+ // rows per thread
14947
+ const int dr = (nr + nth - 1)/nth;
14948
+
14949
+ // row range for this thread
14950
+ const int ir0 = dr*ith;
14951
+ const int ir1 = MIN(ir0 + dr, nr);
14952
+ const int ir = ir1 - ir0;
14953
+
14954
+ if (n_kv > 1) {
14955
+ // multiple sequences means it's hard to know when it's the first time a state is read,
14956
+ // so copy them all over to the destination, just to be sure.
14957
+ for (int i3 = 0; i3 < n_kv; ++i3) {
14958
+ float * s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]));
14959
+ float * s = (float *) ((char *) dst->data + ir0*(src2->nb[1]) + i3*(src2->nb[2]) + nr*n_t*sizeof(float));
14960
+ // can't use memcpy because of d_conv vs d_conv - 1
14961
+ for (int i1 = 0; i1 < ir; ++i1) {
14962
+ for (int i0 = 0; i0 < nc - 1; ++i0) {
14963
+ // copy s0 to last (d_conv - 1) columns of s
14964
+ s[1 + i0 + i1*nc] = s0[i0 + i1*(nc - 1)];
14965
+ }
14966
+ }
14967
+ }
14968
+ }
14969
+
14970
+ for (int i2 = 0; i2 < n_t; ++i2) {
14971
+ int32_t * sq = (int32_t *) ((char *) src3->data + i2*(src3->nb[1])); // {n_kv, n_tokens}
14972
+ float * x = (float *) ((char *) dst->data + ir0*sizeof(float) + i2*(nr*sizeof(float))); // {d_inner, n_tokens}
14973
+ float * s = (float *) ((char *) dst->data + ir0*(src2->nb[1]) + sq[0]*(src2->nb[2]) + nr*n_t*sizeof(float)); // {d_conv, d_inner, n_kv}
14974
+ float * s0; // {d_conv - 1, d_inner, n_kv}
14975
+ float * x0 = (float *) ((char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1])); // {d_inner, n_tokens}
14976
+ float * c = (float *) ((char *) src2->data + ir0*(src2->nb[1])); // {d_conv, d_inner}
14977
+ int ne0s0;
14978
+
14979
+ GGML_ASSERT(0 <= sq[0] && sq[0] < n_kv);
14980
+
14981
+ // avoid needing to copy the state for the first token
14982
+ if (i2 == 0) {
14983
+ s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + sq[0]*(src0->nb[2])); // {d_conv - 1, d_inner, n_kv}
14984
+ ne0s0 = src0->ne[0];
14985
+ } else {
14986
+ // the source is the last (d_conv - 1) columns of the destination
14987
+ s0 = s + 1;
14988
+ ne0s0 = nc;
14989
+ }
14990
+
14991
+ // d_inner
14992
+ for (int i1 = 0; i1 < ir; ++i1) {
14993
+ // shift state left
14994
+ for (int i0 = 0; i0 < nc - 1; ++i0) {
14995
+ s[i0 + i1*nc] = s0[i0 + i1*ne0s0];
14996
+ }
14997
+ // insert x on the last column
14998
+ s[(nc - 1) + i1*nc] = x0[i1];
14999
+ }
15000
+
15001
+ // handle copies when there are multiple output states
15002
+ for (int i3 = 1; i3 < n_kv; ++i3) {
15003
+ int32_t seq = sq[i3];
15004
+ if (0 <= seq && seq < n_kv) {
15005
+ float * s1 = s + (seq - sq[0])*nc*nr;
15006
+ memcpy(s1, s, nc*ir*sizeof(float));
15007
+ } else {
15008
+ // stop at negative or too big seq_ids
15009
+ break;
15010
+ }
15011
+ }
15012
+
15013
+ // it seems a little faster when this is separate from the state shift
15014
+ for (int i1 = 0; i1 < ir; ++i1) {
15015
+ // rowwise dot product
15016
+ float sumf = 0.0f;
15017
+ for (int i0 = 0; i0 < nc; ++i0) {
15018
+ int i = i0 + i1*nc;
15019
+ sumf += s[i] * c[i];
15020
+ }
15021
+ x[i1] = sumf;
15022
+ }
15023
+ }
15024
+ }
15025
+
15026
+ static void ggml_compute_forward_ssm_conv(
15027
+ const struct ggml_compute_params * params,
15028
+ struct ggml_tensor * dst) {
15029
+ switch (dst->src[0]->type) {
15030
+ case GGML_TYPE_F32:
15031
+ {
15032
+ ggml_compute_forward_ssm_conv_f32(params, dst);
15033
+ } break;
15034
+ default:
15035
+ {
15036
+ GGML_ASSERT(false);
15037
+ } break;
15038
+ }
15039
+ }
15040
+
15041
+ // ggml_compute_forward_ssm_scan
15042
+
15043
+ static void ggml_compute_forward_ssm_scan_f32(
15044
+ const struct ggml_compute_params * params,
15045
+ struct ggml_tensor * dst) {
15046
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
15047
+ return;
15048
+ }
15049
+
15050
+ const struct ggml_tensor * src0 = dst->src[0]; // s
15051
+ const struct ggml_tensor * src1 = dst->src[1]; // x
15052
+ const struct ggml_tensor * src2 = dst->src[2]; // dt
15053
+ const struct ggml_tensor * src3 = dst->src[3]; // A
15054
+ const struct ggml_tensor * src4 = dst->src[4]; // B
15055
+ const struct ggml_tensor * src5 = dst->src[5]; // C
15056
+ const struct ggml_tensor * src6 = dst->src[6]; // sq
15057
+
15058
+ const int ith = params->ith;
15059
+ const int nth = params->nth;
15060
+
15061
+ const int64_t nc = src0->ne[0]; // d_state
15062
+ const int64_t nr = src0->ne[1]; // d_inner
15063
+ const int64_t n_t = src1->ne[1]; // number of tokens in the batch
15064
+ const int64_t n_kv = src0->ne[2]; // max number of sequences in the batch
15065
+
15066
+ GGML_ASSERT(ggml_nelements(src1) + ggml_nelements(src0) == ggml_nelements(dst));
15067
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
15068
+ GGML_ASSERT(src1->nb[0] == sizeof(float));
15069
+ GGML_ASSERT(src2->nb[0] == sizeof(float));
15070
+ GGML_ASSERT(src3->nb[0] == sizeof(float));
15071
+ GGML_ASSERT(src4->nb[0] == sizeof(float));
15072
+ GGML_ASSERT(src5->nb[0] == sizeof(float));
15073
+ // required for the dot product between s and C, and when copying the states
15074
+ GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float));
15075
+ // required for per-sequence offsets for states
15076
+ GGML_ASSERT(src0->nb[2] == src0->ne[0]*src0->ne[1]*sizeof(float));
15077
+ // required to get correct offset for state destination (i.e. src1->nb[2])
15078
+ GGML_ASSERT(src1->nb[2] == src1->ne[0]*src1->ne[1]*sizeof(float));
15079
+
15080
+ // rows per thread
15081
+ const int dr = (nr + nth - 1)/nth;
15082
+
15083
+ // row range for this thread
15084
+ const int ir0 = dr*ith;
15085
+ const int ir1 = MIN(ir0 + dr, nr);
15086
+ const int ir = ir1 - ir0;
15087
+
15088
+ if (n_kv > 1) {
15089
+ // it's hard to know if the source states have already been copied
15090
+ // when there are multiple, so copy them already.
15091
+ for (int i3 = 0; i3 < n_kv; ++i3) {
15092
+ float * s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]));
15093
+ float * s = (float *) ((char *) dst->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]) + src1->nb[2]);
15094
+ memcpy(s, s0, nc*ir*sizeof(float));
15095
+ }
15096
+ }
15097
+
15098
+ for (int i2 = 0; i2 < n_t; ++i2) {
15099
+ int32_t * sq = (int32_t *) ((char *) src6->data + i2*(src6->nb[1])); // {n_kv, n_tokens}
15100
+ float * y = (float *) ((char *) dst->data + ir0*(src1->nb[0]) + i2*(src1->nb[1])); // {d_inner, n_tokens}
15101
+ float * s = (float *) ((char *) dst->data + ir0*(src0->nb[1]) + sq[0]*(src0->nb[2]) + src1->nb[2]); // {d_state, d_inner, n_kv}
15102
+ float * s0;
15103
+ float * x = (float *) ((char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1])); // {d_inner, n_tokens}
15104
+ float * dt = (float *) ((char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1])); // {d_inner, n_tokens}
15105
+ float * A = (float *) ((char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
15106
+ float * B = (float *) ((char *) src4->data + i2*(src4->nb[1])); // {d_state, n_tokens}
15107
+ float * C = (float *) ((char *) src5->data + i2*(src5->nb[1])); // {d_state, n_tokens}
15108
+
15109
+ GGML_ASSERT(0 <= sq[0] && sq[0] < n_kv);
15110
+
15111
+ // avoid needing to copy the state for the first token
15112
+ if (i2 == 0) {
15113
+ s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + sq[0]*(src0->nb[2])); // {d_state, d_inner, n_kv}
15114
+ } else {
15115
+ // otherwise the source is the same as the destination
15116
+ s0 = s;
15117
+ }
15118
+
15119
+ // d_inner
15120
+ for (int i1 = 0; i1 < ir; ++i1) {
15121
+ // ref: https://github.com/state-spaces/mamba/blob/34076d664838588a3c97727b263478ab9f621a07/mamba_ssm/ops/triton/selective_state_update.py#L78
15122
+ float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1];
15123
+ float x_dt = x[i1] * dt_soft_plus;
15124
+ float sumf = 0.0f;
15125
+ // d_state
15126
+ for (int i0 = 0; i0 < nc; ++i0) {
15127
+ int i = i0 + i1*nc;
15128
+ // state = prev_state * dA + dB * x
15129
+ float state = (s0[i] * expf(dt_soft_plus * A[i])) + (B[i0] * x_dt);
15130
+ // y = rowwise_dotprod(state, C)
15131
+ sumf += state * C[i0];
15132
+ s[i] = state;
15133
+ }
15134
+ y[i1] = sumf;
15135
+ }
15136
+
15137
+ // handle copies when there are multiple output states
15138
+ for (int i3 = 1; i3 < n_kv; ++i3) {
15139
+ int32_t seq = sq[i3];
15140
+ if (0 <= seq && seq < n_kv) {
15141
+ float * s1 = s + (seq - sq[0])*nc*nr;
15142
+ memcpy(s1, s, nc*ir*sizeof(float));
15143
+ } else {
15144
+ // stop at negative or too big seq_ids
15145
+ break;
15146
+ }
15147
+ }
15148
+ }
15149
+ }
15150
+
15151
+ static void ggml_compute_forward_ssm_scan(
15152
+ const struct ggml_compute_params * params,
15153
+ struct ggml_tensor * dst) {
15154
+ switch (dst->src[0]->type) {
15155
+ case GGML_TYPE_F32:
15156
+ {
15157
+ ggml_compute_forward_ssm_scan_f32(params, dst);
15158
+ } break;
15159
+ default:
15160
+ {
15161
+ GGML_ASSERT(false);
15162
+ } break;
15163
+ }
15164
+ }
15165
+
14593
15166
  // ggml_compute_forward_win_part
14594
15167
 
14595
15168
  static void ggml_compute_forward_win_part_f32(
@@ -15615,6 +16188,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15615
16188
  {
15616
16189
  ggml_compute_forward_pad(params, tensor);
15617
16190
  } break;
16191
+ case GGML_OP_ARANGE:
16192
+ {
16193
+ ggml_compute_forward_arange(params, tensor);
16194
+ } break;
16195
+ case GGML_OP_TIMESTEP_EMBEDDING:
16196
+ {
16197
+ ggml_compute_forward_timestep_embedding(params, tensor);
16198
+ } break;
15618
16199
  case GGML_OP_ARGSORT:
15619
16200
  {
15620
16201
  ggml_compute_forward_argsort(params, tensor);
@@ -15641,6 +16222,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15641
16222
  bool masked = t != 0;
15642
16223
  ggml_compute_forward_flash_attn_back(params, masked, tensor);
15643
16224
  } break;
16225
+ case GGML_OP_SSM_CONV:
16226
+ {
16227
+ ggml_compute_forward_ssm_conv(params, tensor);
16228
+ } break;
16229
+ case GGML_OP_SSM_SCAN:
16230
+ {
16231
+ ggml_compute_forward_ssm_scan(params, tensor);
16232
+ } break;
15644
16233
  case GGML_OP_WIN_PART:
15645
16234
  {
15646
16235
  ggml_compute_forward_win_part(params, tensor);
@@ -16617,6 +17206,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
16617
17206
  {
16618
17207
  GGML_ASSERT(false); // TODO: not implemented
16619
17208
  } break;
17209
+ case GGML_OP_ARANGE:
17210
+ {
17211
+ GGML_ASSERT(false); // TODO: not implemented
17212
+ } break;
17213
+ case GGML_OP_TIMESTEP_EMBEDDING:
17214
+ {
17215
+ GGML_ASSERT(false); // TODO: not implemented
17216
+ } break;
16620
17217
  case GGML_OP_ARGSORT:
16621
17218
  {
16622
17219
  GGML_ASSERT(false); // TODO: not implemented
@@ -16687,6 +17284,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
16687
17284
  {
16688
17285
  GGML_ASSERT(false); // not supported
16689
17286
  } break;
17287
+ case GGML_OP_SSM_CONV:
17288
+ case GGML_OP_SSM_SCAN:
17289
+ {
17290
+ GGML_ASSERT(false); // TODO: not implemented
17291
+ } break;
16690
17292
  case GGML_OP_WIN_PART:
16691
17293
  case GGML_OP_WIN_UNPART:
16692
17294
  case GGML_OP_UNARY:
@@ -17217,6 +17819,7 @@ struct ggml_compute_state {
17217
17819
  ggml_thread_t thrd;
17218
17820
  int ith;
17219
17821
  struct ggml_compute_state_shared * shared;
17822
+ enum ggml_status ec;
17220
17823
  };
17221
17824
 
17222
17825
  static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
@@ -17228,7 +17831,7 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
17228
17831
  node->perf_time_us += time_us_cur;
17229
17832
  }
17230
17833
 
17231
- static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
17834
+ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_threads) {
17232
17835
  int n_tasks = 0;
17233
17836
 
17234
17837
  switch (node->op) {
@@ -17309,6 +17912,12 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
17309
17912
  {
17310
17913
  n_tasks = n_threads;
17311
17914
  } break;
17915
+ case GGML_OP_GET_ROWS:
17916
+ {
17917
+ // FIXME: the cost of launching additional threads decreases performance with GPU offloading
17918
+ //n_tasks = MIN(n_threads, ggml_nelements(node->src[1]));
17919
+ n_tasks = MIN(n_cur_threads, ggml_nelements(node->src[1]));
17920
+ } break;
17312
17921
  case GGML_OP_SCALE:
17313
17922
  case GGML_OP_SET:
17314
17923
  case GGML_OP_CONT:
@@ -17316,7 +17925,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
17316
17925
  case GGML_OP_VIEW:
17317
17926
  case GGML_OP_PERMUTE:
17318
17927
  case GGML_OP_TRANSPOSE:
17319
- case GGML_OP_GET_ROWS:
17320
17928
  case GGML_OP_GET_ROWS_BACK:
17321
17929
  case GGML_OP_DIAG:
17322
17930
  {
@@ -17368,6 +17976,14 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
17368
17976
  {
17369
17977
  n_tasks = n_threads;
17370
17978
  } break;
17979
+ case GGML_OP_ARANGE:
17980
+ {
17981
+ n_tasks = n_threads;
17982
+ } break;
17983
+ case GGML_OP_TIMESTEP_EMBEDDING:
17984
+ {
17985
+ n_tasks = n_threads;
17986
+ } break;
17371
17987
  case GGML_OP_ARGSORT:
17372
17988
  {
17373
17989
  n_tasks = n_threads;
@@ -17384,6 +18000,11 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
17384
18000
  {
17385
18001
  n_tasks = n_threads;
17386
18002
  } break;
18003
+ case GGML_OP_SSM_CONV:
18004
+ case GGML_OP_SSM_SCAN:
18005
+ {
18006
+ n_tasks = n_threads;
18007
+ } break;
17387
18008
  case GGML_OP_WIN_PART:
17388
18009
  case GGML_OP_WIN_UNPART:
17389
18010
  case GGML_OP_GET_REL_POS:
@@ -17502,7 +18123,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
17502
18123
  while (true) {
17503
18124
  if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
17504
18125
  state->shared->node_n += 1;
17505
- return (thread_ret_t) GGML_EXIT_ABORTED;
18126
+ state->ec = GGML_STATUS_ABORTED;
18127
+ return 0;
17506
18128
  }
17507
18129
 
17508
18130
  if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
@@ -17520,7 +18142,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
17520
18142
  /* FINALIZE */
17521
18143
  struct ggml_tensor * node = cgraph->nodes[node_n];
17522
18144
  if (GGML_OP_HAS_FINALIZE[node->op]) {
17523
- params.nth = ggml_get_n_tasks(node, n_threads);
18145
+ params.nth = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
17524
18146
  ggml_compute_forward(&params, node);
17525
18147
  }
17526
18148
  ggml_graph_compute_perf_stats_node(node, state->shared);
@@ -17530,7 +18152,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
17530
18152
  while (++node_n < cgraph->n_nodes) {
17531
18153
  GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
17532
18154
  struct ggml_tensor * node = cgraph->nodes[node_n];
17533
- const int n_tasks = ggml_get_n_tasks(node, n_threads);
18155
+ const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
17534
18156
 
17535
18157
  state->shared->perf_node_start_cycles = ggml_perf_cycles();
17536
18158
  state->shared->perf_node_start_time_us = ggml_perf_time_us();
@@ -17578,7 +18200,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
17578
18200
 
17579
18201
  /* INIT & COMPUTE */
17580
18202
  struct ggml_tensor * node = cgraph->nodes[node_n];
17581
- const int n_tasks = ggml_get_n_tasks(node, n_threads);
18203
+ const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
17582
18204
 
17583
18205
  struct ggml_compute_params params = {
17584
18206
  /*.type =*/ GGML_TASK_TYPE_INIT,
@@ -17624,7 +18246,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
17624
18246
  }
17625
18247
  }
17626
18248
 
17627
- return GGML_EXIT_SUCCESS;
18249
+ return 0;
17628
18250
  }
17629
18251
 
17630
18252
  struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) {
@@ -17643,7 +18265,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
17643
18265
  for (int i = 0; i < cgraph->n_nodes; i++) {
17644
18266
  struct ggml_tensor * node = cgraph->nodes[i];
17645
18267
 
17646
- const int n_tasks = ggml_get_n_tasks(node, n_threads);
18268
+ const int n_tasks = ggml_get_n_tasks(node, n_threads, 1);
17647
18269
 
17648
18270
  max_tasks = MAX(max_tasks, n_tasks);
17649
18271
 
@@ -17820,7 +18442,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
17820
18442
  return cplan;
17821
18443
  }
17822
18444
 
17823
- int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
18445
+ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
17824
18446
  {
17825
18447
  GGML_ASSERT(cplan);
17826
18448
  GGML_ASSERT(cplan->n_threads > 0);
@@ -17864,6 +18486,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
17864
18486
  .thrd = 0,
17865
18487
  .ith = j,
17866
18488
  .shared = &state_shared,
18489
+ .ec = GGML_STATUS_SUCCESS,
17867
18490
  };
17868
18491
 
17869
18492
  const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
@@ -17874,12 +18497,14 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
17874
18497
 
17875
18498
  workers[0].ith = 0;
17876
18499
  workers[0].shared = &state_shared;
18500
+ workers[0].ec = GGML_STATUS_SUCCESS;
17877
18501
 
17878
18502
  const int64_t perf_start_cycles = ggml_perf_cycles();
17879
18503
  const int64_t perf_start_time_us = ggml_perf_time_us();
17880
18504
 
17881
18505
  // this is a work thread too
17882
- int compute_status = (size_t) ggml_graph_compute_thread(&workers[0]);
18506
+ ggml_graph_compute_thread(&workers[0]);
18507
+ enum ggml_status compute_status = workers[0].ec;
17883
18508
 
17884
18509
  // don't leave affinity set on the main thread
17885
18510
  clear_numa_thread_affinity();
@@ -17889,6 +18514,8 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
17889
18514
  for (int j = 1; j < n_threads; j++) {
17890
18515
  const int rc = ggml_thread_join(workers[j].thrd, NULL);
17891
18516
  GGML_ASSERT(rc == 0);
18517
+ if (workers[j].ec != GGML_STATUS_SUCCESS)
18518
+ compute_status = workers[j].ec;
17892
18519
  }
17893
18520
  }
17894
18521
 
@@ -17916,14 +18543,14 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
17916
18543
  return compute_status;
17917
18544
  }
17918
18545
 
17919
- void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
18546
+ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
17920
18547
  struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
17921
18548
 
17922
18549
  struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
17923
18550
 
17924
18551
  cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
17925
18552
 
17926
- ggml_graph_compute(cgraph, &cplan);
18553
+ return ggml_graph_compute(cgraph, &cplan);
17927
18554
  }
17928
18555
 
17929
18556
  struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
@@ -19572,133 +20199,6 @@ void ggml_quantize_free(void) {
19572
20199
  ggml_critical_section_end();
19573
20200
  }
19574
20201
 
19575
- size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) {
19576
- assert(k % QK4_0 == 0);
19577
- const int nb = k / QK4_0;
19578
-
19579
- for (int b = 0; b < n; b += k) {
19580
- block_q4_0 * restrict y = (block_q4_0 *) dst + b/QK4_0;
19581
-
19582
- quantize_row_q4_0_reference(src + b, y, k);
19583
-
19584
- for (int i = 0; i < nb; i++) {
19585
- for (int j = 0; j < QK4_0; j += 2) {
19586
- const uint8_t vi0 = y[i].qs[j/2] & 0x0F;
19587
- const uint8_t vi1 = y[i].qs[j/2] >> 4;
19588
-
19589
- hist[vi0]++;
19590
- hist[vi1]++;
19591
- }
19592
- }
19593
- }
19594
-
19595
- return (n/QK4_0*sizeof(block_q4_0));
19596
- }
19597
-
19598
- size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) {
19599
- assert(k % QK4_1 == 0);
19600
- const int nb = k / QK4_1;
19601
-
19602
- for (int b = 0; b < n; b += k) {
19603
- block_q4_1 * restrict y = (block_q4_1 *) dst + b/QK4_1;
19604
-
19605
- quantize_row_q4_1_reference(src + b, y, k);
19606
-
19607
- for (int i = 0; i < nb; i++) {
19608
- for (int j = 0; j < QK4_1; j += 2) {
19609
- const uint8_t vi0 = y[i].qs[j/2] & 0x0F;
19610
- const uint8_t vi1 = y[i].qs[j/2] >> 4;
19611
-
19612
- hist[vi0]++;
19613
- hist[vi1]++;
19614
- }
19615
- }
19616
- }
19617
-
19618
- return (n/QK4_1*sizeof(block_q4_1));
19619
- }
19620
-
19621
- size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist) {
19622
- assert(k % QK5_0 == 0);
19623
- const int nb = k / QK5_0;
19624
-
19625
- for (int b = 0; b < n; b += k) {
19626
- block_q5_0 * restrict y = (block_q5_0 *)dst + b/QK5_0;
19627
-
19628
- quantize_row_q5_0_reference(src + b, y, k);
19629
-
19630
- for (int i = 0; i < nb; i++) {
19631
- uint32_t qh;
19632
- memcpy(&qh, &y[i].qh, sizeof(qh));
19633
-
19634
- for (int j = 0; j < QK5_0; j += 2) {
19635
- const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
19636
- const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
19637
-
19638
- // cast to 16 bins
19639
- const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
19640
- const uint8_t vi1 = ((y[i].qs[j/2] >> 4) | vh1) / 2;
19641
-
19642
- hist[vi0]++;
19643
- hist[vi1]++;
19644
- }
19645
- }
19646
- }
19647
-
19648
- return (n/QK5_0*sizeof(block_q5_0));
19649
- }
19650
-
19651
- size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist) {
19652
- assert(k % QK5_1 == 0);
19653
- const int nb = k / QK5_1;
19654
-
19655
- for (int b = 0; b < n; b += k) {
19656
- block_q5_1 * restrict y = (block_q5_1 *)dst + b/QK5_1;
19657
-
19658
- quantize_row_q5_1_reference(src + b, y, k);
19659
-
19660
- for (int i = 0; i < nb; i++) {
19661
- uint32_t qh;
19662
- memcpy(&qh, &y[i].qh, sizeof(qh));
19663
-
19664
- for (int j = 0; j < QK5_1; j += 2) {
19665
- const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
19666
- const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
19667
-
19668
- // cast to 16 bins
19669
- const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
19670
- const uint8_t vi1 = ((y[i].qs[j/2] >> 4) | vh1) / 2;
19671
-
19672
- hist[vi0]++;
19673
- hist[vi1]++;
19674
- }
19675
- }
19676
- }
19677
-
19678
- return (n/QK5_1*sizeof(block_q5_1));
19679
- }
19680
-
19681
- size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist) {
19682
- assert(k % QK8_0 == 0);
19683
- const int nb = k / QK8_0;
19684
-
19685
- for (int b = 0; b < n; b += k) {
19686
- block_q8_0 * restrict y = (block_q8_0 *)dst + b/QK8_0;
19687
-
19688
- quantize_row_q8_0_reference(src + b, y, k);
19689
-
19690
- for (int i = 0; i < nb; i++) {
19691
- for (int j = 0; j < QK8_0; ++j) {
19692
- const int8_t vi = y[i].qs[j];
19693
-
19694
- hist[vi/16 + 8]++;
19695
- }
19696
- }
19697
- }
19698
-
19699
- return (n/QK8_0*sizeof(block_q8_0));
19700
- }
19701
-
19702
20202
  bool ggml_quantize_requires_imatrix(enum ggml_type type) {
19703
20203
  return
19704
20204
  type == GGML_TYPE_IQ2_XXS ||
@@ -19706,177 +20206,52 @@ bool ggml_quantize_requires_imatrix(enum ggml_type type) {
19706
20206
  type == GGML_TYPE_IQ1_S;
19707
20207
  }
19708
20208
 
19709
- size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
19710
- int nrows, int n_per_row, int64_t * hist, const float * imatrix) {
20209
+ size_t ggml_quantize_chunk(
20210
+ enum ggml_type type,
20211
+ const float * src,
20212
+ void * dst,
20213
+ int start,
20214
+ int nrows,
20215
+ int n_per_row,
20216
+ const float * imatrix) {
20217
+ const int n = nrows * n_per_row;
20218
+
20219
+ if (ggml_quantize_requires_imatrix(type)) {
20220
+ GGML_ASSERT(imatrix != NULL);
20221
+ }
20222
+
20223
+ GGML_ASSERT(start % type_traits[type].blck_size == 0);
20224
+ GGML_ASSERT(start % n_per_row == 0);
20225
+
19711
20226
  ggml_quantize_init(type); // this is noop if already initialized
20227
+
20228
+ const size_t start_row = start / n_per_row;
20229
+ const size_t row_size = ggml_row_size(type, n_per_row);
20230
+
19712
20231
  size_t result = 0;
19713
- int n = nrows * n_per_row;
20232
+
19714
20233
  switch (type) {
19715
- case GGML_TYPE_Q4_0:
19716
- {
19717
- GGML_ASSERT(start % QK4_0 == 0);
19718
- GGML_ASSERT(start % n_per_row == 0);
19719
- size_t start_row = start / n_per_row;
19720
- size_t row_size = ggml_row_size(type, n_per_row);
19721
- result = quantize_q4_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19722
- GGML_ASSERT(result == row_size * nrows);
19723
- } break;
19724
- case GGML_TYPE_Q4_1:
19725
- {
19726
- GGML_ASSERT(start % QK4_1 == 0);
19727
- GGML_ASSERT(start % n_per_row == 0);
19728
- size_t start_row = start / n_per_row;
19729
- size_t row_size = ggml_row_size(type, n_per_row);
19730
- result = quantize_q4_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19731
- GGML_ASSERT(result == row_size * nrows);
19732
- } break;
19733
- case GGML_TYPE_Q5_0:
19734
- {
19735
- GGML_ASSERT(start % QK5_0 == 0);
19736
- GGML_ASSERT(start % n_per_row == 0);
19737
- size_t start_row = start / n_per_row;
19738
- size_t row_size = ggml_row_size(type, n_per_row);
19739
- result = quantize_q5_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19740
- GGML_ASSERT(result == row_size * nrows);
19741
- } break;
19742
- case GGML_TYPE_Q5_1:
19743
- {
19744
- GGML_ASSERT(start % QK5_1 == 0);
19745
- GGML_ASSERT(start % n_per_row == 0);
19746
- size_t start_row = start / n_per_row;
19747
- size_t row_size = ggml_row_size(type, n_per_row);
19748
- result = quantize_q5_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19749
- GGML_ASSERT(result == row_size * nrows);
19750
- } break;
19751
- case GGML_TYPE_Q8_0:
19752
- {
19753
- GGML_ASSERT(start % QK8_0 == 0);
19754
- block_q8_0 * block = (block_q8_0*)dst + start / QK8_0;
19755
- result = ggml_quantize_q8_0(src + start, block, n, n, hist);
19756
- } break;
19757
- case GGML_TYPE_Q2_K:
19758
- {
19759
- GGML_ASSERT(start % QK_K == 0);
19760
- GGML_ASSERT(start % n_per_row == 0);
19761
- size_t start_row = start / n_per_row;
19762
- size_t row_size = ggml_row_size(type, n_per_row);
19763
- result = quantize_q2_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19764
- GGML_ASSERT(result == row_size * nrows);
19765
- } break;
19766
- case GGML_TYPE_Q3_K:
19767
- {
19768
- GGML_ASSERT(start % QK_K == 0);
19769
- GGML_ASSERT(start % n_per_row == 0);
19770
- size_t start_row = start / n_per_row;
19771
- size_t row_size = ggml_row_size(type, n_per_row);
19772
- result = quantize_q3_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19773
- GGML_ASSERT(result == row_size * nrows);
19774
- } break;
19775
- case GGML_TYPE_Q4_K:
19776
- {
19777
- GGML_ASSERT(start % QK_K == 0);
19778
- GGML_ASSERT(start % n_per_row == 0);
19779
- size_t start_row = start / n_per_row;
19780
- size_t row_size = ggml_row_size(type, n_per_row);
19781
- result = quantize_q4_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19782
- GGML_ASSERT(result == row_size * nrows);
19783
- } break;
19784
- case GGML_TYPE_Q5_K:
19785
- {
19786
- GGML_ASSERT(start % QK_K == 0);
19787
- GGML_ASSERT(start % n_per_row == 0);
19788
- size_t start_row = start / n_per_row;
19789
- size_t row_size = ggml_row_size(type, n_per_row);
19790
- result = quantize_q5_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19791
- GGML_ASSERT(result == row_size * nrows);
19792
- } break;
19793
- case GGML_TYPE_Q6_K:
19794
- {
19795
- GGML_ASSERT(start % QK_K == 0);
19796
- GGML_ASSERT(start % n_per_row == 0);
19797
- size_t start_row = start / n_per_row;
19798
- size_t row_size = ggml_row_size(type, n_per_row);
19799
- result = quantize_q6_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19800
- GGML_ASSERT(result == row_size * nrows);
19801
- } break;
19802
- case GGML_TYPE_IQ2_XXS:
19803
- {
19804
- GGML_ASSERT(start % QK_K == 0);
19805
- GGML_ASSERT(start % n_per_row == 0);
19806
- GGML_ASSERT(imatrix);
19807
- size_t start_row = start / n_per_row;
19808
- size_t row_size = ggml_row_size(type, n_per_row);
19809
- result = quantize_iq2_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19810
- GGML_ASSERT(result == row_size * nrows);
19811
- } break;
19812
- case GGML_TYPE_IQ2_XS:
19813
- {
19814
- GGML_ASSERT(start % QK_K == 0);
19815
- GGML_ASSERT(start % n_per_row == 0);
19816
- GGML_ASSERT(imatrix);
19817
- size_t start_row = start / n_per_row;
19818
- size_t row_size = ggml_row_size(type, n_per_row);
19819
- result = quantize_iq2_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19820
- GGML_ASSERT(result == row_size * nrows);
19821
- } break;
19822
- case GGML_TYPE_IQ3_XXS:
19823
- {
19824
- GGML_ASSERT(start % QK_K == 0);
19825
- GGML_ASSERT(start % n_per_row == 0);
19826
- size_t start_row = start / n_per_row;
19827
- size_t row_size = ggml_row_size(type, n_per_row);
19828
- result = quantize_iq3_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19829
- GGML_ASSERT(result == row_size * nrows);
19830
- } break;
19831
- case GGML_TYPE_IQ3_S:
19832
- {
19833
- GGML_ASSERT(start % QK_K == 0);
19834
- GGML_ASSERT(start % n_per_row == 0);
19835
- size_t start_row = start / n_per_row;
19836
- size_t row_size = ggml_row_size(type, n_per_row);
19837
- result = quantize_iq3_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19838
- GGML_ASSERT(result == row_size * nrows);
19839
- } break;
19840
- case GGML_TYPE_IQ2_S:
19841
- {
19842
- GGML_ASSERT(start % QK_K == 0);
19843
- GGML_ASSERT(start % n_per_row == 0);
19844
- size_t start_row = start / n_per_row;
19845
- size_t row_size = ggml_row_size(type, n_per_row);
19846
- result = quantize_iq2_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19847
- GGML_ASSERT(result == row_size * nrows);
19848
- } break;
19849
- case GGML_TYPE_IQ1_S:
19850
- {
19851
- GGML_ASSERT(start % QK_K == 0);
19852
- GGML_ASSERT(start % n_per_row == 0);
19853
- size_t start_row = start / n_per_row;
19854
- size_t row_size = ggml_row_size(type, n_per_row);
19855
- result = quantize_iq1_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19856
- GGML_ASSERT(result == row_size * nrows);
19857
- } break;
19858
- case GGML_TYPE_IQ4_NL:
20234
+ case GGML_TYPE_Q4_0: result = quantize_q4_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20235
+ case GGML_TYPE_Q4_1: result = quantize_q4_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20236
+ case GGML_TYPE_Q5_0: result = quantize_q5_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20237
+ case GGML_TYPE_Q5_1: result = quantize_q5_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20238
+ case GGML_TYPE_Q8_0: result = quantize_q8_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20239
+ case GGML_TYPE_Q2_K: result = quantize_q2_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20240
+ case GGML_TYPE_Q3_K: result = quantize_q3_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20241
+ case GGML_TYPE_Q4_K: result = quantize_q4_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20242
+ case GGML_TYPE_Q5_K: result = quantize_q5_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20243
+ case GGML_TYPE_Q6_K: result = quantize_q6_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20244
+ case GGML_TYPE_IQ2_XXS: result = quantize_iq2_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20245
+ case GGML_TYPE_IQ2_XS: result = quantize_iq2_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20246
+ case GGML_TYPE_IQ3_XXS: result = quantize_iq3_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20247
+ case GGML_TYPE_IQ3_S: result = quantize_iq3_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20248
+ case GGML_TYPE_IQ2_S: result = quantize_iq2_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20249
+ case GGML_TYPE_IQ1_S: result = quantize_iq1_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20250
+ case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
19859
20251
  #if QK_K == 64
19860
- case GGML_TYPE_IQ4_XS:
19861
- #endif
19862
- {
19863
- GGML_ASSERT(start % QK4_NL == 0);
19864
- GGML_ASSERT(start % n_per_row == 0);
19865
- size_t start_row = start / n_per_row;
19866
- size_t row_size = ggml_row_size(type, n_per_row);
19867
- result = quantize_iq4_nl(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19868
- GGML_ASSERT(result == row_size * nrows);
19869
- } break;
19870
- #if QK_K != 64
19871
- case GGML_TYPE_IQ4_XS:
19872
- {
19873
- GGML_ASSERT(start % QK_K == 0);
19874
- GGML_ASSERT(start % n_per_row == 0);
19875
- size_t start_row = start / n_per_row;
19876
- size_t row_size = ggml_row_size(type, n_per_row);
19877
- result = quantize_iq4_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19878
- GGML_ASSERT(result == row_size * nrows);
19879
- } break;
20252
+ case GGML_TYPE_IQ4_XS: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20253
+ #else
20254
+ case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
19880
20255
  #endif
19881
20256
  case GGML_TYPE_F16:
19882
20257
  {
@@ -19893,6 +20268,9 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
19893
20268
  default:
19894
20269
  assert(false);
19895
20270
  }
20271
+
20272
+ GGML_ASSERT(result == nrows * row_size);
20273
+
19896
20274
  return result;
19897
20275
  }
19898
20276