llama_cpp 0.13.0 → 0.14.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -320,6 +320,17 @@ static ggml_fp16_t ggml_table_exp_f16[1 << 16];
320
320
  // precomputed f32 table for f16 (256 KB) (ggml-impl.h)
321
321
  float ggml_table_f32_f16[1 << 16];
322
322
 
323
+ const char * ggml_status_to_string(enum ggml_status status) {
324
+ switch (status) {
325
+ case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)";
326
+ case GGML_STATUS_FAILED: return "GGML status: error (operation failed)";
327
+ case GGML_STATUS_SUCCESS: return "GGML status: success";
328
+ case GGML_STATUS_ABORTED: return "GGML status: warning (operation aborted)";
329
+ }
330
+
331
+ return "GGML status: unknown";
332
+ }
333
+
323
334
  // note: do not use these inside ggml.c
324
335
  // these are meant to be used via the ggml.h API
325
336
  float ggml_fp16_to_fp32(ggml_fp16_t x) {
@@ -459,6 +470,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
459
470
  .type_size = sizeof(int32_t),
460
471
  .is_quantized = false,
461
472
  },
473
+ [GGML_TYPE_I64] = {
474
+ .type_name = "i64",
475
+ .blck_size = 1,
476
+ .type_size = sizeof(int64_t),
477
+ .is_quantized = false,
478
+ },
479
+ [GGML_TYPE_F64] = {
480
+ .type_name = "f64",
481
+ .blck_size = 1,
482
+ .type_size = sizeof(double),
483
+ .is_quantized = false,
484
+ .nrows = 1,
485
+ },
462
486
  [GGML_TYPE_F32] = {
463
487
  .type_name = "f32",
464
488
  .blck_size = 1,
@@ -846,7 +870,7 @@ inline static float vaddvq_f32(float32x4_t v) {
846
870
  #define GGML_F16x8 float16x8_t
847
871
  #define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
848
872
  #define GGML_F16x8_SET1(x) vdupq_n_f16(x)
849
- #define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x))
873
+ #define GGML_F16x8_LOAD(x) vld1q_f16((const ggml_fp16_internal_t *)(x))
850
874
  #define GGML_F16x8_STORE vst1q_f16
851
875
  #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
852
876
  #define GGML_F16x8_ADD vaddq_f16
@@ -889,7 +913,7 @@ inline static float vaddvq_f32(float32x4_t v) {
889
913
  #define GGML_F32Cx4 float32x4_t
890
914
  #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
891
915
  #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
892
- #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
916
+ #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const ggml_fp16_internal_t *)(x)))
893
917
  #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
894
918
  #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
895
919
  #define GGML_F32Cx4_ADD vaddq_f32
@@ -1822,12 +1846,16 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
1822
1846
  "POOL_2D",
1823
1847
  "UPSCALE",
1824
1848
  "PAD",
1849
+ "ARANGE",
1850
+ "TIMESTEP_EMBEDDING",
1825
1851
  "ARGSORT",
1826
1852
  "LEAKY_RELU",
1827
1853
 
1828
1854
  "FLASH_ATTN",
1829
1855
  "FLASH_FF",
1830
1856
  "FLASH_ATTN_BACK",
1857
+ "SSM_CONV",
1858
+ "SSM_SCAN",
1831
1859
  "WIN_PART",
1832
1860
  "WIN_UNPART",
1833
1861
  "GET_REL_POS",
@@ -1850,7 +1878,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
1850
1878
  "CROSS_ENTROPY_LOSS_BACK",
1851
1879
  };
1852
1880
 
1853
- static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
1881
+ static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
1854
1882
 
1855
1883
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1856
1884
  "none",
@@ -1908,12 +1936,16 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1908
1936
  "pool_2d(x)",
1909
1937
  "upscale(x)",
1910
1938
  "pad(x)",
1939
+ "arange(start, stop, step)",
1940
+ "timestep_embedding(timesteps, dim, max_period)",
1911
1941
  "argsort(x)",
1912
1942
  "leaky_relu(x)",
1913
1943
 
1914
1944
  "flash_attn(x)",
1915
1945
  "flash_ff(x)",
1916
1946
  "flash_attn_back(x)",
1947
+ "ssm_conv(x)",
1948
+ "ssm_scan(x)",
1917
1949
  "win_part(x)",
1918
1950
  "win_unpart(x)",
1919
1951
  "get_rel_pos(x)",
@@ -1936,7 +1968,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1936
1968
  "cross_entropy_loss_back(x,y)",
1937
1969
  };
1938
1970
 
1939
- static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
1971
+ static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
1940
1972
 
1941
1973
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
1942
1974
 
@@ -2139,7 +2171,10 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
2139
2171
  getcpu_ret = getcpu(&current_cpu, &g_state.numa.current_node);
2140
2172
  #else
2141
2173
  // old glibc doesn't have a wrapper for this call. Fall back on direct syscall
2142
- getcpu_ret = syscall(SYS_getcpu,&current_cpu,&g_state.numa.current_node);
2174
+ # if !defined(SYS_getcpu) && defined(SYS_get_cpu)
2175
+ # define SYS_getcpu SYS_get_cpu // some older glibc versions use this name
2176
+ # endif
2177
+ getcpu_ret = syscall(SYS_getcpu, &current_cpu, &g_state.numa.current_node);
2143
2178
  #endif
2144
2179
 
2145
2180
  if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) {
@@ -2895,11 +2930,21 @@ static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_
2895
2930
  return ((const int32_t *)(tensor->op_params))[i];
2896
2931
  }
2897
2932
 
2933
+ static float ggml_get_op_params_f32(const struct ggml_tensor * tensor, uint32_t i) {
2934
+ assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
2935
+ return ((const float *)(tensor->op_params))[i];
2936
+ }
2937
+
2898
2938
  static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
2899
2939
  assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
2900
2940
  ((int32_t *)(tensor->op_params))[i] = value;
2901
2941
  }
2902
2942
 
2943
+ static void ggml_set_op_params_f32(struct ggml_tensor * tensor, uint32_t i, float value) {
2944
+ assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
2945
+ ((float *)(tensor->op_params))[i] = value;
2946
+ }
2947
+
2903
2948
  struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
2904
2949
  memset(tensor->data, 0, ggml_nbytes(tensor));
2905
2950
  return tensor;
@@ -5898,6 +5943,55 @@ struct ggml_tensor * ggml_upscale(
5898
5943
  return ggml_upscale_impl(ctx, a, scale_factor);
5899
5944
  }
5900
5945
 
5946
+ struct ggml_tensor * ggml_arange(
5947
+ struct ggml_context * ctx,
5948
+ float start,
5949
+ float stop,
5950
+ float step) {
5951
+
5952
+ GGML_ASSERT(stop > start);
5953
+
5954
+ const int64_t steps = (int64_t) ceilf((stop - start) / step);
5955
+
5956
+ struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, steps);
5957
+
5958
+ result->op = GGML_OP_ARANGE;
5959
+ ggml_set_op_params_f32(result, 0, start);
5960
+ ggml_set_op_params_f32(result, 1, stop);
5961
+ ggml_set_op_params_f32(result, 2, step);
5962
+
5963
+ return result;
5964
+ }
5965
+
5966
+ struct ggml_tensor * ggml_timestep_embedding(
5967
+ struct ggml_context * ctx,
5968
+ struct ggml_tensor * timesteps,
5969
+ int dim,
5970
+ int max_period) {
5971
+ bool is_node = false;
5972
+
5973
+ if (timesteps->grad) {
5974
+ GGML_ASSERT(false); // TODO: implement backward
5975
+ is_node = true;
5976
+ }
5977
+
5978
+ int actual_dim = dim;
5979
+ if (dim % 2 != 0) {
5980
+ actual_dim = dim + 1;
5981
+ }
5982
+
5983
+ struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, actual_dim, timesteps->ne[0]);
5984
+
5985
+ result->op = GGML_OP_TIMESTEP_EMBEDDING;
5986
+ ggml_set_op_params_i32(result, 0, dim);
5987
+ ggml_set_op_params_i32(result, 1, max_period);
5988
+
5989
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5990
+ result->src[0] = timesteps;
5991
+
5992
+ return result;
5993
+ }
5994
+
5901
5995
  // ggml_argsort
5902
5996
 
5903
5997
  struct ggml_tensor * ggml_argsort(
@@ -6077,6 +6171,108 @@ struct ggml_tensor * ggml_flash_attn_back(
6077
6171
  return result;
6078
6172
  }
6079
6173
 
6174
+ // ggml_ssm_conv
6175
+
6176
+ struct ggml_tensor * ggml_ssm_conv(
6177
+ struct ggml_context * ctx,
6178
+ struct ggml_tensor * s,
6179
+ struct ggml_tensor * x,
6180
+ struct ggml_tensor * c,
6181
+ struct ggml_tensor * sq) {
6182
+ GGML_ASSERT(ggml_is_3d(s));
6183
+ GGML_ASSERT(ggml_is_matrix(x));
6184
+ GGML_ASSERT(ggml_is_matrix(c));
6185
+ GGML_ASSERT(ggml_is_matrix(sq));
6186
+ GGML_ASSERT(sq->type == GGML_TYPE_I32);
6187
+
6188
+ const int64_t d_conv = c->ne[0];
6189
+ const int64_t d_inner = c->ne[1];
6190
+ const int64_t n_tokens = x->ne[1];
6191
+ const int64_t n_kv = s->ne[2];
6192
+
6193
+ GGML_ASSERT( s->ne[0] == d_conv - 1);
6194
+ GGML_ASSERT( s->ne[1] == d_inner);
6195
+ GGML_ASSERT( x->ne[0] == d_inner);
6196
+ GGML_ASSERT(sq->ne[0] == n_kv);
6197
+ GGML_ASSERT(sq->ne[1] == n_tokens);
6198
+
6199
+ bool is_node = false;
6200
+
6201
+ if (s->grad || x->grad || c->grad || sq->grad) {
6202
+ GGML_ASSERT(false); // TODO: implement
6203
+ is_node = true;
6204
+ }
6205
+
6206
+ // 2-in-1 concatenated x and conv_states, {d_inner, n_tokens} with {d_conv, d_inner, n_kv}
6207
+ struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, (d_inner*n_tokens) + (d_conv*d_inner*n_kv));
6208
+
6209
+ result->op = GGML_OP_SSM_CONV;
6210
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6211
+ result->src[0] = s;
6212
+ result->src[1] = x;
6213
+ result->src[2] = c;
6214
+ result->src[3] = sq;
6215
+
6216
+ return result;
6217
+ }
6218
+
6219
+ // ggml_ssm_scan
6220
+
6221
+ struct ggml_tensor * ggml_ssm_scan(
6222
+ struct ggml_context * ctx,
6223
+ struct ggml_tensor * s,
6224
+ struct ggml_tensor * x,
6225
+ struct ggml_tensor * dt,
6226
+ struct ggml_tensor * A,
6227
+ struct ggml_tensor * B,
6228
+ struct ggml_tensor * C,
6229
+ struct ggml_tensor * sq) {
6230
+ GGML_ASSERT(ggml_is_contiguous(s));
6231
+ GGML_ASSERT(ggml_is_contiguous(x));
6232
+ GGML_ASSERT(ggml_is_contiguous(dt));
6233
+ GGML_ASSERT(ggml_is_contiguous(A));
6234
+ GGML_ASSERT(sq->type == GGML_TYPE_I32);
6235
+ GGML_ASSERT(B->nb[0] == ggml_type_size(B->type));
6236
+ GGML_ASSERT(C->nb[0] == ggml_type_size(C->type));
6237
+ GGML_ASSERT(ggml_are_same_shape(x, dt));
6238
+
6239
+ {
6240
+ const int64_t d_state = s->ne[0];
6241
+ const int64_t d_inner = s->ne[1];
6242
+ const int64_t n_tokens = x->ne[1];
6243
+
6244
+ GGML_ASSERT(x->ne[0] == d_inner);
6245
+ GGML_ASSERT(A->ne[0] == d_state);
6246
+ GGML_ASSERT(A->ne[1] == d_inner);
6247
+ GGML_ASSERT(B->ne[0] == d_state);
6248
+ GGML_ASSERT(B->ne[1] == n_tokens);
6249
+ GGML_ASSERT(C->ne[0] == d_state);
6250
+ GGML_ASSERT(C->ne[1] == n_tokens);
6251
+ }
6252
+
6253
+ bool is_node = false;
6254
+
6255
+ if (s->grad || x->grad || dt->grad || A->grad || B->grad || C->grad || sq->grad) {
6256
+ GGML_ASSERT(false); // TODO: implement
6257
+ is_node = true;
6258
+ }
6259
+
6260
+ // 2-in-1 concatenated y and ssm_states, {d_inner, n_tokens} with {d_state, d_inner, n_kv}
6261
+ struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ggml_nelements(x) + ggml_nelements(s));
6262
+
6263
+ result->op = GGML_OP_SSM_SCAN;
6264
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6265
+ result->src[0] = s;
6266
+ result->src[1] = x;
6267
+ result->src[2] = dt;
6268
+ result->src[3] = A;
6269
+ result->src[4] = B;
6270
+ result->src[5] = C;
6271
+ result->src[6] = sq;
6272
+
6273
+ return result;
6274
+ }
6275
+
6080
6276
  // ggml_win_part
6081
6277
 
6082
6278
  struct ggml_tensor * ggml_win_part(
@@ -10231,7 +10427,7 @@ static void ggml_compute_forward_group_norm_f32(
10231
10427
  int n_channels = src0->ne[2];
10232
10428
  int n_groups = dst->op_params[0];
10233
10429
  int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
10234
- for (int i = ith; i < n_groups; i+=nth) {
10430
+ for (int i = ith; i < n_groups; i += nth) {
10235
10431
  int start = i * n_channels_per_group;
10236
10432
  int end = start + n_channels_per_group;
10237
10433
  if (end > n_channels) {
@@ -10245,28 +10441,32 @@ static void ggml_compute_forward_group_norm_f32(
10245
10441
  for (int64_t i01 = 0; i01 < ne01; i01++) {
10246
10442
  const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
10247
10443
 
10444
+ ggml_float sumr = 0.0;
10248
10445
  for (int64_t i00 = 0; i00 < ne00; i00++) {
10249
- sum += (ggml_float)x[i00];
10446
+ sumr += (ggml_float)x[i00];
10250
10447
  }
10448
+ sum += sumr;
10251
10449
  }
10252
10450
  }
10253
- float mean = sum / (ne00 * ne01 * step);
10254
- ggml_float sum2 = 0.0;
10451
+ const float mean = sum / (ne00 * ne01 * step);
10255
10452
 
10453
+ ggml_float sum2 = 0.0;
10256
10454
  for (int64_t i02 = start; i02 < end; i02++) {
10257
10455
  for (int64_t i01 = 0; i01 < ne01; i01++) {
10258
10456
  const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
10259
10457
 
10260
10458
  float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
10261
10459
 
10460
+ ggml_float sumr = 0.0;
10262
10461
  for (int64_t i00 = 0; i00 < ne00; i00++) {
10263
10462
  float v = x[i00] - mean;
10264
10463
  y[i00] = v;
10265
- sum2 += (ggml_float)(v * v);
10464
+ sumr += (ggml_float)(v * v);
10266
10465
  }
10466
+ sum2 += sumr;
10267
10467
  }
10268
10468
  }
10269
- float variance = sum2 / (ne00 * ne01 * step);
10469
+ const float variance = sum2 / (ne00 * ne01 * step);
10270
10470
  const float scale = 1.0f / sqrtf(variance + eps);
10271
10471
 
10272
10472
  for (int64_t i02 = start; i02 < end; i02++) {
@@ -11373,8 +11573,6 @@ static void ggml_compute_forward_get_rows_q(
11373
11573
  const struct ggml_tensor * src0 = dst->src[0];
11374
11574
  const struct ggml_tensor * src1 = dst->src[1];
11375
11575
 
11376
- assert(params->ith == 0);
11377
-
11378
11576
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11379
11577
  return;
11380
11578
  }
@@ -11382,7 +11580,7 @@ static void ggml_compute_forward_get_rows_q(
11382
11580
  GGML_TENSOR_BINARY_OP_LOCALS
11383
11581
 
11384
11582
  const int64_t nc = ne00;
11385
- const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
11583
+ const int64_t nr = ggml_nelements(src1);
11386
11584
 
11387
11585
  const enum ggml_type type = src0->type;
11388
11586
  ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
@@ -11392,17 +11590,25 @@ static void ggml_compute_forward_get_rows_q(
11392
11590
  assert(nb00 == ggml_type_size(type));
11393
11591
  assert(ggml_nrows(dst) == nr);
11394
11592
 
11395
- // TODO: multi-thread
11396
- for (int64_t i12 = 0; i12 < ne12; ++i12) {
11397
- for (int64_t i11 = 0; i11 < ne11; ++i11) {
11398
- for (int64_t i10 = 0; i10 < ne10; ++i10) {
11399
- const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
11593
+ const int ith = params->ith;
11594
+ const int nth = params->nth;
11400
11595
 
11401
- dequantize_row_q(
11402
- (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
11403
- (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
11404
- }
11405
- }
11596
+ // rows per thread
11597
+ const int dr = (nr + nth - 1)/nth;
11598
+
11599
+ // row range for this thread
11600
+ const int ir0 = dr*ith;
11601
+ const int ir1 = MIN(ir0 + dr, nr);
11602
+
11603
+ for (int64_t i = ir0; i < ir1; ++i) {
11604
+ const int64_t i12 = i/(ne11*ne10);
11605
+ const int64_t i11 = (i - i12*ne11*ne10)/ne10;
11606
+ const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
11607
+ const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
11608
+
11609
+ dequantize_row_q(
11610
+ (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
11611
+ (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
11406
11612
  }
11407
11613
  }
11408
11614
 
@@ -11413,8 +11619,6 @@ static void ggml_compute_forward_get_rows_f16(
11413
11619
  const struct ggml_tensor * src0 = dst->src[0];
11414
11620
  const struct ggml_tensor * src1 = dst->src[1];
11415
11621
 
11416
- assert(params->ith == 0);
11417
-
11418
11622
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11419
11623
  return;
11420
11624
  }
@@ -11422,24 +11626,32 @@ static void ggml_compute_forward_get_rows_f16(
11422
11626
  GGML_TENSOR_BINARY_OP_LOCALS
11423
11627
 
11424
11628
  const int64_t nc = ne00;
11425
- const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
11629
+ const int64_t nr = ggml_nelements(src1);
11426
11630
 
11427
11631
  assert(ne0 == nc);
11428
11632
  assert(ne02 == ne11);
11429
11633
  assert(nb00 == sizeof(ggml_fp16_t));
11430
11634
  assert(ggml_nrows(dst) == nr);
11431
11635
 
11432
- // TODO: multi-thread
11433
- for (int64_t i12 = 0; i12 < ne12; ++i12) {
11434
- for (int64_t i11 = 0; i11 < ne11; ++i11) {
11435
- for (int64_t i10 = 0; i10 < ne10; ++i10) {
11436
- const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
11636
+ const int ith = params->ith;
11637
+ const int nth = params->nth;
11437
11638
 
11438
- ggml_fp16_to_fp32_row(
11439
- (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
11440
- (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
11441
- }
11442
- }
11639
+ // rows per thread
11640
+ const int dr = (nr + nth - 1)/nth;
11641
+
11642
+ // row range for this thread
11643
+ const int ir0 = dr*ith;
11644
+ const int ir1 = MIN(ir0 + dr, nr);
11645
+
11646
+ for (int64_t i = ir0; i < ir1; ++i) {
11647
+ const int64_t i12 = i/(ne11*ne10);
11648
+ const int64_t i11 = (i - i12*ne11*ne10)/ne10;
11649
+ const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
11650
+ const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
11651
+
11652
+ ggml_fp16_to_fp32_row(
11653
+ (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
11654
+ (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
11443
11655
  }
11444
11656
  }
11445
11657
 
@@ -11450,8 +11662,6 @@ static void ggml_compute_forward_get_rows_f32(
11450
11662
  const struct ggml_tensor * src0 = dst->src[0];
11451
11663
  const struct ggml_tensor * src1 = dst->src[1];
11452
11664
 
11453
- assert(params->ith == 0);
11454
-
11455
11665
  if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11456
11666
  return;
11457
11667
  }
@@ -11459,24 +11669,32 @@ static void ggml_compute_forward_get_rows_f32(
11459
11669
  GGML_TENSOR_BINARY_OP_LOCALS
11460
11670
 
11461
11671
  const int64_t nc = ne00;
11462
- const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
11672
+ const int64_t nr = ggml_nelements(src1);
11463
11673
 
11464
11674
  assert(ne0 == nc);
11465
11675
  assert(ne02 == ne11);
11466
11676
  assert(nb00 == sizeof(float));
11467
11677
  assert(ggml_nrows(dst) == nr);
11468
11678
 
11469
- // TODO: multi-thread
11470
- for (int64_t i12 = 0; i12 < ne12; ++i12) {
11471
- for (int64_t i11 = 0; i11 < ne11; ++i11) {
11472
- for (int64_t i10 = 0; i10 < ne10; ++i10) {
11473
- const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
11679
+ const int ith = params->ith;
11680
+ const int nth = params->nth;
11474
11681
 
11475
- ggml_vec_cpy_f32(nc,
11476
- (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
11477
- (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
11478
- }
11479
- }
11682
+ // rows per thread
11683
+ const int dr = (nr + nth - 1)/nth;
11684
+
11685
+ // row range for this thread
11686
+ const int ir0 = dr*ith;
11687
+ const int ir1 = MIN(ir0 + dr, nr);
11688
+
11689
+ for (int64_t i = ir0; i < ir1; ++i) {
11690
+ const int64_t i12 = i/(ne11*ne10);
11691
+ const int64_t i11 = (i - i12*ne11*ne10)/ne10;
11692
+ const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
11693
+ const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
11694
+
11695
+ ggml_vec_cpy_f32(nc,
11696
+ (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
11697
+ (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
11480
11698
  }
11481
11699
  }
11482
11700
 
@@ -12213,6 +12431,8 @@ static void ggml_compute_forward_alibi(
12213
12431
  case GGML_TYPE_I8:
12214
12432
  case GGML_TYPE_I16:
12215
12433
  case GGML_TYPE_I32:
12434
+ case GGML_TYPE_I64:
12435
+ case GGML_TYPE_F64:
12216
12436
  case GGML_TYPE_COUNT:
12217
12437
  {
12218
12438
  GGML_ASSERT(false);
@@ -12299,6 +12519,8 @@ static void ggml_compute_forward_clamp(
12299
12519
  case GGML_TYPE_I8:
12300
12520
  case GGML_TYPE_I16:
12301
12521
  case GGML_TYPE_I32:
12522
+ case GGML_TYPE_I64:
12523
+ case GGML_TYPE_F64:
12302
12524
  case GGML_TYPE_COUNT:
12303
12525
  {
12304
12526
  GGML_ASSERT(false);
@@ -13547,6 +13769,106 @@ static void ggml_compute_forward_pad(
13547
13769
  }
13548
13770
  }
13549
13771
 
13772
+
13773
+ // ggml_compute_forward_arange
13774
+
13775
+ static void ggml_compute_forward_arange_f32(
13776
+ const struct ggml_compute_params * params,
13777
+ struct ggml_tensor * dst) {
13778
+
13779
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
13780
+ return;
13781
+ }
13782
+
13783
+ GGML_ASSERT(dst->nb[0] == sizeof(float));
13784
+
13785
+ const int ith = params->ith;
13786
+ const int nth = params->nth;
13787
+
13788
+ const float start = ggml_get_op_params_f32(dst, 0);
13789
+ const float stop = ggml_get_op_params_f32(dst, 1);
13790
+ const float step = ggml_get_op_params_f32(dst, 2);
13791
+
13792
+ const int64_t steps = (int64_t) ceilf((stop - start) / step);
13793
+
13794
+ GGML_ASSERT(ggml_nelements(dst) == steps);
13795
+
13796
+ for (int64_t i = ith; i < steps; i+= nth) {
13797
+ float value = start + step * i;
13798
+ ((float *)dst->data)[i] = value;
13799
+ }
13800
+ }
13801
+
13802
+ static void ggml_compute_forward_arange(
13803
+ const struct ggml_compute_params * params,
13804
+ struct ggml_tensor * dst) {
13805
+ switch (dst->type) {
13806
+ case GGML_TYPE_F32:
13807
+ {
13808
+ ggml_compute_forward_arange_f32(params, dst);
13809
+ } break;
13810
+ default:
13811
+ {
13812
+ GGML_ASSERT(false);
13813
+ } break;
13814
+ }
13815
+ }
13816
+
13817
+ static void ggml_compute_forward_timestep_embedding_f32(
13818
+ const struct ggml_compute_params * params,
13819
+ struct ggml_tensor * dst) {
13820
+
13821
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
13822
+ return;
13823
+ }
13824
+
13825
+ const struct ggml_tensor * src0 = dst->src[0];
13826
+
13827
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
13828
+
13829
+ const int ith = params->ith;
13830
+ const int nth = params->nth;
13831
+
13832
+ GGML_TENSOR_UNARY_OP_LOCALS
13833
+
13834
+ const int dim = ggml_get_op_params_i32(dst, 0);
13835
+ const int max_period = ggml_get_op_params_i32(dst, 1);
13836
+
13837
+ int half = dim / 2;
13838
+
13839
+ for (int64_t i = 0; i < ne00; i++) {
13840
+ float * embed_data = (float *)((char *) dst->data + i*nb1);
13841
+ for (int64_t j = ith; j < half; j += nth) {
13842
+ float timestep = ((float *)src0->data)[i];
13843
+ float freq = (float)expf(-logf(max_period) * j / half);
13844
+ float arg = timestep * freq;
13845
+ embed_data[j] = cosf(arg);
13846
+ embed_data[j + half] = sinf(arg);
13847
+ }
13848
+ if (dim % 2 != 0 && ith == 0) {
13849
+ embed_data[dim] = 0.f;
13850
+ }
13851
+ }
13852
+ }
13853
+
13854
+ static void ggml_compute_forward_timestep_embedding(
13855
+ const struct ggml_compute_params * params,
13856
+ struct ggml_tensor * dst) {
13857
+
13858
+ const struct ggml_tensor * src0 = dst->src[0];
13859
+
13860
+ switch (src0->type) {
13861
+ case GGML_TYPE_F32:
13862
+ {
13863
+ ggml_compute_forward_timestep_embedding_f32(params, dst);
13864
+ } break;
13865
+ default:
13866
+ {
13867
+ GGML_ASSERT(false);
13868
+ } break;
13869
+ }
13870
+ }
13871
+
13550
13872
  // ggml_compute_forward_argsort
13551
13873
 
13552
13874
  static void ggml_compute_forward_argsort_f32(
@@ -14590,6 +14912,257 @@ static void ggml_compute_forward_flash_attn_back(
14590
14912
  }
14591
14913
  }
14592
14914
 
14915
+ // ggml_compute_forward_ssm_conv
14916
+
14917
+ static void ggml_compute_forward_ssm_conv_f32(
14918
+ const struct ggml_compute_params * params,
14919
+ struct ggml_tensor * dst) {
14920
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
14921
+ return;
14922
+ }
14923
+
14924
+ const struct ggml_tensor * src0 = dst->src[0]; // conv_state
14925
+ const struct ggml_tensor * src1 = dst->src[1]; // x
14926
+ const struct ggml_tensor * src2 = dst->src[2]; // conv1d.weight
14927
+ const struct ggml_tensor * src3 = dst->src[3]; // state_seq
14928
+
14929
+ const int ith = params->ith;
14930
+ const int nth = params->nth;
14931
+
14932
+ const int nc = src2->ne[0]; // d_conv
14933
+ const int nr = src0->ne[1]; // d_inner
14934
+ const int n_t = src1->ne[1]; // n_tokens
14935
+ const int n_kv = src0->ne[2]; // max number of sequences in the batch
14936
+
14937
+ GGML_ASSERT((nr*n_t) + (nc*nr*n_kv) == ggml_nelements(dst));
14938
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
14939
+ GGML_ASSERT(src1->nb[0] == sizeof(float));
14940
+ GGML_ASSERT(src2->nb[0] == sizeof(float));
14941
+ GGML_ASSERT(src3->nb[0] == sizeof(int32_t));
14942
+ GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float));
14943
+ // for use with the destination state offset between sequences
14944
+ GGML_ASSERT(src2->nb[2] == src2->ne[1]*src2->ne[0]*sizeof(float));
14945
+
14946
+ // rows per thread
14947
+ const int dr = (nr + nth - 1)/nth;
14948
+
14949
+ // row range for this thread
14950
+ const int ir0 = dr*ith;
14951
+ const int ir1 = MIN(ir0 + dr, nr);
14952
+ const int ir = ir1 - ir0;
14953
+
14954
+ if (n_kv > 1) {
14955
+ // multiple sequences means it's hard to know when it's the first time a state is read,
14956
+ // so copy them all over to the destination, just to be sure.
14957
+ for (int i3 = 0; i3 < n_kv; ++i3) {
14958
+ float * s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]));
14959
+ float * s = (float *) ((char *) dst->data + ir0*(src2->nb[1]) + i3*(src2->nb[2]) + nr*n_t*sizeof(float));
14960
+ // can't use memcpy because of d_conv vs d_conv - 1
14961
+ for (int i1 = 0; i1 < ir; ++i1) {
14962
+ for (int i0 = 0; i0 < nc - 1; ++i0) {
14963
+ // copy s0 to last (d_conv - 1) columns of s
14964
+ s[1 + i0 + i1*nc] = s0[i0 + i1*(nc - 1)];
14965
+ }
14966
+ }
14967
+ }
14968
+ }
14969
+
14970
+ for (int i2 = 0; i2 < n_t; ++i2) {
14971
+ int32_t * sq = (int32_t *) ((char *) src3->data + i2*(src3->nb[1])); // {n_kv, n_tokens}
14972
+ float * x = (float *) ((char *) dst->data + ir0*sizeof(float) + i2*(nr*sizeof(float))); // {d_inner, n_tokens}
14973
+ float * s = (float *) ((char *) dst->data + ir0*(src2->nb[1]) + sq[0]*(src2->nb[2]) + nr*n_t*sizeof(float)); // {d_conv, d_inner, n_kv}
14974
+ float * s0; // {d_conv - 1, d_inner, n_kv}
14975
+ float * x0 = (float *) ((char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1])); // {d_inner, n_tokens}
14976
+ float * c = (float *) ((char *) src2->data + ir0*(src2->nb[1])); // {d_conv, d_inner}
14977
+ int ne0s0;
14978
+
14979
+ GGML_ASSERT(0 <= sq[0] && sq[0] < n_kv);
14980
+
14981
+ // avoid needing to copy the state for the first token
14982
+ if (i2 == 0) {
14983
+ s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + sq[0]*(src0->nb[2])); // {d_conv - 1, d_inner, n_kv}
14984
+ ne0s0 = src0->ne[0];
14985
+ } else {
14986
+ // the source is the last (d_conv - 1) columns of the destination
14987
+ s0 = s + 1;
14988
+ ne0s0 = nc;
14989
+ }
14990
+
14991
+ // d_inner
14992
+ for (int i1 = 0; i1 < ir; ++i1) {
14993
+ // shift state left
14994
+ for (int i0 = 0; i0 < nc - 1; ++i0) {
14995
+ s[i0 + i1*nc] = s0[i0 + i1*ne0s0];
14996
+ }
14997
+ // insert x on the last column
14998
+ s[(nc - 1) + i1*nc] = x0[i1];
14999
+ }
15000
+
15001
+ // handle copies when there are multiple output states
15002
+ for (int i3 = 1; i3 < n_kv; ++i3) {
15003
+ int32_t seq = sq[i3];
15004
+ if (0 <= seq && seq < n_kv) {
15005
+ float * s1 = s + (seq - sq[0])*nc*nr;
15006
+ memcpy(s1, s, nc*ir*sizeof(float));
15007
+ } else {
15008
+ // stop at negative or too big seq_ids
15009
+ break;
15010
+ }
15011
+ }
15012
+
15013
+ // it seems a little faster when this is separate from the state shift
15014
+ for (int i1 = 0; i1 < ir; ++i1) {
15015
+ // rowwise dot product
15016
+ float sumf = 0.0f;
15017
+ for (int i0 = 0; i0 < nc; ++i0) {
15018
+ int i = i0 + i1*nc;
15019
+ sumf += s[i] * c[i];
15020
+ }
15021
+ x[i1] = sumf;
15022
+ }
15023
+ }
15024
+ }
15025
+
15026
+ static void ggml_compute_forward_ssm_conv(
15027
+ const struct ggml_compute_params * params,
15028
+ struct ggml_tensor * dst) {
15029
+ switch (dst->src[0]->type) {
15030
+ case GGML_TYPE_F32:
15031
+ {
15032
+ ggml_compute_forward_ssm_conv_f32(params, dst);
15033
+ } break;
15034
+ default:
15035
+ {
15036
+ GGML_ASSERT(false);
15037
+ } break;
15038
+ }
15039
+ }
15040
+
15041
+ // ggml_compute_forward_ssm_scan
15042
+
15043
+ static void ggml_compute_forward_ssm_scan_f32(
15044
+ const struct ggml_compute_params * params,
15045
+ struct ggml_tensor * dst) {
15046
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
15047
+ return;
15048
+ }
15049
+
15050
+ const struct ggml_tensor * src0 = dst->src[0]; // s
15051
+ const struct ggml_tensor * src1 = dst->src[1]; // x
15052
+ const struct ggml_tensor * src2 = dst->src[2]; // dt
15053
+ const struct ggml_tensor * src3 = dst->src[3]; // A
15054
+ const struct ggml_tensor * src4 = dst->src[4]; // B
15055
+ const struct ggml_tensor * src5 = dst->src[5]; // C
15056
+ const struct ggml_tensor * src6 = dst->src[6]; // sq
15057
+
15058
+ const int ith = params->ith;
15059
+ const int nth = params->nth;
15060
+
15061
+ const int64_t nc = src0->ne[0]; // d_state
15062
+ const int64_t nr = src0->ne[1]; // d_inner
15063
+ const int64_t n_t = src1->ne[1]; // number of tokens in the batch
15064
+ const int64_t n_kv = src0->ne[2]; // max number of sequences in the batch
15065
+
15066
+ GGML_ASSERT(ggml_nelements(src1) + ggml_nelements(src0) == ggml_nelements(dst));
15067
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
15068
+ GGML_ASSERT(src1->nb[0] == sizeof(float));
15069
+ GGML_ASSERT(src2->nb[0] == sizeof(float));
15070
+ GGML_ASSERT(src3->nb[0] == sizeof(float));
15071
+ GGML_ASSERT(src4->nb[0] == sizeof(float));
15072
+ GGML_ASSERT(src5->nb[0] == sizeof(float));
15073
+ // required for the dot product between s and C, and when copying the states
15074
+ GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float));
15075
+ // required for per-sequence offsets for states
15076
+ GGML_ASSERT(src0->nb[2] == src0->ne[0]*src0->ne[1]*sizeof(float));
15077
+ // required to get correct offset for state destination (i.e. src1->nb[2])
15078
+ GGML_ASSERT(src1->nb[2] == src1->ne[0]*src1->ne[1]*sizeof(float));
15079
+
15080
+ // rows per thread
15081
+ const int dr = (nr + nth - 1)/nth;
15082
+
15083
+ // row range for this thread
15084
+ const int ir0 = dr*ith;
15085
+ const int ir1 = MIN(ir0 + dr, nr);
15086
+ const int ir = ir1 - ir0;
15087
+
15088
+ if (n_kv > 1) {
15089
+ // it's hard to know if the source states have already been copied
15090
+ // when there are multiple, so copy them already.
15091
+ for (int i3 = 0; i3 < n_kv; ++i3) {
15092
+ float * s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]));
15093
+ float * s = (float *) ((char *) dst->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]) + src1->nb[2]);
15094
+ memcpy(s, s0, nc*ir*sizeof(float));
15095
+ }
15096
+ }
15097
+
15098
+ for (int i2 = 0; i2 < n_t; ++i2) {
15099
+ int32_t * sq = (int32_t *) ((char *) src6->data + i2*(src6->nb[1])); // {n_kv, n_tokens}
15100
+ float * y = (float *) ((char *) dst->data + ir0*(src1->nb[0]) + i2*(src1->nb[1])); // {d_inner, n_tokens}
15101
+ float * s = (float *) ((char *) dst->data + ir0*(src0->nb[1]) + sq[0]*(src0->nb[2]) + src1->nb[2]); // {d_state, d_inner, n_kv}
15102
+ float * s0;
15103
+ float * x = (float *) ((char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1])); // {d_inner, n_tokens}
15104
+ float * dt = (float *) ((char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1])); // {d_inner, n_tokens}
15105
+ float * A = (float *) ((char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
15106
+ float * B = (float *) ((char *) src4->data + i2*(src4->nb[1])); // {d_state, n_tokens}
15107
+ float * C = (float *) ((char *) src5->data + i2*(src5->nb[1])); // {d_state, n_tokens}
15108
+
15109
+ GGML_ASSERT(0 <= sq[0] && sq[0] < n_kv);
15110
+
15111
+ // avoid needing to copy the state for the first token
15112
+ if (i2 == 0) {
15113
+ s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + sq[0]*(src0->nb[2])); // {d_state, d_inner, n_kv}
15114
+ } else {
15115
+ // otherwise the source is the same as the destination
15116
+ s0 = s;
15117
+ }
15118
+
15119
+ // d_inner
15120
+ for (int i1 = 0; i1 < ir; ++i1) {
15121
+ // ref: https://github.com/state-spaces/mamba/blob/34076d664838588a3c97727b263478ab9f621a07/mamba_ssm/ops/triton/selective_state_update.py#L78
15122
+ float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1];
15123
+ float x_dt = x[i1] * dt_soft_plus;
15124
+ float sumf = 0.0f;
15125
+ // d_state
15126
+ for (int i0 = 0; i0 < nc; ++i0) {
15127
+ int i = i0 + i1*nc;
15128
+ // state = prev_state * dA + dB * x
15129
+ float state = (s0[i] * expf(dt_soft_plus * A[i])) + (B[i0] * x_dt);
15130
+ // y = rowwise_dotprod(state, C)
15131
+ sumf += state * C[i0];
15132
+ s[i] = state;
15133
+ }
15134
+ y[i1] = sumf;
15135
+ }
15136
+
15137
+ // handle copies when there are multiple output states
15138
+ for (int i3 = 1; i3 < n_kv; ++i3) {
15139
+ int32_t seq = sq[i3];
15140
+ if (0 <= seq && seq < n_kv) {
15141
+ float * s1 = s + (seq - sq[0])*nc*nr;
15142
+ memcpy(s1, s, nc*ir*sizeof(float));
15143
+ } else {
15144
+ // stop at negative or too big seq_ids
15145
+ break;
15146
+ }
15147
+ }
15148
+ }
15149
+ }
15150
+
15151
+ static void ggml_compute_forward_ssm_scan(
15152
+ const struct ggml_compute_params * params,
15153
+ struct ggml_tensor * dst) {
15154
+ switch (dst->src[0]->type) {
15155
+ case GGML_TYPE_F32:
15156
+ {
15157
+ ggml_compute_forward_ssm_scan_f32(params, dst);
15158
+ } break;
15159
+ default:
15160
+ {
15161
+ GGML_ASSERT(false);
15162
+ } break;
15163
+ }
15164
+ }
15165
+
14593
15166
  // ggml_compute_forward_win_part
14594
15167
 
14595
15168
  static void ggml_compute_forward_win_part_f32(
@@ -15615,6 +16188,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15615
16188
  {
15616
16189
  ggml_compute_forward_pad(params, tensor);
15617
16190
  } break;
16191
+ case GGML_OP_ARANGE:
16192
+ {
16193
+ ggml_compute_forward_arange(params, tensor);
16194
+ } break;
16195
+ case GGML_OP_TIMESTEP_EMBEDDING:
16196
+ {
16197
+ ggml_compute_forward_timestep_embedding(params, tensor);
16198
+ } break;
15618
16199
  case GGML_OP_ARGSORT:
15619
16200
  {
15620
16201
  ggml_compute_forward_argsort(params, tensor);
@@ -15641,6 +16222,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15641
16222
  bool masked = t != 0;
15642
16223
  ggml_compute_forward_flash_attn_back(params, masked, tensor);
15643
16224
  } break;
16225
+ case GGML_OP_SSM_CONV:
16226
+ {
16227
+ ggml_compute_forward_ssm_conv(params, tensor);
16228
+ } break;
16229
+ case GGML_OP_SSM_SCAN:
16230
+ {
16231
+ ggml_compute_forward_ssm_scan(params, tensor);
16232
+ } break;
15644
16233
  case GGML_OP_WIN_PART:
15645
16234
  {
15646
16235
  ggml_compute_forward_win_part(params, tensor);
@@ -16617,6 +17206,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
16617
17206
  {
16618
17207
  GGML_ASSERT(false); // TODO: not implemented
16619
17208
  } break;
17209
+ case GGML_OP_ARANGE:
17210
+ {
17211
+ GGML_ASSERT(false); // TODO: not implemented
17212
+ } break;
17213
+ case GGML_OP_TIMESTEP_EMBEDDING:
17214
+ {
17215
+ GGML_ASSERT(false); // TODO: not implemented
17216
+ } break;
16620
17217
  case GGML_OP_ARGSORT:
16621
17218
  {
16622
17219
  GGML_ASSERT(false); // TODO: not implemented
@@ -16687,6 +17284,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
16687
17284
  {
16688
17285
  GGML_ASSERT(false); // not supported
16689
17286
  } break;
17287
+ case GGML_OP_SSM_CONV:
17288
+ case GGML_OP_SSM_SCAN:
17289
+ {
17290
+ GGML_ASSERT(false); // TODO: not implemented
17291
+ } break;
16690
17292
  case GGML_OP_WIN_PART:
16691
17293
  case GGML_OP_WIN_UNPART:
16692
17294
  case GGML_OP_UNARY:
@@ -17217,6 +17819,7 @@ struct ggml_compute_state {
17217
17819
  ggml_thread_t thrd;
17218
17820
  int ith;
17219
17821
  struct ggml_compute_state_shared * shared;
17822
+ enum ggml_status ec;
17220
17823
  };
17221
17824
 
17222
17825
  static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
@@ -17228,7 +17831,7 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
17228
17831
  node->perf_time_us += time_us_cur;
17229
17832
  }
17230
17833
 
17231
- static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
17834
+ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_threads) {
17232
17835
  int n_tasks = 0;
17233
17836
 
17234
17837
  switch (node->op) {
@@ -17309,6 +17912,12 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
17309
17912
  {
17310
17913
  n_tasks = n_threads;
17311
17914
  } break;
17915
+ case GGML_OP_GET_ROWS:
17916
+ {
17917
+ // FIXME: the cost of launching additional threads decreases performance with GPU offloading
17918
+ //n_tasks = MIN(n_threads, ggml_nelements(node->src[1]));
17919
+ n_tasks = MIN(n_cur_threads, ggml_nelements(node->src[1]));
17920
+ } break;
17312
17921
  case GGML_OP_SCALE:
17313
17922
  case GGML_OP_SET:
17314
17923
  case GGML_OP_CONT:
@@ -17316,7 +17925,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
17316
17925
  case GGML_OP_VIEW:
17317
17926
  case GGML_OP_PERMUTE:
17318
17927
  case GGML_OP_TRANSPOSE:
17319
- case GGML_OP_GET_ROWS:
17320
17928
  case GGML_OP_GET_ROWS_BACK:
17321
17929
  case GGML_OP_DIAG:
17322
17930
  {
@@ -17368,6 +17976,14 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
17368
17976
  {
17369
17977
  n_tasks = n_threads;
17370
17978
  } break;
17979
+ case GGML_OP_ARANGE:
17980
+ {
17981
+ n_tasks = n_threads;
17982
+ } break;
17983
+ case GGML_OP_TIMESTEP_EMBEDDING:
17984
+ {
17985
+ n_tasks = n_threads;
17986
+ } break;
17371
17987
  case GGML_OP_ARGSORT:
17372
17988
  {
17373
17989
  n_tasks = n_threads;
@@ -17384,6 +18000,11 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
17384
18000
  {
17385
18001
  n_tasks = n_threads;
17386
18002
  } break;
18003
+ case GGML_OP_SSM_CONV:
18004
+ case GGML_OP_SSM_SCAN:
18005
+ {
18006
+ n_tasks = n_threads;
18007
+ } break;
17387
18008
  case GGML_OP_WIN_PART:
17388
18009
  case GGML_OP_WIN_UNPART:
17389
18010
  case GGML_OP_GET_REL_POS:
@@ -17502,7 +18123,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
17502
18123
  while (true) {
17503
18124
  if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
17504
18125
  state->shared->node_n += 1;
17505
- return (thread_ret_t) GGML_EXIT_ABORTED;
18126
+ state->ec = GGML_STATUS_ABORTED;
18127
+ return 0;
17506
18128
  }
17507
18129
 
17508
18130
  if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
@@ -17520,7 +18142,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
17520
18142
  /* FINALIZE */
17521
18143
  struct ggml_tensor * node = cgraph->nodes[node_n];
17522
18144
  if (GGML_OP_HAS_FINALIZE[node->op]) {
17523
- params.nth = ggml_get_n_tasks(node, n_threads);
18145
+ params.nth = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
17524
18146
  ggml_compute_forward(&params, node);
17525
18147
  }
17526
18148
  ggml_graph_compute_perf_stats_node(node, state->shared);
@@ -17530,7 +18152,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
17530
18152
  while (++node_n < cgraph->n_nodes) {
17531
18153
  GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
17532
18154
  struct ggml_tensor * node = cgraph->nodes[node_n];
17533
- const int n_tasks = ggml_get_n_tasks(node, n_threads);
18155
+ const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
17534
18156
 
17535
18157
  state->shared->perf_node_start_cycles = ggml_perf_cycles();
17536
18158
  state->shared->perf_node_start_time_us = ggml_perf_time_us();
@@ -17578,7 +18200,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
17578
18200
 
17579
18201
  /* INIT & COMPUTE */
17580
18202
  struct ggml_tensor * node = cgraph->nodes[node_n];
17581
- const int n_tasks = ggml_get_n_tasks(node, n_threads);
18203
+ const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
17582
18204
 
17583
18205
  struct ggml_compute_params params = {
17584
18206
  /*.type =*/ GGML_TASK_TYPE_INIT,
@@ -17624,7 +18246,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
17624
18246
  }
17625
18247
  }
17626
18248
 
17627
- return GGML_EXIT_SUCCESS;
18249
+ return 0;
17628
18250
  }
17629
18251
 
17630
18252
  struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) {
@@ -17643,7 +18265,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
17643
18265
  for (int i = 0; i < cgraph->n_nodes; i++) {
17644
18266
  struct ggml_tensor * node = cgraph->nodes[i];
17645
18267
 
17646
- const int n_tasks = ggml_get_n_tasks(node, n_threads);
18268
+ const int n_tasks = ggml_get_n_tasks(node, n_threads, 1);
17647
18269
 
17648
18270
  max_tasks = MAX(max_tasks, n_tasks);
17649
18271
 
@@ -17820,7 +18442,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
17820
18442
  return cplan;
17821
18443
  }
17822
18444
 
17823
- int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
18445
+ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
17824
18446
  {
17825
18447
  GGML_ASSERT(cplan);
17826
18448
  GGML_ASSERT(cplan->n_threads > 0);
@@ -17864,6 +18486,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
17864
18486
  .thrd = 0,
17865
18487
  .ith = j,
17866
18488
  .shared = &state_shared,
18489
+ .ec = GGML_STATUS_SUCCESS,
17867
18490
  };
17868
18491
 
17869
18492
  const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
@@ -17874,12 +18497,14 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
17874
18497
 
17875
18498
  workers[0].ith = 0;
17876
18499
  workers[0].shared = &state_shared;
18500
+ workers[0].ec = GGML_STATUS_SUCCESS;
17877
18501
 
17878
18502
  const int64_t perf_start_cycles = ggml_perf_cycles();
17879
18503
  const int64_t perf_start_time_us = ggml_perf_time_us();
17880
18504
 
17881
18505
  // this is a work thread too
17882
- int compute_status = (size_t) ggml_graph_compute_thread(&workers[0]);
18506
+ ggml_graph_compute_thread(&workers[0]);
18507
+ enum ggml_status compute_status = workers[0].ec;
17883
18508
 
17884
18509
  // don't leave affinity set on the main thread
17885
18510
  clear_numa_thread_affinity();
@@ -17889,6 +18514,8 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
17889
18514
  for (int j = 1; j < n_threads; j++) {
17890
18515
  const int rc = ggml_thread_join(workers[j].thrd, NULL);
17891
18516
  GGML_ASSERT(rc == 0);
18517
+ if (workers[j].ec != GGML_STATUS_SUCCESS)
18518
+ compute_status = workers[j].ec;
17892
18519
  }
17893
18520
  }
17894
18521
 
@@ -17916,14 +18543,14 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
17916
18543
  return compute_status;
17917
18544
  }
17918
18545
 
17919
- void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
18546
+ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
17920
18547
  struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
17921
18548
 
17922
18549
  struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
17923
18550
 
17924
18551
  cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
17925
18552
 
17926
- ggml_graph_compute(cgraph, &cplan);
18553
+ return ggml_graph_compute(cgraph, &cplan);
17927
18554
  }
17928
18555
 
17929
18556
  struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
@@ -19572,133 +20199,6 @@ void ggml_quantize_free(void) {
19572
20199
  ggml_critical_section_end();
19573
20200
  }
19574
20201
 
19575
- size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) {
19576
- assert(k % QK4_0 == 0);
19577
- const int nb = k / QK4_0;
19578
-
19579
- for (int b = 0; b < n; b += k) {
19580
- block_q4_0 * restrict y = (block_q4_0 *) dst + b/QK4_0;
19581
-
19582
- quantize_row_q4_0_reference(src + b, y, k);
19583
-
19584
- for (int i = 0; i < nb; i++) {
19585
- for (int j = 0; j < QK4_0; j += 2) {
19586
- const uint8_t vi0 = y[i].qs[j/2] & 0x0F;
19587
- const uint8_t vi1 = y[i].qs[j/2] >> 4;
19588
-
19589
- hist[vi0]++;
19590
- hist[vi1]++;
19591
- }
19592
- }
19593
- }
19594
-
19595
- return (n/QK4_0*sizeof(block_q4_0));
19596
- }
19597
-
19598
- size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) {
19599
- assert(k % QK4_1 == 0);
19600
- const int nb = k / QK4_1;
19601
-
19602
- for (int b = 0; b < n; b += k) {
19603
- block_q4_1 * restrict y = (block_q4_1 *) dst + b/QK4_1;
19604
-
19605
- quantize_row_q4_1_reference(src + b, y, k);
19606
-
19607
- for (int i = 0; i < nb; i++) {
19608
- for (int j = 0; j < QK4_1; j += 2) {
19609
- const uint8_t vi0 = y[i].qs[j/2] & 0x0F;
19610
- const uint8_t vi1 = y[i].qs[j/2] >> 4;
19611
-
19612
- hist[vi0]++;
19613
- hist[vi1]++;
19614
- }
19615
- }
19616
- }
19617
-
19618
- return (n/QK4_1*sizeof(block_q4_1));
19619
- }
19620
-
19621
- size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist) {
19622
- assert(k % QK5_0 == 0);
19623
- const int nb = k / QK5_0;
19624
-
19625
- for (int b = 0; b < n; b += k) {
19626
- block_q5_0 * restrict y = (block_q5_0 *)dst + b/QK5_0;
19627
-
19628
- quantize_row_q5_0_reference(src + b, y, k);
19629
-
19630
- for (int i = 0; i < nb; i++) {
19631
- uint32_t qh;
19632
- memcpy(&qh, &y[i].qh, sizeof(qh));
19633
-
19634
- for (int j = 0; j < QK5_0; j += 2) {
19635
- const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
19636
- const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
19637
-
19638
- // cast to 16 bins
19639
- const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
19640
- const uint8_t vi1 = ((y[i].qs[j/2] >> 4) | vh1) / 2;
19641
-
19642
- hist[vi0]++;
19643
- hist[vi1]++;
19644
- }
19645
- }
19646
- }
19647
-
19648
- return (n/QK5_0*sizeof(block_q5_0));
19649
- }
19650
-
19651
- size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist) {
19652
- assert(k % QK5_1 == 0);
19653
- const int nb = k / QK5_1;
19654
-
19655
- for (int b = 0; b < n; b += k) {
19656
- block_q5_1 * restrict y = (block_q5_1 *)dst + b/QK5_1;
19657
-
19658
- quantize_row_q5_1_reference(src + b, y, k);
19659
-
19660
- for (int i = 0; i < nb; i++) {
19661
- uint32_t qh;
19662
- memcpy(&qh, &y[i].qh, sizeof(qh));
19663
-
19664
- for (int j = 0; j < QK5_1; j += 2) {
19665
- const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
19666
- const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
19667
-
19668
- // cast to 16 bins
19669
- const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
19670
- const uint8_t vi1 = ((y[i].qs[j/2] >> 4) | vh1) / 2;
19671
-
19672
- hist[vi0]++;
19673
- hist[vi1]++;
19674
- }
19675
- }
19676
- }
19677
-
19678
- return (n/QK5_1*sizeof(block_q5_1));
19679
- }
19680
-
19681
- size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist) {
19682
- assert(k % QK8_0 == 0);
19683
- const int nb = k / QK8_0;
19684
-
19685
- for (int b = 0; b < n; b += k) {
19686
- block_q8_0 * restrict y = (block_q8_0 *)dst + b/QK8_0;
19687
-
19688
- quantize_row_q8_0_reference(src + b, y, k);
19689
-
19690
- for (int i = 0; i < nb; i++) {
19691
- for (int j = 0; j < QK8_0; ++j) {
19692
- const int8_t vi = y[i].qs[j];
19693
-
19694
- hist[vi/16 + 8]++;
19695
- }
19696
- }
19697
- }
19698
-
19699
- return (n/QK8_0*sizeof(block_q8_0));
19700
- }
19701
-
19702
20202
  bool ggml_quantize_requires_imatrix(enum ggml_type type) {
19703
20203
  return
19704
20204
  type == GGML_TYPE_IQ2_XXS ||
@@ -19706,177 +20206,52 @@ bool ggml_quantize_requires_imatrix(enum ggml_type type) {
19706
20206
  type == GGML_TYPE_IQ1_S;
19707
20207
  }
19708
20208
 
19709
- size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
19710
- int nrows, int n_per_row, int64_t * hist, const float * imatrix) {
20209
+ size_t ggml_quantize_chunk(
20210
+ enum ggml_type type,
20211
+ const float * src,
20212
+ void * dst,
20213
+ int start,
20214
+ int nrows,
20215
+ int n_per_row,
20216
+ const float * imatrix) {
20217
+ const int n = nrows * n_per_row;
20218
+
20219
+ if (ggml_quantize_requires_imatrix(type)) {
20220
+ GGML_ASSERT(imatrix != NULL);
20221
+ }
20222
+
20223
+ GGML_ASSERT(start % type_traits[type].blck_size == 0);
20224
+ GGML_ASSERT(start % n_per_row == 0);
20225
+
19711
20226
  ggml_quantize_init(type); // this is noop if already initialized
20227
+
20228
+ const size_t start_row = start / n_per_row;
20229
+ const size_t row_size = ggml_row_size(type, n_per_row);
20230
+
19712
20231
  size_t result = 0;
19713
- int n = nrows * n_per_row;
20232
+
19714
20233
  switch (type) {
19715
- case GGML_TYPE_Q4_0:
19716
- {
19717
- GGML_ASSERT(start % QK4_0 == 0);
19718
- GGML_ASSERT(start % n_per_row == 0);
19719
- size_t start_row = start / n_per_row;
19720
- size_t row_size = ggml_row_size(type, n_per_row);
19721
- result = quantize_q4_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19722
- GGML_ASSERT(result == row_size * nrows);
19723
- } break;
19724
- case GGML_TYPE_Q4_1:
19725
- {
19726
- GGML_ASSERT(start % QK4_1 == 0);
19727
- GGML_ASSERT(start % n_per_row == 0);
19728
- size_t start_row = start / n_per_row;
19729
- size_t row_size = ggml_row_size(type, n_per_row);
19730
- result = quantize_q4_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19731
- GGML_ASSERT(result == row_size * nrows);
19732
- } break;
19733
- case GGML_TYPE_Q5_0:
19734
- {
19735
- GGML_ASSERT(start % QK5_0 == 0);
19736
- GGML_ASSERT(start % n_per_row == 0);
19737
- size_t start_row = start / n_per_row;
19738
- size_t row_size = ggml_row_size(type, n_per_row);
19739
- result = quantize_q5_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19740
- GGML_ASSERT(result == row_size * nrows);
19741
- } break;
19742
- case GGML_TYPE_Q5_1:
19743
- {
19744
- GGML_ASSERT(start % QK5_1 == 0);
19745
- GGML_ASSERT(start % n_per_row == 0);
19746
- size_t start_row = start / n_per_row;
19747
- size_t row_size = ggml_row_size(type, n_per_row);
19748
- result = quantize_q5_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19749
- GGML_ASSERT(result == row_size * nrows);
19750
- } break;
19751
- case GGML_TYPE_Q8_0:
19752
- {
19753
- GGML_ASSERT(start % QK8_0 == 0);
19754
- block_q8_0 * block = (block_q8_0*)dst + start / QK8_0;
19755
- result = ggml_quantize_q8_0(src + start, block, n, n, hist);
19756
- } break;
19757
- case GGML_TYPE_Q2_K:
19758
- {
19759
- GGML_ASSERT(start % QK_K == 0);
19760
- GGML_ASSERT(start % n_per_row == 0);
19761
- size_t start_row = start / n_per_row;
19762
- size_t row_size = ggml_row_size(type, n_per_row);
19763
- result = quantize_q2_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19764
- GGML_ASSERT(result == row_size * nrows);
19765
- } break;
19766
- case GGML_TYPE_Q3_K:
19767
- {
19768
- GGML_ASSERT(start % QK_K == 0);
19769
- GGML_ASSERT(start % n_per_row == 0);
19770
- size_t start_row = start / n_per_row;
19771
- size_t row_size = ggml_row_size(type, n_per_row);
19772
- result = quantize_q3_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19773
- GGML_ASSERT(result == row_size * nrows);
19774
- } break;
19775
- case GGML_TYPE_Q4_K:
19776
- {
19777
- GGML_ASSERT(start % QK_K == 0);
19778
- GGML_ASSERT(start % n_per_row == 0);
19779
- size_t start_row = start / n_per_row;
19780
- size_t row_size = ggml_row_size(type, n_per_row);
19781
- result = quantize_q4_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19782
- GGML_ASSERT(result == row_size * nrows);
19783
- } break;
19784
- case GGML_TYPE_Q5_K:
19785
- {
19786
- GGML_ASSERT(start % QK_K == 0);
19787
- GGML_ASSERT(start % n_per_row == 0);
19788
- size_t start_row = start / n_per_row;
19789
- size_t row_size = ggml_row_size(type, n_per_row);
19790
- result = quantize_q5_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19791
- GGML_ASSERT(result == row_size * nrows);
19792
- } break;
19793
- case GGML_TYPE_Q6_K:
19794
- {
19795
- GGML_ASSERT(start % QK_K == 0);
19796
- GGML_ASSERT(start % n_per_row == 0);
19797
- size_t start_row = start / n_per_row;
19798
- size_t row_size = ggml_row_size(type, n_per_row);
19799
- result = quantize_q6_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19800
- GGML_ASSERT(result == row_size * nrows);
19801
- } break;
19802
- case GGML_TYPE_IQ2_XXS:
19803
- {
19804
- GGML_ASSERT(start % QK_K == 0);
19805
- GGML_ASSERT(start % n_per_row == 0);
19806
- GGML_ASSERT(imatrix);
19807
- size_t start_row = start / n_per_row;
19808
- size_t row_size = ggml_row_size(type, n_per_row);
19809
- result = quantize_iq2_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19810
- GGML_ASSERT(result == row_size * nrows);
19811
- } break;
19812
- case GGML_TYPE_IQ2_XS:
19813
- {
19814
- GGML_ASSERT(start % QK_K == 0);
19815
- GGML_ASSERT(start % n_per_row == 0);
19816
- GGML_ASSERT(imatrix);
19817
- size_t start_row = start / n_per_row;
19818
- size_t row_size = ggml_row_size(type, n_per_row);
19819
- result = quantize_iq2_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19820
- GGML_ASSERT(result == row_size * nrows);
19821
- } break;
19822
- case GGML_TYPE_IQ3_XXS:
19823
- {
19824
- GGML_ASSERT(start % QK_K == 0);
19825
- GGML_ASSERT(start % n_per_row == 0);
19826
- size_t start_row = start / n_per_row;
19827
- size_t row_size = ggml_row_size(type, n_per_row);
19828
- result = quantize_iq3_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19829
- GGML_ASSERT(result == row_size * nrows);
19830
- } break;
19831
- case GGML_TYPE_IQ3_S:
19832
- {
19833
- GGML_ASSERT(start % QK_K == 0);
19834
- GGML_ASSERT(start % n_per_row == 0);
19835
- size_t start_row = start / n_per_row;
19836
- size_t row_size = ggml_row_size(type, n_per_row);
19837
- result = quantize_iq3_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19838
- GGML_ASSERT(result == row_size * nrows);
19839
- } break;
19840
- case GGML_TYPE_IQ2_S:
19841
- {
19842
- GGML_ASSERT(start % QK_K == 0);
19843
- GGML_ASSERT(start % n_per_row == 0);
19844
- size_t start_row = start / n_per_row;
19845
- size_t row_size = ggml_row_size(type, n_per_row);
19846
- result = quantize_iq2_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19847
- GGML_ASSERT(result == row_size * nrows);
19848
- } break;
19849
- case GGML_TYPE_IQ1_S:
19850
- {
19851
- GGML_ASSERT(start % QK_K == 0);
19852
- GGML_ASSERT(start % n_per_row == 0);
19853
- size_t start_row = start / n_per_row;
19854
- size_t row_size = ggml_row_size(type, n_per_row);
19855
- result = quantize_iq1_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19856
- GGML_ASSERT(result == row_size * nrows);
19857
- } break;
19858
- case GGML_TYPE_IQ4_NL:
20234
+ case GGML_TYPE_Q4_0: result = quantize_q4_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20235
+ case GGML_TYPE_Q4_1: result = quantize_q4_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20236
+ case GGML_TYPE_Q5_0: result = quantize_q5_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20237
+ case GGML_TYPE_Q5_1: result = quantize_q5_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20238
+ case GGML_TYPE_Q8_0: result = quantize_q8_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20239
+ case GGML_TYPE_Q2_K: result = quantize_q2_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20240
+ case GGML_TYPE_Q3_K: result = quantize_q3_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20241
+ case GGML_TYPE_Q4_K: result = quantize_q4_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20242
+ case GGML_TYPE_Q5_K: result = quantize_q5_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20243
+ case GGML_TYPE_Q6_K: result = quantize_q6_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20244
+ case GGML_TYPE_IQ2_XXS: result = quantize_iq2_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20245
+ case GGML_TYPE_IQ2_XS: result = quantize_iq2_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20246
+ case GGML_TYPE_IQ3_XXS: result = quantize_iq3_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20247
+ case GGML_TYPE_IQ3_S: result = quantize_iq3_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20248
+ case GGML_TYPE_IQ2_S: result = quantize_iq2_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20249
+ case GGML_TYPE_IQ1_S: result = quantize_iq1_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20250
+ case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
19859
20251
  #if QK_K == 64
19860
- case GGML_TYPE_IQ4_XS:
19861
- #endif
19862
- {
19863
- GGML_ASSERT(start % QK4_NL == 0);
19864
- GGML_ASSERT(start % n_per_row == 0);
19865
- size_t start_row = start / n_per_row;
19866
- size_t row_size = ggml_row_size(type, n_per_row);
19867
- result = quantize_iq4_nl(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19868
- GGML_ASSERT(result == row_size * nrows);
19869
- } break;
19870
- #if QK_K != 64
19871
- case GGML_TYPE_IQ4_XS:
19872
- {
19873
- GGML_ASSERT(start % QK_K == 0);
19874
- GGML_ASSERT(start % n_per_row == 0);
19875
- size_t start_row = start / n_per_row;
19876
- size_t row_size = ggml_row_size(type, n_per_row);
19877
- result = quantize_iq4_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
19878
- GGML_ASSERT(result == row_size * nrows);
19879
- } break;
20252
+ case GGML_TYPE_IQ4_XS: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20253
+ #else
20254
+ case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
19880
20255
  #endif
19881
20256
  case GGML_TYPE_F16:
19882
20257
  {
@@ -19893,6 +20268,9 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
19893
20268
  default:
19894
20269
  assert(false);
19895
20270
  }
20271
+
20272
+ GGML_ASSERT(result == nrows * row_size);
20273
+
19896
20274
  return result;
19897
20275
  }
19898
20276