llama_cpp 0.12.0 → 0.12.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -70,7 +70,7 @@ static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block s
70
70
  // 2-bit quantization
71
71
  // weight is represented as x = a * q + b
72
72
  // 16 blocks of 16 elements each
73
- // Effectively 2.5625 bits per weight
73
+ // Effectively 2.625 bits per weight
74
74
  typedef struct {
75
75
  uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
76
76
  uint8_t qs[QK_K/4]; // quants
@@ -165,6 +165,22 @@ typedef struct {
165
165
  } block_q8_K;
166
166
  static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");
167
167
 
168
+ // (Almost) "true" 2-bit quantization.
169
+ // Due to the need to use blocks as per ggml dsign, it ends up using
170
+ // 2.0625 bpw because of the 16-bit scale for each block of 256.
171
+ typedef struct {
172
+ ggml_fp16_t d;
173
+ uint16_t qs[QK_K/8];
174
+ } block_iq2_xxs;
175
+ static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
176
+
177
+ // 2.3125 bpw quants
178
+ typedef struct {
179
+ ggml_fp16_t d;
180
+ uint16_t qs[QK_K/8];
181
+ uint8_t scales[QK_K/32];
182
+ } block_iq2_xs;
183
+ static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
168
184
 
169
185
  // Quantization
170
186
  void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
@@ -180,6 +196,8 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
180
196
  void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
181
197
  void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
182
198
  void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
199
+ void quantize_row_iq2_xxs_reference(const float * restrict x, block_iq2_xxs * restrict y, int k);
200
+ void quantize_row_iq2_xs_reference (const float * restrict x, block_iq2_xs * restrict y, int k);
183
201
 
184
202
  void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
185
203
  void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
@@ -194,6 +212,8 @@ void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
194
212
  void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
195
213
  void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
196
214
  void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
215
+ void quantize_row_iq2_xxs(const float * restrict x, void * restrict y, int k);
216
+ void quantize_row_iq2_xs (const float * restrict x, void * restrict y, int k);
197
217
 
198
218
  // Dequantization
199
219
  void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
@@ -209,6 +229,8 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int
209
229
  void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
210
230
  void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
211
231
  void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
232
+ void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k);
233
+ void dequantize_row_iq2_xs (const block_iq2_xs * restrict x, float * restrict y, int k);
212
234
 
213
235
  // Dot product
214
236
  void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
@@ -222,3 +244,5 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx,
222
244
  void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
223
245
  void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
224
246
  void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
247
+ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
248
+ void ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict vx, const void * restrict vy);
@@ -132,7 +132,7 @@ void ggml_print_backtrace(void) {
132
132
  "-ex", "bt -frame-info source-and-location",
133
133
  "-ex", "detach",
134
134
  "-ex", "quit",
135
- NULL);
135
+ (char *) NULL);
136
136
  } else {
137
137
  waitpid(pid, NULL, 0);
138
138
  }
@@ -394,6 +394,12 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
394
394
  static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
395
395
  static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
396
396
 
397
+ ggml_collect_imatrix_t g_imatrix_collect = NULL;
398
+
399
+ void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect) {
400
+ g_imatrix_collect = imatrix_collect;
401
+ }
402
+
397
403
  static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
398
404
  [GGML_TYPE_I8] = {
399
405
  .type_name = "i8",
@@ -573,6 +579,28 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
573
579
  .vec_dot = ggml_vec_dot_q6_K_q8_K,
574
580
  .vec_dot_type = GGML_TYPE_Q8_K,
575
581
  },
582
+ [GGML_TYPE_IQ2_XXS] = {
583
+ .type_name = "iq2_xxs",
584
+ .blck_size = QK_K,
585
+ .type_size = sizeof(block_iq2_xxs),
586
+ .is_quantized = true,
587
+ .to_float = (ggml_to_float_t) dequantize_row_iq2_xxs,
588
+ .from_float = quantize_row_iq2_xxs,
589
+ .from_float_reference = (ggml_from_float_t) quantize_row_iq2_xxs_reference,
590
+ .vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
591
+ .vec_dot_type = GGML_TYPE_Q8_K,
592
+ },
593
+ [GGML_TYPE_IQ2_XS] = {
594
+ .type_name = "iq2_xs",
595
+ .blck_size = QK_K,
596
+ .type_size = sizeof(block_iq2_xs),
597
+ .is_quantized = true,
598
+ .to_float = (ggml_to_float_t) dequantize_row_iq2_xs,
599
+ .from_float = quantize_row_iq2_xs,
600
+ .from_float_reference = (ggml_from_float_t) quantize_row_iq2_xs_reference,
601
+ .vec_dot = ggml_vec_dot_iq2_xs_q8_K,
602
+ .vec_dot_type = GGML_TYPE_Q8_K,
603
+ },
576
604
  [GGML_TYPE_Q8_K] = {
577
605
  .type_name = "q8_K",
578
606
  .blck_size = QK_K,
@@ -2111,6 +2139,8 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
2111
2139
  case GGML_FTYPE_MOSTLY_Q4_K: wtype = GGML_TYPE_Q4_K; break;
2112
2140
  case GGML_FTYPE_MOSTLY_Q5_K: wtype = GGML_TYPE_Q5_K; break;
2113
2141
  case GGML_FTYPE_MOSTLY_Q6_K: wtype = GGML_TYPE_Q6_K; break;
2142
+ case GGML_FTYPE_MOSTLY_IQ2_XXS: wtype = GGML_TYPE_IQ2_XXS; break;
2143
+ case GGML_FTYPE_MOSTLY_IQ2_XS: wtype = GGML_TYPE_IQ2_XS; break;
2114
2144
  case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
2115
2145
  case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
2116
2146
  }
@@ -4299,13 +4329,13 @@ struct ggml_tensor * ggml_set_2d_inplace(
4299
4329
  static struct ggml_tensor * ggml_cpy_impl(
4300
4330
  struct ggml_context * ctx,
4301
4331
  struct ggml_tensor * a,
4302
- struct ggml_tensor * b,
4303
- bool inplace) {
4332
+ struct ggml_tensor * b) {
4304
4333
  GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
4305
4334
 
4306
4335
  bool is_node = false;
4307
4336
 
4308
- if (!inplace && (a->grad || b->grad)) {
4337
+ if (a->grad || b->grad) {
4338
+ // inplace is false and either one have a grad
4309
4339
  is_node = true;
4310
4340
  }
4311
4341
 
@@ -4329,29 +4359,21 @@ struct ggml_tensor * ggml_cpy(
4329
4359
  struct ggml_context * ctx,
4330
4360
  struct ggml_tensor * a,
4331
4361
  struct ggml_tensor * b) {
4332
- return ggml_cpy_impl(ctx, a, b, false);
4333
- }
4334
-
4335
- struct ggml_tensor * ggml_cpy_inplace(
4336
- struct ggml_context * ctx,
4337
- struct ggml_tensor * a,
4338
- struct ggml_tensor * b) {
4339
- return ggml_cpy_impl(ctx, a, b, true);
4362
+ return ggml_cpy_impl(ctx, a, b);
4340
4363
  }
4341
4364
 
4342
4365
  // ggml_cont
4343
4366
 
4344
4367
  static struct ggml_tensor * ggml_cont_impl(
4345
4368
  struct ggml_context * ctx,
4346
- struct ggml_tensor * a,
4347
- bool inplace) {
4369
+ struct ggml_tensor * a) {
4348
4370
  bool is_node = false;
4349
4371
 
4350
- if (!inplace && a->grad) {
4372
+ if (a->grad) {
4351
4373
  is_node = true;
4352
4374
  }
4353
4375
 
4354
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
4376
+ struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
4355
4377
  ggml_format_name(result, "%s (cont)", a->name);
4356
4378
 
4357
4379
  result->op = GGML_OP_CONT;
@@ -4364,13 +4386,7 @@ static struct ggml_tensor * ggml_cont_impl(
4364
4386
  struct ggml_tensor * ggml_cont(
4365
4387
  struct ggml_context * ctx,
4366
4388
  struct ggml_tensor * a) {
4367
- return ggml_cont_impl(ctx, a, false);
4368
- }
4369
-
4370
- struct ggml_tensor * ggml_cont_inplace(
4371
- struct ggml_context * ctx,
4372
- struct ggml_tensor * a) {
4373
- return ggml_cont_impl(ctx, a, true);
4389
+ return ggml_cont_impl(ctx, a);
4374
4390
  }
4375
4391
 
4376
4392
  // make contiguous, with new shape
@@ -7436,6 +7452,8 @@ static void ggml_compute_forward_add(
7436
7452
  case GGML_TYPE_Q4_K:
7437
7453
  case GGML_TYPE_Q5_K:
7438
7454
  case GGML_TYPE_Q6_K:
7455
+ case GGML_TYPE_IQ2_XXS:
7456
+ case GGML_TYPE_IQ2_XS:
7439
7457
  {
7440
7458
  ggml_compute_forward_add_q_f32(params, src0, src1, dst);
7441
7459
  } break;
@@ -7700,6 +7718,8 @@ static void ggml_compute_forward_add1(
7700
7718
  case GGML_TYPE_Q4_K:
7701
7719
  case GGML_TYPE_Q5_K:
7702
7720
  case GGML_TYPE_Q6_K:
7721
+ case GGML_TYPE_IQ2_XXS:
7722
+ case GGML_TYPE_IQ2_XS:
7703
7723
  {
7704
7724
  ggml_compute_forward_add1_q_f32(params, src0, src1, dst);
7705
7725
  } break;
@@ -7814,6 +7834,8 @@ static void ggml_compute_forward_acc(
7814
7834
  case GGML_TYPE_Q4_K:
7815
7835
  case GGML_TYPE_Q5_K:
7816
7836
  case GGML_TYPE_Q6_K:
7837
+ case GGML_TYPE_IQ2_XXS:
7838
+ case GGML_TYPE_IQ2_XS:
7817
7839
  default:
7818
7840
  {
7819
7841
  GGML_ASSERT(false);
@@ -9704,10 +9726,10 @@ static void ggml_compute_forward_group_norm(
9704
9726
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9705
9727
  // helper function to determine if it is better to use BLAS or not
9706
9728
  // for large matrices, BLAS is faster
9707
- static bool ggml_compute_forward_mul_mat_use_blas(
9708
- const struct ggml_tensor * src0,
9709
- const struct ggml_tensor * src1,
9710
- struct ggml_tensor * dst) {
9729
+ static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
9730
+ const struct ggml_tensor * src0 = dst->src[0];
9731
+ const struct ggml_tensor * src1 = dst->src[1];
9732
+
9711
9733
  //const int64_t ne00 = src0->ne[0];
9712
9734
  //const int64_t ne01 = src0->ne[1];
9713
9735
 
@@ -9747,6 +9769,10 @@ static void ggml_compute_forward_mul_mat(
9747
9769
  const int ith = params->ith;
9748
9770
  const int nth = params->nth;
9749
9771
 
9772
+ if (ith == 1 && g_imatrix_collect) {
9773
+ g_imatrix_collect(src0, src1);
9774
+ }
9775
+
9750
9776
  const enum ggml_type type = src0->type;
9751
9777
 
9752
9778
  const bool src1_cont = ggml_is_contiguous(src1);
@@ -9787,7 +9813,7 @@ static void ggml_compute_forward_mul_mat(
9787
9813
  #endif
9788
9814
 
9789
9815
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9790
- if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
9816
+ if (ggml_compute_forward_mul_mat_use_blas(dst)) {
9791
9817
  if (params->ith != 0) {
9792
9818
  return;
9793
9819
  }
@@ -10050,6 +10076,10 @@ static void ggml_compute_forward_mul_mat_id(
10050
10076
 
10051
10077
  const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];
10052
10078
 
10079
+ if (ith == 1 && g_imatrix_collect) {
10080
+ g_imatrix_collect(src0_cur, src1);
10081
+ }
10082
+
10053
10083
  const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
10054
10084
  const size_t row_size = ggml_row_size(vec_dot_type, ne10);
10055
10085
 
@@ -10455,6 +10485,8 @@ static void ggml_compute_forward_out_prod(
10455
10485
  case GGML_TYPE_Q4_K:
10456
10486
  case GGML_TYPE_Q5_K:
10457
10487
  case GGML_TYPE_Q6_K:
10488
+ case GGML_TYPE_IQ2_XXS:
10489
+ case GGML_TYPE_IQ2_XS:
10458
10490
  {
10459
10491
  ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst);
10460
10492
  } break;
@@ -10629,6 +10661,8 @@ static void ggml_compute_forward_set(
10629
10661
  case GGML_TYPE_Q4_K:
10630
10662
  case GGML_TYPE_Q5_K:
10631
10663
  case GGML_TYPE_Q6_K:
10664
+ case GGML_TYPE_IQ2_XXS:
10665
+ case GGML_TYPE_IQ2_XS:
10632
10666
  default:
10633
10667
  {
10634
10668
  GGML_ASSERT(false);
@@ -10823,6 +10857,8 @@ static void ggml_compute_forward_get_rows(
10823
10857
  case GGML_TYPE_Q4_K:
10824
10858
  case GGML_TYPE_Q5_K:
10825
10859
  case GGML_TYPE_Q6_K:
10860
+ case GGML_TYPE_IQ2_XXS:
10861
+ case GGML_TYPE_IQ2_XS:
10826
10862
  {
10827
10863
  ggml_compute_forward_get_rows_q(params, src0, src1, dst);
10828
10864
  } break;
@@ -11459,6 +11495,8 @@ static void ggml_compute_forward_alibi(
11459
11495
  case GGML_TYPE_Q4_K:
11460
11496
  case GGML_TYPE_Q5_K:
11461
11497
  case GGML_TYPE_Q6_K:
11498
+ case GGML_TYPE_IQ2_XXS:
11499
+ case GGML_TYPE_IQ2_XS:
11462
11500
  case GGML_TYPE_Q8_K:
11463
11501
  case GGML_TYPE_I8:
11464
11502
  case GGML_TYPE_I16:
@@ -11533,6 +11571,8 @@ static void ggml_compute_forward_clamp(
11533
11571
  case GGML_TYPE_Q4_K:
11534
11572
  case GGML_TYPE_Q5_K:
11535
11573
  case GGML_TYPE_Q6_K:
11574
+ case GGML_TYPE_IQ2_XXS:
11575
+ case GGML_TYPE_IQ2_XS:
11536
11576
  case GGML_TYPE_Q8_K:
11537
11577
  case GGML_TYPE_I8:
11538
11578
  case GGML_TYPE_I16:
@@ -16301,24 +16341,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
16301
16341
 
16302
16342
  //n_tasks = MIN(n_threads, MAX(1, nr0/128));
16303
16343
  //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
16304
-
16305
- #if defined(GGML_USE_CUBLAS)
16306
- if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
16307
- n_tasks = 1; // TODO: this actually is doing nothing
16308
- // the threads are still spinning
16309
- }
16310
- #elif defined(GGML_USE_CLBLAST)
16311
- if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
16312
- n_tasks = 1; // TODO: this actually is doing nothing
16313
- // the threads are still spinning
16314
- }
16315
- #endif
16316
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
16317
- if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
16318
- n_tasks = 1; // TODO: this actually is doing nothing
16319
- // the threads are still spinning
16320
- }
16321
- #endif
16322
16344
  } break;
16323
16345
  case GGML_OP_MUL_MAT_ID:
16324
16346
  {
@@ -16491,6 +16513,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16491
16513
  state->shared->node_n += 1;
16492
16514
  return (thread_ret_t) GGML_EXIT_ABORTED;
16493
16515
  }
16516
+
16494
16517
  if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
16495
16518
  // all other threads are finished and spinning
16496
16519
  // do finalize and init here so we don't have synchronize again
@@ -16556,14 +16579,18 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16556
16579
  } else {
16557
16580
  // wait for other threads to finish
16558
16581
  const int last = node_n;
16582
+
16583
+ const bool do_yield = last < 0 || cgraph->nodes[last]->op == GGML_OP_MUL_MAT;
16584
+
16559
16585
  while (true) {
16560
16586
  // TODO: this sched_yield can have significant impact on the performance - either positive or negative
16561
16587
  // depending on the workload and the operating system.
16562
16588
  // since it is not clear what is the best approach, it should potentially become user-configurable
16563
16589
  // ref: https://github.com/ggerganov/ggml/issues/291
16564
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
16565
- sched_yield();
16566
- #endif
16590
+ // UPD: adding the do_yield flag seems to resolve the issue universally
16591
+ if (do_yield) {
16592
+ sched_yield();
16593
+ }
16567
16594
 
16568
16595
  node_n = atomic_load(&state->shared->node_n);
16569
16596
  if (node_n != last) break;
@@ -16642,7 +16669,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16642
16669
  } else
16643
16670
  #endif
16644
16671
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
16645
- if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
16672
+ if (ggml_compute_forward_mul_mat_use_blas(node)) {
16646
16673
  if (node->src[0]->type != GGML_TYPE_F32) {
16647
16674
  // here we need memory just for single 2D matrix from src0
16648
16675
  cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
@@ -18661,6 +18688,18 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
18661
18688
  block_q6_K * block = (block_q6_K*)dst + start / QK_K;
18662
18689
  result = ggml_quantize_q6_K(src + start, block, n, n, hist);
18663
18690
  } break;
18691
+ case GGML_TYPE_IQ2_XXS:
18692
+ {
18693
+ GGML_ASSERT(start % QK_K == 0);
18694
+ block_iq2_xxs * block = (block_iq2_xxs*)dst + start / QK_K;
18695
+ result = ggml_quantize_iq2_xxs(src + start, block, n, n, hist);
18696
+ } break;
18697
+ case GGML_TYPE_IQ2_XS:
18698
+ {
18699
+ GGML_ASSERT(start % QK_K == 0);
18700
+ block_iq2_xs * block = (block_iq2_xs*)dst + start / QK_K;
18701
+ result = ggml_quantize_iq2_xs(src + start, block, n, n, hist);
18702
+ } break;
18664
18703
  case GGML_TYPE_F16:
18665
18704
  {
18666
18705
  int elemsize = sizeof(ggml_fp16_t);
@@ -19016,8 +19055,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19016
19055
  (int64_t) info->ne[3];
19017
19056
 
19018
19057
  if (ne % ggml_blck_size(info->type) != 0) {
19019
- fprintf(stderr, "%s: tensor '%s' number of elements (%" PRId64 ") is not a multiple of block size (%d)\n",
19020
- __func__, info->name.data, ne, ggml_blck_size(info->type));
19058
+ fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%d)\n",
19059
+ __func__, info->name.data, (int)info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
19021
19060
  fclose(file);
19022
19061
  gguf_free(ctx);
19023
19062
  return NULL;
@@ -218,7 +218,9 @@
218
218
  #define GGML_MAX_PARAMS 2048
219
219
  #define GGML_MAX_CONTEXTS 64
220
220
  #define GGML_MAX_SRC 10
221
+ #ifndef GGML_MAX_NAME
221
222
  #define GGML_MAX_NAME 64
223
+ #endif
222
224
  #define GGML_MAX_OP_PARAMS 64
223
225
  #define GGML_DEFAULT_N_THREADS 4
224
226
  #define GGML_DEFAULT_GRAPH_SIZE 2048
@@ -339,6 +341,8 @@ extern "C" {
339
341
  GGML_TYPE_Q5_K = 13,
340
342
  GGML_TYPE_Q6_K = 14,
341
343
  GGML_TYPE_Q8_K = 15,
344
+ GGML_TYPE_IQ2_XXS = 16,
345
+ GGML_TYPE_IQ2_XS = 17,
342
346
  GGML_TYPE_I8,
343
347
  GGML_TYPE_I16,
344
348
  GGML_TYPE_I32,
@@ -373,6 +377,8 @@ extern "C" {
373
377
  GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
374
378
  GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
375
379
  GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
380
+ GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
381
+ GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors
376
382
  };
377
383
 
378
384
  // available tensor operations:
@@ -1159,22 +1165,11 @@ extern "C" {
1159
1165
  struct ggml_tensor * a,
1160
1166
  struct ggml_tensor * b);
1161
1167
 
1162
- // a -> b, in-place, return view(b)
1163
- GGML_API struct ggml_tensor * ggml_cpy_inplace(
1164
- struct ggml_context * ctx,
1165
- struct ggml_tensor * a,
1166
- struct ggml_tensor * b);
1167
-
1168
1168
  // make contiguous
1169
1169
  GGML_API struct ggml_tensor * ggml_cont(
1170
1170
  struct ggml_context * ctx,
1171
1171
  struct ggml_tensor * a);
1172
1172
 
1173
- // make contiguous, in-place
1174
- GGML_API struct ggml_tensor * ggml_cont_inplace(
1175
- struct ggml_context * ctx,
1176
- struct ggml_tensor * a);
1177
-
1178
1173
  // make contiguous, with new shape
1179
1174
  GGML_API struct ggml_tensor * ggml_cont_1d(
1180
1175
  struct ggml_context * ctx,
@@ -2067,9 +2062,17 @@ extern "C" {
2067
2062
  GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
2068
2063
  GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
2069
2064
  GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
2065
+ GGML_API size_t ggml_quantize_iq2_xxs(const float * src, void * dst, int n, int k, int64_t * hist);
2066
+ GGML_API size_t ggml_quantize_iq2_xs (const float * src, void * dst, int n, int k, int64_t * hist);
2070
2067
 
2071
2068
  GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
2072
2069
 
2070
+ //
2071
+ // Importance matrix
2072
+ //
2073
+ typedef void(*ggml_collect_imatrix_t)(const struct ggml_tensor * src0, const struct ggml_tensor * src1);
2074
+ GGML_API void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect);
2075
+
2073
2076
  //
2074
2077
  // gguf
2075
2078
  //