llama_cpp 0.12.0 → 0.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -70,7 +70,7 @@ static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block s
70
70
  // 2-bit quantization
71
71
  // weight is represented as x = a * q + b
72
72
  // 16 blocks of 16 elements each
73
- // Effectively 2.5625 bits per weight
73
+ // Effectively 2.625 bits per weight
74
74
  typedef struct {
75
75
  uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
76
76
  uint8_t qs[QK_K/4]; // quants
@@ -165,6 +165,22 @@ typedef struct {
165
165
  } block_q8_K;
166
166
  static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");
167
167
 
168
+ // (Almost) "true" 2-bit quantization.
169
+ // Due to the need to use blocks as per ggml dsign, it ends up using
170
+ // 2.0625 bpw because of the 16-bit scale for each block of 256.
171
+ typedef struct {
172
+ ggml_fp16_t d;
173
+ uint16_t qs[QK_K/8];
174
+ } block_iq2_xxs;
175
+ static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
176
+
177
+ // 2.3125 bpw quants
178
+ typedef struct {
179
+ ggml_fp16_t d;
180
+ uint16_t qs[QK_K/8];
181
+ uint8_t scales[QK_K/32];
182
+ } block_iq2_xs;
183
+ static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
168
184
 
169
185
  // Quantization
170
186
  void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
@@ -180,6 +196,8 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
180
196
  void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
181
197
  void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
182
198
  void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
199
+ void quantize_row_iq2_xxs_reference(const float * restrict x, block_iq2_xxs * restrict y, int k);
200
+ void quantize_row_iq2_xs_reference (const float * restrict x, block_iq2_xs * restrict y, int k);
183
201
 
184
202
  void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
185
203
  void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
@@ -194,6 +212,8 @@ void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
194
212
  void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
195
213
  void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
196
214
  void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
215
+ void quantize_row_iq2_xxs(const float * restrict x, void * restrict y, int k);
216
+ void quantize_row_iq2_xs (const float * restrict x, void * restrict y, int k);
197
217
 
198
218
  // Dequantization
199
219
  void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
@@ -209,6 +229,8 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int
209
229
  void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
210
230
  void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
211
231
  void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
232
+ void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k);
233
+ void dequantize_row_iq2_xs (const block_iq2_xs * restrict x, float * restrict y, int k);
212
234
 
213
235
  // Dot product
214
236
  void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
@@ -222,3 +244,5 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx,
222
244
  void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
223
245
  void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
224
246
  void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
247
+ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
248
+ void ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict vx, const void * restrict vy);
@@ -132,7 +132,7 @@ void ggml_print_backtrace(void) {
132
132
  "-ex", "bt -frame-info source-and-location",
133
133
  "-ex", "detach",
134
134
  "-ex", "quit",
135
- NULL);
135
+ (char *) NULL);
136
136
  } else {
137
137
  waitpid(pid, NULL, 0);
138
138
  }
@@ -394,6 +394,12 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
394
394
  static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
395
395
  static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
396
396
 
397
+ ggml_collect_imatrix_t g_imatrix_collect = NULL;
398
+
399
+ void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect) {
400
+ g_imatrix_collect = imatrix_collect;
401
+ }
402
+
397
403
  static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
398
404
  [GGML_TYPE_I8] = {
399
405
  .type_name = "i8",
@@ -573,6 +579,28 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
573
579
  .vec_dot = ggml_vec_dot_q6_K_q8_K,
574
580
  .vec_dot_type = GGML_TYPE_Q8_K,
575
581
  },
582
+ [GGML_TYPE_IQ2_XXS] = {
583
+ .type_name = "iq2_xxs",
584
+ .blck_size = QK_K,
585
+ .type_size = sizeof(block_iq2_xxs),
586
+ .is_quantized = true,
587
+ .to_float = (ggml_to_float_t) dequantize_row_iq2_xxs,
588
+ .from_float = quantize_row_iq2_xxs,
589
+ .from_float_reference = (ggml_from_float_t) quantize_row_iq2_xxs_reference,
590
+ .vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
591
+ .vec_dot_type = GGML_TYPE_Q8_K,
592
+ },
593
+ [GGML_TYPE_IQ2_XS] = {
594
+ .type_name = "iq2_xs",
595
+ .blck_size = QK_K,
596
+ .type_size = sizeof(block_iq2_xs),
597
+ .is_quantized = true,
598
+ .to_float = (ggml_to_float_t) dequantize_row_iq2_xs,
599
+ .from_float = quantize_row_iq2_xs,
600
+ .from_float_reference = (ggml_from_float_t) quantize_row_iq2_xs_reference,
601
+ .vec_dot = ggml_vec_dot_iq2_xs_q8_K,
602
+ .vec_dot_type = GGML_TYPE_Q8_K,
603
+ },
576
604
  [GGML_TYPE_Q8_K] = {
577
605
  .type_name = "q8_K",
578
606
  .blck_size = QK_K,
@@ -2111,6 +2139,8 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
2111
2139
  case GGML_FTYPE_MOSTLY_Q4_K: wtype = GGML_TYPE_Q4_K; break;
2112
2140
  case GGML_FTYPE_MOSTLY_Q5_K: wtype = GGML_TYPE_Q5_K; break;
2113
2141
  case GGML_FTYPE_MOSTLY_Q6_K: wtype = GGML_TYPE_Q6_K; break;
2142
+ case GGML_FTYPE_MOSTLY_IQ2_XXS: wtype = GGML_TYPE_IQ2_XXS; break;
2143
+ case GGML_FTYPE_MOSTLY_IQ2_XS: wtype = GGML_TYPE_IQ2_XS; break;
2114
2144
  case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
2115
2145
  case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
2116
2146
  }
@@ -4299,13 +4329,13 @@ struct ggml_tensor * ggml_set_2d_inplace(
4299
4329
  static struct ggml_tensor * ggml_cpy_impl(
4300
4330
  struct ggml_context * ctx,
4301
4331
  struct ggml_tensor * a,
4302
- struct ggml_tensor * b,
4303
- bool inplace) {
4332
+ struct ggml_tensor * b) {
4304
4333
  GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
4305
4334
 
4306
4335
  bool is_node = false;
4307
4336
 
4308
- if (!inplace && (a->grad || b->grad)) {
4337
+ if (a->grad || b->grad) {
4338
+ // inplace is false and either one have a grad
4309
4339
  is_node = true;
4310
4340
  }
4311
4341
 
@@ -4329,29 +4359,21 @@ struct ggml_tensor * ggml_cpy(
4329
4359
  struct ggml_context * ctx,
4330
4360
  struct ggml_tensor * a,
4331
4361
  struct ggml_tensor * b) {
4332
- return ggml_cpy_impl(ctx, a, b, false);
4333
- }
4334
-
4335
- struct ggml_tensor * ggml_cpy_inplace(
4336
- struct ggml_context * ctx,
4337
- struct ggml_tensor * a,
4338
- struct ggml_tensor * b) {
4339
- return ggml_cpy_impl(ctx, a, b, true);
4362
+ return ggml_cpy_impl(ctx, a, b);
4340
4363
  }
4341
4364
 
4342
4365
  // ggml_cont
4343
4366
 
4344
4367
  static struct ggml_tensor * ggml_cont_impl(
4345
4368
  struct ggml_context * ctx,
4346
- struct ggml_tensor * a,
4347
- bool inplace) {
4369
+ struct ggml_tensor * a) {
4348
4370
  bool is_node = false;
4349
4371
 
4350
- if (!inplace && a->grad) {
4372
+ if (a->grad) {
4351
4373
  is_node = true;
4352
4374
  }
4353
4375
 
4354
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
4376
+ struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
4355
4377
  ggml_format_name(result, "%s (cont)", a->name);
4356
4378
 
4357
4379
  result->op = GGML_OP_CONT;
@@ -4364,13 +4386,7 @@ static struct ggml_tensor * ggml_cont_impl(
4364
4386
  struct ggml_tensor * ggml_cont(
4365
4387
  struct ggml_context * ctx,
4366
4388
  struct ggml_tensor * a) {
4367
- return ggml_cont_impl(ctx, a, false);
4368
- }
4369
-
4370
- struct ggml_tensor * ggml_cont_inplace(
4371
- struct ggml_context * ctx,
4372
- struct ggml_tensor * a) {
4373
- return ggml_cont_impl(ctx, a, true);
4389
+ return ggml_cont_impl(ctx, a);
4374
4390
  }
4375
4391
 
4376
4392
  // make contiguous, with new shape
@@ -7436,6 +7452,8 @@ static void ggml_compute_forward_add(
7436
7452
  case GGML_TYPE_Q4_K:
7437
7453
  case GGML_TYPE_Q5_K:
7438
7454
  case GGML_TYPE_Q6_K:
7455
+ case GGML_TYPE_IQ2_XXS:
7456
+ case GGML_TYPE_IQ2_XS:
7439
7457
  {
7440
7458
  ggml_compute_forward_add_q_f32(params, src0, src1, dst);
7441
7459
  } break;
@@ -7700,6 +7718,8 @@ static void ggml_compute_forward_add1(
7700
7718
  case GGML_TYPE_Q4_K:
7701
7719
  case GGML_TYPE_Q5_K:
7702
7720
  case GGML_TYPE_Q6_K:
7721
+ case GGML_TYPE_IQ2_XXS:
7722
+ case GGML_TYPE_IQ2_XS:
7703
7723
  {
7704
7724
  ggml_compute_forward_add1_q_f32(params, src0, src1, dst);
7705
7725
  } break;
@@ -7814,6 +7834,8 @@ static void ggml_compute_forward_acc(
7814
7834
  case GGML_TYPE_Q4_K:
7815
7835
  case GGML_TYPE_Q5_K:
7816
7836
  case GGML_TYPE_Q6_K:
7837
+ case GGML_TYPE_IQ2_XXS:
7838
+ case GGML_TYPE_IQ2_XS:
7817
7839
  default:
7818
7840
  {
7819
7841
  GGML_ASSERT(false);
@@ -9704,10 +9726,10 @@ static void ggml_compute_forward_group_norm(
9704
9726
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9705
9727
  // helper function to determine if it is better to use BLAS or not
9706
9728
  // for large matrices, BLAS is faster
9707
- static bool ggml_compute_forward_mul_mat_use_blas(
9708
- const struct ggml_tensor * src0,
9709
- const struct ggml_tensor * src1,
9710
- struct ggml_tensor * dst) {
9729
+ static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
9730
+ const struct ggml_tensor * src0 = dst->src[0];
9731
+ const struct ggml_tensor * src1 = dst->src[1];
9732
+
9711
9733
  //const int64_t ne00 = src0->ne[0];
9712
9734
  //const int64_t ne01 = src0->ne[1];
9713
9735
 
@@ -9747,6 +9769,10 @@ static void ggml_compute_forward_mul_mat(
9747
9769
  const int ith = params->ith;
9748
9770
  const int nth = params->nth;
9749
9771
 
9772
+ if (ith == 1 && g_imatrix_collect) {
9773
+ g_imatrix_collect(src0, src1);
9774
+ }
9775
+
9750
9776
  const enum ggml_type type = src0->type;
9751
9777
 
9752
9778
  const bool src1_cont = ggml_is_contiguous(src1);
@@ -9787,7 +9813,7 @@ static void ggml_compute_forward_mul_mat(
9787
9813
  #endif
9788
9814
 
9789
9815
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9790
- if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
9816
+ if (ggml_compute_forward_mul_mat_use_blas(dst)) {
9791
9817
  if (params->ith != 0) {
9792
9818
  return;
9793
9819
  }
@@ -10050,6 +10076,10 @@ static void ggml_compute_forward_mul_mat_id(
10050
10076
 
10051
10077
  const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];
10052
10078
 
10079
+ if (ith == 1 && g_imatrix_collect) {
10080
+ g_imatrix_collect(src0_cur, src1);
10081
+ }
10082
+
10053
10083
  const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
10054
10084
  const size_t row_size = ggml_row_size(vec_dot_type, ne10);
10055
10085
 
@@ -10455,6 +10485,8 @@ static void ggml_compute_forward_out_prod(
10455
10485
  case GGML_TYPE_Q4_K:
10456
10486
  case GGML_TYPE_Q5_K:
10457
10487
  case GGML_TYPE_Q6_K:
10488
+ case GGML_TYPE_IQ2_XXS:
10489
+ case GGML_TYPE_IQ2_XS:
10458
10490
  {
10459
10491
  ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst);
10460
10492
  } break;
@@ -10629,6 +10661,8 @@ static void ggml_compute_forward_set(
10629
10661
  case GGML_TYPE_Q4_K:
10630
10662
  case GGML_TYPE_Q5_K:
10631
10663
  case GGML_TYPE_Q6_K:
10664
+ case GGML_TYPE_IQ2_XXS:
10665
+ case GGML_TYPE_IQ2_XS:
10632
10666
  default:
10633
10667
  {
10634
10668
  GGML_ASSERT(false);
@@ -10823,6 +10857,8 @@ static void ggml_compute_forward_get_rows(
10823
10857
  case GGML_TYPE_Q4_K:
10824
10858
  case GGML_TYPE_Q5_K:
10825
10859
  case GGML_TYPE_Q6_K:
10860
+ case GGML_TYPE_IQ2_XXS:
10861
+ case GGML_TYPE_IQ2_XS:
10826
10862
  {
10827
10863
  ggml_compute_forward_get_rows_q(params, src0, src1, dst);
10828
10864
  } break;
@@ -11459,6 +11495,8 @@ static void ggml_compute_forward_alibi(
11459
11495
  case GGML_TYPE_Q4_K:
11460
11496
  case GGML_TYPE_Q5_K:
11461
11497
  case GGML_TYPE_Q6_K:
11498
+ case GGML_TYPE_IQ2_XXS:
11499
+ case GGML_TYPE_IQ2_XS:
11462
11500
  case GGML_TYPE_Q8_K:
11463
11501
  case GGML_TYPE_I8:
11464
11502
  case GGML_TYPE_I16:
@@ -11533,6 +11571,8 @@ static void ggml_compute_forward_clamp(
11533
11571
  case GGML_TYPE_Q4_K:
11534
11572
  case GGML_TYPE_Q5_K:
11535
11573
  case GGML_TYPE_Q6_K:
11574
+ case GGML_TYPE_IQ2_XXS:
11575
+ case GGML_TYPE_IQ2_XS:
11536
11576
  case GGML_TYPE_Q8_K:
11537
11577
  case GGML_TYPE_I8:
11538
11578
  case GGML_TYPE_I16:
@@ -16301,24 +16341,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
16301
16341
 
16302
16342
  //n_tasks = MIN(n_threads, MAX(1, nr0/128));
16303
16343
  //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
16304
-
16305
- #if defined(GGML_USE_CUBLAS)
16306
- if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
16307
- n_tasks = 1; // TODO: this actually is doing nothing
16308
- // the threads are still spinning
16309
- }
16310
- #elif defined(GGML_USE_CLBLAST)
16311
- if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
16312
- n_tasks = 1; // TODO: this actually is doing nothing
16313
- // the threads are still spinning
16314
- }
16315
- #endif
16316
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
16317
- if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
16318
- n_tasks = 1; // TODO: this actually is doing nothing
16319
- // the threads are still spinning
16320
- }
16321
- #endif
16322
16344
  } break;
16323
16345
  case GGML_OP_MUL_MAT_ID:
16324
16346
  {
@@ -16491,6 +16513,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16491
16513
  state->shared->node_n += 1;
16492
16514
  return (thread_ret_t) GGML_EXIT_ABORTED;
16493
16515
  }
16516
+
16494
16517
  if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
16495
16518
  // all other threads are finished and spinning
16496
16519
  // do finalize and init here so we don't have synchronize again
@@ -16556,14 +16579,18 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16556
16579
  } else {
16557
16580
  // wait for other threads to finish
16558
16581
  const int last = node_n;
16582
+
16583
+ const bool do_yield = last < 0 || cgraph->nodes[last]->op == GGML_OP_MUL_MAT;
16584
+
16559
16585
  while (true) {
16560
16586
  // TODO: this sched_yield can have significant impact on the performance - either positive or negative
16561
16587
  // depending on the workload and the operating system.
16562
16588
  // since it is not clear what is the best approach, it should potentially become user-configurable
16563
16589
  // ref: https://github.com/ggerganov/ggml/issues/291
16564
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
16565
- sched_yield();
16566
- #endif
16590
+ // UPD: adding the do_yield flag seems to resolve the issue universally
16591
+ if (do_yield) {
16592
+ sched_yield();
16593
+ }
16567
16594
 
16568
16595
  node_n = atomic_load(&state->shared->node_n);
16569
16596
  if (node_n != last) break;
@@ -16642,7 +16669,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16642
16669
  } else
16643
16670
  #endif
16644
16671
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
16645
- if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
16672
+ if (ggml_compute_forward_mul_mat_use_blas(node)) {
16646
16673
  if (node->src[0]->type != GGML_TYPE_F32) {
16647
16674
  // here we need memory just for single 2D matrix from src0
16648
16675
  cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
@@ -18661,6 +18688,18 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
18661
18688
  block_q6_K * block = (block_q6_K*)dst + start / QK_K;
18662
18689
  result = ggml_quantize_q6_K(src + start, block, n, n, hist);
18663
18690
  } break;
18691
+ case GGML_TYPE_IQ2_XXS:
18692
+ {
18693
+ GGML_ASSERT(start % QK_K == 0);
18694
+ block_iq2_xxs * block = (block_iq2_xxs*)dst + start / QK_K;
18695
+ result = ggml_quantize_iq2_xxs(src + start, block, n, n, hist);
18696
+ } break;
18697
+ case GGML_TYPE_IQ2_XS:
18698
+ {
18699
+ GGML_ASSERT(start % QK_K == 0);
18700
+ block_iq2_xs * block = (block_iq2_xs*)dst + start / QK_K;
18701
+ result = ggml_quantize_iq2_xs(src + start, block, n, n, hist);
18702
+ } break;
18664
18703
  case GGML_TYPE_F16:
18665
18704
  {
18666
18705
  int elemsize = sizeof(ggml_fp16_t);
@@ -19016,8 +19055,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19016
19055
  (int64_t) info->ne[3];
19017
19056
 
19018
19057
  if (ne % ggml_blck_size(info->type) != 0) {
19019
- fprintf(stderr, "%s: tensor '%s' number of elements (%" PRId64 ") is not a multiple of block size (%d)\n",
19020
- __func__, info->name.data, ne, ggml_blck_size(info->type));
19058
+ fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%d)\n",
19059
+ __func__, info->name.data, (int)info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
19021
19060
  fclose(file);
19022
19061
  gguf_free(ctx);
19023
19062
  return NULL;
@@ -218,7 +218,9 @@
218
218
  #define GGML_MAX_PARAMS 2048
219
219
  #define GGML_MAX_CONTEXTS 64
220
220
  #define GGML_MAX_SRC 10
221
+ #ifndef GGML_MAX_NAME
221
222
  #define GGML_MAX_NAME 64
223
+ #endif
222
224
  #define GGML_MAX_OP_PARAMS 64
223
225
  #define GGML_DEFAULT_N_THREADS 4
224
226
  #define GGML_DEFAULT_GRAPH_SIZE 2048
@@ -339,6 +341,8 @@ extern "C" {
339
341
  GGML_TYPE_Q5_K = 13,
340
342
  GGML_TYPE_Q6_K = 14,
341
343
  GGML_TYPE_Q8_K = 15,
344
+ GGML_TYPE_IQ2_XXS = 16,
345
+ GGML_TYPE_IQ2_XS = 17,
342
346
  GGML_TYPE_I8,
343
347
  GGML_TYPE_I16,
344
348
  GGML_TYPE_I32,
@@ -373,6 +377,8 @@ extern "C" {
373
377
  GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
374
378
  GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
375
379
  GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
380
+ GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
381
+ GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors
376
382
  };
377
383
 
378
384
  // available tensor operations:
@@ -1159,22 +1165,11 @@ extern "C" {
1159
1165
  struct ggml_tensor * a,
1160
1166
  struct ggml_tensor * b);
1161
1167
 
1162
- // a -> b, in-place, return view(b)
1163
- GGML_API struct ggml_tensor * ggml_cpy_inplace(
1164
- struct ggml_context * ctx,
1165
- struct ggml_tensor * a,
1166
- struct ggml_tensor * b);
1167
-
1168
1168
  // make contiguous
1169
1169
  GGML_API struct ggml_tensor * ggml_cont(
1170
1170
  struct ggml_context * ctx,
1171
1171
  struct ggml_tensor * a);
1172
1172
 
1173
- // make contiguous, in-place
1174
- GGML_API struct ggml_tensor * ggml_cont_inplace(
1175
- struct ggml_context * ctx,
1176
- struct ggml_tensor * a);
1177
-
1178
1173
  // make contiguous, with new shape
1179
1174
  GGML_API struct ggml_tensor * ggml_cont_1d(
1180
1175
  struct ggml_context * ctx,
@@ -2067,9 +2062,17 @@ extern "C" {
2067
2062
  GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
2068
2063
  GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
2069
2064
  GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
2065
+ GGML_API size_t ggml_quantize_iq2_xxs(const float * src, void * dst, int n, int k, int64_t * hist);
2066
+ GGML_API size_t ggml_quantize_iq2_xs (const float * src, void * dst, int n, int k, int64_t * hist);
2070
2067
 
2071
2068
  GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
2072
2069
 
2070
+ //
2071
+ // Importance matrix
2072
+ //
2073
+ typedef void(*ggml_collect_imatrix_t)(const struct ggml_tensor * src0, const struct ggml_tensor * src1);
2074
+ GGML_API void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect);
2075
+
2073
2076
  //
2074
2077
  // gguf
2075
2078
  //