llama_cpp 0.12.1 → 0.12.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -196,8 +196,6 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
196
196
  void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
197
197
  void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
198
198
  void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
199
- void quantize_row_iq2_xxs_reference(const float * restrict x, block_iq2_xxs * restrict y, int k);
200
- void quantize_row_iq2_xs_reference (const float * restrict x, block_iq2_xs * restrict y, int k);
201
199
 
202
200
  void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
203
201
  void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
@@ -212,8 +210,6 @@ void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
212
210
  void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
213
211
  void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
214
212
  void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
215
- void quantize_row_iq2_xxs(const float * restrict x, void * restrict y, int k);
216
- void quantize_row_iq2_xs (const float * restrict x, void * restrict y, int k);
217
213
 
218
214
  // Dequantization
219
215
  void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
@@ -246,3 +242,18 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx,
246
242
  void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
247
243
  void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
248
244
  void ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict vx, const void * restrict vy);
245
+
246
+ //
247
+ // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
248
+ //
249
+ size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
250
+ size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
251
+ size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
252
+ size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
253
+ size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
254
+ size_t quantize_q5_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
255
+ size_t quantize_q6_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
256
+ size_t quantize_q4_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
257
+ size_t quantize_q4_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
258
+ size_t quantize_q5_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
259
+ size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
@@ -585,8 +585,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
585
585
  .type_size = sizeof(block_iq2_xxs),
586
586
  .is_quantized = true,
587
587
  .to_float = (ggml_to_float_t) dequantize_row_iq2_xxs,
588
- .from_float = quantize_row_iq2_xxs,
589
- .from_float_reference = (ggml_from_float_t) quantize_row_iq2_xxs_reference,
588
+ .from_float = NULL,
589
+ .from_float_reference = NULL,
590
590
  .vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
591
591
  .vec_dot_type = GGML_TYPE_Q8_K,
592
592
  },
@@ -596,8 +596,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
596
596
  .type_size = sizeof(block_iq2_xs),
597
597
  .is_quantized = true,
598
598
  .to_float = (ggml_to_float_t) dequantize_row_iq2_xs,
599
- .from_float = quantize_row_iq2_xs,
600
- .from_float_reference = (ggml_from_float_t) quantize_row_iq2_xs_reference,
599
+ .from_float = NULL,
600
+ .from_float_reference = NULL,
601
601
  .vec_dot = ggml_vec_dot_iq2_xs_q8_K,
602
602
  .vec_dot_type = GGML_TYPE_Q8_K,
603
603
  },
@@ -1990,19 +1990,19 @@ void ggml_print_objects(const struct ggml_context * ctx) {
1990
1990
  GGML_PRINT("%s: --- end ---\n", __func__);
1991
1991
  }
1992
1992
 
1993
- int64_t ggml_nelements(const struct ggml_tensor * tensor) {
1993
+ GGML_CALL int64_t ggml_nelements(const struct ggml_tensor * tensor) {
1994
1994
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1995
1995
 
1996
1996
  return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
1997
1997
  }
1998
1998
 
1999
- int64_t ggml_nrows(const struct ggml_tensor * tensor) {
1999
+ GGML_CALL int64_t ggml_nrows(const struct ggml_tensor * tensor) {
2000
2000
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2001
2001
 
2002
2002
  return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
2003
2003
  }
2004
2004
 
2005
- size_t ggml_nbytes(const struct ggml_tensor * tensor) {
2005
+ GGML_CALL size_t ggml_nbytes(const struct ggml_tensor * tensor) {
2006
2006
  size_t nbytes;
2007
2007
  size_t blck_size = ggml_blck_size(tensor->type);
2008
2008
  if (blck_size == 1) {
@@ -2025,15 +2025,15 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
2025
2025
  return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
2026
2026
  }
2027
2027
 
2028
- int ggml_blck_size(enum ggml_type type) {
2028
+ GGML_CALL int ggml_blck_size(enum ggml_type type) {
2029
2029
  return type_traits[type].blck_size;
2030
2030
  }
2031
2031
 
2032
- size_t ggml_type_size(enum ggml_type type) {
2032
+ GGML_CALL size_t ggml_type_size(enum ggml_type type) {
2033
2033
  return type_traits[type].type_size;
2034
2034
  }
2035
2035
 
2036
- size_t ggml_row_size(enum ggml_type type, int64_t ne) {
2036
+ GGML_CALL size_t ggml_row_size(enum ggml_type type, int64_t ne) {
2037
2037
  assert(ne % ggml_blck_size(type) == 0);
2038
2038
  return ggml_type_size(type)*ne/ggml_blck_size(type);
2039
2039
  }
@@ -2042,15 +2042,15 @@ double ggml_type_sizef(enum ggml_type type) {
2042
2042
  return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
2043
2043
  }
2044
2044
 
2045
- const char * ggml_type_name(enum ggml_type type) {
2045
+ GGML_CALL const char * ggml_type_name(enum ggml_type type) {
2046
2046
  return type_traits[type].type_name;
2047
2047
  }
2048
2048
 
2049
- bool ggml_is_quantized(enum ggml_type type) {
2049
+ GGML_CALL bool ggml_is_quantized(enum ggml_type type) {
2050
2050
  return type_traits[type].is_quantized;
2051
2051
  }
2052
2052
 
2053
- const char * ggml_op_name(enum ggml_op op) {
2053
+ GGML_CALL const char * ggml_op_name(enum ggml_op op) {
2054
2054
  return GGML_OP_NAME[op];
2055
2055
  }
2056
2056
 
@@ -2062,7 +2062,7 @@ const char * ggml_unary_op_name(enum ggml_unary_op op) {
2062
2062
  return GGML_UNARY_OP_NAME[op];
2063
2063
  }
2064
2064
 
2065
- const char * ggml_op_desc(const struct ggml_tensor * t) {
2065
+ GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t) {
2066
2066
  if (t->op == GGML_OP_UNARY) {
2067
2067
  enum ggml_unary_op uop = ggml_get_unary_op(t);
2068
2068
  return ggml_unary_op_name(uop);
@@ -2072,7 +2072,7 @@ const char * ggml_op_desc(const struct ggml_tensor * t) {
2072
2072
  }
2073
2073
  }
2074
2074
 
2075
- size_t ggml_element_size(const struct ggml_tensor * tensor) {
2075
+ GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor) {
2076
2076
  return ggml_type_size(tensor->type);
2077
2077
  }
2078
2078
 
@@ -2154,11 +2154,11 @@ size_t ggml_tensor_overhead(void) {
2154
2154
  return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE;
2155
2155
  }
2156
2156
 
2157
- bool ggml_is_transposed(const struct ggml_tensor * tensor) {
2157
+ GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor) {
2158
2158
  return tensor->nb[0] > tensor->nb[1];
2159
2159
  }
2160
2160
 
2161
- bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
2161
+ GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
2162
2162
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2163
2163
 
2164
2164
  return
@@ -2177,7 +2177,7 @@ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * te
2177
2177
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
2178
2178
  }
2179
2179
 
2180
- bool ggml_is_permuted(const struct ggml_tensor * tensor) {
2180
+ GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
2181
2181
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2182
2182
 
2183
2183
  return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
@@ -2354,6 +2354,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
2354
2354
  }
2355
2355
 
2356
2356
  void ggml_free(struct ggml_context * ctx) {
2357
+ if (ctx == NULL) {
2358
+ return;
2359
+ }
2360
+
2357
2361
  // make this function thread safe
2358
2362
  ggml_critical_section_start();
2359
2363
 
@@ -3075,7 +3079,7 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
3075
3079
  return (float *)(tensor->data);
3076
3080
  }
3077
3081
 
3078
- enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
3082
+ GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
3079
3083
  GGML_ASSERT(tensor->op == GGML_OP_UNARY);
3080
3084
  return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0);
3081
3085
  }
@@ -4362,6 +4366,23 @@ struct ggml_tensor * ggml_cpy(
4362
4366
  return ggml_cpy_impl(ctx, a, b);
4363
4367
  }
4364
4368
 
4369
+ struct ggml_tensor * ggml_cast(
4370
+ struct ggml_context * ctx,
4371
+ struct ggml_tensor * a,
4372
+ enum ggml_type type) {
4373
+ bool is_node = false;
4374
+
4375
+ struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne);
4376
+ ggml_format_name(result, "%s (copy)", a->name);
4377
+
4378
+ result->op = GGML_OP_CPY;
4379
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
4380
+ result->src[0] = a;
4381
+ result->src[1] = result;
4382
+
4383
+ return result;
4384
+ }
4385
+
4365
4386
  // ggml_cont
4366
4387
 
4367
4388
  static struct ggml_tensor * ggml_cont_impl(
@@ -11617,7 +11638,22 @@ static float ggml_rope_yarn_corr_dim(int n_dims, int n_orig_ctx, float n_rot, fl
11617
11638
  return n_dims * logf(n_orig_ctx / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
11618
11639
  }
11619
11640
 
11620
- void ggml_rope_yarn_corr_dims(
11641
+ static void ggml_rope_cache_init(
11642
+ float theta_base, float freq_scale, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
11643
+ float * cache, float sin_sign, float theta_scale
11644
+ ) {
11645
+ float theta = theta_base;
11646
+ for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
11647
+ rope_yarn(
11648
+ theta, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
11649
+ );
11650
+ cache[i0 + 1] *= sin_sign;
11651
+
11652
+ theta *= theta_scale;
11653
+ }
11654
+ }
11655
+
11656
+ GGML_CALL void ggml_rope_yarn_corr_dims(
11621
11657
  int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
11622
11658
  ) {
11623
11659
  // start and end correction dims
@@ -11699,6 +11735,12 @@ static void ggml_compute_forward_rope_f32(
11699
11735
  for (int64_t i3 = 0; i3 < ne3; i3++) {
11700
11736
  for (int64_t i2 = 0; i2 < ne2; i2++) {
11701
11737
  const int64_t p = pos[i2];
11738
+
11739
+ float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
11740
+ if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox
11741
+ ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
11742
+ }
11743
+
11702
11744
  for (int64_t i1 = 0; i1 < ne1; i1++) {
11703
11745
  if (ir++ < ir0) continue;
11704
11746
  if (ir > ir1) break;
@@ -11732,18 +11774,13 @@ static void ggml_compute_forward_rope_f32(
11732
11774
  }
11733
11775
  } else if (!is_neox) {
11734
11776
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
11735
- float cos_theta, sin_theta;
11736
- rope_yarn(
11737
- theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
11738
- );
11739
- sin_theta *= sin_sign;
11777
+ const float cos_theta = cache[i0 + 0];
11778
+ const float sin_theta = cache[i0 + 1];
11740
11779
 
11741
11780
  // zeta scaling for xPos only:
11742
11781
  float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
11743
11782
  if (xpos_down) zeta = 1.0f / zeta;
11744
11783
 
11745
- theta_base *= theta_scale;
11746
-
11747
11784
  const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
11748
11785
  float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
11749
11786
 
@@ -11867,6 +11904,12 @@ static void ggml_compute_forward_rope_f16(
11867
11904
  for (int64_t i3 = 0; i3 < ne3; i3++) {
11868
11905
  for (int64_t i2 = 0; i2 < ne2; i2++) {
11869
11906
  const int64_t p = pos[i2];
11907
+
11908
+ float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
11909
+ if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox
11910
+ ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
11911
+ }
11912
+
11870
11913
  for (int64_t i1 = 0; i1 < ne1; i1++) {
11871
11914
  if (ir++ < ir0) continue;
11872
11915
  if (ir > ir1) break;
@@ -11900,13 +11943,8 @@ static void ggml_compute_forward_rope_f16(
11900
11943
  }
11901
11944
  } else if (!is_neox) {
11902
11945
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
11903
- float cos_theta, sin_theta;
11904
- rope_yarn(
11905
- theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
11906
- );
11907
- sin_theta *= sin_sign;
11908
-
11909
- theta_base *= theta_scale;
11946
+ const float cos_theta = cache[i0 + 0];
11947
+ const float sin_theta = cache[i0 + 1];
11910
11948
 
11911
11949
  const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
11912
11950
  ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
@@ -14871,7 +14909,7 @@ size_t ggml_hash_find_or_insert(struct ggml_hash_set hash_set, struct ggml_tenso
14871
14909
  return i;
14872
14910
  }
14873
14911
 
14874
- static struct ggml_hash_set ggml_hash_set_new(size_t size) {
14912
+ struct ggml_hash_set ggml_hash_set_new(size_t size) {
14875
14913
  size = ggml_hash_size(size);
14876
14914
  struct ggml_hash_set result;
14877
14915
  result.size = size;
@@ -16620,7 +16658,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16620
16658
  return GGML_EXIT_SUCCESS;
16621
16659
  }
16622
16660
 
16623
- struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16661
+ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) {
16624
16662
  if (n_threads <= 0) {
16625
16663
  n_threads = GGML_DEFAULT_N_THREADS;
16626
16664
  }
@@ -16682,14 +16720,15 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16682
16720
  } break;
16683
16721
  case GGML_OP_MUL_MAT_ID:
16684
16722
  {
16723
+ cur = 0;
16685
16724
  const struct ggml_tensor * src0 = node->src[2];
16686
16725
  const struct ggml_tensor * src1 = node->src[1];
16687
16726
  const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
16688
16727
  if (src1->type != vec_dot_type) {
16689
- cur = ggml_row_size(vec_dot_type, ggml_nelements(src1));
16728
+ cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
16690
16729
  }
16691
16730
  const int n_as = ggml_get_op_params_i32(node, 1);
16692
- cur = GGML_PAD(cur, sizeof(int64_t)); // align
16731
+ cur += GGML_PAD(cur, sizeof(int64_t)); // align
16693
16732
  cur += n_as * sizeof(int64_t); // matrix_row_counts
16694
16733
  cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
16695
16734
  } break;
@@ -16700,6 +16739,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16700
16739
  }
16701
16740
  } break;
16702
16741
  case GGML_OP_SOFT_MAX:
16742
+ case GGML_OP_ROPE:
16703
16743
  {
16704
16744
  cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
16705
16745
  } break;
@@ -18625,32 +18665,47 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
18625
18665
  return (n/QK8_0*sizeof(block_q8_0));
18626
18666
  }
18627
18667
 
18628
- size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) {
18668
+ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
18669
+ int nrows, int n_per_row, int64_t * hist, const float * imatrix) {
18670
+ (void)imatrix;
18629
18671
  size_t result = 0;
18672
+ int n = nrows * n_per_row;
18630
18673
  switch (type) {
18631
18674
  case GGML_TYPE_Q4_0:
18632
18675
  {
18633
18676
  GGML_ASSERT(start % QK4_0 == 0);
18634
- block_q4_0 * block = (block_q4_0*)dst + start / QK4_0;
18635
- result = ggml_quantize_q4_0(src + start, block, n, n, hist);
18677
+ GGML_ASSERT(start % n_per_row == 0);
18678
+ size_t start_row = start / n_per_row;
18679
+ size_t row_size = ggml_row_size(type, n_per_row);
18680
+ result = quantize_q4_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18681
+ GGML_ASSERT(result == row_size * nrows);
18636
18682
  } break;
18637
18683
  case GGML_TYPE_Q4_1:
18638
18684
  {
18639
18685
  GGML_ASSERT(start % QK4_1 == 0);
18640
- block_q4_1 * block = (block_q4_1*)dst + start / QK4_1;
18641
- result = ggml_quantize_q4_1(src + start, block, n, n, hist);
18686
+ GGML_ASSERT(start % n_per_row == 0);
18687
+ size_t start_row = start / n_per_row;
18688
+ size_t row_size = ggml_row_size(type, n_per_row);
18689
+ result = quantize_q4_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18690
+ GGML_ASSERT(result == row_size * nrows);
18642
18691
  } break;
18643
18692
  case GGML_TYPE_Q5_0:
18644
18693
  {
18645
18694
  GGML_ASSERT(start % QK5_0 == 0);
18646
- block_q5_0 * block = (block_q5_0*)dst + start / QK5_0;
18647
- result = ggml_quantize_q5_0(src + start, block, n, n, hist);
18695
+ GGML_ASSERT(start % n_per_row == 0);
18696
+ size_t start_row = start / n_per_row;
18697
+ size_t row_size = ggml_row_size(type, n_per_row);
18698
+ result = quantize_q5_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18699
+ GGML_ASSERT(result == row_size * nrows);
18648
18700
  } break;
18649
18701
  case GGML_TYPE_Q5_1:
18650
18702
  {
18651
18703
  GGML_ASSERT(start % QK5_1 == 0);
18652
- block_q5_1 * block = (block_q5_1*)dst + start / QK5_1;
18653
- result = ggml_quantize_q5_1(src + start, block, n, n, hist);
18704
+ GGML_ASSERT(start % n_per_row == 0);
18705
+ size_t start_row = start / n_per_row;
18706
+ size_t row_size = ggml_row_size(type, n_per_row);
18707
+ result = quantize_q5_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18708
+ GGML_ASSERT(result == row_size * nrows);
18654
18709
  } break;
18655
18710
  case GGML_TYPE_Q8_0:
18656
18711
  {
@@ -18661,44 +18716,67 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
18661
18716
  case GGML_TYPE_Q2_K:
18662
18717
  {
18663
18718
  GGML_ASSERT(start % QK_K == 0);
18664
- block_q2_K * block = (block_q2_K*)dst + start / QK_K;
18665
- result = ggml_quantize_q2_K(src + start, block, n, n, hist);
18719
+ GGML_ASSERT(start % n_per_row == 0);
18720
+ size_t start_row = start / n_per_row;
18721
+ size_t row_size = ggml_row_size(type, n_per_row);
18722
+ result = quantize_q2_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18723
+ GGML_ASSERT(result == row_size * nrows);
18666
18724
  } break;
18667
18725
  case GGML_TYPE_Q3_K:
18668
18726
  {
18669
18727
  GGML_ASSERT(start % QK_K == 0);
18670
- block_q3_K * block = (block_q3_K*)dst + start / QK_K;
18671
- result = ggml_quantize_q3_K(src + start, block, n, n, hist);
18728
+ GGML_ASSERT(start % n_per_row == 0);
18729
+ size_t start_row = start / n_per_row;
18730
+ size_t row_size = ggml_row_size(type, n_per_row);
18731
+ result = quantize_q3_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18732
+ GGML_ASSERT(result == row_size * nrows);
18672
18733
  } break;
18673
18734
  case GGML_TYPE_Q4_K:
18674
18735
  {
18675
18736
  GGML_ASSERT(start % QK_K == 0);
18676
- block_q4_K * block = (block_q4_K*)dst + start / QK_K;
18677
- result = ggml_quantize_q4_K(src + start, block, n, n, hist);
18737
+ GGML_ASSERT(start % n_per_row == 0);
18738
+ size_t start_row = start / n_per_row;
18739
+ size_t row_size = ggml_row_size(type, n_per_row);
18740
+ result = quantize_q4_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18741
+ GGML_ASSERT(result == row_size * nrows);
18678
18742
  } break;
18679
18743
  case GGML_TYPE_Q5_K:
18680
18744
  {
18681
18745
  GGML_ASSERT(start % QK_K == 0);
18682
- block_q5_K * block = (block_q5_K*)dst + start / QK_K;
18683
- result = ggml_quantize_q5_K(src + start, block, n, n, hist);
18746
+ GGML_ASSERT(start % n_per_row == 0);
18747
+ size_t start_row = start / n_per_row;
18748
+ size_t row_size = ggml_row_size(type, n_per_row);
18749
+ result = quantize_q5_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18750
+ GGML_ASSERT(result == row_size * nrows);
18684
18751
  } break;
18685
18752
  case GGML_TYPE_Q6_K:
18686
18753
  {
18687
18754
  GGML_ASSERT(start % QK_K == 0);
18688
- block_q6_K * block = (block_q6_K*)dst + start / QK_K;
18689
- result = ggml_quantize_q6_K(src + start, block, n, n, hist);
18755
+ GGML_ASSERT(start % n_per_row == 0);
18756
+ size_t start_row = start / n_per_row;
18757
+ size_t row_size = ggml_row_size(type, n_per_row);
18758
+ result = quantize_q6_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18759
+ GGML_ASSERT(result == row_size * nrows);
18690
18760
  } break;
18691
18761
  case GGML_TYPE_IQ2_XXS:
18692
18762
  {
18693
18763
  GGML_ASSERT(start % QK_K == 0);
18694
- block_iq2_xxs * block = (block_iq2_xxs*)dst + start / QK_K;
18695
- result = ggml_quantize_iq2_xxs(src + start, block, n, n, hist);
18764
+ GGML_ASSERT(start % n_per_row == 0);
18765
+ GGML_ASSERT(imatrix);
18766
+ size_t start_row = start / n_per_row;
18767
+ size_t row_size = ggml_row_size(type, n_per_row);
18768
+ result = quantize_iq2_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18769
+ GGML_ASSERT(result == row_size * nrows);
18696
18770
  } break;
18697
18771
  case GGML_TYPE_IQ2_XS:
18698
18772
  {
18699
18773
  GGML_ASSERT(start % QK_K == 0);
18700
- block_iq2_xs * block = (block_iq2_xs*)dst + start / QK_K;
18701
- result = ggml_quantize_iq2_xs(src + start, block, n, n, hist);
18774
+ GGML_ASSERT(start % n_per_row == 0);
18775
+ GGML_ASSERT(imatrix);
18776
+ size_t start_row = start / n_per_row;
18777
+ size_t row_size = ggml_row_size(type, n_per_row);
18778
+ result = quantize_iq2_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18779
+ GGML_ASSERT(result == row_size * nrows);
18702
18780
  } break;
18703
18781
  case GGML_TYPE_F16:
18704
18782
  {
@@ -19162,7 +19240,7 @@ void gguf_free(struct gguf_context * ctx) {
19162
19240
 
19163
19241
  if (ctx->kv) {
19164
19242
  // free string memory - not great..
19165
- for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
19243
+ for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
19166
19244
  struct gguf_kv * kv = &ctx->kv[i];
19167
19245
 
19168
19246
  if (kv->key.data) {
@@ -19178,7 +19256,7 @@ void gguf_free(struct gguf_context * ctx) {
19178
19256
  if (kv->type == GGUF_TYPE_ARRAY) {
19179
19257
  if (kv->value.arr.data) {
19180
19258
  if (kv->value.arr.type == GGUF_TYPE_STRING) {
19181
- for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
19259
+ for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
19182
19260
  struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
19183
19261
  if (str->data) {
19184
19262
  free(str->data);
@@ -19194,7 +19272,7 @@ void gguf_free(struct gguf_context * ctx) {
19194
19272
  }
19195
19273
 
19196
19274
  if (ctx->infos) {
19197
- for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
19275
+ for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
19198
19276
  struct gguf_tensor_info * info = &ctx->infos[i];
19199
19277
 
19200
19278
  if (info->name.data) {
@@ -187,6 +187,16 @@
187
187
  # define GGML_API
188
188
  #endif
189
189
 
190
+ #ifdef GGML_MULTIPLATFORM
191
+ # if defined(_WIN32)
192
+ # define GGML_CALL
193
+ # else
194
+ # define GGML_CALL __attribute__((__ms_abi__))
195
+ # endif
196
+ #else
197
+ # define GGML_CALL
198
+ #endif
199
+
190
200
  // TODO: support for clang
191
201
  #ifdef __GNUC__
192
202
  # define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
@@ -649,41 +659,41 @@ extern "C" {
649
659
  GGML_API void ggml_print_object (const struct ggml_object * obj);
650
660
  GGML_API void ggml_print_objects(const struct ggml_context * ctx);
651
661
 
652
- GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
653
- GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
654
- GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
655
- GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
662
+ GGML_API GGML_CALL int64_t ggml_nelements (const struct ggml_tensor * tensor);
663
+ GGML_API GGML_CALL int64_t ggml_nrows (const struct ggml_tensor * tensor);
664
+ GGML_API GGML_CALL size_t ggml_nbytes (const struct ggml_tensor * tensor);
665
+ GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
656
666
 
657
- GGML_API int ggml_blck_size(enum ggml_type type);
658
- GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
659
- GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
667
+ GGML_API GGML_CALL int ggml_blck_size(enum ggml_type type);
668
+ GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
669
+ GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
660
670
 
661
671
  GGML_DEPRECATED(
662
672
  GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
663
673
  "use ggml_row_size() instead");
664
674
 
665
- GGML_API const char * ggml_type_name(enum ggml_type type);
666
- GGML_API const char * ggml_op_name (enum ggml_op op);
667
- GGML_API const char * ggml_op_symbol(enum ggml_op op);
675
+ GGML_API GGML_CALL const char * ggml_type_name(enum ggml_type type);
676
+ GGML_API GGML_CALL const char * ggml_op_name (enum ggml_op op);
677
+ GGML_API const char * ggml_op_symbol(enum ggml_op op);
668
678
 
669
- GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
670
- GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
679
+ GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
680
+ GGML_API GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
671
681
 
672
- GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
682
+ GGML_API GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor);
673
683
 
674
- GGML_API bool ggml_is_quantized(enum ggml_type type);
684
+ GGML_API GGML_CALL bool ggml_is_quantized(enum ggml_type type);
675
685
 
676
686
  // TODO: temporary until model loading of ggml examples is refactored
677
687
  GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
678
688
 
679
- GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
680
- GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
681
- GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
682
- GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
683
- GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
684
- GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
685
- GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
686
- GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
689
+ GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
690
+ GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor);
691
+ GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor);
692
+ GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
693
+ GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
694
+ GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
695
+ GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
696
+ GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
687
697
 
688
698
  GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
689
699
 
@@ -770,7 +780,7 @@ extern "C" {
770
780
  GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
771
781
  GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
772
782
 
773
- GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
783
+ GGML_API GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
774
784
 
775
785
  GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
776
786
  GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
@@ -1165,6 +1175,11 @@ extern "C" {
1165
1175
  struct ggml_tensor * a,
1166
1176
  struct ggml_tensor * b);
1167
1177
 
1178
+ GGML_API struct ggml_tensor * ggml_cast(
1179
+ struct ggml_context * ctx,
1180
+ struct ggml_tensor * a,
1181
+ enum ggml_type type);
1182
+
1168
1183
  // make contiguous
1169
1184
  GGML_API struct ggml_tensor * ggml_cont(
1170
1185
  struct ggml_context * ctx,
@@ -1408,7 +1423,7 @@ extern "C" {
1408
1423
  float beta_slow);
1409
1424
 
1410
1425
  // compute correction dims for YaRN RoPE scaling
1411
- void ggml_rope_yarn_corr_dims(
1426
+ GGML_CALL void ggml_rope_yarn_corr_dims(
1412
1427
  int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
1413
1428
 
1414
1429
  // xPos RoPE, in-place, returns view(a)
@@ -1842,8 +1857,8 @@ extern "C" {
1842
1857
 
1843
1858
  // ggml_graph_plan() has to be called before ggml_graph_compute()
1844
1859
  // when plan.work_size > 0, caller must allocate memory for plan.work_data
1845
- GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
1846
- GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
1860
+ GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
1861
+ GGML_API int ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
1847
1862
 
1848
1863
  // same as ggml_graph_compute() but the work data is allocated as a part of the context
1849
1864
  // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
@@ -2062,10 +2077,13 @@ extern "C" {
2062
2077
  GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
2063
2078
  GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
2064
2079
  GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
2065
- GGML_API size_t ggml_quantize_iq2_xxs(const float * src, void * dst, int n, int k, int64_t * hist);
2066
- GGML_API size_t ggml_quantize_iq2_xs (const float * src, void * dst, int n, int k, int64_t * hist);
2067
2080
 
2068
- GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
2081
+ GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst,
2082
+ int start, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
2083
+
2084
+ // These are needed for IQ2_XS and IQ2_XXS quantizations
2085
+ GGML_API void ggml_init_iq2_quantization(enum ggml_type type);
2086
+ GGML_API void ggml_deinit_iq2_quantization(enum ggml_type type);
2069
2087
 
2070
2088
  //
2071
2089
  // Importance matrix