llama_cpp 0.12.1 → 0.12.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -196,8 +196,6 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
196
196
  void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
197
197
  void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
198
198
  void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
199
- void quantize_row_iq2_xxs_reference(const float * restrict x, block_iq2_xxs * restrict y, int k);
200
- void quantize_row_iq2_xs_reference (const float * restrict x, block_iq2_xs * restrict y, int k);
201
199
 
202
200
  void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
203
201
  void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
@@ -212,8 +210,6 @@ void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
212
210
  void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
213
211
  void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
214
212
  void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
215
- void quantize_row_iq2_xxs(const float * restrict x, void * restrict y, int k);
216
- void quantize_row_iq2_xs (const float * restrict x, void * restrict y, int k);
217
213
 
218
214
  // Dequantization
219
215
  void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
@@ -246,3 +242,18 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx,
246
242
  void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
247
243
  void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
248
244
  void ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict vx, const void * restrict vy);
245
+
246
+ //
247
+ // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
248
+ //
249
+ size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
250
+ size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
251
+ size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
252
+ size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
253
+ size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
254
+ size_t quantize_q5_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
255
+ size_t quantize_q6_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
256
+ size_t quantize_q4_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
257
+ size_t quantize_q4_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
258
+ size_t quantize_q5_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
259
+ size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
@@ -585,8 +585,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
585
585
  .type_size = sizeof(block_iq2_xxs),
586
586
  .is_quantized = true,
587
587
  .to_float = (ggml_to_float_t) dequantize_row_iq2_xxs,
588
- .from_float = quantize_row_iq2_xxs,
589
- .from_float_reference = (ggml_from_float_t) quantize_row_iq2_xxs_reference,
588
+ .from_float = NULL,
589
+ .from_float_reference = NULL,
590
590
  .vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
591
591
  .vec_dot_type = GGML_TYPE_Q8_K,
592
592
  },
@@ -596,8 +596,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
596
596
  .type_size = sizeof(block_iq2_xs),
597
597
  .is_quantized = true,
598
598
  .to_float = (ggml_to_float_t) dequantize_row_iq2_xs,
599
- .from_float = quantize_row_iq2_xs,
600
- .from_float_reference = (ggml_from_float_t) quantize_row_iq2_xs_reference,
599
+ .from_float = NULL,
600
+ .from_float_reference = NULL,
601
601
  .vec_dot = ggml_vec_dot_iq2_xs_q8_K,
602
602
  .vec_dot_type = GGML_TYPE_Q8_K,
603
603
  },
@@ -1990,19 +1990,19 @@ void ggml_print_objects(const struct ggml_context * ctx) {
1990
1990
  GGML_PRINT("%s: --- end ---\n", __func__);
1991
1991
  }
1992
1992
 
1993
- int64_t ggml_nelements(const struct ggml_tensor * tensor) {
1993
+ GGML_CALL int64_t ggml_nelements(const struct ggml_tensor * tensor) {
1994
1994
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1995
1995
 
1996
1996
  return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
1997
1997
  }
1998
1998
 
1999
- int64_t ggml_nrows(const struct ggml_tensor * tensor) {
1999
+ GGML_CALL int64_t ggml_nrows(const struct ggml_tensor * tensor) {
2000
2000
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2001
2001
 
2002
2002
  return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
2003
2003
  }
2004
2004
 
2005
- size_t ggml_nbytes(const struct ggml_tensor * tensor) {
2005
+ GGML_CALL size_t ggml_nbytes(const struct ggml_tensor * tensor) {
2006
2006
  size_t nbytes;
2007
2007
  size_t blck_size = ggml_blck_size(tensor->type);
2008
2008
  if (blck_size == 1) {
@@ -2025,15 +2025,15 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
2025
2025
  return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
2026
2026
  }
2027
2027
 
2028
- int ggml_blck_size(enum ggml_type type) {
2028
+ GGML_CALL int ggml_blck_size(enum ggml_type type) {
2029
2029
  return type_traits[type].blck_size;
2030
2030
  }
2031
2031
 
2032
- size_t ggml_type_size(enum ggml_type type) {
2032
+ GGML_CALL size_t ggml_type_size(enum ggml_type type) {
2033
2033
  return type_traits[type].type_size;
2034
2034
  }
2035
2035
 
2036
- size_t ggml_row_size(enum ggml_type type, int64_t ne) {
2036
+ GGML_CALL size_t ggml_row_size(enum ggml_type type, int64_t ne) {
2037
2037
  assert(ne % ggml_blck_size(type) == 0);
2038
2038
  return ggml_type_size(type)*ne/ggml_blck_size(type);
2039
2039
  }
@@ -2042,15 +2042,15 @@ double ggml_type_sizef(enum ggml_type type) {
2042
2042
  return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
2043
2043
  }
2044
2044
 
2045
- const char * ggml_type_name(enum ggml_type type) {
2045
+ GGML_CALL const char * ggml_type_name(enum ggml_type type) {
2046
2046
  return type_traits[type].type_name;
2047
2047
  }
2048
2048
 
2049
- bool ggml_is_quantized(enum ggml_type type) {
2049
+ GGML_CALL bool ggml_is_quantized(enum ggml_type type) {
2050
2050
  return type_traits[type].is_quantized;
2051
2051
  }
2052
2052
 
2053
- const char * ggml_op_name(enum ggml_op op) {
2053
+ GGML_CALL const char * ggml_op_name(enum ggml_op op) {
2054
2054
  return GGML_OP_NAME[op];
2055
2055
  }
2056
2056
 
@@ -2062,7 +2062,7 @@ const char * ggml_unary_op_name(enum ggml_unary_op op) {
2062
2062
  return GGML_UNARY_OP_NAME[op];
2063
2063
  }
2064
2064
 
2065
- const char * ggml_op_desc(const struct ggml_tensor * t) {
2065
+ GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t) {
2066
2066
  if (t->op == GGML_OP_UNARY) {
2067
2067
  enum ggml_unary_op uop = ggml_get_unary_op(t);
2068
2068
  return ggml_unary_op_name(uop);
@@ -2072,7 +2072,7 @@ const char * ggml_op_desc(const struct ggml_tensor * t) {
2072
2072
  }
2073
2073
  }
2074
2074
 
2075
- size_t ggml_element_size(const struct ggml_tensor * tensor) {
2075
+ GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor) {
2076
2076
  return ggml_type_size(tensor->type);
2077
2077
  }
2078
2078
 
@@ -2154,11 +2154,11 @@ size_t ggml_tensor_overhead(void) {
2154
2154
  return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE;
2155
2155
  }
2156
2156
 
2157
- bool ggml_is_transposed(const struct ggml_tensor * tensor) {
2157
+ GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor) {
2158
2158
  return tensor->nb[0] > tensor->nb[1];
2159
2159
  }
2160
2160
 
2161
- bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
2161
+ GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
2162
2162
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2163
2163
 
2164
2164
  return
@@ -2177,7 +2177,7 @@ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * te
2177
2177
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
2178
2178
  }
2179
2179
 
2180
- bool ggml_is_permuted(const struct ggml_tensor * tensor) {
2180
+ GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
2181
2181
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2182
2182
 
2183
2183
  return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
@@ -2354,6 +2354,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
2354
2354
  }
2355
2355
 
2356
2356
  void ggml_free(struct ggml_context * ctx) {
2357
+ if (ctx == NULL) {
2358
+ return;
2359
+ }
2360
+
2357
2361
  // make this function thread safe
2358
2362
  ggml_critical_section_start();
2359
2363
 
@@ -3075,7 +3079,7 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
3075
3079
  return (float *)(tensor->data);
3076
3080
  }
3077
3081
 
3078
- enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
3082
+ GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
3079
3083
  GGML_ASSERT(tensor->op == GGML_OP_UNARY);
3080
3084
  return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0);
3081
3085
  }
@@ -4362,6 +4366,23 @@ struct ggml_tensor * ggml_cpy(
4362
4366
  return ggml_cpy_impl(ctx, a, b);
4363
4367
  }
4364
4368
 
4369
+ struct ggml_tensor * ggml_cast(
4370
+ struct ggml_context * ctx,
4371
+ struct ggml_tensor * a,
4372
+ enum ggml_type type) {
4373
+ bool is_node = false;
4374
+
4375
+ struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne);
4376
+ ggml_format_name(result, "%s (copy)", a->name);
4377
+
4378
+ result->op = GGML_OP_CPY;
4379
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
4380
+ result->src[0] = a;
4381
+ result->src[1] = result;
4382
+
4383
+ return result;
4384
+ }
4385
+
4365
4386
  // ggml_cont
4366
4387
 
4367
4388
  static struct ggml_tensor * ggml_cont_impl(
@@ -11617,7 +11638,22 @@ static float ggml_rope_yarn_corr_dim(int n_dims, int n_orig_ctx, float n_rot, fl
11617
11638
  return n_dims * logf(n_orig_ctx / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
11618
11639
  }
11619
11640
 
11620
- void ggml_rope_yarn_corr_dims(
11641
+ static void ggml_rope_cache_init(
11642
+ float theta_base, float freq_scale, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
11643
+ float * cache, float sin_sign, float theta_scale
11644
+ ) {
11645
+ float theta = theta_base;
11646
+ for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
11647
+ rope_yarn(
11648
+ theta, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
11649
+ );
11650
+ cache[i0 + 1] *= sin_sign;
11651
+
11652
+ theta *= theta_scale;
11653
+ }
11654
+ }
11655
+
11656
+ GGML_CALL void ggml_rope_yarn_corr_dims(
11621
11657
  int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
11622
11658
  ) {
11623
11659
  // start and end correction dims
@@ -11699,6 +11735,12 @@ static void ggml_compute_forward_rope_f32(
11699
11735
  for (int64_t i3 = 0; i3 < ne3; i3++) {
11700
11736
  for (int64_t i2 = 0; i2 < ne2; i2++) {
11701
11737
  const int64_t p = pos[i2];
11738
+
11739
+ float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
11740
+ if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox
11741
+ ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
11742
+ }
11743
+
11702
11744
  for (int64_t i1 = 0; i1 < ne1; i1++) {
11703
11745
  if (ir++ < ir0) continue;
11704
11746
  if (ir > ir1) break;
@@ -11732,18 +11774,13 @@ static void ggml_compute_forward_rope_f32(
11732
11774
  }
11733
11775
  } else if (!is_neox) {
11734
11776
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
11735
- float cos_theta, sin_theta;
11736
- rope_yarn(
11737
- theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
11738
- );
11739
- sin_theta *= sin_sign;
11777
+ const float cos_theta = cache[i0 + 0];
11778
+ const float sin_theta = cache[i0 + 1];
11740
11779
 
11741
11780
  // zeta scaling for xPos only:
11742
11781
  float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
11743
11782
  if (xpos_down) zeta = 1.0f / zeta;
11744
11783
 
11745
- theta_base *= theta_scale;
11746
-
11747
11784
  const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
11748
11785
  float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
11749
11786
 
@@ -11867,6 +11904,12 @@ static void ggml_compute_forward_rope_f16(
11867
11904
  for (int64_t i3 = 0; i3 < ne3; i3++) {
11868
11905
  for (int64_t i2 = 0; i2 < ne2; i2++) {
11869
11906
  const int64_t p = pos[i2];
11907
+
11908
+ float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
11909
+ if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox
11910
+ ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
11911
+ }
11912
+
11870
11913
  for (int64_t i1 = 0; i1 < ne1; i1++) {
11871
11914
  if (ir++ < ir0) continue;
11872
11915
  if (ir > ir1) break;
@@ -11900,13 +11943,8 @@ static void ggml_compute_forward_rope_f16(
11900
11943
  }
11901
11944
  } else if (!is_neox) {
11902
11945
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
11903
- float cos_theta, sin_theta;
11904
- rope_yarn(
11905
- theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
11906
- );
11907
- sin_theta *= sin_sign;
11908
-
11909
- theta_base *= theta_scale;
11946
+ const float cos_theta = cache[i0 + 0];
11947
+ const float sin_theta = cache[i0 + 1];
11910
11948
 
11911
11949
  const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
11912
11950
  ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
@@ -14871,7 +14909,7 @@ size_t ggml_hash_find_or_insert(struct ggml_hash_set hash_set, struct ggml_tenso
14871
14909
  return i;
14872
14910
  }
14873
14911
 
14874
- static struct ggml_hash_set ggml_hash_set_new(size_t size) {
14912
+ struct ggml_hash_set ggml_hash_set_new(size_t size) {
14875
14913
  size = ggml_hash_size(size);
14876
14914
  struct ggml_hash_set result;
14877
14915
  result.size = size;
@@ -16620,7 +16658,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16620
16658
  return GGML_EXIT_SUCCESS;
16621
16659
  }
16622
16660
 
16623
- struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16661
+ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) {
16624
16662
  if (n_threads <= 0) {
16625
16663
  n_threads = GGML_DEFAULT_N_THREADS;
16626
16664
  }
@@ -16682,14 +16720,15 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16682
16720
  } break;
16683
16721
  case GGML_OP_MUL_MAT_ID:
16684
16722
  {
16723
+ cur = 0;
16685
16724
  const struct ggml_tensor * src0 = node->src[2];
16686
16725
  const struct ggml_tensor * src1 = node->src[1];
16687
16726
  const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
16688
16727
  if (src1->type != vec_dot_type) {
16689
- cur = ggml_row_size(vec_dot_type, ggml_nelements(src1));
16728
+ cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
16690
16729
  }
16691
16730
  const int n_as = ggml_get_op_params_i32(node, 1);
16692
- cur = GGML_PAD(cur, sizeof(int64_t)); // align
16731
+ cur += GGML_PAD(cur, sizeof(int64_t)); // align
16693
16732
  cur += n_as * sizeof(int64_t); // matrix_row_counts
16694
16733
  cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
16695
16734
  } break;
@@ -16700,6 +16739,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16700
16739
  }
16701
16740
  } break;
16702
16741
  case GGML_OP_SOFT_MAX:
16742
+ case GGML_OP_ROPE:
16703
16743
  {
16704
16744
  cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
16705
16745
  } break;
@@ -18625,32 +18665,47 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
18625
18665
  return (n/QK8_0*sizeof(block_q8_0));
18626
18666
  }
18627
18667
 
18628
- size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) {
18668
+ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
18669
+ int nrows, int n_per_row, int64_t * hist, const float * imatrix) {
18670
+ (void)imatrix;
18629
18671
  size_t result = 0;
18672
+ int n = nrows * n_per_row;
18630
18673
  switch (type) {
18631
18674
  case GGML_TYPE_Q4_0:
18632
18675
  {
18633
18676
  GGML_ASSERT(start % QK4_0 == 0);
18634
- block_q4_0 * block = (block_q4_0*)dst + start / QK4_0;
18635
- result = ggml_quantize_q4_0(src + start, block, n, n, hist);
18677
+ GGML_ASSERT(start % n_per_row == 0);
18678
+ size_t start_row = start / n_per_row;
18679
+ size_t row_size = ggml_row_size(type, n_per_row);
18680
+ result = quantize_q4_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18681
+ GGML_ASSERT(result == row_size * nrows);
18636
18682
  } break;
18637
18683
  case GGML_TYPE_Q4_1:
18638
18684
  {
18639
18685
  GGML_ASSERT(start % QK4_1 == 0);
18640
- block_q4_1 * block = (block_q4_1*)dst + start / QK4_1;
18641
- result = ggml_quantize_q4_1(src + start, block, n, n, hist);
18686
+ GGML_ASSERT(start % n_per_row == 0);
18687
+ size_t start_row = start / n_per_row;
18688
+ size_t row_size = ggml_row_size(type, n_per_row);
18689
+ result = quantize_q4_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18690
+ GGML_ASSERT(result == row_size * nrows);
18642
18691
  } break;
18643
18692
  case GGML_TYPE_Q5_0:
18644
18693
  {
18645
18694
  GGML_ASSERT(start % QK5_0 == 0);
18646
- block_q5_0 * block = (block_q5_0*)dst + start / QK5_0;
18647
- result = ggml_quantize_q5_0(src + start, block, n, n, hist);
18695
+ GGML_ASSERT(start % n_per_row == 0);
18696
+ size_t start_row = start / n_per_row;
18697
+ size_t row_size = ggml_row_size(type, n_per_row);
18698
+ result = quantize_q5_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18699
+ GGML_ASSERT(result == row_size * nrows);
18648
18700
  } break;
18649
18701
  case GGML_TYPE_Q5_1:
18650
18702
  {
18651
18703
  GGML_ASSERT(start % QK5_1 == 0);
18652
- block_q5_1 * block = (block_q5_1*)dst + start / QK5_1;
18653
- result = ggml_quantize_q5_1(src + start, block, n, n, hist);
18704
+ GGML_ASSERT(start % n_per_row == 0);
18705
+ size_t start_row = start / n_per_row;
18706
+ size_t row_size = ggml_row_size(type, n_per_row);
18707
+ result = quantize_q5_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18708
+ GGML_ASSERT(result == row_size * nrows);
18654
18709
  } break;
18655
18710
  case GGML_TYPE_Q8_0:
18656
18711
  {
@@ -18661,44 +18716,67 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
18661
18716
  case GGML_TYPE_Q2_K:
18662
18717
  {
18663
18718
  GGML_ASSERT(start % QK_K == 0);
18664
- block_q2_K * block = (block_q2_K*)dst + start / QK_K;
18665
- result = ggml_quantize_q2_K(src + start, block, n, n, hist);
18719
+ GGML_ASSERT(start % n_per_row == 0);
18720
+ size_t start_row = start / n_per_row;
18721
+ size_t row_size = ggml_row_size(type, n_per_row);
18722
+ result = quantize_q2_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18723
+ GGML_ASSERT(result == row_size * nrows);
18666
18724
  } break;
18667
18725
  case GGML_TYPE_Q3_K:
18668
18726
  {
18669
18727
  GGML_ASSERT(start % QK_K == 0);
18670
- block_q3_K * block = (block_q3_K*)dst + start / QK_K;
18671
- result = ggml_quantize_q3_K(src + start, block, n, n, hist);
18728
+ GGML_ASSERT(start % n_per_row == 0);
18729
+ size_t start_row = start / n_per_row;
18730
+ size_t row_size = ggml_row_size(type, n_per_row);
18731
+ result = quantize_q3_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18732
+ GGML_ASSERT(result == row_size * nrows);
18672
18733
  } break;
18673
18734
  case GGML_TYPE_Q4_K:
18674
18735
  {
18675
18736
  GGML_ASSERT(start % QK_K == 0);
18676
- block_q4_K * block = (block_q4_K*)dst + start / QK_K;
18677
- result = ggml_quantize_q4_K(src + start, block, n, n, hist);
18737
+ GGML_ASSERT(start % n_per_row == 0);
18738
+ size_t start_row = start / n_per_row;
18739
+ size_t row_size = ggml_row_size(type, n_per_row);
18740
+ result = quantize_q4_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18741
+ GGML_ASSERT(result == row_size * nrows);
18678
18742
  } break;
18679
18743
  case GGML_TYPE_Q5_K:
18680
18744
  {
18681
18745
  GGML_ASSERT(start % QK_K == 0);
18682
- block_q5_K * block = (block_q5_K*)dst + start / QK_K;
18683
- result = ggml_quantize_q5_K(src + start, block, n, n, hist);
18746
+ GGML_ASSERT(start % n_per_row == 0);
18747
+ size_t start_row = start / n_per_row;
18748
+ size_t row_size = ggml_row_size(type, n_per_row);
18749
+ result = quantize_q5_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18750
+ GGML_ASSERT(result == row_size * nrows);
18684
18751
  } break;
18685
18752
  case GGML_TYPE_Q6_K:
18686
18753
  {
18687
18754
  GGML_ASSERT(start % QK_K == 0);
18688
- block_q6_K * block = (block_q6_K*)dst + start / QK_K;
18689
- result = ggml_quantize_q6_K(src + start, block, n, n, hist);
18755
+ GGML_ASSERT(start % n_per_row == 0);
18756
+ size_t start_row = start / n_per_row;
18757
+ size_t row_size = ggml_row_size(type, n_per_row);
18758
+ result = quantize_q6_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18759
+ GGML_ASSERT(result == row_size * nrows);
18690
18760
  } break;
18691
18761
  case GGML_TYPE_IQ2_XXS:
18692
18762
  {
18693
18763
  GGML_ASSERT(start % QK_K == 0);
18694
- block_iq2_xxs * block = (block_iq2_xxs*)dst + start / QK_K;
18695
- result = ggml_quantize_iq2_xxs(src + start, block, n, n, hist);
18764
+ GGML_ASSERT(start % n_per_row == 0);
18765
+ GGML_ASSERT(imatrix);
18766
+ size_t start_row = start / n_per_row;
18767
+ size_t row_size = ggml_row_size(type, n_per_row);
18768
+ result = quantize_iq2_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18769
+ GGML_ASSERT(result == row_size * nrows);
18696
18770
  } break;
18697
18771
  case GGML_TYPE_IQ2_XS:
18698
18772
  {
18699
18773
  GGML_ASSERT(start % QK_K == 0);
18700
- block_iq2_xs * block = (block_iq2_xs*)dst + start / QK_K;
18701
- result = ggml_quantize_iq2_xs(src + start, block, n, n, hist);
18774
+ GGML_ASSERT(start % n_per_row == 0);
18775
+ GGML_ASSERT(imatrix);
18776
+ size_t start_row = start / n_per_row;
18777
+ size_t row_size = ggml_row_size(type, n_per_row);
18778
+ result = quantize_iq2_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18779
+ GGML_ASSERT(result == row_size * nrows);
18702
18780
  } break;
18703
18781
  case GGML_TYPE_F16:
18704
18782
  {
@@ -19162,7 +19240,7 @@ void gguf_free(struct gguf_context * ctx) {
19162
19240
 
19163
19241
  if (ctx->kv) {
19164
19242
  // free string memory - not great..
19165
- for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
19243
+ for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
19166
19244
  struct gguf_kv * kv = &ctx->kv[i];
19167
19245
 
19168
19246
  if (kv->key.data) {
@@ -19178,7 +19256,7 @@ void gguf_free(struct gguf_context * ctx) {
19178
19256
  if (kv->type == GGUF_TYPE_ARRAY) {
19179
19257
  if (kv->value.arr.data) {
19180
19258
  if (kv->value.arr.type == GGUF_TYPE_STRING) {
19181
- for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
19259
+ for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
19182
19260
  struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
19183
19261
  if (str->data) {
19184
19262
  free(str->data);
@@ -19194,7 +19272,7 @@ void gguf_free(struct gguf_context * ctx) {
19194
19272
  }
19195
19273
 
19196
19274
  if (ctx->infos) {
19197
- for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
19275
+ for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
19198
19276
  struct gguf_tensor_info * info = &ctx->infos[i];
19199
19277
 
19200
19278
  if (info->name.data) {
@@ -187,6 +187,16 @@
187
187
  # define GGML_API
188
188
  #endif
189
189
 
190
+ #ifdef GGML_MULTIPLATFORM
191
+ # if defined(_WIN32)
192
+ # define GGML_CALL
193
+ # else
194
+ # define GGML_CALL __attribute__((__ms_abi__))
195
+ # endif
196
+ #else
197
+ # define GGML_CALL
198
+ #endif
199
+
190
200
  // TODO: support for clang
191
201
  #ifdef __GNUC__
192
202
  # define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
@@ -649,41 +659,41 @@ extern "C" {
649
659
  GGML_API void ggml_print_object (const struct ggml_object * obj);
650
660
  GGML_API void ggml_print_objects(const struct ggml_context * ctx);
651
661
 
652
- GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
653
- GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
654
- GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
655
- GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
662
+ GGML_API GGML_CALL int64_t ggml_nelements (const struct ggml_tensor * tensor);
663
+ GGML_API GGML_CALL int64_t ggml_nrows (const struct ggml_tensor * tensor);
664
+ GGML_API GGML_CALL size_t ggml_nbytes (const struct ggml_tensor * tensor);
665
+ GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
656
666
 
657
- GGML_API int ggml_blck_size(enum ggml_type type);
658
- GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
659
- GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
667
+ GGML_API GGML_CALL int ggml_blck_size(enum ggml_type type);
668
+ GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
669
+ GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
660
670
 
661
671
  GGML_DEPRECATED(
662
672
  GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
663
673
  "use ggml_row_size() instead");
664
674
 
665
- GGML_API const char * ggml_type_name(enum ggml_type type);
666
- GGML_API const char * ggml_op_name (enum ggml_op op);
667
- GGML_API const char * ggml_op_symbol(enum ggml_op op);
675
+ GGML_API GGML_CALL const char * ggml_type_name(enum ggml_type type);
676
+ GGML_API GGML_CALL const char * ggml_op_name (enum ggml_op op);
677
+ GGML_API const char * ggml_op_symbol(enum ggml_op op);
668
678
 
669
- GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
670
- GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
679
+ GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
680
+ GGML_API GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
671
681
 
672
- GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
682
+ GGML_API GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor);
673
683
 
674
- GGML_API bool ggml_is_quantized(enum ggml_type type);
684
+ GGML_API GGML_CALL bool ggml_is_quantized(enum ggml_type type);
675
685
 
676
686
  // TODO: temporary until model loading of ggml examples is refactored
677
687
  GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
678
688
 
679
- GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
680
- GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
681
- GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
682
- GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
683
- GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
684
- GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
685
- GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
686
- GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
689
+ GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
690
+ GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor);
691
+ GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor);
692
+ GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
693
+ GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
694
+ GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
695
+ GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
696
+ GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
687
697
 
688
698
  GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
689
699
 
@@ -770,7 +780,7 @@ extern "C" {
770
780
  GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
771
781
  GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
772
782
 
773
- GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
783
+ GGML_API GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
774
784
 
775
785
  GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
776
786
  GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
@@ -1165,6 +1175,11 @@ extern "C" {
1165
1175
  struct ggml_tensor * a,
1166
1176
  struct ggml_tensor * b);
1167
1177
 
1178
+ GGML_API struct ggml_tensor * ggml_cast(
1179
+ struct ggml_context * ctx,
1180
+ struct ggml_tensor * a,
1181
+ enum ggml_type type);
1182
+
1168
1183
  // make contiguous
1169
1184
  GGML_API struct ggml_tensor * ggml_cont(
1170
1185
  struct ggml_context * ctx,
@@ -1408,7 +1423,7 @@ extern "C" {
1408
1423
  float beta_slow);
1409
1424
 
1410
1425
  // compute correction dims for YaRN RoPE scaling
1411
- void ggml_rope_yarn_corr_dims(
1426
+ GGML_CALL void ggml_rope_yarn_corr_dims(
1412
1427
  int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
1413
1428
 
1414
1429
  // xPos RoPE, in-place, returns view(a)
@@ -1842,8 +1857,8 @@ extern "C" {
1842
1857
 
1843
1858
  // ggml_graph_plan() has to be called before ggml_graph_compute()
1844
1859
  // when plan.work_size > 0, caller must allocate memory for plan.work_data
1845
- GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
1846
- GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
1860
+ GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
1861
+ GGML_API int ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
1847
1862
 
1848
1863
  // same as ggml_graph_compute() but the work data is allocated as a part of the context
1849
1864
  // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
@@ -2062,10 +2077,13 @@ extern "C" {
2062
2077
  GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
2063
2078
  GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
2064
2079
  GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
2065
- GGML_API size_t ggml_quantize_iq2_xxs(const float * src, void * dst, int n, int k, int64_t * hist);
2066
- GGML_API size_t ggml_quantize_iq2_xs (const float * src, void * dst, int n, int k, int64_t * hist);
2067
2080
 
2068
- GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
2081
+ GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst,
2082
+ int start, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
2083
+
2084
+ // These are needed for IQ2_XS and IQ2_XXS quantizations
2085
+ GGML_API void ggml_init_iq2_quantization(enum ggml_type type);
2086
+ GGML_API void ggml_deinit_iq2_quantization(enum ggml_type type);
2069
2087
 
2070
2088
  //
2071
2089
  // Importance matrix