llama_cpp 0.12.0 → 0.12.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -132,7 +132,7 @@ void ggml_print_backtrace(void) {
132
132
  "-ex", "bt -frame-info source-and-location",
133
133
  "-ex", "detach",
134
134
  "-ex", "quit",
135
- NULL);
135
+ (char *) NULL);
136
136
  } else {
137
137
  waitpid(pid, NULL, 0);
138
138
  }
@@ -394,6 +394,12 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
394
394
  static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
395
395
  static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
396
396
 
397
+ ggml_collect_imatrix_t g_imatrix_collect = NULL;
398
+
399
+ void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect) {
400
+ g_imatrix_collect = imatrix_collect;
401
+ }
402
+
397
403
  static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
398
404
  [GGML_TYPE_I8] = {
399
405
  .type_name = "i8",
@@ -573,6 +579,28 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
573
579
  .vec_dot = ggml_vec_dot_q6_K_q8_K,
574
580
  .vec_dot_type = GGML_TYPE_Q8_K,
575
581
  },
582
+ [GGML_TYPE_IQ2_XXS] = {
583
+ .type_name = "iq2_xxs",
584
+ .blck_size = QK_K,
585
+ .type_size = sizeof(block_iq2_xxs),
586
+ .is_quantized = true,
587
+ .to_float = (ggml_to_float_t) dequantize_row_iq2_xxs,
588
+ .from_float = NULL,
589
+ .from_float_reference = NULL,
590
+ .vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
591
+ .vec_dot_type = GGML_TYPE_Q8_K,
592
+ },
593
+ [GGML_TYPE_IQ2_XS] = {
594
+ .type_name = "iq2_xs",
595
+ .blck_size = QK_K,
596
+ .type_size = sizeof(block_iq2_xs),
597
+ .is_quantized = true,
598
+ .to_float = (ggml_to_float_t) dequantize_row_iq2_xs,
599
+ .from_float = NULL,
600
+ .from_float_reference = NULL,
601
+ .vec_dot = ggml_vec_dot_iq2_xs_q8_K,
602
+ .vec_dot_type = GGML_TYPE_Q8_K,
603
+ },
576
604
  [GGML_TYPE_Q8_K] = {
577
605
  .type_name = "q8_K",
578
606
  .blck_size = QK_K,
@@ -1962,19 +1990,19 @@ void ggml_print_objects(const struct ggml_context * ctx) {
1962
1990
  GGML_PRINT("%s: --- end ---\n", __func__);
1963
1991
  }
1964
1992
 
1965
- int64_t ggml_nelements(const struct ggml_tensor * tensor) {
1993
+ GGML_CALL int64_t ggml_nelements(const struct ggml_tensor * tensor) {
1966
1994
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1967
1995
 
1968
1996
  return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
1969
1997
  }
1970
1998
 
1971
- int64_t ggml_nrows(const struct ggml_tensor * tensor) {
1999
+ GGML_CALL int64_t ggml_nrows(const struct ggml_tensor * tensor) {
1972
2000
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1973
2001
 
1974
2002
  return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
1975
2003
  }
1976
2004
 
1977
- size_t ggml_nbytes(const struct ggml_tensor * tensor) {
2005
+ GGML_CALL size_t ggml_nbytes(const struct ggml_tensor * tensor) {
1978
2006
  size_t nbytes;
1979
2007
  size_t blck_size = ggml_blck_size(tensor->type);
1980
2008
  if (blck_size == 1) {
@@ -1997,15 +2025,15 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
1997
2025
  return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
1998
2026
  }
1999
2027
 
2000
- int ggml_blck_size(enum ggml_type type) {
2028
+ GGML_CALL int ggml_blck_size(enum ggml_type type) {
2001
2029
  return type_traits[type].blck_size;
2002
2030
  }
2003
2031
 
2004
- size_t ggml_type_size(enum ggml_type type) {
2032
+ GGML_CALL size_t ggml_type_size(enum ggml_type type) {
2005
2033
  return type_traits[type].type_size;
2006
2034
  }
2007
2035
 
2008
- size_t ggml_row_size(enum ggml_type type, int64_t ne) {
2036
+ GGML_CALL size_t ggml_row_size(enum ggml_type type, int64_t ne) {
2009
2037
  assert(ne % ggml_blck_size(type) == 0);
2010
2038
  return ggml_type_size(type)*ne/ggml_blck_size(type);
2011
2039
  }
@@ -2014,15 +2042,15 @@ double ggml_type_sizef(enum ggml_type type) {
2014
2042
  return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
2015
2043
  }
2016
2044
 
2017
- const char * ggml_type_name(enum ggml_type type) {
2045
+ GGML_CALL const char * ggml_type_name(enum ggml_type type) {
2018
2046
  return type_traits[type].type_name;
2019
2047
  }
2020
2048
 
2021
- bool ggml_is_quantized(enum ggml_type type) {
2049
+ GGML_CALL bool ggml_is_quantized(enum ggml_type type) {
2022
2050
  return type_traits[type].is_quantized;
2023
2051
  }
2024
2052
 
2025
- const char * ggml_op_name(enum ggml_op op) {
2053
+ GGML_CALL const char * ggml_op_name(enum ggml_op op) {
2026
2054
  return GGML_OP_NAME[op];
2027
2055
  }
2028
2056
 
@@ -2034,7 +2062,7 @@ const char * ggml_unary_op_name(enum ggml_unary_op op) {
2034
2062
  return GGML_UNARY_OP_NAME[op];
2035
2063
  }
2036
2064
 
2037
- const char * ggml_op_desc(const struct ggml_tensor * t) {
2065
+ GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t) {
2038
2066
  if (t->op == GGML_OP_UNARY) {
2039
2067
  enum ggml_unary_op uop = ggml_get_unary_op(t);
2040
2068
  return ggml_unary_op_name(uop);
@@ -2044,7 +2072,7 @@ const char * ggml_op_desc(const struct ggml_tensor * t) {
2044
2072
  }
2045
2073
  }
2046
2074
 
2047
- size_t ggml_element_size(const struct ggml_tensor * tensor) {
2075
+ GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor) {
2048
2076
  return ggml_type_size(tensor->type);
2049
2077
  }
2050
2078
 
@@ -2111,6 +2139,8 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
2111
2139
  case GGML_FTYPE_MOSTLY_Q4_K: wtype = GGML_TYPE_Q4_K; break;
2112
2140
  case GGML_FTYPE_MOSTLY_Q5_K: wtype = GGML_TYPE_Q5_K; break;
2113
2141
  case GGML_FTYPE_MOSTLY_Q6_K: wtype = GGML_TYPE_Q6_K; break;
2142
+ case GGML_FTYPE_MOSTLY_IQ2_XXS: wtype = GGML_TYPE_IQ2_XXS; break;
2143
+ case GGML_FTYPE_MOSTLY_IQ2_XS: wtype = GGML_TYPE_IQ2_XS; break;
2114
2144
  case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
2115
2145
  case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
2116
2146
  }
@@ -2124,11 +2154,11 @@ size_t ggml_tensor_overhead(void) {
2124
2154
  return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE;
2125
2155
  }
2126
2156
 
2127
- bool ggml_is_transposed(const struct ggml_tensor * tensor) {
2157
+ GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor) {
2128
2158
  return tensor->nb[0] > tensor->nb[1];
2129
2159
  }
2130
2160
 
2131
- bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
2161
+ GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
2132
2162
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2133
2163
 
2134
2164
  return
@@ -2147,7 +2177,7 @@ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * te
2147
2177
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
2148
2178
  }
2149
2179
 
2150
- bool ggml_is_permuted(const struct ggml_tensor * tensor) {
2180
+ GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
2151
2181
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
2152
2182
 
2153
2183
  return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
@@ -2324,6 +2354,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
2324
2354
  }
2325
2355
 
2326
2356
  void ggml_free(struct ggml_context * ctx) {
2357
+ if (ctx == NULL) {
2358
+ return;
2359
+ }
2360
+
2327
2361
  // make this function thread safe
2328
2362
  ggml_critical_section_start();
2329
2363
 
@@ -3045,7 +3079,7 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
3045
3079
  return (float *)(tensor->data);
3046
3080
  }
3047
3081
 
3048
- enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
3082
+ GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
3049
3083
  GGML_ASSERT(tensor->op == GGML_OP_UNARY);
3050
3084
  return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0);
3051
3085
  }
@@ -4299,13 +4333,13 @@ struct ggml_tensor * ggml_set_2d_inplace(
4299
4333
  static struct ggml_tensor * ggml_cpy_impl(
4300
4334
  struct ggml_context * ctx,
4301
4335
  struct ggml_tensor * a,
4302
- struct ggml_tensor * b,
4303
- bool inplace) {
4336
+ struct ggml_tensor * b) {
4304
4337
  GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
4305
4338
 
4306
4339
  bool is_node = false;
4307
4340
 
4308
- if (!inplace && (a->grad || b->grad)) {
4341
+ if (a->grad || b->grad) {
4342
+ // inplace is false and either one have a grad
4309
4343
  is_node = true;
4310
4344
  }
4311
4345
 
@@ -4329,29 +4363,38 @@ struct ggml_tensor * ggml_cpy(
4329
4363
  struct ggml_context * ctx,
4330
4364
  struct ggml_tensor * a,
4331
4365
  struct ggml_tensor * b) {
4332
- return ggml_cpy_impl(ctx, a, b, false);
4366
+ return ggml_cpy_impl(ctx, a, b);
4333
4367
  }
4334
4368
 
4335
- struct ggml_tensor * ggml_cpy_inplace(
4369
+ struct ggml_tensor * ggml_cast(
4336
4370
  struct ggml_context * ctx,
4337
- struct ggml_tensor * a,
4338
- struct ggml_tensor * b) {
4339
- return ggml_cpy_impl(ctx, a, b, true);
4371
+ struct ggml_tensor * a,
4372
+ enum ggml_type type) {
4373
+ bool is_node = false;
4374
+
4375
+ struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne);
4376
+ ggml_format_name(result, "%s (copy)", a->name);
4377
+
4378
+ result->op = GGML_OP_CPY;
4379
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
4380
+ result->src[0] = a;
4381
+ result->src[1] = result;
4382
+
4383
+ return result;
4340
4384
  }
4341
4385
 
4342
4386
  // ggml_cont
4343
4387
 
4344
4388
  static struct ggml_tensor * ggml_cont_impl(
4345
4389
  struct ggml_context * ctx,
4346
- struct ggml_tensor * a,
4347
- bool inplace) {
4390
+ struct ggml_tensor * a) {
4348
4391
  bool is_node = false;
4349
4392
 
4350
- if (!inplace && a->grad) {
4393
+ if (a->grad) {
4351
4394
  is_node = true;
4352
4395
  }
4353
4396
 
4354
- struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
4397
+ struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
4355
4398
  ggml_format_name(result, "%s (cont)", a->name);
4356
4399
 
4357
4400
  result->op = GGML_OP_CONT;
@@ -4364,13 +4407,7 @@ static struct ggml_tensor * ggml_cont_impl(
4364
4407
  struct ggml_tensor * ggml_cont(
4365
4408
  struct ggml_context * ctx,
4366
4409
  struct ggml_tensor * a) {
4367
- return ggml_cont_impl(ctx, a, false);
4368
- }
4369
-
4370
- struct ggml_tensor * ggml_cont_inplace(
4371
- struct ggml_context * ctx,
4372
- struct ggml_tensor * a) {
4373
- return ggml_cont_impl(ctx, a, true);
4410
+ return ggml_cont_impl(ctx, a);
4374
4411
  }
4375
4412
 
4376
4413
  // make contiguous, with new shape
@@ -7436,6 +7473,8 @@ static void ggml_compute_forward_add(
7436
7473
  case GGML_TYPE_Q4_K:
7437
7474
  case GGML_TYPE_Q5_K:
7438
7475
  case GGML_TYPE_Q6_K:
7476
+ case GGML_TYPE_IQ2_XXS:
7477
+ case GGML_TYPE_IQ2_XS:
7439
7478
  {
7440
7479
  ggml_compute_forward_add_q_f32(params, src0, src1, dst);
7441
7480
  } break;
@@ -7700,6 +7739,8 @@ static void ggml_compute_forward_add1(
7700
7739
  case GGML_TYPE_Q4_K:
7701
7740
  case GGML_TYPE_Q5_K:
7702
7741
  case GGML_TYPE_Q6_K:
7742
+ case GGML_TYPE_IQ2_XXS:
7743
+ case GGML_TYPE_IQ2_XS:
7703
7744
  {
7704
7745
  ggml_compute_forward_add1_q_f32(params, src0, src1, dst);
7705
7746
  } break;
@@ -7814,6 +7855,8 @@ static void ggml_compute_forward_acc(
7814
7855
  case GGML_TYPE_Q4_K:
7815
7856
  case GGML_TYPE_Q5_K:
7816
7857
  case GGML_TYPE_Q6_K:
7858
+ case GGML_TYPE_IQ2_XXS:
7859
+ case GGML_TYPE_IQ2_XS:
7817
7860
  default:
7818
7861
  {
7819
7862
  GGML_ASSERT(false);
@@ -9704,10 +9747,10 @@ static void ggml_compute_forward_group_norm(
9704
9747
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9705
9748
  // helper function to determine if it is better to use BLAS or not
9706
9749
  // for large matrices, BLAS is faster
9707
- static bool ggml_compute_forward_mul_mat_use_blas(
9708
- const struct ggml_tensor * src0,
9709
- const struct ggml_tensor * src1,
9710
- struct ggml_tensor * dst) {
9750
+ static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
9751
+ const struct ggml_tensor * src0 = dst->src[0];
9752
+ const struct ggml_tensor * src1 = dst->src[1];
9753
+
9711
9754
  //const int64_t ne00 = src0->ne[0];
9712
9755
  //const int64_t ne01 = src0->ne[1];
9713
9756
 
@@ -9747,6 +9790,10 @@ static void ggml_compute_forward_mul_mat(
9747
9790
  const int ith = params->ith;
9748
9791
  const int nth = params->nth;
9749
9792
 
9793
+ if (ith == 1 && g_imatrix_collect) {
9794
+ g_imatrix_collect(src0, src1);
9795
+ }
9796
+
9750
9797
  const enum ggml_type type = src0->type;
9751
9798
 
9752
9799
  const bool src1_cont = ggml_is_contiguous(src1);
@@ -9787,7 +9834,7 @@ static void ggml_compute_forward_mul_mat(
9787
9834
  #endif
9788
9835
 
9789
9836
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9790
- if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
9837
+ if (ggml_compute_forward_mul_mat_use_blas(dst)) {
9791
9838
  if (params->ith != 0) {
9792
9839
  return;
9793
9840
  }
@@ -10050,6 +10097,10 @@ static void ggml_compute_forward_mul_mat_id(
10050
10097
 
10051
10098
  const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];
10052
10099
 
10100
+ if (ith == 1 && g_imatrix_collect) {
10101
+ g_imatrix_collect(src0_cur, src1);
10102
+ }
10103
+
10053
10104
  const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
10054
10105
  const size_t row_size = ggml_row_size(vec_dot_type, ne10);
10055
10106
 
@@ -10455,6 +10506,8 @@ static void ggml_compute_forward_out_prod(
10455
10506
  case GGML_TYPE_Q4_K:
10456
10507
  case GGML_TYPE_Q5_K:
10457
10508
  case GGML_TYPE_Q6_K:
10509
+ case GGML_TYPE_IQ2_XXS:
10510
+ case GGML_TYPE_IQ2_XS:
10458
10511
  {
10459
10512
  ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst);
10460
10513
  } break;
@@ -10629,6 +10682,8 @@ static void ggml_compute_forward_set(
10629
10682
  case GGML_TYPE_Q4_K:
10630
10683
  case GGML_TYPE_Q5_K:
10631
10684
  case GGML_TYPE_Q6_K:
10685
+ case GGML_TYPE_IQ2_XXS:
10686
+ case GGML_TYPE_IQ2_XS:
10632
10687
  default:
10633
10688
  {
10634
10689
  GGML_ASSERT(false);
@@ -10823,6 +10878,8 @@ static void ggml_compute_forward_get_rows(
10823
10878
  case GGML_TYPE_Q4_K:
10824
10879
  case GGML_TYPE_Q5_K:
10825
10880
  case GGML_TYPE_Q6_K:
10881
+ case GGML_TYPE_IQ2_XXS:
10882
+ case GGML_TYPE_IQ2_XS:
10826
10883
  {
10827
10884
  ggml_compute_forward_get_rows_q(params, src0, src1, dst);
10828
10885
  } break;
@@ -11459,6 +11516,8 @@ static void ggml_compute_forward_alibi(
11459
11516
  case GGML_TYPE_Q4_K:
11460
11517
  case GGML_TYPE_Q5_K:
11461
11518
  case GGML_TYPE_Q6_K:
11519
+ case GGML_TYPE_IQ2_XXS:
11520
+ case GGML_TYPE_IQ2_XS:
11462
11521
  case GGML_TYPE_Q8_K:
11463
11522
  case GGML_TYPE_I8:
11464
11523
  case GGML_TYPE_I16:
@@ -11533,6 +11592,8 @@ static void ggml_compute_forward_clamp(
11533
11592
  case GGML_TYPE_Q4_K:
11534
11593
  case GGML_TYPE_Q5_K:
11535
11594
  case GGML_TYPE_Q6_K:
11595
+ case GGML_TYPE_IQ2_XXS:
11596
+ case GGML_TYPE_IQ2_XS:
11536
11597
  case GGML_TYPE_Q8_K:
11537
11598
  case GGML_TYPE_I8:
11538
11599
  case GGML_TYPE_I16:
@@ -11577,7 +11638,22 @@ static float ggml_rope_yarn_corr_dim(int n_dims, int n_orig_ctx, float n_rot, fl
11577
11638
  return n_dims * logf(n_orig_ctx / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
11578
11639
  }
11579
11640
 
11580
- void ggml_rope_yarn_corr_dims(
11641
+ static void ggml_rope_cache_init(
11642
+ float theta_base, float freq_scale, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
11643
+ float * cache, float sin_sign, float theta_scale
11644
+ ) {
11645
+ float theta = theta_base;
11646
+ for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
11647
+ rope_yarn(
11648
+ theta, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
11649
+ );
11650
+ cache[i0 + 1] *= sin_sign;
11651
+
11652
+ theta *= theta_scale;
11653
+ }
11654
+ }
11655
+
11656
+ GGML_CALL void ggml_rope_yarn_corr_dims(
11581
11657
  int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
11582
11658
  ) {
11583
11659
  // start and end correction dims
@@ -11659,6 +11735,12 @@ static void ggml_compute_forward_rope_f32(
11659
11735
  for (int64_t i3 = 0; i3 < ne3; i3++) {
11660
11736
  for (int64_t i2 = 0; i2 < ne2; i2++) {
11661
11737
  const int64_t p = pos[i2];
11738
+
11739
+ float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
11740
+ if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox
11741
+ ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
11742
+ }
11743
+
11662
11744
  for (int64_t i1 = 0; i1 < ne1; i1++) {
11663
11745
  if (ir++ < ir0) continue;
11664
11746
  if (ir > ir1) break;
@@ -11692,18 +11774,13 @@ static void ggml_compute_forward_rope_f32(
11692
11774
  }
11693
11775
  } else if (!is_neox) {
11694
11776
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
11695
- float cos_theta, sin_theta;
11696
- rope_yarn(
11697
- theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
11698
- );
11699
- sin_theta *= sin_sign;
11777
+ const float cos_theta = cache[i0 + 0];
11778
+ const float sin_theta = cache[i0 + 1];
11700
11779
 
11701
11780
  // zeta scaling for xPos only:
11702
11781
  float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
11703
11782
  if (xpos_down) zeta = 1.0f / zeta;
11704
11783
 
11705
- theta_base *= theta_scale;
11706
-
11707
11784
  const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
11708
11785
  float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
11709
11786
 
@@ -11827,6 +11904,12 @@ static void ggml_compute_forward_rope_f16(
11827
11904
  for (int64_t i3 = 0; i3 < ne3; i3++) {
11828
11905
  for (int64_t i2 = 0; i2 < ne2; i2++) {
11829
11906
  const int64_t p = pos[i2];
11907
+
11908
+ float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
11909
+ if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox
11910
+ ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
11911
+ }
11912
+
11830
11913
  for (int64_t i1 = 0; i1 < ne1; i1++) {
11831
11914
  if (ir++ < ir0) continue;
11832
11915
  if (ir > ir1) break;
@@ -11860,13 +11943,8 @@ static void ggml_compute_forward_rope_f16(
11860
11943
  }
11861
11944
  } else if (!is_neox) {
11862
11945
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
11863
- float cos_theta, sin_theta;
11864
- rope_yarn(
11865
- theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
11866
- );
11867
- sin_theta *= sin_sign;
11868
-
11869
- theta_base *= theta_scale;
11946
+ const float cos_theta = cache[i0 + 0];
11947
+ const float sin_theta = cache[i0 + 1];
11870
11948
 
11871
11949
  const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
11872
11950
  ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
@@ -14831,7 +14909,7 @@ size_t ggml_hash_find_or_insert(struct ggml_hash_set hash_set, struct ggml_tenso
14831
14909
  return i;
14832
14910
  }
14833
14911
 
14834
- static struct ggml_hash_set ggml_hash_set_new(size_t size) {
14912
+ struct ggml_hash_set ggml_hash_set_new(size_t size) {
14835
14913
  size = ggml_hash_size(size);
14836
14914
  struct ggml_hash_set result;
14837
14915
  result.size = size;
@@ -16301,24 +16379,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
16301
16379
 
16302
16380
  //n_tasks = MIN(n_threads, MAX(1, nr0/128));
16303
16381
  //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
16304
-
16305
- #if defined(GGML_USE_CUBLAS)
16306
- if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
16307
- n_tasks = 1; // TODO: this actually is doing nothing
16308
- // the threads are still spinning
16309
- }
16310
- #elif defined(GGML_USE_CLBLAST)
16311
- if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
16312
- n_tasks = 1; // TODO: this actually is doing nothing
16313
- // the threads are still spinning
16314
- }
16315
- #endif
16316
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
16317
- if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
16318
- n_tasks = 1; // TODO: this actually is doing nothing
16319
- // the threads are still spinning
16320
- }
16321
- #endif
16322
16382
  } break;
16323
16383
  case GGML_OP_MUL_MAT_ID:
16324
16384
  {
@@ -16491,6 +16551,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16491
16551
  state->shared->node_n += 1;
16492
16552
  return (thread_ret_t) GGML_EXIT_ABORTED;
16493
16553
  }
16554
+
16494
16555
  if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
16495
16556
  // all other threads are finished and spinning
16496
16557
  // do finalize and init here so we don't have synchronize again
@@ -16556,14 +16617,18 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16556
16617
  } else {
16557
16618
  // wait for other threads to finish
16558
16619
  const int last = node_n;
16620
+
16621
+ const bool do_yield = last < 0 || cgraph->nodes[last]->op == GGML_OP_MUL_MAT;
16622
+
16559
16623
  while (true) {
16560
16624
  // TODO: this sched_yield can have significant impact on the performance - either positive or negative
16561
16625
  // depending on the workload and the operating system.
16562
16626
  // since it is not clear what is the best approach, it should potentially become user-configurable
16563
16627
  // ref: https://github.com/ggerganov/ggml/issues/291
16564
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
16565
- sched_yield();
16566
- #endif
16628
+ // UPD: adding the do_yield flag seems to resolve the issue universally
16629
+ if (do_yield) {
16630
+ sched_yield();
16631
+ }
16567
16632
 
16568
16633
  node_n = atomic_load(&state->shared->node_n);
16569
16634
  if (node_n != last) break;
@@ -16593,7 +16658,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16593
16658
  return GGML_EXIT_SUCCESS;
16594
16659
  }
16595
16660
 
16596
- struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16661
+ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) {
16597
16662
  if (n_threads <= 0) {
16598
16663
  n_threads = GGML_DEFAULT_N_THREADS;
16599
16664
  }
@@ -16642,7 +16707,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16642
16707
  } else
16643
16708
  #endif
16644
16709
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
16645
- if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
16710
+ if (ggml_compute_forward_mul_mat_use_blas(node)) {
16646
16711
  if (node->src[0]->type != GGML_TYPE_F32) {
16647
16712
  // here we need memory just for single 2D matrix from src0
16648
16713
  cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
@@ -16655,14 +16720,15 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16655
16720
  } break;
16656
16721
  case GGML_OP_MUL_MAT_ID:
16657
16722
  {
16723
+ cur = 0;
16658
16724
  const struct ggml_tensor * src0 = node->src[2];
16659
16725
  const struct ggml_tensor * src1 = node->src[1];
16660
16726
  const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
16661
16727
  if (src1->type != vec_dot_type) {
16662
- cur = ggml_row_size(vec_dot_type, ggml_nelements(src1));
16728
+ cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
16663
16729
  }
16664
16730
  const int n_as = ggml_get_op_params_i32(node, 1);
16665
- cur = GGML_PAD(cur, sizeof(int64_t)); // align
16731
+ cur += GGML_PAD(cur, sizeof(int64_t)); // align
16666
16732
  cur += n_as * sizeof(int64_t); // matrix_row_counts
16667
16733
  cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
16668
16734
  } break;
@@ -16673,6 +16739,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16673
16739
  }
16674
16740
  } break;
16675
16741
  case GGML_OP_SOFT_MAX:
16742
+ case GGML_OP_ROPE:
16676
16743
  {
16677
16744
  cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
16678
16745
  } break;
@@ -18598,32 +18665,47 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
18598
18665
  return (n/QK8_0*sizeof(block_q8_0));
18599
18666
  }
18600
18667
 
18601
- size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) {
18668
+ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
18669
+ int nrows, int n_per_row, int64_t * hist, const float * imatrix) {
18670
+ (void)imatrix;
18602
18671
  size_t result = 0;
18672
+ int n = nrows * n_per_row;
18603
18673
  switch (type) {
18604
18674
  case GGML_TYPE_Q4_0:
18605
18675
  {
18606
18676
  GGML_ASSERT(start % QK4_0 == 0);
18607
- block_q4_0 * block = (block_q4_0*)dst + start / QK4_0;
18608
- result = ggml_quantize_q4_0(src + start, block, n, n, hist);
18677
+ GGML_ASSERT(start % n_per_row == 0);
18678
+ size_t start_row = start / n_per_row;
18679
+ size_t row_size = ggml_row_size(type, n_per_row);
18680
+ result = quantize_q4_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18681
+ GGML_ASSERT(result == row_size * nrows);
18609
18682
  } break;
18610
18683
  case GGML_TYPE_Q4_1:
18611
18684
  {
18612
18685
  GGML_ASSERT(start % QK4_1 == 0);
18613
- block_q4_1 * block = (block_q4_1*)dst + start / QK4_1;
18614
- result = ggml_quantize_q4_1(src + start, block, n, n, hist);
18686
+ GGML_ASSERT(start % n_per_row == 0);
18687
+ size_t start_row = start / n_per_row;
18688
+ size_t row_size = ggml_row_size(type, n_per_row);
18689
+ result = quantize_q4_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18690
+ GGML_ASSERT(result == row_size * nrows);
18615
18691
  } break;
18616
18692
  case GGML_TYPE_Q5_0:
18617
18693
  {
18618
18694
  GGML_ASSERT(start % QK5_0 == 0);
18619
- block_q5_0 * block = (block_q5_0*)dst + start / QK5_0;
18620
- result = ggml_quantize_q5_0(src + start, block, n, n, hist);
18695
+ GGML_ASSERT(start % n_per_row == 0);
18696
+ size_t start_row = start / n_per_row;
18697
+ size_t row_size = ggml_row_size(type, n_per_row);
18698
+ result = quantize_q5_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18699
+ GGML_ASSERT(result == row_size * nrows);
18621
18700
  } break;
18622
18701
  case GGML_TYPE_Q5_1:
18623
18702
  {
18624
18703
  GGML_ASSERT(start % QK5_1 == 0);
18625
- block_q5_1 * block = (block_q5_1*)dst + start / QK5_1;
18626
- result = ggml_quantize_q5_1(src + start, block, n, n, hist);
18704
+ GGML_ASSERT(start % n_per_row == 0);
18705
+ size_t start_row = start / n_per_row;
18706
+ size_t row_size = ggml_row_size(type, n_per_row);
18707
+ result = quantize_q5_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18708
+ GGML_ASSERT(result == row_size * nrows);
18627
18709
  } break;
18628
18710
  case GGML_TYPE_Q8_0:
18629
18711
  {
@@ -18634,32 +18716,67 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
18634
18716
  case GGML_TYPE_Q2_K:
18635
18717
  {
18636
18718
  GGML_ASSERT(start % QK_K == 0);
18637
- block_q2_K * block = (block_q2_K*)dst + start / QK_K;
18638
- result = ggml_quantize_q2_K(src + start, block, n, n, hist);
18719
+ GGML_ASSERT(start % n_per_row == 0);
18720
+ size_t start_row = start / n_per_row;
18721
+ size_t row_size = ggml_row_size(type, n_per_row);
18722
+ result = quantize_q2_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18723
+ GGML_ASSERT(result == row_size * nrows);
18639
18724
  } break;
18640
18725
  case GGML_TYPE_Q3_K:
18641
18726
  {
18642
18727
  GGML_ASSERT(start % QK_K == 0);
18643
- block_q3_K * block = (block_q3_K*)dst + start / QK_K;
18644
- result = ggml_quantize_q3_K(src + start, block, n, n, hist);
18728
+ GGML_ASSERT(start % n_per_row == 0);
18729
+ size_t start_row = start / n_per_row;
18730
+ size_t row_size = ggml_row_size(type, n_per_row);
18731
+ result = quantize_q3_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18732
+ GGML_ASSERT(result == row_size * nrows);
18645
18733
  } break;
18646
18734
  case GGML_TYPE_Q4_K:
18647
18735
  {
18648
18736
  GGML_ASSERT(start % QK_K == 0);
18649
- block_q4_K * block = (block_q4_K*)dst + start / QK_K;
18650
- result = ggml_quantize_q4_K(src + start, block, n, n, hist);
18737
+ GGML_ASSERT(start % n_per_row == 0);
18738
+ size_t start_row = start / n_per_row;
18739
+ size_t row_size = ggml_row_size(type, n_per_row);
18740
+ result = quantize_q4_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18741
+ GGML_ASSERT(result == row_size * nrows);
18651
18742
  } break;
18652
18743
  case GGML_TYPE_Q5_K:
18653
18744
  {
18654
18745
  GGML_ASSERT(start % QK_K == 0);
18655
- block_q5_K * block = (block_q5_K*)dst + start / QK_K;
18656
- result = ggml_quantize_q5_K(src + start, block, n, n, hist);
18746
+ GGML_ASSERT(start % n_per_row == 0);
18747
+ size_t start_row = start / n_per_row;
18748
+ size_t row_size = ggml_row_size(type, n_per_row);
18749
+ result = quantize_q5_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18750
+ GGML_ASSERT(result == row_size * nrows);
18657
18751
  } break;
18658
18752
  case GGML_TYPE_Q6_K:
18659
18753
  {
18660
18754
  GGML_ASSERT(start % QK_K == 0);
18661
- block_q6_K * block = (block_q6_K*)dst + start / QK_K;
18662
- result = ggml_quantize_q6_K(src + start, block, n, n, hist);
18755
+ GGML_ASSERT(start % n_per_row == 0);
18756
+ size_t start_row = start / n_per_row;
18757
+ size_t row_size = ggml_row_size(type, n_per_row);
18758
+ result = quantize_q6_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18759
+ GGML_ASSERT(result == row_size * nrows);
18760
+ } break;
18761
+ case GGML_TYPE_IQ2_XXS:
18762
+ {
18763
+ GGML_ASSERT(start % QK_K == 0);
18764
+ GGML_ASSERT(start % n_per_row == 0);
18765
+ GGML_ASSERT(imatrix);
18766
+ size_t start_row = start / n_per_row;
18767
+ size_t row_size = ggml_row_size(type, n_per_row);
18768
+ result = quantize_iq2_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18769
+ GGML_ASSERT(result == row_size * nrows);
18770
+ } break;
18771
+ case GGML_TYPE_IQ2_XS:
18772
+ {
18773
+ GGML_ASSERT(start % QK_K == 0);
18774
+ GGML_ASSERT(start % n_per_row == 0);
18775
+ GGML_ASSERT(imatrix);
18776
+ size_t start_row = start / n_per_row;
18777
+ size_t row_size = ggml_row_size(type, n_per_row);
18778
+ result = quantize_iq2_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18779
+ GGML_ASSERT(result == row_size * nrows);
18663
18780
  } break;
18664
18781
  case GGML_TYPE_F16:
18665
18782
  {
@@ -19016,8 +19133,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19016
19133
  (int64_t) info->ne[3];
19017
19134
 
19018
19135
  if (ne % ggml_blck_size(info->type) != 0) {
19019
- fprintf(stderr, "%s: tensor '%s' number of elements (%" PRId64 ") is not a multiple of block size (%d)\n",
19020
- __func__, info->name.data, ne, ggml_blck_size(info->type));
19136
+ fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%d)\n",
19137
+ __func__, info->name.data, (int)info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
19021
19138
  fclose(file);
19022
19139
  gguf_free(ctx);
19023
19140
  return NULL;
@@ -19123,7 +19240,7 @@ void gguf_free(struct gguf_context * ctx) {
19123
19240
 
19124
19241
  if (ctx->kv) {
19125
19242
  // free string memory - not great..
19126
- for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
19243
+ for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
19127
19244
  struct gguf_kv * kv = &ctx->kv[i];
19128
19245
 
19129
19246
  if (kv->key.data) {
@@ -19139,7 +19256,7 @@ void gguf_free(struct gguf_context * ctx) {
19139
19256
  if (kv->type == GGUF_TYPE_ARRAY) {
19140
19257
  if (kv->value.arr.data) {
19141
19258
  if (kv->value.arr.type == GGUF_TYPE_STRING) {
19142
- for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
19259
+ for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
19143
19260
  struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
19144
19261
  if (str->data) {
19145
19262
  free(str->data);
@@ -19155,7 +19272,7 @@ void gguf_free(struct gguf_context * ctx) {
19155
19272
  }
19156
19273
 
19157
19274
  if (ctx->infos) {
19158
- for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
19275
+ for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
19159
19276
  struct gguf_tensor_info * info = &ctx->infos[i];
19160
19277
 
19161
19278
  if (info->name.data) {