llama_cpp 0.1.3 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -3,6 +3,10 @@
3
3
 
4
4
  #include "ggml.h"
5
5
 
6
+ #ifdef GGML_USE_K_QUANTS
7
+ #include "k_quants.h"
8
+ #endif
9
+
6
10
  #if defined(_MSC_VER) || defined(__MINGW32__)
7
11
  #include <malloc.h> // using malloc.h with MSC/MINGW
8
12
  #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
@@ -21,6 +25,10 @@
21
25
  #include <float.h>
22
26
  #include <limits.h>
23
27
 
28
+ #ifdef GGML_USE_METAL
29
+ #include <unistd.h>
30
+ #endif
31
+
24
32
  // if C99 - static_assert is noop
25
33
  // ref: https://stackoverflow.com/a/53923785/4039976
26
34
  #ifndef static_assert
@@ -121,7 +129,11 @@ typedef void* thread_ret_t;
121
129
  #else
122
130
  inline static void* ggml_aligned_malloc(size_t size) {
123
131
  void* aligned_memory = NULL;
132
+ #ifdef GGML_USE_METAL
133
+ int result = posix_memalign(&aligned_memory, getpagesize(), size);
134
+ #else
124
135
  int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
136
+ #endif
125
137
  if (result != 0) {
126
138
  // Handle allocation failure
127
139
  return NULL;
@@ -186,10 +198,12 @@ typedef double ggml_float;
186
198
  #if defined(_MSC_VER) || defined(__MINGW32__)
187
199
  #include <intrin.h>
188
200
  #else
201
+ #if !defined(__riscv)
189
202
  #include <immintrin.h>
190
203
  #endif
191
204
  #endif
192
205
  #endif
206
+ #endif
193
207
 
194
208
  #ifdef __F16C__
195
209
 
@@ -401,21 +415,27 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) {
401
415
  //
402
416
 
403
417
  #if defined(_MSC_VER) || defined(__MINGW32__)
404
- static int64_t timer_freq;
418
+ static int64_t timer_freq, timer_start;
405
419
  void ggml_time_init(void) {
406
- LARGE_INTEGER frequency;
407
- QueryPerformanceFrequency(&frequency);
408
- timer_freq = frequency.QuadPart;
420
+ LARGE_INTEGER t;
421
+ QueryPerformanceFrequency(&t);
422
+ timer_freq = t.QuadPart;
423
+
424
+ // The multiplication by 1000 or 1000000 below can cause an overflow if timer_freq
425
+ // and the uptime is high enough.
426
+ // We subtract the program start time to reduce the likelihood of that happening.
427
+ QueryPerformanceCounter(&t);
428
+ timer_start = t.QuadPart;
409
429
  }
410
430
  int64_t ggml_time_ms(void) {
411
431
  LARGE_INTEGER t;
412
432
  QueryPerformanceCounter(&t);
413
- return (t.QuadPart * 1000) / timer_freq;
433
+ return ((t.QuadPart-timer_start) * 1000) / timer_freq;
414
434
  }
415
435
  int64_t ggml_time_us(void) {
416
436
  LARGE_INTEGER t;
417
437
  QueryPerformanceCounter(&t);
418
- return (t.QuadPart * 1000000) / timer_freq;
438
+ return ((t.QuadPart-timer_start) * 1000000) / timer_freq;
419
439
  }
420
440
  #else
421
441
  void ggml_time_init(void) {}
@@ -472,6 +492,8 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
472
492
  // quantization
473
493
  //
474
494
 
495
+ #define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
496
+
475
497
  #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
476
498
  // multiply int8_t, add results pairwise twice
477
499
  static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
@@ -531,7 +553,7 @@ static inline __m256i bytes_from_bits_32(const uint8_t * x) {
531
553
  static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
532
554
  {
533
555
  const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
534
- const __m256i bytes = _mm256_set_m128i(_mm_srli_epi16(tmp, 4), tmp);
556
+ const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp);
535
557
  const __m256i lowMask = _mm256_set1_epi8( 0xF );
536
558
  return _mm256_and_si256(lowMask, bytes);
537
559
  }
@@ -604,7 +626,7 @@ static inline __m256i bytes_from_bits_32(const uint8_t * x) {
604
626
  bytesh = _mm_or_si128(bytesh, bit_mask);
605
627
  bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1));
606
628
  bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1));
607
- return _mm256_set_m128i(bytesh, bytesl);
629
+ return MM256_SET_M128I(bytesh, bytesl);
608
630
  }
609
631
 
610
632
  // Unpack 32 4-bit fields into 32 bytes
@@ -617,7 +639,7 @@ static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
617
639
  const __m128i lowMask = _mm_set1_epi8(0xF);
618
640
  tmpl = _mm_and_si128(lowMask, tmpl);
619
641
  tmph = _mm_and_si128(lowMask, tmph);
620
- return _mm256_set_m128i(tmph, tmpl);
642
+ return MM256_SET_M128I(tmph, tmpl);
621
643
  }
622
644
 
623
645
  // add int16_t pairwise and return as float vector
@@ -625,7 +647,7 @@ static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) {
625
647
  const __m128i ones = _mm_set1_epi16(1);
626
648
  const __m128i summed_pairsl = _mm_madd_epi16(ones, xl);
627
649
  const __m128i summed_pairsh = _mm_madd_epi16(ones, xh);
628
- const __m256i summed_pairs = _mm256_set_m128i(summed_pairsh, summed_pairsl);
650
+ const __m256i summed_pairs = MM256_SET_M128I(summed_pairsh, summed_pairsl);
629
651
  return _mm256_cvtepi32_ps(summed_pairs);
630
652
  }
631
653
 
@@ -1563,6 +1585,48 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
1563
1585
  .vec_dot_q = NULL, // TODO
1564
1586
  .vec_dot_type = GGML_TYPE_Q8_1,
1565
1587
  },
1588
+ #ifdef GGML_USE_K_QUANTS
1589
+ [GGML_TYPE_Q2_K] = {
1590
+ .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q2_K,
1591
+ .quantize_row_q = quantize_row_q2_K,
1592
+ .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q2_K_reference,
1593
+ .quantize_row_q_dot = quantize_row_q8_K,
1594
+ .vec_dot_q = ggml_vec_dot_q2_K_q8_K,
1595
+ .vec_dot_type = GGML_TYPE_Q8_K,
1596
+ },
1597
+ [GGML_TYPE_Q3_K] = {
1598
+ .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q3_K,
1599
+ .quantize_row_q = quantize_row_q3_K,
1600
+ .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q3_K_reference,
1601
+ .quantize_row_q_dot = quantize_row_q8_K,
1602
+ .vec_dot_q = ggml_vec_dot_q3_K_q8_K,
1603
+ .vec_dot_type = GGML_TYPE_Q8_K,
1604
+ },
1605
+ [GGML_TYPE_Q4_K] = {
1606
+ .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q4_K,
1607
+ .quantize_row_q = quantize_row_q4_K,
1608
+ .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_K_reference,
1609
+ .quantize_row_q_dot = quantize_row_q8_K,
1610
+ .vec_dot_q = ggml_vec_dot_q4_K_q8_K,
1611
+ .vec_dot_type = GGML_TYPE_Q8_K,
1612
+ },
1613
+ [GGML_TYPE_Q5_K] = {
1614
+ .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_K,
1615
+ .quantize_row_q = quantize_row_q5_K,
1616
+ .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_K_reference,
1617
+ .quantize_row_q_dot = quantize_row_q8_K,
1618
+ .vec_dot_q = ggml_vec_dot_q5_K_q8_K,
1619
+ .vec_dot_type = GGML_TYPE_Q8_K,
1620
+ },
1621
+ [GGML_TYPE_Q6_K] = {
1622
+ .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q6_K,
1623
+ .quantize_row_q = quantize_row_q6_K,
1624
+ .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q6_K_reference,
1625
+ .quantize_row_q_dot = quantize_row_q8_K,
1626
+ .vec_dot_q = ggml_vec_dot_q6_K_q8_K,
1627
+ .vec_dot_type = GGML_TYPE_Q8_K,
1628
+ },
1629
+ #endif
1566
1630
  };
1567
1631
 
1568
1632
  // For internal test use
@@ -2288,7 +2352,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2288
2352
  const __m128i i32_1 = mul_sum_i8_pairs(bx, by);
2289
2353
 
2290
2354
  // Convert int32_t to float
2291
- __m256 p = _mm256_cvtepi32_ps(_mm256_set_m128i(i32_0, i32_1));
2355
+ __m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1));
2292
2356
 
2293
2357
  // Apply the scale, and accumulate
2294
2358
  acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
@@ -2764,7 +2828,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2764
2828
  __m128i bxh = _mm256_extractf128_si256(bx, 1);
2765
2829
  bxl = _mm_or_si128(bxl, bxhil);
2766
2830
  bxh = _mm_or_si128(bxh, bxhih);
2767
- bx = _mm256_set_m128i(bxh, bxl);
2831
+ bx = MM256_SET_M128I(bxh, bxl);
2768
2832
 
2769
2833
  const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
2770
2834
 
@@ -3020,7 +3084,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
3020
3084
  __m128i bxh = _mm256_extractf128_si256(bx, 1);
3021
3085
  bxl = _mm_or_si128(bxl, bxhil);
3022
3086
  bxh = _mm_or_si128(bxh, bxhih);
3023
- bx = _mm256_set_m128i(bxh, bxl);
3087
+ bx = MM256_SET_M128I(bxh, bxl);
3024
3088
 
3025
3089
  const __m256 dy = _mm256_set1_ps(y[i].d);
3026
3090
  const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
@@ -3442,11 +3506,19 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
3442
3506
  [GGML_TYPE_Q5_1] = QK5_1,
3443
3507
  [GGML_TYPE_Q8_0] = QK8_0,
3444
3508
  [GGML_TYPE_Q8_1] = QK8_1,
3509
+ #ifdef GGML_USE_K_QUANTS
3510
+ [GGML_TYPE_Q2_K] = QK_K,
3511
+ [GGML_TYPE_Q3_K] = QK_K,
3512
+ [GGML_TYPE_Q4_K] = QK_K,
3513
+ [GGML_TYPE_Q5_K] = QK_K,
3514
+ [GGML_TYPE_Q6_K] = QK_K,
3515
+ [GGML_TYPE_Q8_K] = QK_K,
3516
+ #endif
3445
3517
  [GGML_TYPE_I8] = 1,
3446
3518
  [GGML_TYPE_I16] = 1,
3447
3519
  [GGML_TYPE_I32] = 1,
3448
3520
  };
3449
- static_assert(GGML_TYPE_COUNT == 13, "GGML_BLCK_SIZE is outdated");
3521
+ static_assert(GGML_TYPE_COUNT == 19, "GGML_BLCK_SIZE is outdated");
3450
3522
 
3451
3523
  static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
3452
3524
  [GGML_TYPE_F32] = sizeof(float),
@@ -3457,11 +3529,19 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
3457
3529
  [GGML_TYPE_Q5_1] = sizeof(block_q5_1),
3458
3530
  [GGML_TYPE_Q8_0] = sizeof(block_q8_0),
3459
3531
  [GGML_TYPE_Q8_1] = sizeof(block_q8_1),
3532
+ #ifdef GGML_USE_K_QUANTS
3533
+ [GGML_TYPE_Q2_K] = sizeof(block_q2_K),
3534
+ [GGML_TYPE_Q3_K] = sizeof(block_q3_K),
3535
+ [GGML_TYPE_Q4_K] = sizeof(block_q4_K),
3536
+ [GGML_TYPE_Q5_K] = sizeof(block_q5_K),
3537
+ [GGML_TYPE_Q6_K] = sizeof(block_q6_K),
3538
+ [GGML_TYPE_Q8_K] = sizeof(block_q8_K),
3539
+ #endif
3460
3540
  [GGML_TYPE_I8] = sizeof(int8_t),
3461
3541
  [GGML_TYPE_I16] = sizeof(int16_t),
3462
3542
  [GGML_TYPE_I32] = sizeof(int32_t),
3463
3543
  };
3464
- static_assert(GGML_TYPE_COUNT == 13, "GGML_TYPE_SIZE is outdated");
3544
+ static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_SIZE is outdated");
3465
3545
 
3466
3546
 
3467
3547
  static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
@@ -3473,11 +3553,17 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
3473
3553
  [GGML_TYPE_Q5_1] = "q5_1",
3474
3554
  [GGML_TYPE_Q8_0] = "q8_0",
3475
3555
  [GGML_TYPE_Q8_1] = "q8_1",
3556
+ [GGML_TYPE_Q2_K] = "q2_K",
3557
+ [GGML_TYPE_Q3_K] = "q3_K",
3558
+ [GGML_TYPE_Q4_K] = "q4_K",
3559
+ [GGML_TYPE_Q5_K] = "q5_K",
3560
+ [GGML_TYPE_Q6_K] = "q6_K",
3561
+ [GGML_TYPE_Q8_K] = "q8_K",
3476
3562
  [GGML_TYPE_I8] = "i8",
3477
3563
  [GGML_TYPE_I16] = "i16",
3478
3564
  [GGML_TYPE_I32] = "i32",
3479
3565
  };
3480
- static_assert(GGML_TYPE_COUNT == 13, "GGML_TYPE_NAME is outdated");
3566
+ static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_NAME is outdated");
3481
3567
 
3482
3568
  static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
3483
3569
  [GGML_TYPE_F32] = false,
@@ -3488,13 +3574,19 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
3488
3574
  [GGML_TYPE_Q5_1] = true,
3489
3575
  [GGML_TYPE_Q8_0] = true,
3490
3576
  [GGML_TYPE_Q8_1] = true,
3577
+ [GGML_TYPE_Q2_K] = true,
3578
+ [GGML_TYPE_Q3_K] = true,
3579
+ [GGML_TYPE_Q4_K] = true,
3580
+ [GGML_TYPE_Q5_K] = true,
3581
+ [GGML_TYPE_Q6_K] = true,
3582
+ [GGML_TYPE_Q8_K] = true,
3491
3583
  [GGML_TYPE_I8] = false,
3492
3584
  [GGML_TYPE_I16] = false,
3493
3585
  [GGML_TYPE_I32] = false,
3494
3586
  };
3495
- static_assert(GGML_TYPE_COUNT == 13, "GGML_IS_QUANTIZED is outdated");
3587
+ static_assert(GGML_TYPE_COUNT == 19, "GGML_IS_QUANTIZED is outdated");
3496
3588
 
3497
- static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
3589
+ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3498
3590
  "NONE",
3499
3591
 
3500
3592
  "DUP",
@@ -3629,6 +3721,7 @@ struct ggml_context {
3629
3721
  void * mem_buffer;
3630
3722
  bool mem_buffer_owned;
3631
3723
  bool no_alloc;
3724
+ bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
3632
3725
 
3633
3726
  int n_objects;
3634
3727
 
@@ -3645,26 +3738,6 @@ struct ggml_context_container {
3645
3738
  struct ggml_context context;
3646
3739
  };
3647
3740
 
3648
- //
3649
- // compute types
3650
- //
3651
-
3652
- enum ggml_task_type {
3653
- GGML_TASK_INIT = 0,
3654
- GGML_TASK_COMPUTE,
3655
- GGML_TASK_FINALIZE,
3656
- };
3657
-
3658
- struct ggml_compute_params {
3659
- enum ggml_task_type type;
3660
-
3661
- int ith, nth;
3662
-
3663
- // work buffer for all threads
3664
- size_t wsize;
3665
- void * wdata;
3666
- };
3667
-
3668
3741
  //
3669
3742
  // ggml state
3670
3743
  //
@@ -3721,7 +3794,7 @@ int64_t ggml_nelements(const struct ggml_tensor * tensor) {
3721
3794
  return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
3722
3795
  }
3723
3796
 
3724
- int ggml_nrows(const struct ggml_tensor * tensor) {
3797
+ int64_t ggml_nrows(const struct ggml_tensor * tensor) {
3725
3798
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3726
3799
 
3727
3800
  return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
@@ -3730,7 +3803,20 @@ int ggml_nrows(const struct ggml_tensor * tensor) {
3730
3803
  size_t ggml_nbytes(const struct ggml_tensor * tensor) {
3731
3804
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3732
3805
 
3733
- return (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type];
3806
+ // this should handle cases where the tensor is not contiguous in memory
3807
+ // probaby just:
3808
+ //
3809
+ // return tensor->ne[3]*tensor->nb[3]
3810
+ //
3811
+ // is enough, but just in case, adding the second part
3812
+
3813
+ return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]);
3814
+ }
3815
+
3816
+ size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
3817
+ static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3818
+
3819
+ return (nrows_split*tensor->ne[0]*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type];
3734
3820
  }
3735
3821
 
3736
3822
  int ggml_blck_size(enum ggml_type type) {
@@ -3749,6 +3835,9 @@ const char * ggml_type_name(enum ggml_type type) {
3749
3835
  return GGML_TYPE_NAME[type];
3750
3836
  }
3751
3837
 
3838
+ const char * ggml_op_name(enum ggml_op op) {
3839
+ return GGML_OP_NAME[op];
3840
+ }
3752
3841
 
3753
3842
  size_t ggml_element_size(const struct ggml_tensor * tensor) {
3754
3843
  return GGML_TYPE_SIZE[tensor->type];
@@ -3796,6 +3885,11 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
3796
3885
  case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break;
3797
3886
  case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break;
3798
3887
  case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break;
3888
+ case GGML_FTYPE_MOSTLY_Q2_K: wtype = GGML_TYPE_Q2_K; break;
3889
+ case GGML_FTYPE_MOSTLY_Q3_K: wtype = GGML_TYPE_Q3_K; break;
3890
+ case GGML_FTYPE_MOSTLY_Q4_K: wtype = GGML_TYPE_Q4_K; break;
3891
+ case GGML_FTYPE_MOSTLY_Q5_K: wtype = GGML_TYPE_Q5_K; break;
3892
+ case GGML_FTYPE_MOSTLY_Q6_K: wtype = GGML_TYPE_Q6_K; break;
3799
3893
  case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
3800
3894
  case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
3801
3895
  }
@@ -3805,11 +3899,15 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
3805
3899
  return wtype;
3806
3900
  }
3807
3901
 
3808
- static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
3902
+ size_t ggml_tensor_overhead(void) {
3903
+ return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16;
3904
+ }
3905
+
3906
+ bool ggml_is_transposed(const struct ggml_tensor * tensor) {
3809
3907
  return tensor->nb[0] > tensor->nb[1];
3810
3908
  }
3811
3909
 
3812
- static inline bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
3910
+ bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
3813
3911
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3814
3912
 
3815
3913
  return
@@ -3958,6 +4056,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
3958
4056
  /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size),
3959
4057
  /*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
3960
4058
  /*.no_alloc =*/ params.no_alloc,
4059
+ /*.no_alloc_save =*/ params.no_alloc,
3961
4060
  /*.n_objects =*/ 0,
3962
4061
  /*.objects_begin =*/ NULL,
3963
4062
  /*.objects_end =*/ NULL,
@@ -4017,17 +4116,36 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
4017
4116
  return result;
4018
4117
  }
4019
4118
 
4119
+ void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
4120
+ ctx->no_alloc = no_alloc;
4121
+ }
4122
+
4123
+ void * ggml_get_mem_buffer(struct ggml_context * ctx) {
4124
+ return ctx->mem_buffer;
4125
+ }
4126
+
4127
+ size_t ggml_get_mem_size(struct ggml_context * ctx) {
4128
+ return ctx->mem_size;
4129
+ }
4130
+
4020
4131
  // IMPORTANT:
4021
4132
  // when creating "opt" tensors, always save and load the scratch buffer
4022
4133
  // this is an error prone process, but it is necessary to support inplace
4023
4134
  // operators when using scratch buffers
4024
4135
  // TODO: implement a better way
4025
4136
  void ggml_scratch_save(struct ggml_context * ctx) {
4137
+ // this is needed to allow opt tensors to store their data
4138
+ // TODO: again, need to find a better way
4139
+ ctx->no_alloc_save = ctx->no_alloc;
4140
+ ctx->no_alloc = false;
4141
+
4026
4142
  ctx->scratch_save = ctx->scratch;
4027
4143
  ctx->scratch.data = NULL;
4028
4144
  }
4029
4145
 
4030
4146
  void ggml_scratch_load(struct ggml_context * ctx) {
4147
+ ctx->no_alloc = ctx->no_alloc_save;
4148
+
4031
4149
  ctx->scratch = ctx->scratch_save;
4032
4150
  }
4033
4151
 
@@ -4061,7 +4179,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
4061
4179
  struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
4062
4180
 
4063
4181
  if (ctx->scratch.data == NULL || data != NULL) {
4064
- size_needed += sizeof(struct ggml_tensor);
4182
+ size_needed += GGML_TENSOR_SIZE;
4065
4183
 
4066
4184
  if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
4067
4185
  GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
@@ -4077,14 +4195,15 @@ struct ggml_tensor * ggml_new_tensor_impl(
4077
4195
  };
4078
4196
  } else {
4079
4197
  if (ctx->scratch.offs + size_needed > ctx->scratch.size) {
4080
- GGML_PRINT("%s: not enough space in the scratch memory\n", __func__);
4198
+ GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
4199
+ __func__, ctx->scratch.offs + size_needed, ctx->scratch.size);
4081
4200
  assert(false);
4082
4201
  return NULL;
4083
4202
  }
4084
4203
 
4085
- if (cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE > ctx->mem_size) {
4204
+ if (cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE > ctx->mem_size) {
4086
4205
  GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
4087
- __func__, cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE, ctx->mem_size);
4206
+ __func__, cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE, ctx->mem_size);
4088
4207
  assert(false);
4089
4208
  return NULL;
4090
4209
  }
@@ -4093,7 +4212,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
4093
4212
 
4094
4213
  *obj_new = (struct ggml_object) {
4095
4214
  .offs = cur_end + GGML_OBJECT_SIZE,
4096
- .size = sizeof(struct ggml_tensor),
4215
+ .size = GGML_TENSOR_SIZE,
4097
4216
  .next = NULL,
4098
4217
  };
4099
4218
 
@@ -4135,6 +4254,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
4135
4254
  /*.perf_time_us =*/ 0,
4136
4255
  /*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
4137
4256
  /*.name =*/ { 0 },
4257
+ /*.extra =*/ NULL,
4138
4258
  /*.pad =*/ { 0 },
4139
4259
  };
4140
4260
 
@@ -4509,6 +4629,23 @@ struct ggml_tensor * ggml_view_tensor(
4509
4629
  return result;
4510
4630
  }
4511
4631
 
4632
+ struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
4633
+ struct ggml_object * obj = ctx->objects_begin;
4634
+
4635
+ char * const mem_buffer = ctx->mem_buffer;
4636
+
4637
+ while (obj != NULL) {
4638
+ struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
4639
+ if (strcmp(cur->name, name) == 0) {
4640
+ return cur;
4641
+ }
4642
+
4643
+ obj = obj->next;
4644
+ }
4645
+
4646
+ return NULL;
4647
+ }
4648
+
4512
4649
  ////////////////////////////////////////////////////////////////////////////////
4513
4650
 
4514
4651
  // ggml_dup
@@ -5763,10 +5900,18 @@ struct ggml_tensor * ggml_view_1d(
5763
5900
 
5764
5901
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
5765
5902
 
5903
+ ggml_scratch_save(ctx);
5904
+
5905
+ struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
5906
+ memcpy(offs->data, &offset, 2*sizeof(int32_t));
5907
+
5908
+ ggml_scratch_load(ctx);
5909
+
5766
5910
  result->op = GGML_OP_VIEW;
5767
5911
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5768
5912
  result->src0 = a;
5769
5913
  result->src1 = NULL;
5914
+ result->opt[0] = offs;
5770
5915
 
5771
5916
  if (is_node) {
5772
5917
  memcpy(result->padding, &offset, sizeof(offset));
@@ -5795,6 +5940,13 @@ struct ggml_tensor * ggml_view_2d(
5795
5940
 
5796
5941
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
5797
5942
 
5943
+ ggml_scratch_save(ctx);
5944
+
5945
+ struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
5946
+ memcpy(offs->data, &offset, 2*sizeof(int32_t));
5947
+
5948
+ ggml_scratch_load(ctx);
5949
+
5798
5950
  result->nb[1] = nb1;
5799
5951
  result->nb[2] = result->nb[1]*ne1;
5800
5952
  result->nb[3] = result->nb[2];
@@ -5803,6 +5955,7 @@ struct ggml_tensor * ggml_view_2d(
5803
5955
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5804
5956
  result->src0 = a;
5805
5957
  result->src1 = NULL;
5958
+ result->opt[0] = offs;
5806
5959
 
5807
5960
  if (is_node) {
5808
5961
  memcpy(result->padding, &offset, sizeof(offset));
@@ -5833,6 +5986,13 @@ struct ggml_tensor * ggml_view_3d(
5833
5986
 
5834
5987
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
5835
5988
 
5989
+ ggml_scratch_save(ctx);
5990
+
5991
+ struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
5992
+ memcpy(offs->data, &offset, 2*sizeof(int32_t));
5993
+
5994
+ ggml_scratch_load(ctx);
5995
+
5836
5996
  result->nb[1] = nb1;
5837
5997
  result->nb[2] = nb2;
5838
5998
  result->nb[3] = result->nb[2]*ne2;
@@ -5841,6 +6001,7 @@ struct ggml_tensor * ggml_view_3d(
5841
6001
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5842
6002
  result->src0 = a;
5843
6003
  result->src1 = NULL;
6004
+ result->opt[0] = offs;
5844
6005
 
5845
6006
  if (is_node) {
5846
6007
  memcpy(result->padding, &offset, sizeof(offset));
@@ -5873,6 +6034,13 @@ struct ggml_tensor * ggml_view_4d(
5873
6034
 
5874
6035
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
5875
6036
 
6037
+ ggml_scratch_save(ctx);
6038
+
6039
+ struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6040
+ memcpy(offs->data, &offset, 2*sizeof(int32_t));
6041
+
6042
+ ggml_scratch_load(ctx);
6043
+
5876
6044
  result->nb[1] = nb1;
5877
6045
  result->nb[2] = nb2;
5878
6046
  result->nb[3] = nb3;
@@ -5881,6 +6049,7 @@ struct ggml_tensor * ggml_view_4d(
5881
6049
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5882
6050
  result->src0 = a;
5883
6051
  result->src1 = NULL;
6052
+ result->opt[0] = offs;
5884
6053
 
5885
6054
  if (is_node) {
5886
6055
  memcpy(result->padding, &offset, sizeof(offset));
@@ -6303,7 +6472,7 @@ struct ggml_tensor * ggml_alibi(
6303
6472
 
6304
6473
  ggml_scratch_save(ctx);
6305
6474
 
6306
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6475
+ struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
6307
6476
 
6308
6477
  ((int32_t *) b->data)[0] = n_past;
6309
6478
  ((int32_t *) b->data)[1] = n_head;
@@ -7545,6 +7714,11 @@ static void ggml_compute_forward_add(
7545
7714
  case GGML_TYPE_Q5_0:
7546
7715
  case GGML_TYPE_Q5_1:
7547
7716
  case GGML_TYPE_Q8_0:
7717
+ case GGML_TYPE_Q2_K:
7718
+ case GGML_TYPE_Q3_K:
7719
+ case GGML_TYPE_Q4_K:
7720
+ case GGML_TYPE_Q5_K:
7721
+ case GGML_TYPE_Q6_K:
7548
7722
  {
7549
7723
  ggml_compute_forward_add_q_f32(params, src0, src1, dst);
7550
7724
  } break;
@@ -7848,6 +8022,11 @@ static void ggml_compute_forward_add1(
7848
8022
  case GGML_TYPE_Q5_1:
7849
8023
  case GGML_TYPE_Q8_0:
7850
8024
  case GGML_TYPE_Q8_1:
8025
+ case GGML_TYPE_Q2_K:
8026
+ case GGML_TYPE_Q3_K:
8027
+ case GGML_TYPE_Q4_K:
8028
+ case GGML_TYPE_Q5_K:
8029
+ case GGML_TYPE_Q6_K:
7851
8030
  {
7852
8031
  ggml_compute_forward_add1_q_f32(params, src0, src1, dst);
7853
8032
  } break;
@@ -7970,6 +8149,11 @@ static void ggml_compute_forward_acc(
7970
8149
  case GGML_TYPE_Q5_1:
7971
8150
  case GGML_TYPE_Q8_0:
7972
8151
  case GGML_TYPE_Q8_1:
8152
+ case GGML_TYPE_Q2_K:
8153
+ case GGML_TYPE_Q3_K:
8154
+ case GGML_TYPE_Q4_K:
8155
+ case GGML_TYPE_Q5_K:
8156
+ case GGML_TYPE_Q6_K:
7973
8157
  default:
7974
8158
  {
7975
8159
  GGML_ASSERT(false);
@@ -8088,10 +8272,10 @@ static void ggml_compute_forward_mul_f32(
8088
8272
  const int ith = params->ith;
8089
8273
  const int nth = params->nth;
8090
8274
 
8091
- #ifdef GGML_USE_CUBLAS
8092
- if (src1->backend == GGML_BACKEND_CUDA) {
8275
+ #ifdef GGML_USE_CLBLAST
8276
+ if (src1->backend == GGML_BACKEND_GPU) {
8093
8277
  if (ith == 0) {
8094
- ggml_cuda_mul(src0, src1, dst);
8278
+ ggml_cl_mul(src0, src1, dst);
8095
8279
  }
8096
8280
  return;
8097
8281
  }
@@ -9206,7 +9390,7 @@ static void ggml_compute_forward_rms_norm_f32(
9206
9390
  sum += (ggml_float)(x[i00] * x[i00]);
9207
9391
  }
9208
9392
 
9209
- float mean = sum/ne00;
9393
+ const float mean = sum/ne00;
9210
9394
 
9211
9395
  float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
9212
9396
 
@@ -9529,14 +9713,7 @@ static void ggml_compute_forward_mul_mat_f32(
9529
9713
  // nb01 >= nb00 - src0 is not transposed
9530
9714
  // compute by src0 rows
9531
9715
 
9532
- #if defined(GGML_USE_CUBLAS)
9533
- if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
9534
- if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
9535
- ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
9536
- }
9537
- return;
9538
- }
9539
- #elif defined(GGML_USE_CLBLAST)
9716
+ #if defined(GGML_USE_CLBLAST)
9540
9717
  if (ggml_cl_can_mul_mat(src0, src1, dst)) {
9541
9718
  if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
9542
9719
  ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
@@ -9701,14 +9878,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
9701
9878
  // nb01 >= nb00 - src0 is not transposed
9702
9879
  // compute by src0 rows
9703
9880
 
9704
- #if defined(GGML_USE_CUBLAS)
9705
- if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
9706
- if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
9707
- ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
9708
- }
9709
- return;
9710
- }
9711
- #elif defined(GGML_USE_CLBLAST)
9881
+ #if defined(GGML_USE_CLBLAST)
9712
9882
  if (ggml_cl_can_mul_mat(src0, src1, dst)) {
9713
9883
  if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
9714
9884
  ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
@@ -9913,14 +10083,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
9913
10083
  // nb01 >= nb00 - src0 is not transposed
9914
10084
  // compute by src0 rows
9915
10085
 
9916
- #if defined(GGML_USE_CUBLAS)
9917
- if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
9918
- if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
9919
- ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
9920
- }
9921
- return;
9922
- }
9923
- #elif defined(GGML_USE_CLBLAST)
10086
+ #if defined(GGML_USE_CLBLAST)
9924
10087
  if (ggml_cl_can_mul_mat(src0, src1, dst)) {
9925
10088
  if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
9926
10089
  ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
@@ -10063,6 +10226,11 @@ static void ggml_compute_forward_mul_mat(
10063
10226
  case GGML_TYPE_Q5_1:
10064
10227
  case GGML_TYPE_Q8_0:
10065
10228
  case GGML_TYPE_Q8_1:
10229
+ case GGML_TYPE_Q2_K:
10230
+ case GGML_TYPE_Q3_K:
10231
+ case GGML_TYPE_Q4_K:
10232
+ case GGML_TYPE_Q5_K:
10233
+ case GGML_TYPE_Q6_K:
10066
10234
  {
10067
10235
  ggml_compute_forward_mul_mat_q_f32(params, src0, src1, dst);
10068
10236
  } break;
@@ -10246,6 +10414,11 @@ static void ggml_compute_forward_set(
10246
10414
  case GGML_TYPE_Q5_1:
10247
10415
  case GGML_TYPE_Q8_0:
10248
10416
  case GGML_TYPE_Q8_1:
10417
+ case GGML_TYPE_Q2_K:
10418
+ case GGML_TYPE_Q3_K:
10419
+ case GGML_TYPE_Q4_K:
10420
+ case GGML_TYPE_Q5_K:
10421
+ case GGML_TYPE_Q6_K:
10249
10422
  default:
10250
10423
  {
10251
10424
  GGML_ASSERT(false);
@@ -10411,6 +10584,11 @@ static void ggml_compute_forward_get_rows(
10411
10584
  case GGML_TYPE_Q5_1:
10412
10585
  case GGML_TYPE_Q8_0:
10413
10586
  case GGML_TYPE_Q8_1:
10587
+ case GGML_TYPE_Q2_K:
10588
+ case GGML_TYPE_Q3_K:
10589
+ case GGML_TYPE_Q4_K:
10590
+ case GGML_TYPE_Q5_K:
10591
+ case GGML_TYPE_Q6_K:
10414
10592
  {
10415
10593
  ggml_compute_forward_get_rows_q(params, src0, src1, dst);
10416
10594
  } break;
@@ -10957,6 +11135,12 @@ static void ggml_compute_forward_alibi(
10957
11135
  case GGML_TYPE_Q5_1:
10958
11136
  case GGML_TYPE_Q8_0:
10959
11137
  case GGML_TYPE_Q8_1:
11138
+ case GGML_TYPE_Q2_K:
11139
+ case GGML_TYPE_Q3_K:
11140
+ case GGML_TYPE_Q4_K:
11141
+ case GGML_TYPE_Q5_K:
11142
+ case GGML_TYPE_Q6_K:
11143
+ case GGML_TYPE_Q8_K:
10960
11144
  case GGML_TYPE_I8:
10961
11145
  case GGML_TYPE_I16:
10962
11146
  case GGML_TYPE_I32:
@@ -11028,6 +11212,12 @@ static void ggml_compute_forward_clamp(
11028
11212
  case GGML_TYPE_Q5_1:
11029
11213
  case GGML_TYPE_Q8_0:
11030
11214
  case GGML_TYPE_Q8_1:
11215
+ case GGML_TYPE_Q2_K:
11216
+ case GGML_TYPE_Q3_K:
11217
+ case GGML_TYPE_Q4_K:
11218
+ case GGML_TYPE_Q5_K:
11219
+ case GGML_TYPE_Q6_K:
11220
+ case GGML_TYPE_Q8_K:
11031
11221
  case GGML_TYPE_I8:
11032
11222
  case GGML_TYPE_I16:
11033
11223
  case GGML_TYPE_I32:
@@ -11117,7 +11307,7 @@ static void ggml_compute_forward_rope_f32(
11117
11307
  theta *= theta_scale;
11118
11308
 
11119
11309
  const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
11120
- float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
11310
+ float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
11121
11311
 
11122
11312
  const float x0 = src[0];
11123
11313
  const float x1 = src[1];
@@ -11138,7 +11328,7 @@ static void ggml_compute_forward_rope_f32(
11138
11328
  const int64_t i0 = ib*n_dims + ic/2;
11139
11329
 
11140
11330
  const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
11141
- float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
11331
+ float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
11142
11332
 
11143
11333
  const float x0 = src[0];
11144
11334
  const float x1 = src[n_dims/2];
@@ -12846,6 +13036,15 @@ static void ggml_compute_forward_map_binary(
12846
13036
  static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
12847
13037
  GGML_ASSERT(params);
12848
13038
 
13039
+ #ifdef GGML_USE_CUBLAS
13040
+ bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
13041
+ if (skip_cpu) {
13042
+ return;
13043
+ }
13044
+ GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU);
13045
+ GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU);
13046
+ #endif // GGML_USE_CUBLAS
13047
+
12849
13048
  switch (tensor->op) {
12850
13049
  case GGML_OP_DUP:
12851
13050
  {
@@ -13792,11 +13991,19 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
13792
13991
  // reached a leaf node, not part of the gradient graph (e.g. a constant)
13793
13992
  GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
13794
13993
 
13994
+ if (strlen(node->name) == 0) {
13995
+ snprintf(node->name, sizeof(node->name), "leaf_%d", cgraph->n_leafs);
13996
+ }
13997
+
13795
13998
  cgraph->leafs[cgraph->n_leafs] = node;
13796
13999
  cgraph->n_leafs++;
13797
14000
  } else {
13798
14001
  GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
13799
14002
 
14003
+ if (strlen(node->name) == 0) {
14004
+ snprintf(node->name, sizeof(node->name), "node_%d", cgraph->n_nodes);
14005
+ }
14006
+
13800
14007
  cgraph->nodes[cgraph->n_nodes] = node;
13801
14008
  cgraph->grads[cgraph->n_nodes] = node->grad;
13802
14009
  cgraph->n_nodes++;
@@ -14144,7 +14351,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
14144
14351
  if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
14145
14352
  node->n_tasks = 1; // TODO: this actually is doing nothing
14146
14353
  // the threads are still spinning
14147
- cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node);
14148
14354
  }
14149
14355
  else
14150
14356
  #elif defined(GGML_USE_CLBLAST)
@@ -14510,6 +14716,521 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
14510
14716
  }
14511
14717
  }
14512
14718
 
14719
+ struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
14720
+ for (int i = 0; i < cgraph->n_leafs; i++) {
14721
+ struct ggml_tensor * leaf = cgraph->leafs[i];
14722
+
14723
+ if (strcmp(leaf->name, name) == 0) {
14724
+ return leaf;
14725
+ }
14726
+ }
14727
+
14728
+ for (int i = 0; i < cgraph->n_nodes; i++) {
14729
+ struct ggml_tensor * node = cgraph->nodes[i];
14730
+
14731
+ if (strcmp(node->name, name) == 0) {
14732
+ return node;
14733
+ }
14734
+ }
14735
+
14736
+ return NULL;
14737
+ }
14738
+
14739
+ static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fout) {
14740
+ const int64_t * ne = tensor->ne;
14741
+ const size_t * nb = tensor->nb;
14742
+
14743
+ fprintf(fout, "%-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
14744
+ ggml_type_name(tensor->type),
14745
+ ggml_op_name (tensor->op),
14746
+ tensor->n_dims,
14747
+ ne[0], ne[1], ne[2], ne[3],
14748
+ nb[0], nb[1], nb[2], nb[3],
14749
+ tensor->data,
14750
+ tensor->name);
14751
+ }
14752
+
14753
+ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char * arg, FILE * fout) {
14754
+ const int64_t * ne = tensor->ne;
14755
+ const size_t * nb = tensor->nb;
14756
+
14757
+ fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %8d %16p %32s\n",
14758
+ arg,
14759
+ ggml_type_name(tensor->type),
14760
+ ggml_op_name (tensor->op),
14761
+ tensor->n_dims,
14762
+ ne[0], ne[1], ne[2], ne[3],
14763
+ nb[0], nb[1], nb[2], nb[3],
14764
+ tensor->n_tasks,
14765
+ tensor->data,
14766
+ tensor->name);
14767
+ }
14768
+
14769
+ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
14770
+ //assert(cgraph->work == NULL);
14771
+ //assert(cgraph->work_size == 0);
14772
+
14773
+ uint64_t size_eval = 0;
14774
+
14775
+ // compute size of intermediate results
14776
+ // TODO: does not take into account scratch buffers !!!!
14777
+ for (int i = 0; i < cgraph->n_nodes; ++i) {
14778
+ size_eval += ggml_nbytes(cgraph->nodes[i]);
14779
+ }
14780
+
14781
+ // print
14782
+ {
14783
+ FILE * fout = stdout;
14784
+
14785
+ fprintf(fout, "\n");
14786
+ fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC);
14787
+ fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION);
14788
+ fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs);
14789
+ fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes);
14790
+ fprintf(fout, "%-16s %" PRIu64 "\n", "eval", size_eval);
14791
+
14792
+ // header
14793
+ fprintf(fout, "\n");
14794
+ fprintf(fout, "%-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %16s %16s\n",
14795
+ "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "DATA", "NAME");
14796
+
14797
+ for (int i = 0; i < cgraph->n_leafs; ++i) {
14798
+ ggml_graph_export_leaf(cgraph->leafs[i], fout);
14799
+
14800
+ GGML_ASSERT(cgraph->leafs[i]->op == GGML_OP_NONE);
14801
+ GGML_ASSERT(cgraph->leafs[i]->src0 == NULL);
14802
+ GGML_ASSERT(cgraph->leafs[i]->src1 == NULL);
14803
+ }
14804
+
14805
+ // header
14806
+ fprintf(fout, "\n");
14807
+ fprintf(fout, "%-6s %-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %8s %16s %16s\n",
14808
+ "ARG", "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "NTASKS", "DATA", "NAME");
14809
+
14810
+ for (int i = 0; i < cgraph->n_nodes; ++i) {
14811
+ ggml_graph_export_node(cgraph->nodes[i], "DST", fout);
14812
+
14813
+ if (cgraph->nodes[i]->src0) {
14814
+ ggml_graph_export_node(cgraph->nodes[i]->src0, "SRC0", fout);
14815
+ }
14816
+
14817
+ if (cgraph->nodes[i]->src1) {
14818
+ ggml_graph_export_node(cgraph->nodes[i]->src1, "SRC1", fout);
14819
+ }
14820
+
14821
+ for (int j = 0; j < GGML_MAX_OPT; ++j) {
14822
+ if (cgraph->nodes[i]->opt[j]) {
14823
+ ggml_graph_export_node(cgraph->nodes[i]->opt[j], "OPT", fout);
14824
+ }
14825
+ }
14826
+
14827
+ fprintf(fout, "\n");
14828
+ }
14829
+
14830
+ fprintf(fout, "\n");
14831
+ }
14832
+
14833
+ // write binary data
14834
+ {
14835
+ FILE * fout = fopen(fname, "wb");
14836
+
14837
+ if (!fout) {
14838
+ fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
14839
+ return;
14840
+ }
14841
+
14842
+ // header
14843
+ {
14844
+ const uint32_t magic = GGML_FILE_MAGIC;
14845
+ const uint32_t version = GGML_FILE_VERSION;
14846
+ const uint32_t n_leafs = cgraph->n_leafs;
14847
+ const uint32_t nodes = cgraph->n_nodes;
14848
+
14849
+ fwrite(&magic, sizeof(uint32_t), 1, fout);
14850
+ fwrite(&version, sizeof(uint32_t), 1, fout);
14851
+ fwrite(&n_leafs, sizeof(uint32_t), 1, fout);
14852
+ fwrite(&nodes, sizeof(uint32_t), 1, fout);
14853
+ fwrite(&size_eval, sizeof(uint64_t), 1, fout);
14854
+ }
14855
+
14856
+ // leafs
14857
+ {
14858
+ for (int i = 0; i < cgraph->n_leafs; ++i) {
14859
+ const struct ggml_tensor * tensor = cgraph->leafs[i];
14860
+
14861
+ const uint32_t type = tensor->type;
14862
+ const uint32_t op = tensor->op;
14863
+ const uint32_t n_dims = tensor->n_dims;
14864
+
14865
+ fwrite(&type, sizeof(uint32_t), 1, fout);
14866
+ fwrite(&op, sizeof(uint32_t), 1, fout);
14867
+ fwrite(&n_dims, sizeof(uint32_t), 1, fout);
14868
+
14869
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
14870
+ const uint64_t ne = tensor->ne[j];
14871
+ const uint64_t nb = tensor->nb[j];
14872
+
14873
+ fwrite(&ne, sizeof(uint64_t), 1, fout);
14874
+ fwrite(&nb, sizeof(uint64_t), 1, fout);
14875
+ }
14876
+
14877
+ // store the pointer address
14878
+ {
14879
+ const uint64_t ptr = (uint64_t) tensor->data;
14880
+
14881
+ fwrite(&ptr, sizeof(uint64_t), 1, fout);
14882
+ }
14883
+
14884
+ fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
14885
+
14886
+ // dump the data
14887
+ // TODO: pad this to 32 byte boundary
14888
+ {
14889
+ const size_t size = ggml_nbytes(tensor);
14890
+
14891
+ fwrite(tensor->data, sizeof(char), size, fout);
14892
+ }
14893
+ }
14894
+ }
14895
+
14896
+ // nodes
14897
+ {
14898
+ for (int i = 0; i < cgraph->n_nodes; ++i) {
14899
+ const struct ggml_tensor * tensor = cgraph->nodes[i];
14900
+
14901
+ const uint32_t type = tensor->type;
14902
+ const uint32_t op = tensor->op;
14903
+ const uint32_t n_dims = tensor->n_dims;
14904
+
14905
+ fwrite(&type, sizeof(uint32_t), 1, fout);
14906
+ fwrite(&op, sizeof(uint32_t), 1, fout);
14907
+ fwrite(&n_dims, sizeof(uint32_t), 1, fout);
14908
+
14909
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
14910
+ const uint64_t ne = tensor->ne[j];
14911
+ const uint64_t nb = tensor->nb[j];
14912
+
14913
+ fwrite(&ne, sizeof(uint64_t), 1, fout);
14914
+ fwrite(&nb, sizeof(uint64_t), 1, fout);
14915
+ }
14916
+
14917
+ // store the pointer address
14918
+ {
14919
+ const uint64_t ptr = (uint64_t) tensor->data;
14920
+
14921
+ fwrite(&ptr, sizeof(uint64_t), 1, fout);
14922
+ }
14923
+
14924
+ fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
14925
+
14926
+ // output the op arguments
14927
+ {
14928
+ struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL };
14929
+
14930
+ args[0] = tensor->src0;
14931
+ args[1] = tensor->src1;
14932
+
14933
+ for (int j = 0; j < GGML_MAX_OPT; ++j) {
14934
+ args[2 + j] = tensor->opt[j];
14935
+ }
14936
+
14937
+ for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
14938
+ if (args[j]) {
14939
+ int32_t idx = -1;
14940
+
14941
+ // check if leaf
14942
+ {
14943
+ for (int k = 0; k < cgraph->n_leafs; ++k) {
14944
+ if (args[j] == cgraph->leafs[k]) {
14945
+ idx = k;
14946
+ break;
14947
+ }
14948
+ }
14949
+ }
14950
+
14951
+ // check if node
14952
+ if (idx == -1) {
14953
+ for (int k = 0; k < cgraph->n_nodes; ++k) {
14954
+ if (args[j] == cgraph->nodes[k]) {
14955
+ idx = GGML_MAX_NODES + k;
14956
+ break;
14957
+ }
14958
+ }
14959
+ }
14960
+
14961
+ if (idx == -1) {
14962
+ fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i);
14963
+ return;
14964
+ }
14965
+
14966
+ fwrite(&idx, sizeof(int32_t), 1, fout);
14967
+ } else {
14968
+ const int32_t nul = -1;
14969
+
14970
+ fwrite(&nul, sizeof(int32_t), 1, fout);
14971
+ }
14972
+ }
14973
+ }
14974
+ }
14975
+ }
14976
+
14977
+ fclose(fout);
14978
+ }
14979
+ }
14980
+
14981
+ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) {
14982
+ assert(*ctx_data == NULL);
14983
+ assert(*ctx_eval == NULL);
14984
+
14985
+ struct ggml_cgraph result = { 0 };
14986
+
14987
+ struct ggml_tensor * data = NULL;
14988
+
14989
+ // read file into data
14990
+ {
14991
+ FILE * fin = fopen(fname, "rb");
14992
+ if (!fin) {
14993
+ fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
14994
+ return result;
14995
+ }
14996
+
14997
+ size_t fsize = 0;
14998
+
14999
+ fseek(fin, 0, SEEK_END);
15000
+ fsize = ftell(fin);
15001
+ fseek(fin, 0, SEEK_SET);
15002
+
15003
+ // create the data context
15004
+ {
15005
+ const size_t overhead = 1*ggml_tensor_overhead();
15006
+
15007
+ struct ggml_init_params params = {
15008
+ .mem_size = fsize + overhead,
15009
+ .mem_buffer = NULL,
15010
+ .no_alloc = false,
15011
+ };
15012
+
15013
+ *ctx_data = ggml_init(params);
15014
+
15015
+ if (!*ctx_data) {
15016
+ fprintf(stderr, "%s: failed to create ggml context\n", __func__);
15017
+ return result;
15018
+ }
15019
+ }
15020
+
15021
+ data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize);
15022
+
15023
+ const size_t ret = fread(data->data, sizeof(char), fsize, fin);
15024
+ if (ret != fsize) {
15025
+ fprintf(stderr, "%s: failed to read %s\n", __func__, fname);
15026
+ return result;
15027
+ }
15028
+
15029
+ fclose(fin);
15030
+ }
15031
+
15032
+ // populate result
15033
+ {
15034
+ char * ptr = (char *) data->data;
15035
+
15036
+ const uint32_t magic = *(const uint32_t *) ptr; ptr += sizeof(magic);
15037
+
15038
+ if (magic != GGML_FILE_MAGIC) {
15039
+ fprintf(stderr, "%s: invalid magic number, got %08x\n", __func__, magic);
15040
+ return result;
15041
+ }
15042
+
15043
+ const uint32_t version = *(const uint32_t *) ptr; ptr += sizeof(version);
15044
+
15045
+ if (version != GGML_FILE_VERSION) {
15046
+ fprintf(stderr, "%s: invalid version number\n", __func__);
15047
+ return result;
15048
+ }
15049
+
15050
+ const uint32_t n_leafs = *(const uint32_t *) ptr; ptr += sizeof(n_leafs);
15051
+ const uint32_t n_nodes = *(const uint32_t *) ptr; ptr += sizeof(n_nodes);
15052
+ const uint64_t size_eval = *(const uint64_t *) ptr; ptr += sizeof(size_eval);
15053
+
15054
+ result.n_leafs = n_leafs;
15055
+ result.n_nodes = n_nodes;
15056
+
15057
+ // create the data context
15058
+ {
15059
+ const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead();
15060
+
15061
+ struct ggml_init_params params = {
15062
+ .mem_size = size_eval + overhead,
15063
+ .mem_buffer = NULL,
15064
+ .no_alloc = true,
15065
+ };
15066
+
15067
+ *ctx_eval = ggml_init(params);
15068
+
15069
+ if (!*ctx_eval) {
15070
+ fprintf(stderr, "%s: failed to create ggml context\n", __func__);
15071
+ return result;
15072
+ }
15073
+ }
15074
+
15075
+ // leafs
15076
+ {
15077
+ uint32_t type;
15078
+ uint32_t op;
15079
+ uint32_t n_dims;
15080
+
15081
+ for (uint32_t i = 0; i < n_leafs; ++i) {
15082
+ type = *(const uint32_t *) ptr; ptr += sizeof(type);
15083
+ op = *(const uint32_t *) ptr; ptr += sizeof(op);
15084
+ n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
15085
+
15086
+ int64_t ne[GGML_MAX_DIMS];
15087
+ size_t nb[GGML_MAX_DIMS];
15088
+
15089
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
15090
+ uint64_t ne_cur;
15091
+ uint64_t nb_cur;
15092
+
15093
+ ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
15094
+ nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
15095
+
15096
+ ne[j] = ne_cur;
15097
+ nb[j] = nb_cur;
15098
+ }
15099
+
15100
+ struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
15101
+
15102
+ tensor->op = (enum ggml_op) op;
15103
+
15104
+ uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur);
15105
+
15106
+ memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
15107
+
15108
+ tensor->data = (void *) ptr;
15109
+
15110
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
15111
+ tensor->nb[j] = nb[j];
15112
+ }
15113
+
15114
+ result.leafs[i] = tensor;
15115
+
15116
+ ptr += ggml_nbytes(tensor);
15117
+
15118
+ fprintf(stderr, "%s: loaded leaf %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
15119
+ }
15120
+ }
15121
+
15122
+ ggml_set_no_alloc(*ctx_eval, false);
15123
+
15124
+ // nodes
15125
+ {
15126
+ uint32_t type;
15127
+ uint32_t op;
15128
+ uint32_t n_dims;
15129
+
15130
+ for (uint32_t i = 0; i < n_nodes; ++i) {
15131
+ type = *(const uint32_t *) ptr; ptr += sizeof(type);
15132
+ op = *(const uint32_t *) ptr; ptr += sizeof(op);
15133
+ n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
15134
+
15135
+ enum ggml_op eop = (enum ggml_op) op;
15136
+
15137
+ int64_t ne[GGML_MAX_DIMS];
15138
+ size_t nb[GGML_MAX_DIMS];
15139
+
15140
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
15141
+ uint64_t ne_cur;
15142
+ uint64_t nb_cur;
15143
+
15144
+ ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
15145
+ nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
15146
+
15147
+ ne[j] = ne_cur;
15148
+ nb[j] = nb_cur;
15149
+ }
15150
+
15151
+ uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur); // TODO: not yet used
15152
+
15153
+ const char * ptr_name = ptr; ptr += GGML_MAX_NAME;
15154
+
15155
+ const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += (2 + GGML_MAX_OPT)*sizeof(int32_t);
15156
+
15157
+ struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL };
15158
+
15159
+ // parse args
15160
+ for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
15161
+ const int32_t arg_idx = ptr_arg_idx[j];
15162
+
15163
+ if (arg_idx == -1) {
15164
+ continue;
15165
+ }
15166
+
15167
+ if (arg_idx < GGML_MAX_NODES) {
15168
+ args[j] = result.leafs[arg_idx];
15169
+ } else {
15170
+ args[j] = result.nodes[arg_idx - GGML_MAX_NODES];
15171
+ }
15172
+ }
15173
+
15174
+ // create the tensor
15175
+ // "view" operations are handled differently
15176
+ // TODO: handle inplace ops - currently a copy is always made
15177
+
15178
+ struct ggml_tensor * tensor = NULL;
15179
+
15180
+ switch (eop) {
15181
+ // TODO: implement other view ops
15182
+ case GGML_OP_RESHAPE:
15183
+ {
15184
+ tensor = ggml_reshape_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3]);
15185
+ } break;
15186
+ case GGML_OP_VIEW:
15187
+ {
15188
+ tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
15189
+
15190
+ uint64_t offs;
15191
+ memcpy(&offs, args[2]->data, sizeof(offs));
15192
+
15193
+ tensor->data = ((char *) tensor->data) + offs;
15194
+ } break;
15195
+ case GGML_OP_TRANSPOSE:
15196
+ {
15197
+ tensor = ggml_transpose(*ctx_eval, args[0]);
15198
+ } break;
15199
+ case GGML_OP_PERMUTE:
15200
+ {
15201
+ tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
15202
+ } break;
15203
+ default:
15204
+ {
15205
+ tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
15206
+
15207
+ tensor->op = eop;
15208
+ } break;
15209
+ }
15210
+
15211
+ memcpy(tensor->name, ptr_name, GGML_MAX_NAME);
15212
+
15213
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
15214
+ tensor->nb[j] = nb[j];
15215
+ }
15216
+
15217
+ tensor->src0 = args[0];
15218
+ tensor->src1 = args[1];
15219
+
15220
+ for (int j = 0; j < GGML_MAX_OPT; ++j) {
15221
+ tensor->opt[j] = args[2 + j];
15222
+ }
15223
+
15224
+ result.nodes[i] = tensor;
15225
+
15226
+ fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
15227
+ }
15228
+ }
15229
+ }
15230
+
15231
+ return result;
15232
+ }
15233
+
14513
15234
  void ggml_graph_print(const struct ggml_cgraph * cgraph) {
14514
15235
  int64_t perf_total_per_op_us[GGML_OP_COUNT] = {0};
14515
15236
 
@@ -14527,7 +15248,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
14527
15248
  GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
14528
15249
  i,
14529
15250
  node->ne[0], node->ne[1], node->ne[2],
14530
- GGML_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
15251
+ GGML_OP_NAME[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
14531
15252
  (double) node->perf_cycles / (double) ggml_cycles_per_ms(),
14532
15253
  (double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
14533
15254
  (double) node->perf_time_us / 1000.0,
@@ -14541,7 +15262,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
14541
15262
  GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n",
14542
15263
  i,
14543
15264
  node->ne[0], node->ne[1],
14544
- GGML_OP_LABEL[node->op]);
15265
+ GGML_OP_NAME[node->op]);
14545
15266
  }
14546
15267
 
14547
15268
  for (int i = 0; i < GGML_OP_COUNT; i++) {
@@ -14549,7 +15270,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
14549
15270
  continue;
14550
15271
  }
14551
15272
 
14552
- GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_LABEL[i], (double) perf_total_per_op_us[i] / 1000.0);
15273
+ GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_NAME[i], (double) perf_total_per_op_us[i] / 1000.0);
14553
15274
  }
14554
15275
 
14555
15276
  GGML_PRINT("========================================\n");
@@ -15548,6 +16269,38 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
15548
16269
  block_q8_0 * block = (block_q8_0*)dst + start / QK8_0;
15549
16270
  result = ggml_quantize_q8_0(src + start, block, n, n, hist);
15550
16271
  } break;
16272
+ #ifdef GGML_USE_K_QUANTS
16273
+ case GGML_TYPE_Q2_K:
16274
+ {
16275
+ GGML_ASSERT(start % QK_K == 0);
16276
+ block_q2_K * block = (block_q2_K*)dst + start / QK_K;
16277
+ result = ggml_quantize_q2_K(src + start, block, n, n, hist);
16278
+ } break;
16279
+ case GGML_TYPE_Q3_K:
16280
+ {
16281
+ GGML_ASSERT(start % QK_K == 0);
16282
+ block_q3_K * block = (block_q3_K*)dst + start / QK_K;
16283
+ result = ggml_quantize_q3_K(src + start, block, n, n, hist);
16284
+ } break;
16285
+ case GGML_TYPE_Q4_K:
16286
+ {
16287
+ GGML_ASSERT(start % QK_K == 0);
16288
+ block_q4_K * block = (block_q4_K*)dst + start / QK_K;
16289
+ result = ggml_quantize_q4_K(src + start, block, n, n, hist);
16290
+ } break;
16291
+ case GGML_TYPE_Q5_K:
16292
+ {
16293
+ GGML_ASSERT(start % QK_K == 0);
16294
+ block_q5_K * block = (block_q5_K*)dst + start / QK_K;
16295
+ result = ggml_quantize_q5_K(src + start, block, n, n, hist);
16296
+ } break;
16297
+ case GGML_TYPE_Q6_K:
16298
+ {
16299
+ GGML_ASSERT(start % QK_K == 0);
16300
+ block_q6_K * block = (block_q6_K*)dst + start / QK_K;
16301
+ result = ggml_quantize_q6_K(src + start, block, n, n, hist);
16302
+ } break;
16303
+ #endif
15551
16304
  default:
15552
16305
  assert(false);
15553
16306
  }