llama_cpp 0.1.3 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,6 +3,10 @@
3
3
 
4
4
  #include "ggml.h"
5
5
 
6
+ #ifdef GGML_USE_K_QUANTS
7
+ #include "k_quants.h"
8
+ #endif
9
+
6
10
  #if defined(_MSC_VER) || defined(__MINGW32__)
7
11
  #include <malloc.h> // using malloc.h with MSC/MINGW
8
12
  #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
@@ -21,6 +25,10 @@
21
25
  #include <float.h>
22
26
  #include <limits.h>
23
27
 
28
+ #ifdef GGML_USE_METAL
29
+ #include <unistd.h>
30
+ #endif
31
+
24
32
  // if C99 - static_assert is noop
25
33
  // ref: https://stackoverflow.com/a/53923785/4039976
26
34
  #ifndef static_assert
@@ -121,7 +129,11 @@ typedef void* thread_ret_t;
121
129
  #else
122
130
  inline static void* ggml_aligned_malloc(size_t size) {
123
131
  void* aligned_memory = NULL;
132
+ #ifdef GGML_USE_METAL
133
+ int result = posix_memalign(&aligned_memory, getpagesize(), size);
134
+ #else
124
135
  int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
136
+ #endif
125
137
  if (result != 0) {
126
138
  // Handle allocation failure
127
139
  return NULL;
@@ -186,10 +198,12 @@ typedef double ggml_float;
186
198
  #if defined(_MSC_VER) || defined(__MINGW32__)
187
199
  #include <intrin.h>
188
200
  #else
201
+ #if !defined(__riscv)
189
202
  #include <immintrin.h>
190
203
  #endif
191
204
  #endif
192
205
  #endif
206
+ #endif
193
207
 
194
208
  #ifdef __F16C__
195
209
 
@@ -401,21 +415,27 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) {
401
415
  //
402
416
 
403
417
  #if defined(_MSC_VER) || defined(__MINGW32__)
404
- static int64_t timer_freq;
418
+ static int64_t timer_freq, timer_start;
405
419
  void ggml_time_init(void) {
406
- LARGE_INTEGER frequency;
407
- QueryPerformanceFrequency(&frequency);
408
- timer_freq = frequency.QuadPart;
420
+ LARGE_INTEGER t;
421
+ QueryPerformanceFrequency(&t);
422
+ timer_freq = t.QuadPart;
423
+
424
+ // The multiplication by 1000 or 1000000 below can cause an overflow if timer_freq
425
+ // and the uptime is high enough.
426
+ // We subtract the program start time to reduce the likelihood of that happening.
427
+ QueryPerformanceCounter(&t);
428
+ timer_start = t.QuadPart;
409
429
  }
410
430
  int64_t ggml_time_ms(void) {
411
431
  LARGE_INTEGER t;
412
432
  QueryPerformanceCounter(&t);
413
- return (t.QuadPart * 1000) / timer_freq;
433
+ return ((t.QuadPart-timer_start) * 1000) / timer_freq;
414
434
  }
415
435
  int64_t ggml_time_us(void) {
416
436
  LARGE_INTEGER t;
417
437
  QueryPerformanceCounter(&t);
418
- return (t.QuadPart * 1000000) / timer_freq;
438
+ return ((t.QuadPart-timer_start) * 1000000) / timer_freq;
419
439
  }
420
440
  #else
421
441
  void ggml_time_init(void) {}
@@ -472,6 +492,8 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
472
492
  // quantization
473
493
  //
474
494
 
495
+ #define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
496
+
475
497
  #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
476
498
  // multiply int8_t, add results pairwise twice
477
499
  static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
@@ -531,7 +553,7 @@ static inline __m256i bytes_from_bits_32(const uint8_t * x) {
531
553
  static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
532
554
  {
533
555
  const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
534
- const __m256i bytes = _mm256_set_m128i(_mm_srli_epi16(tmp, 4), tmp);
556
+ const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp);
535
557
  const __m256i lowMask = _mm256_set1_epi8( 0xF );
536
558
  return _mm256_and_si256(lowMask, bytes);
537
559
  }
@@ -604,7 +626,7 @@ static inline __m256i bytes_from_bits_32(const uint8_t * x) {
604
626
  bytesh = _mm_or_si128(bytesh, bit_mask);
605
627
  bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1));
606
628
  bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1));
607
- return _mm256_set_m128i(bytesh, bytesl);
629
+ return MM256_SET_M128I(bytesh, bytesl);
608
630
  }
609
631
 
610
632
  // Unpack 32 4-bit fields into 32 bytes
@@ -617,7 +639,7 @@ static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
617
639
  const __m128i lowMask = _mm_set1_epi8(0xF);
618
640
  tmpl = _mm_and_si128(lowMask, tmpl);
619
641
  tmph = _mm_and_si128(lowMask, tmph);
620
- return _mm256_set_m128i(tmph, tmpl);
642
+ return MM256_SET_M128I(tmph, tmpl);
621
643
  }
622
644
 
623
645
  // add int16_t pairwise and return as float vector
@@ -625,7 +647,7 @@ static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) {
625
647
  const __m128i ones = _mm_set1_epi16(1);
626
648
  const __m128i summed_pairsl = _mm_madd_epi16(ones, xl);
627
649
  const __m128i summed_pairsh = _mm_madd_epi16(ones, xh);
628
- const __m256i summed_pairs = _mm256_set_m128i(summed_pairsh, summed_pairsl);
650
+ const __m256i summed_pairs = MM256_SET_M128I(summed_pairsh, summed_pairsl);
629
651
  return _mm256_cvtepi32_ps(summed_pairs);
630
652
  }
631
653
 
@@ -1563,6 +1585,48 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
1563
1585
  .vec_dot_q = NULL, // TODO
1564
1586
  .vec_dot_type = GGML_TYPE_Q8_1,
1565
1587
  },
1588
+ #ifdef GGML_USE_K_QUANTS
1589
+ [GGML_TYPE_Q2_K] = {
1590
+ .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q2_K,
1591
+ .quantize_row_q = quantize_row_q2_K,
1592
+ .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q2_K_reference,
1593
+ .quantize_row_q_dot = quantize_row_q8_K,
1594
+ .vec_dot_q = ggml_vec_dot_q2_K_q8_K,
1595
+ .vec_dot_type = GGML_TYPE_Q8_K,
1596
+ },
1597
+ [GGML_TYPE_Q3_K] = {
1598
+ .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q3_K,
1599
+ .quantize_row_q = quantize_row_q3_K,
1600
+ .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q3_K_reference,
1601
+ .quantize_row_q_dot = quantize_row_q8_K,
1602
+ .vec_dot_q = ggml_vec_dot_q3_K_q8_K,
1603
+ .vec_dot_type = GGML_TYPE_Q8_K,
1604
+ },
1605
+ [GGML_TYPE_Q4_K] = {
1606
+ .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q4_K,
1607
+ .quantize_row_q = quantize_row_q4_K,
1608
+ .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_K_reference,
1609
+ .quantize_row_q_dot = quantize_row_q8_K,
1610
+ .vec_dot_q = ggml_vec_dot_q4_K_q8_K,
1611
+ .vec_dot_type = GGML_TYPE_Q8_K,
1612
+ },
1613
+ [GGML_TYPE_Q5_K] = {
1614
+ .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_K,
1615
+ .quantize_row_q = quantize_row_q5_K,
1616
+ .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_K_reference,
1617
+ .quantize_row_q_dot = quantize_row_q8_K,
1618
+ .vec_dot_q = ggml_vec_dot_q5_K_q8_K,
1619
+ .vec_dot_type = GGML_TYPE_Q8_K,
1620
+ },
1621
+ [GGML_TYPE_Q6_K] = {
1622
+ .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q6_K,
1623
+ .quantize_row_q = quantize_row_q6_K,
1624
+ .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q6_K_reference,
1625
+ .quantize_row_q_dot = quantize_row_q8_K,
1626
+ .vec_dot_q = ggml_vec_dot_q6_K_q8_K,
1627
+ .vec_dot_type = GGML_TYPE_Q8_K,
1628
+ },
1629
+ #endif
1566
1630
  };
1567
1631
 
1568
1632
  // For internal test use
@@ -2288,7 +2352,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2288
2352
  const __m128i i32_1 = mul_sum_i8_pairs(bx, by);
2289
2353
 
2290
2354
  // Convert int32_t to float
2291
- __m256 p = _mm256_cvtepi32_ps(_mm256_set_m128i(i32_0, i32_1));
2355
+ __m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1));
2292
2356
 
2293
2357
  // Apply the scale, and accumulate
2294
2358
  acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
@@ -2764,7 +2828,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2764
2828
  __m128i bxh = _mm256_extractf128_si256(bx, 1);
2765
2829
  bxl = _mm_or_si128(bxl, bxhil);
2766
2830
  bxh = _mm_or_si128(bxh, bxhih);
2767
- bx = _mm256_set_m128i(bxh, bxl);
2831
+ bx = MM256_SET_M128I(bxh, bxl);
2768
2832
 
2769
2833
  const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
2770
2834
 
@@ -3020,7 +3084,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
3020
3084
  __m128i bxh = _mm256_extractf128_si256(bx, 1);
3021
3085
  bxl = _mm_or_si128(bxl, bxhil);
3022
3086
  bxh = _mm_or_si128(bxh, bxhih);
3023
- bx = _mm256_set_m128i(bxh, bxl);
3087
+ bx = MM256_SET_M128I(bxh, bxl);
3024
3088
 
3025
3089
  const __m256 dy = _mm256_set1_ps(y[i].d);
3026
3090
  const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
@@ -3442,11 +3506,19 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
3442
3506
  [GGML_TYPE_Q5_1] = QK5_1,
3443
3507
  [GGML_TYPE_Q8_0] = QK8_0,
3444
3508
  [GGML_TYPE_Q8_1] = QK8_1,
3509
+ #ifdef GGML_USE_K_QUANTS
3510
+ [GGML_TYPE_Q2_K] = QK_K,
3511
+ [GGML_TYPE_Q3_K] = QK_K,
3512
+ [GGML_TYPE_Q4_K] = QK_K,
3513
+ [GGML_TYPE_Q5_K] = QK_K,
3514
+ [GGML_TYPE_Q6_K] = QK_K,
3515
+ [GGML_TYPE_Q8_K] = QK_K,
3516
+ #endif
3445
3517
  [GGML_TYPE_I8] = 1,
3446
3518
  [GGML_TYPE_I16] = 1,
3447
3519
  [GGML_TYPE_I32] = 1,
3448
3520
  };
3449
- static_assert(GGML_TYPE_COUNT == 13, "GGML_BLCK_SIZE is outdated");
3521
+ static_assert(GGML_TYPE_COUNT == 19, "GGML_BLCK_SIZE is outdated");
3450
3522
 
3451
3523
  static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
3452
3524
  [GGML_TYPE_F32] = sizeof(float),
@@ -3457,11 +3529,19 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
3457
3529
  [GGML_TYPE_Q5_1] = sizeof(block_q5_1),
3458
3530
  [GGML_TYPE_Q8_0] = sizeof(block_q8_0),
3459
3531
  [GGML_TYPE_Q8_1] = sizeof(block_q8_1),
3532
+ #ifdef GGML_USE_K_QUANTS
3533
+ [GGML_TYPE_Q2_K] = sizeof(block_q2_K),
3534
+ [GGML_TYPE_Q3_K] = sizeof(block_q3_K),
3535
+ [GGML_TYPE_Q4_K] = sizeof(block_q4_K),
3536
+ [GGML_TYPE_Q5_K] = sizeof(block_q5_K),
3537
+ [GGML_TYPE_Q6_K] = sizeof(block_q6_K),
3538
+ [GGML_TYPE_Q8_K] = sizeof(block_q8_K),
3539
+ #endif
3460
3540
  [GGML_TYPE_I8] = sizeof(int8_t),
3461
3541
  [GGML_TYPE_I16] = sizeof(int16_t),
3462
3542
  [GGML_TYPE_I32] = sizeof(int32_t),
3463
3543
  };
3464
- static_assert(GGML_TYPE_COUNT == 13, "GGML_TYPE_SIZE is outdated");
3544
+ static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_SIZE is outdated");
3465
3545
 
3466
3546
 
3467
3547
  static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
@@ -3473,11 +3553,17 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
3473
3553
  [GGML_TYPE_Q5_1] = "q5_1",
3474
3554
  [GGML_TYPE_Q8_0] = "q8_0",
3475
3555
  [GGML_TYPE_Q8_1] = "q8_1",
3556
+ [GGML_TYPE_Q2_K] = "q2_K",
3557
+ [GGML_TYPE_Q3_K] = "q3_K",
3558
+ [GGML_TYPE_Q4_K] = "q4_K",
3559
+ [GGML_TYPE_Q5_K] = "q5_K",
3560
+ [GGML_TYPE_Q6_K] = "q6_K",
3561
+ [GGML_TYPE_Q8_K] = "q8_K",
3476
3562
  [GGML_TYPE_I8] = "i8",
3477
3563
  [GGML_TYPE_I16] = "i16",
3478
3564
  [GGML_TYPE_I32] = "i32",
3479
3565
  };
3480
- static_assert(GGML_TYPE_COUNT == 13, "GGML_TYPE_NAME is outdated");
3566
+ static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_NAME is outdated");
3481
3567
 
3482
3568
  static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
3483
3569
  [GGML_TYPE_F32] = false,
@@ -3488,13 +3574,19 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
3488
3574
  [GGML_TYPE_Q5_1] = true,
3489
3575
  [GGML_TYPE_Q8_0] = true,
3490
3576
  [GGML_TYPE_Q8_1] = true,
3577
+ [GGML_TYPE_Q2_K] = true,
3578
+ [GGML_TYPE_Q3_K] = true,
3579
+ [GGML_TYPE_Q4_K] = true,
3580
+ [GGML_TYPE_Q5_K] = true,
3581
+ [GGML_TYPE_Q6_K] = true,
3582
+ [GGML_TYPE_Q8_K] = true,
3491
3583
  [GGML_TYPE_I8] = false,
3492
3584
  [GGML_TYPE_I16] = false,
3493
3585
  [GGML_TYPE_I32] = false,
3494
3586
  };
3495
- static_assert(GGML_TYPE_COUNT == 13, "GGML_IS_QUANTIZED is outdated");
3587
+ static_assert(GGML_TYPE_COUNT == 19, "GGML_IS_QUANTIZED is outdated");
3496
3588
 
3497
- static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
3589
+ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3498
3590
  "NONE",
3499
3591
 
3500
3592
  "DUP",
@@ -3629,6 +3721,7 @@ struct ggml_context {
3629
3721
  void * mem_buffer;
3630
3722
  bool mem_buffer_owned;
3631
3723
  bool no_alloc;
3724
+ bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
3632
3725
 
3633
3726
  int n_objects;
3634
3727
 
@@ -3645,26 +3738,6 @@ struct ggml_context_container {
3645
3738
  struct ggml_context context;
3646
3739
  };
3647
3740
 
3648
- //
3649
- // compute types
3650
- //
3651
-
3652
- enum ggml_task_type {
3653
- GGML_TASK_INIT = 0,
3654
- GGML_TASK_COMPUTE,
3655
- GGML_TASK_FINALIZE,
3656
- };
3657
-
3658
- struct ggml_compute_params {
3659
- enum ggml_task_type type;
3660
-
3661
- int ith, nth;
3662
-
3663
- // work buffer for all threads
3664
- size_t wsize;
3665
- void * wdata;
3666
- };
3667
-
3668
3741
  //
3669
3742
  // ggml state
3670
3743
  //
@@ -3721,7 +3794,7 @@ int64_t ggml_nelements(const struct ggml_tensor * tensor) {
3721
3794
  return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
3722
3795
  }
3723
3796
 
3724
- int ggml_nrows(const struct ggml_tensor * tensor) {
3797
+ int64_t ggml_nrows(const struct ggml_tensor * tensor) {
3725
3798
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3726
3799
 
3727
3800
  return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
@@ -3730,7 +3803,20 @@ int ggml_nrows(const struct ggml_tensor * tensor) {
3730
3803
  size_t ggml_nbytes(const struct ggml_tensor * tensor) {
3731
3804
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3732
3805
 
3733
- return (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type];
3806
+ // this should handle cases where the tensor is not contiguous in memory
3807
+ // probaby just:
3808
+ //
3809
+ // return tensor->ne[3]*tensor->nb[3]
3810
+ //
3811
+ // is enough, but just in case, adding the second part
3812
+
3813
+ return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]);
3814
+ }
3815
+
3816
+ size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
3817
+ static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3818
+
3819
+ return (nrows_split*tensor->ne[0]*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type];
3734
3820
  }
3735
3821
 
3736
3822
  int ggml_blck_size(enum ggml_type type) {
@@ -3749,6 +3835,9 @@ const char * ggml_type_name(enum ggml_type type) {
3749
3835
  return GGML_TYPE_NAME[type];
3750
3836
  }
3751
3837
 
3838
+ const char * ggml_op_name(enum ggml_op op) {
3839
+ return GGML_OP_NAME[op];
3840
+ }
3752
3841
 
3753
3842
  size_t ggml_element_size(const struct ggml_tensor * tensor) {
3754
3843
  return GGML_TYPE_SIZE[tensor->type];
@@ -3796,6 +3885,11 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
3796
3885
  case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break;
3797
3886
  case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break;
3798
3887
  case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break;
3888
+ case GGML_FTYPE_MOSTLY_Q2_K: wtype = GGML_TYPE_Q2_K; break;
3889
+ case GGML_FTYPE_MOSTLY_Q3_K: wtype = GGML_TYPE_Q3_K; break;
3890
+ case GGML_FTYPE_MOSTLY_Q4_K: wtype = GGML_TYPE_Q4_K; break;
3891
+ case GGML_FTYPE_MOSTLY_Q5_K: wtype = GGML_TYPE_Q5_K; break;
3892
+ case GGML_FTYPE_MOSTLY_Q6_K: wtype = GGML_TYPE_Q6_K; break;
3799
3893
  case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
3800
3894
  case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
3801
3895
  }
@@ -3805,11 +3899,15 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
3805
3899
  return wtype;
3806
3900
  }
3807
3901
 
3808
- static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
3902
+ size_t ggml_tensor_overhead(void) {
3903
+ return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16;
3904
+ }
3905
+
3906
+ bool ggml_is_transposed(const struct ggml_tensor * tensor) {
3809
3907
  return tensor->nb[0] > tensor->nb[1];
3810
3908
  }
3811
3909
 
3812
- static inline bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
3910
+ bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
3813
3911
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3814
3912
 
3815
3913
  return
@@ -3958,6 +4056,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
3958
4056
  /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size),
3959
4057
  /*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
3960
4058
  /*.no_alloc =*/ params.no_alloc,
4059
+ /*.no_alloc_save =*/ params.no_alloc,
3961
4060
  /*.n_objects =*/ 0,
3962
4061
  /*.objects_begin =*/ NULL,
3963
4062
  /*.objects_end =*/ NULL,
@@ -4017,17 +4116,36 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
4017
4116
  return result;
4018
4117
  }
4019
4118
 
4119
+ void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
4120
+ ctx->no_alloc = no_alloc;
4121
+ }
4122
+
4123
+ void * ggml_get_mem_buffer(struct ggml_context * ctx) {
4124
+ return ctx->mem_buffer;
4125
+ }
4126
+
4127
+ size_t ggml_get_mem_size(struct ggml_context * ctx) {
4128
+ return ctx->mem_size;
4129
+ }
4130
+
4020
4131
  // IMPORTANT:
4021
4132
  // when creating "opt" tensors, always save and load the scratch buffer
4022
4133
  // this is an error prone process, but it is necessary to support inplace
4023
4134
  // operators when using scratch buffers
4024
4135
  // TODO: implement a better way
4025
4136
  void ggml_scratch_save(struct ggml_context * ctx) {
4137
+ // this is needed to allow opt tensors to store their data
4138
+ // TODO: again, need to find a better way
4139
+ ctx->no_alloc_save = ctx->no_alloc;
4140
+ ctx->no_alloc = false;
4141
+
4026
4142
  ctx->scratch_save = ctx->scratch;
4027
4143
  ctx->scratch.data = NULL;
4028
4144
  }
4029
4145
 
4030
4146
  void ggml_scratch_load(struct ggml_context * ctx) {
4147
+ ctx->no_alloc = ctx->no_alloc_save;
4148
+
4031
4149
  ctx->scratch = ctx->scratch_save;
4032
4150
  }
4033
4151
 
@@ -4061,7 +4179,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
4061
4179
  struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
4062
4180
 
4063
4181
  if (ctx->scratch.data == NULL || data != NULL) {
4064
- size_needed += sizeof(struct ggml_tensor);
4182
+ size_needed += GGML_TENSOR_SIZE;
4065
4183
 
4066
4184
  if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
4067
4185
  GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
@@ -4077,14 +4195,15 @@ struct ggml_tensor * ggml_new_tensor_impl(
4077
4195
  };
4078
4196
  } else {
4079
4197
  if (ctx->scratch.offs + size_needed > ctx->scratch.size) {
4080
- GGML_PRINT("%s: not enough space in the scratch memory\n", __func__);
4198
+ GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
4199
+ __func__, ctx->scratch.offs + size_needed, ctx->scratch.size);
4081
4200
  assert(false);
4082
4201
  return NULL;
4083
4202
  }
4084
4203
 
4085
- if (cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE > ctx->mem_size) {
4204
+ if (cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE > ctx->mem_size) {
4086
4205
  GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
4087
- __func__, cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE, ctx->mem_size);
4206
+ __func__, cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE, ctx->mem_size);
4088
4207
  assert(false);
4089
4208
  return NULL;
4090
4209
  }
@@ -4093,7 +4212,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
4093
4212
 
4094
4213
  *obj_new = (struct ggml_object) {
4095
4214
  .offs = cur_end + GGML_OBJECT_SIZE,
4096
- .size = sizeof(struct ggml_tensor),
4215
+ .size = GGML_TENSOR_SIZE,
4097
4216
  .next = NULL,
4098
4217
  };
4099
4218
 
@@ -4135,6 +4254,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
4135
4254
  /*.perf_time_us =*/ 0,
4136
4255
  /*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
4137
4256
  /*.name =*/ { 0 },
4257
+ /*.extra =*/ NULL,
4138
4258
  /*.pad =*/ { 0 },
4139
4259
  };
4140
4260
 
@@ -4509,6 +4629,23 @@ struct ggml_tensor * ggml_view_tensor(
4509
4629
  return result;
4510
4630
  }
4511
4631
 
4632
+ struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
4633
+ struct ggml_object * obj = ctx->objects_begin;
4634
+
4635
+ char * const mem_buffer = ctx->mem_buffer;
4636
+
4637
+ while (obj != NULL) {
4638
+ struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
4639
+ if (strcmp(cur->name, name) == 0) {
4640
+ return cur;
4641
+ }
4642
+
4643
+ obj = obj->next;
4644
+ }
4645
+
4646
+ return NULL;
4647
+ }
4648
+
4512
4649
  ////////////////////////////////////////////////////////////////////////////////
4513
4650
 
4514
4651
  // ggml_dup
@@ -5763,10 +5900,18 @@ struct ggml_tensor * ggml_view_1d(
5763
5900
 
5764
5901
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
5765
5902
 
5903
+ ggml_scratch_save(ctx);
5904
+
5905
+ struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
5906
+ memcpy(offs->data, &offset, 2*sizeof(int32_t));
5907
+
5908
+ ggml_scratch_load(ctx);
5909
+
5766
5910
  result->op = GGML_OP_VIEW;
5767
5911
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5768
5912
  result->src0 = a;
5769
5913
  result->src1 = NULL;
5914
+ result->opt[0] = offs;
5770
5915
 
5771
5916
  if (is_node) {
5772
5917
  memcpy(result->padding, &offset, sizeof(offset));
@@ -5795,6 +5940,13 @@ struct ggml_tensor * ggml_view_2d(
5795
5940
 
5796
5941
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
5797
5942
 
5943
+ ggml_scratch_save(ctx);
5944
+
5945
+ struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
5946
+ memcpy(offs->data, &offset, 2*sizeof(int32_t));
5947
+
5948
+ ggml_scratch_load(ctx);
5949
+
5798
5950
  result->nb[1] = nb1;
5799
5951
  result->nb[2] = result->nb[1]*ne1;
5800
5952
  result->nb[3] = result->nb[2];
@@ -5803,6 +5955,7 @@ struct ggml_tensor * ggml_view_2d(
5803
5955
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5804
5956
  result->src0 = a;
5805
5957
  result->src1 = NULL;
5958
+ result->opt[0] = offs;
5806
5959
 
5807
5960
  if (is_node) {
5808
5961
  memcpy(result->padding, &offset, sizeof(offset));
@@ -5833,6 +5986,13 @@ struct ggml_tensor * ggml_view_3d(
5833
5986
 
5834
5987
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
5835
5988
 
5989
+ ggml_scratch_save(ctx);
5990
+
5991
+ struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
5992
+ memcpy(offs->data, &offset, 2*sizeof(int32_t));
5993
+
5994
+ ggml_scratch_load(ctx);
5995
+
5836
5996
  result->nb[1] = nb1;
5837
5997
  result->nb[2] = nb2;
5838
5998
  result->nb[3] = result->nb[2]*ne2;
@@ -5841,6 +6001,7 @@ struct ggml_tensor * ggml_view_3d(
5841
6001
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5842
6002
  result->src0 = a;
5843
6003
  result->src1 = NULL;
6004
+ result->opt[0] = offs;
5844
6005
 
5845
6006
  if (is_node) {
5846
6007
  memcpy(result->padding, &offset, sizeof(offset));
@@ -5873,6 +6034,13 @@ struct ggml_tensor * ggml_view_4d(
5873
6034
 
5874
6035
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
5875
6036
 
6037
+ ggml_scratch_save(ctx);
6038
+
6039
+ struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6040
+ memcpy(offs->data, &offset, 2*sizeof(int32_t));
6041
+
6042
+ ggml_scratch_load(ctx);
6043
+
5876
6044
  result->nb[1] = nb1;
5877
6045
  result->nb[2] = nb2;
5878
6046
  result->nb[3] = nb3;
@@ -5881,6 +6049,7 @@ struct ggml_tensor * ggml_view_4d(
5881
6049
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5882
6050
  result->src0 = a;
5883
6051
  result->src1 = NULL;
6052
+ result->opt[0] = offs;
5884
6053
 
5885
6054
  if (is_node) {
5886
6055
  memcpy(result->padding, &offset, sizeof(offset));
@@ -6303,7 +6472,7 @@ struct ggml_tensor * ggml_alibi(
6303
6472
 
6304
6473
  ggml_scratch_save(ctx);
6305
6474
 
6306
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6475
+ struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
6307
6476
 
6308
6477
  ((int32_t *) b->data)[0] = n_past;
6309
6478
  ((int32_t *) b->data)[1] = n_head;
@@ -7545,6 +7714,11 @@ static void ggml_compute_forward_add(
7545
7714
  case GGML_TYPE_Q5_0:
7546
7715
  case GGML_TYPE_Q5_1:
7547
7716
  case GGML_TYPE_Q8_0:
7717
+ case GGML_TYPE_Q2_K:
7718
+ case GGML_TYPE_Q3_K:
7719
+ case GGML_TYPE_Q4_K:
7720
+ case GGML_TYPE_Q5_K:
7721
+ case GGML_TYPE_Q6_K:
7548
7722
  {
7549
7723
  ggml_compute_forward_add_q_f32(params, src0, src1, dst);
7550
7724
  } break;
@@ -7848,6 +8022,11 @@ static void ggml_compute_forward_add1(
7848
8022
  case GGML_TYPE_Q5_1:
7849
8023
  case GGML_TYPE_Q8_0:
7850
8024
  case GGML_TYPE_Q8_1:
8025
+ case GGML_TYPE_Q2_K:
8026
+ case GGML_TYPE_Q3_K:
8027
+ case GGML_TYPE_Q4_K:
8028
+ case GGML_TYPE_Q5_K:
8029
+ case GGML_TYPE_Q6_K:
7851
8030
  {
7852
8031
  ggml_compute_forward_add1_q_f32(params, src0, src1, dst);
7853
8032
  } break;
@@ -7970,6 +8149,11 @@ static void ggml_compute_forward_acc(
7970
8149
  case GGML_TYPE_Q5_1:
7971
8150
  case GGML_TYPE_Q8_0:
7972
8151
  case GGML_TYPE_Q8_1:
8152
+ case GGML_TYPE_Q2_K:
8153
+ case GGML_TYPE_Q3_K:
8154
+ case GGML_TYPE_Q4_K:
8155
+ case GGML_TYPE_Q5_K:
8156
+ case GGML_TYPE_Q6_K:
7973
8157
  default:
7974
8158
  {
7975
8159
  GGML_ASSERT(false);
@@ -8088,10 +8272,10 @@ static void ggml_compute_forward_mul_f32(
8088
8272
  const int ith = params->ith;
8089
8273
  const int nth = params->nth;
8090
8274
 
8091
- #ifdef GGML_USE_CUBLAS
8092
- if (src1->backend == GGML_BACKEND_CUDA) {
8275
+ #ifdef GGML_USE_CLBLAST
8276
+ if (src1->backend == GGML_BACKEND_GPU) {
8093
8277
  if (ith == 0) {
8094
- ggml_cuda_mul(src0, src1, dst);
8278
+ ggml_cl_mul(src0, src1, dst);
8095
8279
  }
8096
8280
  return;
8097
8281
  }
@@ -9206,7 +9390,7 @@ static void ggml_compute_forward_rms_norm_f32(
9206
9390
  sum += (ggml_float)(x[i00] * x[i00]);
9207
9391
  }
9208
9392
 
9209
- float mean = sum/ne00;
9393
+ const float mean = sum/ne00;
9210
9394
 
9211
9395
  float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
9212
9396
 
@@ -9529,14 +9713,7 @@ static void ggml_compute_forward_mul_mat_f32(
9529
9713
  // nb01 >= nb00 - src0 is not transposed
9530
9714
  // compute by src0 rows
9531
9715
 
9532
- #if defined(GGML_USE_CUBLAS)
9533
- if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
9534
- if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
9535
- ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
9536
- }
9537
- return;
9538
- }
9539
- #elif defined(GGML_USE_CLBLAST)
9716
+ #if defined(GGML_USE_CLBLAST)
9540
9717
  if (ggml_cl_can_mul_mat(src0, src1, dst)) {
9541
9718
  if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
9542
9719
  ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
@@ -9701,14 +9878,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
9701
9878
  // nb01 >= nb00 - src0 is not transposed
9702
9879
  // compute by src0 rows
9703
9880
 
9704
- #if defined(GGML_USE_CUBLAS)
9705
- if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
9706
- if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
9707
- ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
9708
- }
9709
- return;
9710
- }
9711
- #elif defined(GGML_USE_CLBLAST)
9881
+ #if defined(GGML_USE_CLBLAST)
9712
9882
  if (ggml_cl_can_mul_mat(src0, src1, dst)) {
9713
9883
  if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
9714
9884
  ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
@@ -9913,14 +10083,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
9913
10083
  // nb01 >= nb00 - src0 is not transposed
9914
10084
  // compute by src0 rows
9915
10085
 
9916
- #if defined(GGML_USE_CUBLAS)
9917
- if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
9918
- if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
9919
- ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
9920
- }
9921
- return;
9922
- }
9923
- #elif defined(GGML_USE_CLBLAST)
10086
+ #if defined(GGML_USE_CLBLAST)
9924
10087
  if (ggml_cl_can_mul_mat(src0, src1, dst)) {
9925
10088
  if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
9926
10089
  ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
@@ -10063,6 +10226,11 @@ static void ggml_compute_forward_mul_mat(
10063
10226
  case GGML_TYPE_Q5_1:
10064
10227
  case GGML_TYPE_Q8_0:
10065
10228
  case GGML_TYPE_Q8_1:
10229
+ case GGML_TYPE_Q2_K:
10230
+ case GGML_TYPE_Q3_K:
10231
+ case GGML_TYPE_Q4_K:
10232
+ case GGML_TYPE_Q5_K:
10233
+ case GGML_TYPE_Q6_K:
10066
10234
  {
10067
10235
  ggml_compute_forward_mul_mat_q_f32(params, src0, src1, dst);
10068
10236
  } break;
@@ -10246,6 +10414,11 @@ static void ggml_compute_forward_set(
10246
10414
  case GGML_TYPE_Q5_1:
10247
10415
  case GGML_TYPE_Q8_0:
10248
10416
  case GGML_TYPE_Q8_1:
10417
+ case GGML_TYPE_Q2_K:
10418
+ case GGML_TYPE_Q3_K:
10419
+ case GGML_TYPE_Q4_K:
10420
+ case GGML_TYPE_Q5_K:
10421
+ case GGML_TYPE_Q6_K:
10249
10422
  default:
10250
10423
  {
10251
10424
  GGML_ASSERT(false);
@@ -10411,6 +10584,11 @@ static void ggml_compute_forward_get_rows(
10411
10584
  case GGML_TYPE_Q5_1:
10412
10585
  case GGML_TYPE_Q8_0:
10413
10586
  case GGML_TYPE_Q8_1:
10587
+ case GGML_TYPE_Q2_K:
10588
+ case GGML_TYPE_Q3_K:
10589
+ case GGML_TYPE_Q4_K:
10590
+ case GGML_TYPE_Q5_K:
10591
+ case GGML_TYPE_Q6_K:
10414
10592
  {
10415
10593
  ggml_compute_forward_get_rows_q(params, src0, src1, dst);
10416
10594
  } break;
@@ -10957,6 +11135,12 @@ static void ggml_compute_forward_alibi(
10957
11135
  case GGML_TYPE_Q5_1:
10958
11136
  case GGML_TYPE_Q8_0:
10959
11137
  case GGML_TYPE_Q8_1:
11138
+ case GGML_TYPE_Q2_K:
11139
+ case GGML_TYPE_Q3_K:
11140
+ case GGML_TYPE_Q4_K:
11141
+ case GGML_TYPE_Q5_K:
11142
+ case GGML_TYPE_Q6_K:
11143
+ case GGML_TYPE_Q8_K:
10960
11144
  case GGML_TYPE_I8:
10961
11145
  case GGML_TYPE_I16:
10962
11146
  case GGML_TYPE_I32:
@@ -11028,6 +11212,12 @@ static void ggml_compute_forward_clamp(
11028
11212
  case GGML_TYPE_Q5_1:
11029
11213
  case GGML_TYPE_Q8_0:
11030
11214
  case GGML_TYPE_Q8_1:
11215
+ case GGML_TYPE_Q2_K:
11216
+ case GGML_TYPE_Q3_K:
11217
+ case GGML_TYPE_Q4_K:
11218
+ case GGML_TYPE_Q5_K:
11219
+ case GGML_TYPE_Q6_K:
11220
+ case GGML_TYPE_Q8_K:
11031
11221
  case GGML_TYPE_I8:
11032
11222
  case GGML_TYPE_I16:
11033
11223
  case GGML_TYPE_I32:
@@ -11117,7 +11307,7 @@ static void ggml_compute_forward_rope_f32(
11117
11307
  theta *= theta_scale;
11118
11308
 
11119
11309
  const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
11120
- float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
11310
+ float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
11121
11311
 
11122
11312
  const float x0 = src[0];
11123
11313
  const float x1 = src[1];
@@ -11138,7 +11328,7 @@ static void ggml_compute_forward_rope_f32(
11138
11328
  const int64_t i0 = ib*n_dims + ic/2;
11139
11329
 
11140
11330
  const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
11141
- float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
11331
+ float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
11142
11332
 
11143
11333
  const float x0 = src[0];
11144
11334
  const float x1 = src[n_dims/2];
@@ -12846,6 +13036,15 @@ static void ggml_compute_forward_map_binary(
12846
13036
  static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
12847
13037
  GGML_ASSERT(params);
12848
13038
 
13039
+ #ifdef GGML_USE_CUBLAS
13040
+ bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
13041
+ if (skip_cpu) {
13042
+ return;
13043
+ }
13044
+ GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU);
13045
+ GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU);
13046
+ #endif // GGML_USE_CUBLAS
13047
+
12849
13048
  switch (tensor->op) {
12850
13049
  case GGML_OP_DUP:
12851
13050
  {
@@ -13792,11 +13991,19 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
13792
13991
  // reached a leaf node, not part of the gradient graph (e.g. a constant)
13793
13992
  GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
13794
13993
 
13994
+ if (strlen(node->name) == 0) {
13995
+ snprintf(node->name, sizeof(node->name), "leaf_%d", cgraph->n_leafs);
13996
+ }
13997
+
13795
13998
  cgraph->leafs[cgraph->n_leafs] = node;
13796
13999
  cgraph->n_leafs++;
13797
14000
  } else {
13798
14001
  GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
13799
14002
 
14003
+ if (strlen(node->name) == 0) {
14004
+ snprintf(node->name, sizeof(node->name), "node_%d", cgraph->n_nodes);
14005
+ }
14006
+
13800
14007
  cgraph->nodes[cgraph->n_nodes] = node;
13801
14008
  cgraph->grads[cgraph->n_nodes] = node->grad;
13802
14009
  cgraph->n_nodes++;
@@ -14144,7 +14351,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
14144
14351
  if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
14145
14352
  node->n_tasks = 1; // TODO: this actually is doing nothing
14146
14353
  // the threads are still spinning
14147
- cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node);
14148
14354
  }
14149
14355
  else
14150
14356
  #elif defined(GGML_USE_CLBLAST)
@@ -14510,6 +14716,521 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
14510
14716
  }
14511
14717
  }
14512
14718
 
14719
+ struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
14720
+ for (int i = 0; i < cgraph->n_leafs; i++) {
14721
+ struct ggml_tensor * leaf = cgraph->leafs[i];
14722
+
14723
+ if (strcmp(leaf->name, name) == 0) {
14724
+ return leaf;
14725
+ }
14726
+ }
14727
+
14728
+ for (int i = 0; i < cgraph->n_nodes; i++) {
14729
+ struct ggml_tensor * node = cgraph->nodes[i];
14730
+
14731
+ if (strcmp(node->name, name) == 0) {
14732
+ return node;
14733
+ }
14734
+ }
14735
+
14736
+ return NULL;
14737
+ }
14738
+
14739
+ static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fout) {
14740
+ const int64_t * ne = tensor->ne;
14741
+ const size_t * nb = tensor->nb;
14742
+
14743
+ fprintf(fout, "%-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
14744
+ ggml_type_name(tensor->type),
14745
+ ggml_op_name (tensor->op),
14746
+ tensor->n_dims,
14747
+ ne[0], ne[1], ne[2], ne[3],
14748
+ nb[0], nb[1], nb[2], nb[3],
14749
+ tensor->data,
14750
+ tensor->name);
14751
+ }
14752
+
14753
+ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char * arg, FILE * fout) {
14754
+ const int64_t * ne = tensor->ne;
14755
+ const size_t * nb = tensor->nb;
14756
+
14757
+ fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %8d %16p %32s\n",
14758
+ arg,
14759
+ ggml_type_name(tensor->type),
14760
+ ggml_op_name (tensor->op),
14761
+ tensor->n_dims,
14762
+ ne[0], ne[1], ne[2], ne[3],
14763
+ nb[0], nb[1], nb[2], nb[3],
14764
+ tensor->n_tasks,
14765
+ tensor->data,
14766
+ tensor->name);
14767
+ }
14768
+
14769
+ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
14770
+ //assert(cgraph->work == NULL);
14771
+ //assert(cgraph->work_size == 0);
14772
+
14773
+ uint64_t size_eval = 0;
14774
+
14775
+ // compute size of intermediate results
14776
+ // TODO: does not take into account scratch buffers !!!!
14777
+ for (int i = 0; i < cgraph->n_nodes; ++i) {
14778
+ size_eval += ggml_nbytes(cgraph->nodes[i]);
14779
+ }
14780
+
14781
+ // print
14782
+ {
14783
+ FILE * fout = stdout;
14784
+
14785
+ fprintf(fout, "\n");
14786
+ fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC);
14787
+ fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION);
14788
+ fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs);
14789
+ fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes);
14790
+ fprintf(fout, "%-16s %" PRIu64 "\n", "eval", size_eval);
14791
+
14792
+ // header
14793
+ fprintf(fout, "\n");
14794
+ fprintf(fout, "%-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %16s %16s\n",
14795
+ "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "DATA", "NAME");
14796
+
14797
+ for (int i = 0; i < cgraph->n_leafs; ++i) {
14798
+ ggml_graph_export_leaf(cgraph->leafs[i], fout);
14799
+
14800
+ GGML_ASSERT(cgraph->leafs[i]->op == GGML_OP_NONE);
14801
+ GGML_ASSERT(cgraph->leafs[i]->src0 == NULL);
14802
+ GGML_ASSERT(cgraph->leafs[i]->src1 == NULL);
14803
+ }
14804
+
14805
+ // header
14806
+ fprintf(fout, "\n");
14807
+ fprintf(fout, "%-6s %-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %8s %16s %16s\n",
14808
+ "ARG", "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "NTASKS", "DATA", "NAME");
14809
+
14810
+ for (int i = 0; i < cgraph->n_nodes; ++i) {
14811
+ ggml_graph_export_node(cgraph->nodes[i], "DST", fout);
14812
+
14813
+ if (cgraph->nodes[i]->src0) {
14814
+ ggml_graph_export_node(cgraph->nodes[i]->src0, "SRC0", fout);
14815
+ }
14816
+
14817
+ if (cgraph->nodes[i]->src1) {
14818
+ ggml_graph_export_node(cgraph->nodes[i]->src1, "SRC1", fout);
14819
+ }
14820
+
14821
+ for (int j = 0; j < GGML_MAX_OPT; ++j) {
14822
+ if (cgraph->nodes[i]->opt[j]) {
14823
+ ggml_graph_export_node(cgraph->nodes[i]->opt[j], "OPT", fout);
14824
+ }
14825
+ }
14826
+
14827
+ fprintf(fout, "\n");
14828
+ }
14829
+
14830
+ fprintf(fout, "\n");
14831
+ }
14832
+
14833
+ // write binary data
14834
+ {
14835
+ FILE * fout = fopen(fname, "wb");
14836
+
14837
+ if (!fout) {
14838
+ fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
14839
+ return;
14840
+ }
14841
+
14842
+ // header
14843
+ {
14844
+ const uint32_t magic = GGML_FILE_MAGIC;
14845
+ const uint32_t version = GGML_FILE_VERSION;
14846
+ const uint32_t n_leafs = cgraph->n_leafs;
14847
+ const uint32_t nodes = cgraph->n_nodes;
14848
+
14849
+ fwrite(&magic, sizeof(uint32_t), 1, fout);
14850
+ fwrite(&version, sizeof(uint32_t), 1, fout);
14851
+ fwrite(&n_leafs, sizeof(uint32_t), 1, fout);
14852
+ fwrite(&nodes, sizeof(uint32_t), 1, fout);
14853
+ fwrite(&size_eval, sizeof(uint64_t), 1, fout);
14854
+ }
14855
+
14856
+ // leafs
14857
+ {
14858
+ for (int i = 0; i < cgraph->n_leafs; ++i) {
14859
+ const struct ggml_tensor * tensor = cgraph->leafs[i];
14860
+
14861
+ const uint32_t type = tensor->type;
14862
+ const uint32_t op = tensor->op;
14863
+ const uint32_t n_dims = tensor->n_dims;
14864
+
14865
+ fwrite(&type, sizeof(uint32_t), 1, fout);
14866
+ fwrite(&op, sizeof(uint32_t), 1, fout);
14867
+ fwrite(&n_dims, sizeof(uint32_t), 1, fout);
14868
+
14869
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
14870
+ const uint64_t ne = tensor->ne[j];
14871
+ const uint64_t nb = tensor->nb[j];
14872
+
14873
+ fwrite(&ne, sizeof(uint64_t), 1, fout);
14874
+ fwrite(&nb, sizeof(uint64_t), 1, fout);
14875
+ }
14876
+
14877
+ // store the pointer address
14878
+ {
14879
+ const uint64_t ptr = (uint64_t) tensor->data;
14880
+
14881
+ fwrite(&ptr, sizeof(uint64_t), 1, fout);
14882
+ }
14883
+
14884
+ fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
14885
+
14886
+ // dump the data
14887
+ // TODO: pad this to 32 byte boundary
14888
+ {
14889
+ const size_t size = ggml_nbytes(tensor);
14890
+
14891
+ fwrite(tensor->data, sizeof(char), size, fout);
14892
+ }
14893
+ }
14894
+ }
14895
+
14896
+ // nodes
14897
+ {
14898
+ for (int i = 0; i < cgraph->n_nodes; ++i) {
14899
+ const struct ggml_tensor * tensor = cgraph->nodes[i];
14900
+
14901
+ const uint32_t type = tensor->type;
14902
+ const uint32_t op = tensor->op;
14903
+ const uint32_t n_dims = tensor->n_dims;
14904
+
14905
+ fwrite(&type, sizeof(uint32_t), 1, fout);
14906
+ fwrite(&op, sizeof(uint32_t), 1, fout);
14907
+ fwrite(&n_dims, sizeof(uint32_t), 1, fout);
14908
+
14909
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
14910
+ const uint64_t ne = tensor->ne[j];
14911
+ const uint64_t nb = tensor->nb[j];
14912
+
14913
+ fwrite(&ne, sizeof(uint64_t), 1, fout);
14914
+ fwrite(&nb, sizeof(uint64_t), 1, fout);
14915
+ }
14916
+
14917
+ // store the pointer address
14918
+ {
14919
+ const uint64_t ptr = (uint64_t) tensor->data;
14920
+
14921
+ fwrite(&ptr, sizeof(uint64_t), 1, fout);
14922
+ }
14923
+
14924
+ fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
14925
+
14926
+ // output the op arguments
14927
+ {
14928
+ struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL };
14929
+
14930
+ args[0] = tensor->src0;
14931
+ args[1] = tensor->src1;
14932
+
14933
+ for (int j = 0; j < GGML_MAX_OPT; ++j) {
14934
+ args[2 + j] = tensor->opt[j];
14935
+ }
14936
+
14937
+ for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
14938
+ if (args[j]) {
14939
+ int32_t idx = -1;
14940
+
14941
+ // check if leaf
14942
+ {
14943
+ for (int k = 0; k < cgraph->n_leafs; ++k) {
14944
+ if (args[j] == cgraph->leafs[k]) {
14945
+ idx = k;
14946
+ break;
14947
+ }
14948
+ }
14949
+ }
14950
+
14951
+ // check if node
14952
+ if (idx == -1) {
14953
+ for (int k = 0; k < cgraph->n_nodes; ++k) {
14954
+ if (args[j] == cgraph->nodes[k]) {
14955
+ idx = GGML_MAX_NODES + k;
14956
+ break;
14957
+ }
14958
+ }
14959
+ }
14960
+
14961
+ if (idx == -1) {
14962
+ fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i);
14963
+ return;
14964
+ }
14965
+
14966
+ fwrite(&idx, sizeof(int32_t), 1, fout);
14967
+ } else {
14968
+ const int32_t nul = -1;
14969
+
14970
+ fwrite(&nul, sizeof(int32_t), 1, fout);
14971
+ }
14972
+ }
14973
+ }
14974
+ }
14975
+ }
14976
+
14977
+ fclose(fout);
14978
+ }
14979
+ }
14980
+
14981
+ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) {
14982
+ assert(*ctx_data == NULL);
14983
+ assert(*ctx_eval == NULL);
14984
+
14985
+ struct ggml_cgraph result = { 0 };
14986
+
14987
+ struct ggml_tensor * data = NULL;
14988
+
14989
+ // read file into data
14990
+ {
14991
+ FILE * fin = fopen(fname, "rb");
14992
+ if (!fin) {
14993
+ fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
14994
+ return result;
14995
+ }
14996
+
14997
+ size_t fsize = 0;
14998
+
14999
+ fseek(fin, 0, SEEK_END);
15000
+ fsize = ftell(fin);
15001
+ fseek(fin, 0, SEEK_SET);
15002
+
15003
+ // create the data context
15004
+ {
15005
+ const size_t overhead = 1*ggml_tensor_overhead();
15006
+
15007
+ struct ggml_init_params params = {
15008
+ .mem_size = fsize + overhead,
15009
+ .mem_buffer = NULL,
15010
+ .no_alloc = false,
15011
+ };
15012
+
15013
+ *ctx_data = ggml_init(params);
15014
+
15015
+ if (!*ctx_data) {
15016
+ fprintf(stderr, "%s: failed to create ggml context\n", __func__);
15017
+ return result;
15018
+ }
15019
+ }
15020
+
15021
+ data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize);
15022
+
15023
+ const size_t ret = fread(data->data, sizeof(char), fsize, fin);
15024
+ if (ret != fsize) {
15025
+ fprintf(stderr, "%s: failed to read %s\n", __func__, fname);
15026
+ return result;
15027
+ }
15028
+
15029
+ fclose(fin);
15030
+ }
15031
+
15032
+ // populate result
15033
+ {
15034
+ char * ptr = (char *) data->data;
15035
+
15036
+ const uint32_t magic = *(const uint32_t *) ptr; ptr += sizeof(magic);
15037
+
15038
+ if (magic != GGML_FILE_MAGIC) {
15039
+ fprintf(stderr, "%s: invalid magic number, got %08x\n", __func__, magic);
15040
+ return result;
15041
+ }
15042
+
15043
+ const uint32_t version = *(const uint32_t *) ptr; ptr += sizeof(version);
15044
+
15045
+ if (version != GGML_FILE_VERSION) {
15046
+ fprintf(stderr, "%s: invalid version number\n", __func__);
15047
+ return result;
15048
+ }
15049
+
15050
+ const uint32_t n_leafs = *(const uint32_t *) ptr; ptr += sizeof(n_leafs);
15051
+ const uint32_t n_nodes = *(const uint32_t *) ptr; ptr += sizeof(n_nodes);
15052
+ const uint64_t size_eval = *(const uint64_t *) ptr; ptr += sizeof(size_eval);
15053
+
15054
+ result.n_leafs = n_leafs;
15055
+ result.n_nodes = n_nodes;
15056
+
15057
+ // create the data context
15058
+ {
15059
+ const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead();
15060
+
15061
+ struct ggml_init_params params = {
15062
+ .mem_size = size_eval + overhead,
15063
+ .mem_buffer = NULL,
15064
+ .no_alloc = true,
15065
+ };
15066
+
15067
+ *ctx_eval = ggml_init(params);
15068
+
15069
+ if (!*ctx_eval) {
15070
+ fprintf(stderr, "%s: failed to create ggml context\n", __func__);
15071
+ return result;
15072
+ }
15073
+ }
15074
+
15075
+ // leafs
15076
+ {
15077
+ uint32_t type;
15078
+ uint32_t op;
15079
+ uint32_t n_dims;
15080
+
15081
+ for (uint32_t i = 0; i < n_leafs; ++i) {
15082
+ type = *(const uint32_t *) ptr; ptr += sizeof(type);
15083
+ op = *(const uint32_t *) ptr; ptr += sizeof(op);
15084
+ n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
15085
+
15086
+ int64_t ne[GGML_MAX_DIMS];
15087
+ size_t nb[GGML_MAX_DIMS];
15088
+
15089
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
15090
+ uint64_t ne_cur;
15091
+ uint64_t nb_cur;
15092
+
15093
+ ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
15094
+ nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
15095
+
15096
+ ne[j] = ne_cur;
15097
+ nb[j] = nb_cur;
15098
+ }
15099
+
15100
+ struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
15101
+
15102
+ tensor->op = (enum ggml_op) op;
15103
+
15104
+ uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur);
15105
+
15106
+ memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
15107
+
15108
+ tensor->data = (void *) ptr;
15109
+
15110
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
15111
+ tensor->nb[j] = nb[j];
15112
+ }
15113
+
15114
+ result.leafs[i] = tensor;
15115
+
15116
+ ptr += ggml_nbytes(tensor);
15117
+
15118
+ fprintf(stderr, "%s: loaded leaf %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
15119
+ }
15120
+ }
15121
+
15122
+ ggml_set_no_alloc(*ctx_eval, false);
15123
+
15124
+ // nodes
15125
+ {
15126
+ uint32_t type;
15127
+ uint32_t op;
15128
+ uint32_t n_dims;
15129
+
15130
+ for (uint32_t i = 0; i < n_nodes; ++i) {
15131
+ type = *(const uint32_t *) ptr; ptr += sizeof(type);
15132
+ op = *(const uint32_t *) ptr; ptr += sizeof(op);
15133
+ n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
15134
+
15135
+ enum ggml_op eop = (enum ggml_op) op;
15136
+
15137
+ int64_t ne[GGML_MAX_DIMS];
15138
+ size_t nb[GGML_MAX_DIMS];
15139
+
15140
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
15141
+ uint64_t ne_cur;
15142
+ uint64_t nb_cur;
15143
+
15144
+ ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
15145
+ nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
15146
+
15147
+ ne[j] = ne_cur;
15148
+ nb[j] = nb_cur;
15149
+ }
15150
+
15151
+ uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur); // TODO: not yet used
15152
+
15153
+ const char * ptr_name = ptr; ptr += GGML_MAX_NAME;
15154
+
15155
+ const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += (2 + GGML_MAX_OPT)*sizeof(int32_t);
15156
+
15157
+ struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL };
15158
+
15159
+ // parse args
15160
+ for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
15161
+ const int32_t arg_idx = ptr_arg_idx[j];
15162
+
15163
+ if (arg_idx == -1) {
15164
+ continue;
15165
+ }
15166
+
15167
+ if (arg_idx < GGML_MAX_NODES) {
15168
+ args[j] = result.leafs[arg_idx];
15169
+ } else {
15170
+ args[j] = result.nodes[arg_idx - GGML_MAX_NODES];
15171
+ }
15172
+ }
15173
+
15174
+ // create the tensor
15175
+ // "view" operations are handled differently
15176
+ // TODO: handle inplace ops - currently a copy is always made
15177
+
15178
+ struct ggml_tensor * tensor = NULL;
15179
+
15180
+ switch (eop) {
15181
+ // TODO: implement other view ops
15182
+ case GGML_OP_RESHAPE:
15183
+ {
15184
+ tensor = ggml_reshape_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3]);
15185
+ } break;
15186
+ case GGML_OP_VIEW:
15187
+ {
15188
+ tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
15189
+
15190
+ uint64_t offs;
15191
+ memcpy(&offs, args[2]->data, sizeof(offs));
15192
+
15193
+ tensor->data = ((char *) tensor->data) + offs;
15194
+ } break;
15195
+ case GGML_OP_TRANSPOSE:
15196
+ {
15197
+ tensor = ggml_transpose(*ctx_eval, args[0]);
15198
+ } break;
15199
+ case GGML_OP_PERMUTE:
15200
+ {
15201
+ tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
15202
+ } break;
15203
+ default:
15204
+ {
15205
+ tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
15206
+
15207
+ tensor->op = eop;
15208
+ } break;
15209
+ }
15210
+
15211
+ memcpy(tensor->name, ptr_name, GGML_MAX_NAME);
15212
+
15213
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
15214
+ tensor->nb[j] = nb[j];
15215
+ }
15216
+
15217
+ tensor->src0 = args[0];
15218
+ tensor->src1 = args[1];
15219
+
15220
+ for (int j = 0; j < GGML_MAX_OPT; ++j) {
15221
+ tensor->opt[j] = args[2 + j];
15222
+ }
15223
+
15224
+ result.nodes[i] = tensor;
15225
+
15226
+ fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
15227
+ }
15228
+ }
15229
+ }
15230
+
15231
+ return result;
15232
+ }
15233
+
14513
15234
  void ggml_graph_print(const struct ggml_cgraph * cgraph) {
14514
15235
  int64_t perf_total_per_op_us[GGML_OP_COUNT] = {0};
14515
15236
 
@@ -14527,7 +15248,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
14527
15248
  GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
14528
15249
  i,
14529
15250
  node->ne[0], node->ne[1], node->ne[2],
14530
- GGML_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
15251
+ GGML_OP_NAME[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
14531
15252
  (double) node->perf_cycles / (double) ggml_cycles_per_ms(),
14532
15253
  (double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
14533
15254
  (double) node->perf_time_us / 1000.0,
@@ -14541,7 +15262,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
14541
15262
  GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n",
14542
15263
  i,
14543
15264
  node->ne[0], node->ne[1],
14544
- GGML_OP_LABEL[node->op]);
15265
+ GGML_OP_NAME[node->op]);
14545
15266
  }
14546
15267
 
14547
15268
  for (int i = 0; i < GGML_OP_COUNT; i++) {
@@ -14549,7 +15270,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
14549
15270
  continue;
14550
15271
  }
14551
15272
 
14552
- GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_LABEL[i], (double) perf_total_per_op_us[i] / 1000.0);
15273
+ GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_NAME[i], (double) perf_total_per_op_us[i] / 1000.0);
14553
15274
  }
14554
15275
 
14555
15276
  GGML_PRINT("========================================\n");
@@ -15548,6 +16269,38 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
15548
16269
  block_q8_0 * block = (block_q8_0*)dst + start / QK8_0;
15549
16270
  result = ggml_quantize_q8_0(src + start, block, n, n, hist);
15550
16271
  } break;
16272
+ #ifdef GGML_USE_K_QUANTS
16273
+ case GGML_TYPE_Q2_K:
16274
+ {
16275
+ GGML_ASSERT(start % QK_K == 0);
16276
+ block_q2_K * block = (block_q2_K*)dst + start / QK_K;
16277
+ result = ggml_quantize_q2_K(src + start, block, n, n, hist);
16278
+ } break;
16279
+ case GGML_TYPE_Q3_K:
16280
+ {
16281
+ GGML_ASSERT(start % QK_K == 0);
16282
+ block_q3_K * block = (block_q3_K*)dst + start / QK_K;
16283
+ result = ggml_quantize_q3_K(src + start, block, n, n, hist);
16284
+ } break;
16285
+ case GGML_TYPE_Q4_K:
16286
+ {
16287
+ GGML_ASSERT(start % QK_K == 0);
16288
+ block_q4_K * block = (block_q4_K*)dst + start / QK_K;
16289
+ result = ggml_quantize_q4_K(src + start, block, n, n, hist);
16290
+ } break;
16291
+ case GGML_TYPE_Q5_K:
16292
+ {
16293
+ GGML_ASSERT(start % QK_K == 0);
16294
+ block_q5_K * block = (block_q5_K*)dst + start / QK_K;
16295
+ result = ggml_quantize_q5_K(src + start, block, n, n, hist);
16296
+ } break;
16297
+ case GGML_TYPE_Q6_K:
16298
+ {
16299
+ GGML_ASSERT(start % QK_K == 0);
16300
+ block_q6_K * block = (block_q6_K*)dst + start / QK_K;
16301
+ result = ggml_quantize_q6_K(src + start, block, n, n, hist);
16302
+ } break;
16303
+ #endif
15551
16304
  default:
15552
16305
  assert(false);
15553
16306
  }