llama_cpp 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,6 +3,10 @@
3
3
 
4
4
  #include "ggml.h"
5
5
 
6
+ #ifdef GGML_USE_K_QUANTS
7
+ #include "k_quants.h"
8
+ #endif
9
+
6
10
  #if defined(_MSC_VER) || defined(__MINGW32__)
7
11
  #include <malloc.h> // using malloc.h with MSC/MINGW
8
12
  #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
@@ -21,6 +25,10 @@
21
25
  #include <float.h>
22
26
  #include <limits.h>
23
27
 
28
+ #ifdef GGML_USE_METAL
29
+ #include <unistd.h>
30
+ #endif
31
+
24
32
  // if C99 - static_assert is noop
25
33
  // ref: https://stackoverflow.com/a/53923785/4039976
26
34
  #ifndef static_assert
@@ -121,7 +129,11 @@ typedef void* thread_ret_t;
121
129
  #else
122
130
  inline static void* ggml_aligned_malloc(size_t size) {
123
131
  void* aligned_memory = NULL;
132
+ #ifdef GGML_USE_METAL
133
+ int result = posix_memalign(&aligned_memory, getpagesize(), size);
134
+ #else
124
135
  int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
136
+ #endif
125
137
  if (result != 0) {
126
138
  // Handle allocation failure
127
139
  return NULL;
@@ -403,21 +415,27 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) {
403
415
  //
404
416
 
405
417
  #if defined(_MSC_VER) || defined(__MINGW32__)
406
- static int64_t timer_freq;
418
+ static int64_t timer_freq, timer_start;
407
419
  void ggml_time_init(void) {
408
- LARGE_INTEGER frequency;
409
- QueryPerformanceFrequency(&frequency);
410
- timer_freq = frequency.QuadPart;
420
+ LARGE_INTEGER t;
421
+ QueryPerformanceFrequency(&t);
422
+ timer_freq = t.QuadPart;
423
+
424
+ // The multiplication by 1000 or 1000000 below can cause an overflow if timer_freq
425
+ // and the uptime is high enough.
426
+ // We subtract the program start time to reduce the likelihood of that happening.
427
+ QueryPerformanceCounter(&t);
428
+ timer_start = t.QuadPart;
411
429
  }
412
430
  int64_t ggml_time_ms(void) {
413
431
  LARGE_INTEGER t;
414
432
  QueryPerformanceCounter(&t);
415
- return (t.QuadPart * 1000) / timer_freq;
433
+ return ((t.QuadPart-timer_start) * 1000) / timer_freq;
416
434
  }
417
435
  int64_t ggml_time_us(void) {
418
436
  LARGE_INTEGER t;
419
437
  QueryPerformanceCounter(&t);
420
- return (t.QuadPart * 1000000) / timer_freq;
438
+ return ((t.QuadPart-timer_start) * 1000000) / timer_freq;
421
439
  }
422
440
  #else
423
441
  void ggml_time_init(void) {}
@@ -474,6 +492,8 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
474
492
  // quantization
475
493
  //
476
494
 
495
+ #define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
496
+
477
497
  #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
478
498
  // multiply int8_t, add results pairwise twice
479
499
  static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
@@ -533,7 +553,7 @@ static inline __m256i bytes_from_bits_32(const uint8_t * x) {
533
553
  static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
534
554
  {
535
555
  const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
536
- const __m256i bytes = _mm256_set_m128i(_mm_srli_epi16(tmp, 4), tmp);
556
+ const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp);
537
557
  const __m256i lowMask = _mm256_set1_epi8( 0xF );
538
558
  return _mm256_and_si256(lowMask, bytes);
539
559
  }
@@ -606,7 +626,7 @@ static inline __m256i bytes_from_bits_32(const uint8_t * x) {
606
626
  bytesh = _mm_or_si128(bytesh, bit_mask);
607
627
  bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1));
608
628
  bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1));
609
- return _mm256_set_m128i(bytesh, bytesl);
629
+ return MM256_SET_M128I(bytesh, bytesl);
610
630
  }
611
631
 
612
632
  // Unpack 32 4-bit fields into 32 bytes
@@ -619,7 +639,7 @@ static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
619
639
  const __m128i lowMask = _mm_set1_epi8(0xF);
620
640
  tmpl = _mm_and_si128(lowMask, tmpl);
621
641
  tmph = _mm_and_si128(lowMask, tmph);
622
- return _mm256_set_m128i(tmph, tmpl);
642
+ return MM256_SET_M128I(tmph, tmpl);
623
643
  }
624
644
 
625
645
  // add int16_t pairwise and return as float vector
@@ -627,7 +647,7 @@ static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) {
627
647
  const __m128i ones = _mm_set1_epi16(1);
628
648
  const __m128i summed_pairsl = _mm_madd_epi16(ones, xl);
629
649
  const __m128i summed_pairsh = _mm_madd_epi16(ones, xh);
630
- const __m256i summed_pairs = _mm256_set_m128i(summed_pairsh, summed_pairsl);
650
+ const __m256i summed_pairs = MM256_SET_M128I(summed_pairsh, summed_pairsl);
631
651
  return _mm256_cvtepi32_ps(summed_pairs);
632
652
  }
633
653
 
@@ -1565,6 +1585,48 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
1565
1585
  .vec_dot_q = NULL, // TODO
1566
1586
  .vec_dot_type = GGML_TYPE_Q8_1,
1567
1587
  },
1588
+ #ifdef GGML_USE_K_QUANTS
1589
+ [GGML_TYPE_Q2_K] = {
1590
+ .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q2_K,
1591
+ .quantize_row_q = quantize_row_q2_K,
1592
+ .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q2_K_reference,
1593
+ .quantize_row_q_dot = quantize_row_q8_K,
1594
+ .vec_dot_q = ggml_vec_dot_q2_K_q8_K,
1595
+ .vec_dot_type = GGML_TYPE_Q8_K,
1596
+ },
1597
+ [GGML_TYPE_Q3_K] = {
1598
+ .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q3_K,
1599
+ .quantize_row_q = quantize_row_q3_K,
1600
+ .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q3_K_reference,
1601
+ .quantize_row_q_dot = quantize_row_q8_K,
1602
+ .vec_dot_q = ggml_vec_dot_q3_K_q8_K,
1603
+ .vec_dot_type = GGML_TYPE_Q8_K,
1604
+ },
1605
+ [GGML_TYPE_Q4_K] = {
1606
+ .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q4_K,
1607
+ .quantize_row_q = quantize_row_q4_K,
1608
+ .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_K_reference,
1609
+ .quantize_row_q_dot = quantize_row_q8_K,
1610
+ .vec_dot_q = ggml_vec_dot_q4_K_q8_K,
1611
+ .vec_dot_type = GGML_TYPE_Q8_K,
1612
+ },
1613
+ [GGML_TYPE_Q5_K] = {
1614
+ .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_K,
1615
+ .quantize_row_q = quantize_row_q5_K,
1616
+ .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_K_reference,
1617
+ .quantize_row_q_dot = quantize_row_q8_K,
1618
+ .vec_dot_q = ggml_vec_dot_q5_K_q8_K,
1619
+ .vec_dot_type = GGML_TYPE_Q8_K,
1620
+ },
1621
+ [GGML_TYPE_Q6_K] = {
1622
+ .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q6_K,
1623
+ .quantize_row_q = quantize_row_q6_K,
1624
+ .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q6_K_reference,
1625
+ .quantize_row_q_dot = quantize_row_q8_K,
1626
+ .vec_dot_q = ggml_vec_dot_q6_K_q8_K,
1627
+ .vec_dot_type = GGML_TYPE_Q8_K,
1628
+ },
1629
+ #endif
1568
1630
  };
1569
1631
 
1570
1632
  // For internal test use
@@ -2290,7 +2352,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2290
2352
  const __m128i i32_1 = mul_sum_i8_pairs(bx, by);
2291
2353
 
2292
2354
  // Convert int32_t to float
2293
- __m256 p = _mm256_cvtepi32_ps(_mm256_set_m128i(i32_0, i32_1));
2355
+ __m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1));
2294
2356
 
2295
2357
  // Apply the scale, and accumulate
2296
2358
  acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
@@ -2766,7 +2828,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2766
2828
  __m128i bxh = _mm256_extractf128_si256(bx, 1);
2767
2829
  bxl = _mm_or_si128(bxl, bxhil);
2768
2830
  bxh = _mm_or_si128(bxh, bxhih);
2769
- bx = _mm256_set_m128i(bxh, bxl);
2831
+ bx = MM256_SET_M128I(bxh, bxl);
2770
2832
 
2771
2833
  const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
2772
2834
 
@@ -3022,7 +3084,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
3022
3084
  __m128i bxh = _mm256_extractf128_si256(bx, 1);
3023
3085
  bxl = _mm_or_si128(bxl, bxhil);
3024
3086
  bxh = _mm_or_si128(bxh, bxhih);
3025
- bx = _mm256_set_m128i(bxh, bxl);
3087
+ bx = MM256_SET_M128I(bxh, bxl);
3026
3088
 
3027
3089
  const __m256 dy = _mm256_set1_ps(y[i].d);
3028
3090
  const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
@@ -3444,11 +3506,19 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
3444
3506
  [GGML_TYPE_Q5_1] = QK5_1,
3445
3507
  [GGML_TYPE_Q8_0] = QK8_0,
3446
3508
  [GGML_TYPE_Q8_1] = QK8_1,
3509
+ #ifdef GGML_USE_K_QUANTS
3510
+ [GGML_TYPE_Q2_K] = QK_K,
3511
+ [GGML_TYPE_Q3_K] = QK_K,
3512
+ [GGML_TYPE_Q4_K] = QK_K,
3513
+ [GGML_TYPE_Q5_K] = QK_K,
3514
+ [GGML_TYPE_Q6_K] = QK_K,
3515
+ [GGML_TYPE_Q8_K] = QK_K,
3516
+ #endif
3447
3517
  [GGML_TYPE_I8] = 1,
3448
3518
  [GGML_TYPE_I16] = 1,
3449
3519
  [GGML_TYPE_I32] = 1,
3450
3520
  };
3451
- static_assert(GGML_TYPE_COUNT == 13, "GGML_BLCK_SIZE is outdated");
3521
+ static_assert(GGML_TYPE_COUNT == 19, "GGML_BLCK_SIZE is outdated");
3452
3522
 
3453
3523
  static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
3454
3524
  [GGML_TYPE_F32] = sizeof(float),
@@ -3459,11 +3529,19 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
3459
3529
  [GGML_TYPE_Q5_1] = sizeof(block_q5_1),
3460
3530
  [GGML_TYPE_Q8_0] = sizeof(block_q8_0),
3461
3531
  [GGML_TYPE_Q8_1] = sizeof(block_q8_1),
3532
+ #ifdef GGML_USE_K_QUANTS
3533
+ [GGML_TYPE_Q2_K] = sizeof(block_q2_K),
3534
+ [GGML_TYPE_Q3_K] = sizeof(block_q3_K),
3535
+ [GGML_TYPE_Q4_K] = sizeof(block_q4_K),
3536
+ [GGML_TYPE_Q5_K] = sizeof(block_q5_K),
3537
+ [GGML_TYPE_Q6_K] = sizeof(block_q6_K),
3538
+ [GGML_TYPE_Q8_K] = sizeof(block_q8_K),
3539
+ #endif
3462
3540
  [GGML_TYPE_I8] = sizeof(int8_t),
3463
3541
  [GGML_TYPE_I16] = sizeof(int16_t),
3464
3542
  [GGML_TYPE_I32] = sizeof(int32_t),
3465
3543
  };
3466
- static_assert(GGML_TYPE_COUNT == 13, "GGML_TYPE_SIZE is outdated");
3544
+ static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_SIZE is outdated");
3467
3545
 
3468
3546
 
3469
3547
  static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
@@ -3475,11 +3553,17 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
3475
3553
  [GGML_TYPE_Q5_1] = "q5_1",
3476
3554
  [GGML_TYPE_Q8_0] = "q8_0",
3477
3555
  [GGML_TYPE_Q8_1] = "q8_1",
3556
+ [GGML_TYPE_Q2_K] = "q2_K",
3557
+ [GGML_TYPE_Q3_K] = "q3_K",
3558
+ [GGML_TYPE_Q4_K] = "q4_K",
3559
+ [GGML_TYPE_Q5_K] = "q5_K",
3560
+ [GGML_TYPE_Q6_K] = "q6_K",
3561
+ [GGML_TYPE_Q8_K] = "q8_K",
3478
3562
  [GGML_TYPE_I8] = "i8",
3479
3563
  [GGML_TYPE_I16] = "i16",
3480
3564
  [GGML_TYPE_I32] = "i32",
3481
3565
  };
3482
- static_assert(GGML_TYPE_COUNT == 13, "GGML_TYPE_NAME is outdated");
3566
+ static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_NAME is outdated");
3483
3567
 
3484
3568
  static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
3485
3569
  [GGML_TYPE_F32] = false,
@@ -3490,11 +3574,17 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
3490
3574
  [GGML_TYPE_Q5_1] = true,
3491
3575
  [GGML_TYPE_Q8_0] = true,
3492
3576
  [GGML_TYPE_Q8_1] = true,
3577
+ [GGML_TYPE_Q2_K] = true,
3578
+ [GGML_TYPE_Q3_K] = true,
3579
+ [GGML_TYPE_Q4_K] = true,
3580
+ [GGML_TYPE_Q5_K] = true,
3581
+ [GGML_TYPE_Q6_K] = true,
3582
+ [GGML_TYPE_Q8_K] = true,
3493
3583
  [GGML_TYPE_I8] = false,
3494
3584
  [GGML_TYPE_I16] = false,
3495
3585
  [GGML_TYPE_I32] = false,
3496
3586
  };
3497
- static_assert(GGML_TYPE_COUNT == 13, "GGML_IS_QUANTIZED is outdated");
3587
+ static_assert(GGML_TYPE_COUNT == 19, "GGML_IS_QUANTIZED is outdated");
3498
3588
 
3499
3589
  static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3500
3590
  "NONE",
@@ -3631,6 +3721,7 @@ struct ggml_context {
3631
3721
  void * mem_buffer;
3632
3722
  bool mem_buffer_owned;
3633
3723
  bool no_alloc;
3724
+ bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
3634
3725
 
3635
3726
  int n_objects;
3636
3727
 
@@ -3647,26 +3738,6 @@ struct ggml_context_container {
3647
3738
  struct ggml_context context;
3648
3739
  };
3649
3740
 
3650
- //
3651
- // compute types
3652
- //
3653
-
3654
- enum ggml_task_type {
3655
- GGML_TASK_INIT = 0,
3656
- GGML_TASK_COMPUTE,
3657
- GGML_TASK_FINALIZE,
3658
- };
3659
-
3660
- struct ggml_compute_params {
3661
- enum ggml_task_type type;
3662
-
3663
- int ith, nth;
3664
-
3665
- // work buffer for all threads
3666
- size_t wsize;
3667
- void * wdata;
3668
- };
3669
-
3670
3741
  //
3671
3742
  // ggml state
3672
3743
  //
@@ -3723,7 +3794,7 @@ int64_t ggml_nelements(const struct ggml_tensor * tensor) {
3723
3794
  return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
3724
3795
  }
3725
3796
 
3726
- int ggml_nrows(const struct ggml_tensor * tensor) {
3797
+ int64_t ggml_nrows(const struct ggml_tensor * tensor) {
3727
3798
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3728
3799
 
3729
3800
  return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
@@ -3732,7 +3803,20 @@ int ggml_nrows(const struct ggml_tensor * tensor) {
3732
3803
  size_t ggml_nbytes(const struct ggml_tensor * tensor) {
3733
3804
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3734
3805
 
3735
- return (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type];
3806
+ // this should handle cases where the tensor is not contiguous in memory
3807
+ // probaby just:
3808
+ //
3809
+ // return tensor->ne[3]*tensor->nb[3]
3810
+ //
3811
+ // is enough, but just in case, adding the second part
3812
+
3813
+ return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]);
3814
+ }
3815
+
3816
+ size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
3817
+ static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3818
+
3819
+ return (nrows_split*tensor->ne[0]*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type];
3736
3820
  }
3737
3821
 
3738
3822
  int ggml_blck_size(enum ggml_type type) {
@@ -3801,6 +3885,11 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
3801
3885
  case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break;
3802
3886
  case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break;
3803
3887
  case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break;
3888
+ case GGML_FTYPE_MOSTLY_Q2_K: wtype = GGML_TYPE_Q2_K; break;
3889
+ case GGML_FTYPE_MOSTLY_Q3_K: wtype = GGML_TYPE_Q3_K; break;
3890
+ case GGML_FTYPE_MOSTLY_Q4_K: wtype = GGML_TYPE_Q4_K; break;
3891
+ case GGML_FTYPE_MOSTLY_Q5_K: wtype = GGML_TYPE_Q5_K; break;
3892
+ case GGML_FTYPE_MOSTLY_Q6_K: wtype = GGML_TYPE_Q6_K; break;
3804
3893
  case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
3805
3894
  case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
3806
3895
  }
@@ -3814,11 +3903,11 @@ size_t ggml_tensor_overhead(void) {
3814
3903
  return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16;
3815
3904
  }
3816
3905
 
3817
- static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
3906
+ bool ggml_is_transposed(const struct ggml_tensor * tensor) {
3818
3907
  return tensor->nb[0] > tensor->nb[1];
3819
3908
  }
3820
3909
 
3821
- static inline bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
3910
+ bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
3822
3911
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3823
3912
 
3824
3913
  return
@@ -3967,6 +4056,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
3967
4056
  /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size),
3968
4057
  /*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
3969
4058
  /*.no_alloc =*/ params.no_alloc,
4059
+ /*.no_alloc_save =*/ params.no_alloc,
3970
4060
  /*.n_objects =*/ 0,
3971
4061
  /*.objects_begin =*/ NULL,
3972
4062
  /*.objects_end =*/ NULL,
@@ -4044,11 +4134,18 @@ size_t ggml_get_mem_size(struct ggml_context * ctx) {
4044
4134
  // operators when using scratch buffers
4045
4135
  // TODO: implement a better way
4046
4136
  void ggml_scratch_save(struct ggml_context * ctx) {
4137
+ // this is needed to allow opt tensors to store their data
4138
+ // TODO: again, need to find a better way
4139
+ ctx->no_alloc_save = ctx->no_alloc;
4140
+ ctx->no_alloc = false;
4141
+
4047
4142
  ctx->scratch_save = ctx->scratch;
4048
4143
  ctx->scratch.data = NULL;
4049
4144
  }
4050
4145
 
4051
4146
  void ggml_scratch_load(struct ggml_context * ctx) {
4147
+ ctx->no_alloc = ctx->no_alloc_save;
4148
+
4052
4149
  ctx->scratch = ctx->scratch_save;
4053
4150
  }
4054
4151
 
@@ -4157,6 +4254,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
4157
4254
  /*.perf_time_us =*/ 0,
4158
4255
  /*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
4159
4256
  /*.name =*/ { 0 },
4257
+ /*.extra =*/ NULL,
4160
4258
  /*.pad =*/ { 0 },
4161
4259
  };
4162
4260
 
@@ -5802,10 +5900,18 @@ struct ggml_tensor * ggml_view_1d(
5802
5900
 
5803
5901
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
5804
5902
 
5903
+ ggml_scratch_save(ctx);
5904
+
5905
+ struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
5906
+ memcpy(offs->data, &offset, 2*sizeof(int32_t));
5907
+
5908
+ ggml_scratch_load(ctx);
5909
+
5805
5910
  result->op = GGML_OP_VIEW;
5806
5911
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5807
5912
  result->src0 = a;
5808
5913
  result->src1 = NULL;
5914
+ result->opt[0] = offs;
5809
5915
 
5810
5916
  if (is_node) {
5811
5917
  memcpy(result->padding, &offset, sizeof(offset));
@@ -5834,6 +5940,13 @@ struct ggml_tensor * ggml_view_2d(
5834
5940
 
5835
5941
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
5836
5942
 
5943
+ ggml_scratch_save(ctx);
5944
+
5945
+ struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
5946
+ memcpy(offs->data, &offset, 2*sizeof(int32_t));
5947
+
5948
+ ggml_scratch_load(ctx);
5949
+
5837
5950
  result->nb[1] = nb1;
5838
5951
  result->nb[2] = result->nb[1]*ne1;
5839
5952
  result->nb[3] = result->nb[2];
@@ -5842,6 +5955,7 @@ struct ggml_tensor * ggml_view_2d(
5842
5955
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5843
5956
  result->src0 = a;
5844
5957
  result->src1 = NULL;
5958
+ result->opt[0] = offs;
5845
5959
 
5846
5960
  if (is_node) {
5847
5961
  memcpy(result->padding, &offset, sizeof(offset));
@@ -5872,6 +5986,13 @@ struct ggml_tensor * ggml_view_3d(
5872
5986
 
5873
5987
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
5874
5988
 
5989
+ ggml_scratch_save(ctx);
5990
+
5991
+ struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
5992
+ memcpy(offs->data, &offset, 2*sizeof(int32_t));
5993
+
5994
+ ggml_scratch_load(ctx);
5995
+
5875
5996
  result->nb[1] = nb1;
5876
5997
  result->nb[2] = nb2;
5877
5998
  result->nb[3] = result->nb[2]*ne2;
@@ -5880,6 +6001,7 @@ struct ggml_tensor * ggml_view_3d(
5880
6001
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5881
6002
  result->src0 = a;
5882
6003
  result->src1 = NULL;
6004
+ result->opt[0] = offs;
5883
6005
 
5884
6006
  if (is_node) {
5885
6007
  memcpy(result->padding, &offset, sizeof(offset));
@@ -5912,6 +6034,13 @@ struct ggml_tensor * ggml_view_4d(
5912
6034
 
5913
6035
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
5914
6036
 
6037
+ ggml_scratch_save(ctx);
6038
+
6039
+ struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6040
+ memcpy(offs->data, &offset, 2*sizeof(int32_t));
6041
+
6042
+ ggml_scratch_load(ctx);
6043
+
5915
6044
  result->nb[1] = nb1;
5916
6045
  result->nb[2] = nb2;
5917
6046
  result->nb[3] = nb3;
@@ -5920,6 +6049,7 @@ struct ggml_tensor * ggml_view_4d(
5920
6049
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5921
6050
  result->src0 = a;
5922
6051
  result->src1 = NULL;
6052
+ result->opt[0] = offs;
5923
6053
 
5924
6054
  if (is_node) {
5925
6055
  memcpy(result->padding, &offset, sizeof(offset));
@@ -7584,6 +7714,11 @@ static void ggml_compute_forward_add(
7584
7714
  case GGML_TYPE_Q5_0:
7585
7715
  case GGML_TYPE_Q5_1:
7586
7716
  case GGML_TYPE_Q8_0:
7717
+ case GGML_TYPE_Q2_K:
7718
+ case GGML_TYPE_Q3_K:
7719
+ case GGML_TYPE_Q4_K:
7720
+ case GGML_TYPE_Q5_K:
7721
+ case GGML_TYPE_Q6_K:
7587
7722
  {
7588
7723
  ggml_compute_forward_add_q_f32(params, src0, src1, dst);
7589
7724
  } break;
@@ -7887,6 +8022,11 @@ static void ggml_compute_forward_add1(
7887
8022
  case GGML_TYPE_Q5_1:
7888
8023
  case GGML_TYPE_Q8_0:
7889
8024
  case GGML_TYPE_Q8_1:
8025
+ case GGML_TYPE_Q2_K:
8026
+ case GGML_TYPE_Q3_K:
8027
+ case GGML_TYPE_Q4_K:
8028
+ case GGML_TYPE_Q5_K:
8029
+ case GGML_TYPE_Q6_K:
7890
8030
  {
7891
8031
  ggml_compute_forward_add1_q_f32(params, src0, src1, dst);
7892
8032
  } break;
@@ -8009,6 +8149,11 @@ static void ggml_compute_forward_acc(
8009
8149
  case GGML_TYPE_Q5_1:
8010
8150
  case GGML_TYPE_Q8_0:
8011
8151
  case GGML_TYPE_Q8_1:
8152
+ case GGML_TYPE_Q2_K:
8153
+ case GGML_TYPE_Q3_K:
8154
+ case GGML_TYPE_Q4_K:
8155
+ case GGML_TYPE_Q5_K:
8156
+ case GGML_TYPE_Q6_K:
8012
8157
  default:
8013
8158
  {
8014
8159
  GGML_ASSERT(false);
@@ -8127,10 +8272,10 @@ static void ggml_compute_forward_mul_f32(
8127
8272
  const int ith = params->ith;
8128
8273
  const int nth = params->nth;
8129
8274
 
8130
- #ifdef GGML_USE_CUBLAS
8131
- if (src1->backend == GGML_BACKEND_CUDA) {
8275
+ #ifdef GGML_USE_CLBLAST
8276
+ if (src1->backend == GGML_BACKEND_GPU) {
8132
8277
  if (ith == 0) {
8133
- ggml_cuda_mul(src0, src1, dst);
8278
+ ggml_cl_mul(src0, src1, dst);
8134
8279
  }
8135
8280
  return;
8136
8281
  }
@@ -9245,7 +9390,7 @@ static void ggml_compute_forward_rms_norm_f32(
9245
9390
  sum += (ggml_float)(x[i00] * x[i00]);
9246
9391
  }
9247
9392
 
9248
- float mean = sum/ne00;
9393
+ const float mean = sum/ne00;
9249
9394
 
9250
9395
  float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
9251
9396
 
@@ -9568,14 +9713,7 @@ static void ggml_compute_forward_mul_mat_f32(
9568
9713
  // nb01 >= nb00 - src0 is not transposed
9569
9714
  // compute by src0 rows
9570
9715
 
9571
- #if defined(GGML_USE_CUBLAS)
9572
- if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
9573
- if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
9574
- ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
9575
- }
9576
- return;
9577
- }
9578
- #elif defined(GGML_USE_CLBLAST)
9716
+ #if defined(GGML_USE_CLBLAST)
9579
9717
  if (ggml_cl_can_mul_mat(src0, src1, dst)) {
9580
9718
  if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
9581
9719
  ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
@@ -9740,14 +9878,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
9740
9878
  // nb01 >= nb00 - src0 is not transposed
9741
9879
  // compute by src0 rows
9742
9880
 
9743
- #if defined(GGML_USE_CUBLAS)
9744
- if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
9745
- if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
9746
- ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
9747
- }
9748
- return;
9749
- }
9750
- #elif defined(GGML_USE_CLBLAST)
9881
+ #if defined(GGML_USE_CLBLAST)
9751
9882
  if (ggml_cl_can_mul_mat(src0, src1, dst)) {
9752
9883
  if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
9753
9884
  ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
@@ -9952,14 +10083,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
9952
10083
  // nb01 >= nb00 - src0 is not transposed
9953
10084
  // compute by src0 rows
9954
10085
 
9955
- #if defined(GGML_USE_CUBLAS)
9956
- if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
9957
- if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
9958
- ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
9959
- }
9960
- return;
9961
- }
9962
- #elif defined(GGML_USE_CLBLAST)
10086
+ #if defined(GGML_USE_CLBLAST)
9963
10087
  if (ggml_cl_can_mul_mat(src0, src1, dst)) {
9964
10088
  if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
9965
10089
  ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
@@ -10102,6 +10226,11 @@ static void ggml_compute_forward_mul_mat(
10102
10226
  case GGML_TYPE_Q5_1:
10103
10227
  case GGML_TYPE_Q8_0:
10104
10228
  case GGML_TYPE_Q8_1:
10229
+ case GGML_TYPE_Q2_K:
10230
+ case GGML_TYPE_Q3_K:
10231
+ case GGML_TYPE_Q4_K:
10232
+ case GGML_TYPE_Q5_K:
10233
+ case GGML_TYPE_Q6_K:
10105
10234
  {
10106
10235
  ggml_compute_forward_mul_mat_q_f32(params, src0, src1, dst);
10107
10236
  } break;
@@ -10285,6 +10414,11 @@ static void ggml_compute_forward_set(
10285
10414
  case GGML_TYPE_Q5_1:
10286
10415
  case GGML_TYPE_Q8_0:
10287
10416
  case GGML_TYPE_Q8_1:
10417
+ case GGML_TYPE_Q2_K:
10418
+ case GGML_TYPE_Q3_K:
10419
+ case GGML_TYPE_Q4_K:
10420
+ case GGML_TYPE_Q5_K:
10421
+ case GGML_TYPE_Q6_K:
10288
10422
  default:
10289
10423
  {
10290
10424
  GGML_ASSERT(false);
@@ -10450,6 +10584,11 @@ static void ggml_compute_forward_get_rows(
10450
10584
  case GGML_TYPE_Q5_1:
10451
10585
  case GGML_TYPE_Q8_0:
10452
10586
  case GGML_TYPE_Q8_1:
10587
+ case GGML_TYPE_Q2_K:
10588
+ case GGML_TYPE_Q3_K:
10589
+ case GGML_TYPE_Q4_K:
10590
+ case GGML_TYPE_Q5_K:
10591
+ case GGML_TYPE_Q6_K:
10453
10592
  {
10454
10593
  ggml_compute_forward_get_rows_q(params, src0, src1, dst);
10455
10594
  } break;
@@ -10996,6 +11135,12 @@ static void ggml_compute_forward_alibi(
10996
11135
  case GGML_TYPE_Q5_1:
10997
11136
  case GGML_TYPE_Q8_0:
10998
11137
  case GGML_TYPE_Q8_1:
11138
+ case GGML_TYPE_Q2_K:
11139
+ case GGML_TYPE_Q3_K:
11140
+ case GGML_TYPE_Q4_K:
11141
+ case GGML_TYPE_Q5_K:
11142
+ case GGML_TYPE_Q6_K:
11143
+ case GGML_TYPE_Q8_K:
10999
11144
  case GGML_TYPE_I8:
11000
11145
  case GGML_TYPE_I16:
11001
11146
  case GGML_TYPE_I32:
@@ -11067,6 +11212,12 @@ static void ggml_compute_forward_clamp(
11067
11212
  case GGML_TYPE_Q5_1:
11068
11213
  case GGML_TYPE_Q8_0:
11069
11214
  case GGML_TYPE_Q8_1:
11215
+ case GGML_TYPE_Q2_K:
11216
+ case GGML_TYPE_Q3_K:
11217
+ case GGML_TYPE_Q4_K:
11218
+ case GGML_TYPE_Q5_K:
11219
+ case GGML_TYPE_Q6_K:
11220
+ case GGML_TYPE_Q8_K:
11070
11221
  case GGML_TYPE_I8:
11071
11222
  case GGML_TYPE_I16:
11072
11223
  case GGML_TYPE_I32:
@@ -11156,7 +11307,7 @@ static void ggml_compute_forward_rope_f32(
11156
11307
  theta *= theta_scale;
11157
11308
 
11158
11309
  const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
11159
- float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
11310
+ float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
11160
11311
 
11161
11312
  const float x0 = src[0];
11162
11313
  const float x1 = src[1];
@@ -11177,7 +11328,7 @@ static void ggml_compute_forward_rope_f32(
11177
11328
  const int64_t i0 = ib*n_dims + ic/2;
11178
11329
 
11179
11330
  const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
11180
- float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
11331
+ float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
11181
11332
 
11182
11333
  const float x0 = src[0];
11183
11334
  const float x1 = src[n_dims/2];
@@ -12885,6 +13036,15 @@ static void ggml_compute_forward_map_binary(
12885
13036
  static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
12886
13037
  GGML_ASSERT(params);
12887
13038
 
13039
+ #ifdef GGML_USE_CUBLAS
13040
+ bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
13041
+ if (skip_cpu) {
13042
+ return;
13043
+ }
13044
+ GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU);
13045
+ GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU);
13046
+ #endif // GGML_USE_CUBLAS
13047
+
12888
13048
  switch (tensor->op) {
12889
13049
  case GGML_OP_DUP:
12890
13050
  {
@@ -14191,7 +14351,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
14191
14351
  if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
14192
14352
  node->n_tasks = 1; // TODO: this actually is doing nothing
14193
14353
  // the threads are still spinning
14194
- cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node);
14195
14354
  }
14196
14355
  else
14197
14356
  #elif defined(GGML_USE_CLBLAST)
@@ -14581,7 +14740,7 @@ static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fou
14581
14740
  const int64_t * ne = tensor->ne;
14582
14741
  const size_t * nb = tensor->nb;
14583
14742
 
14584
- fprintf(fout, "%-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %16p %16s\n",
14743
+ fprintf(fout, "%-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
14585
14744
  ggml_type_name(tensor->type),
14586
14745
  ggml_op_name (tensor->op),
14587
14746
  tensor->n_dims,
@@ -14595,7 +14754,7 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
14595
14754
  const int64_t * ne = tensor->ne;
14596
14755
  const size_t * nb = tensor->nb;
14597
14756
 
14598
- fprintf(fout, "%-6s %-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %8d %16p %16s\n",
14757
+ fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %8d %16p %32s\n",
14599
14758
  arg,
14600
14759
  ggml_type_name(tensor->type),
14601
14760
  ggml_op_name (tensor->op),
@@ -14608,8 +14767,8 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
14608
14767
  }
14609
14768
 
14610
14769
  void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
14611
- assert(cgraph->work == NULL);
14612
- assert(cgraph->work_size == 0);
14770
+ //assert(cgraph->work == NULL);
14771
+ //assert(cgraph->work_size == 0);
14613
14772
 
14614
14773
  uint64_t size_eval = 0;
14615
14774
 
@@ -14624,11 +14783,11 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
14624
14783
  FILE * fout = stdout;
14625
14784
 
14626
14785
  fprintf(fout, "\n");
14627
- fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC);
14628
- fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION);
14629
- fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs);
14630
- fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes);
14631
- fprintf(fout, "%-16s %8llu\n", "eval", size_eval);
14786
+ fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC);
14787
+ fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION);
14788
+ fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs);
14789
+ fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes);
14790
+ fprintf(fout, "%-16s %" PRIu64 "\n", "eval", size_eval);
14632
14791
 
14633
14792
  // header
14634
14793
  fprintf(fout, "\n");
@@ -14830,7 +14989,6 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
14830
14989
  // read file into data
14831
14990
  {
14832
14991
  FILE * fin = fopen(fname, "rb");
14833
-
14834
14992
  if (!fin) {
14835
14993
  fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
14836
14994
  return result;
@@ -14862,7 +15020,11 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
14862
15020
 
14863
15021
  data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize);
14864
15022
 
14865
- fread(data->data, sizeof(char), fsize, fin);
15023
+ const size_t ret = fread(data->data, sizeof(char), fsize, fin);
15024
+ if (ret != fsize) {
15025
+ fprintf(stderr, "%s: failed to read %s\n", __func__, fname);
15026
+ return result;
15027
+ }
14866
15028
 
14867
15029
  fclose(fin);
14868
15030
  }
@@ -14970,6 +15132,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
14970
15132
  op = *(const uint32_t *) ptr; ptr += sizeof(op);
14971
15133
  n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
14972
15134
 
15135
+ enum ggml_op eop = (enum ggml_op) op;
15136
+
14973
15137
  int64_t ne[GGML_MAX_DIMS];
14974
15138
  size_t nb[GGML_MAX_DIMS];
14975
15139
 
@@ -14984,42 +15148,77 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
14984
15148
  nb[j] = nb_cur;
14985
15149
  }
14986
15150
 
14987
- struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
15151
+ uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur); // TODO: not yet used
14988
15152
 
14989
- tensor->op = (enum ggml_op) op;
15153
+ const char * ptr_name = ptr; ptr += GGML_MAX_NAME;
14990
15154
 
14991
- uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur);
15155
+ const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += (2 + GGML_MAX_OPT)*sizeof(int32_t);
14992
15156
 
14993
- memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
14994
-
14995
- for (int j = 0; j < GGML_MAX_DIMS; ++j) {
14996
- tensor->nb[j] = nb[j];
14997
- }
15157
+ struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL };
14998
15158
 
14999
15159
  // parse args
15000
- {
15001
- struct ggml_tensor ** args[2 + GGML_MAX_OPT] = {
15002
- &tensor->src0,
15003
- &tensor->src1,
15004
- };
15160
+ for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
15161
+ const int32_t arg_idx = ptr_arg_idx[j];
15005
15162
 
15006
- for (int j = 0; j < GGML_MAX_OPT; ++j) {
15007
- args[2 + j] = &tensor->opt[j];
15163
+ if (arg_idx == -1) {
15164
+ continue;
15008
15165
  }
15009
15166
 
15010
- for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
15011
- const int32_t arg_idx = *(const int32_t *) ptr; ptr += sizeof(arg_idx);
15167
+ if (arg_idx < GGML_MAX_NODES) {
15168
+ args[j] = result.leafs[arg_idx];
15169
+ } else {
15170
+ args[j] = result.nodes[arg_idx - GGML_MAX_NODES];
15171
+ }
15172
+ }
15012
15173
 
15013
- if (arg_idx == -1) {
15014
- continue;
15015
- }
15174
+ // create the tensor
15175
+ // "view" operations are handled differently
15176
+ // TODO: handle inplace ops - currently a copy is always made
15177
+
15178
+ struct ggml_tensor * tensor = NULL;
15179
+
15180
+ switch (eop) {
15181
+ // TODO: implement other view ops
15182
+ case GGML_OP_RESHAPE:
15183
+ {
15184
+ tensor = ggml_reshape_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3]);
15185
+ } break;
15186
+ case GGML_OP_VIEW:
15187
+ {
15188
+ tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
15189
+
15190
+ uint64_t offs;
15191
+ memcpy(&offs, args[2]->data, sizeof(offs));
15192
+
15193
+ tensor->data = ((char *) tensor->data) + offs;
15194
+ } break;
15195
+ case GGML_OP_TRANSPOSE:
15196
+ {
15197
+ tensor = ggml_transpose(*ctx_eval, args[0]);
15198
+ } break;
15199
+ case GGML_OP_PERMUTE:
15200
+ {
15201
+ tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
15202
+ } break;
15203
+ default:
15204
+ {
15205
+ tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
15206
+
15207
+ tensor->op = eop;
15208
+ } break;
15209
+ }
15016
15210
 
15017
- if (arg_idx < GGML_MAX_NODES) {
15018
- *args[j] = result.leafs[arg_idx];
15019
- } else {
15020
- *args[j] = result.nodes[arg_idx - GGML_MAX_NODES];
15021
- }
15022
- }
15211
+ memcpy(tensor->name, ptr_name, GGML_MAX_NAME);
15212
+
15213
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
15214
+ tensor->nb[j] = nb[j];
15215
+ }
15216
+
15217
+ tensor->src0 = args[0];
15218
+ tensor->src1 = args[1];
15219
+
15220
+ for (int j = 0; j < GGML_MAX_OPT; ++j) {
15221
+ tensor->opt[j] = args[2 + j];
15023
15222
  }
15024
15223
 
15025
15224
  result.nodes[i] = tensor;
@@ -16070,6 +16269,38 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
16070
16269
  block_q8_0 * block = (block_q8_0*)dst + start / QK8_0;
16071
16270
  result = ggml_quantize_q8_0(src + start, block, n, n, hist);
16072
16271
  } break;
16272
+ #ifdef GGML_USE_K_QUANTS
16273
+ case GGML_TYPE_Q2_K:
16274
+ {
16275
+ GGML_ASSERT(start % QK_K == 0);
16276
+ block_q2_K * block = (block_q2_K*)dst + start / QK_K;
16277
+ result = ggml_quantize_q2_K(src + start, block, n, n, hist);
16278
+ } break;
16279
+ case GGML_TYPE_Q3_K:
16280
+ {
16281
+ GGML_ASSERT(start % QK_K == 0);
16282
+ block_q3_K * block = (block_q3_K*)dst + start / QK_K;
16283
+ result = ggml_quantize_q3_K(src + start, block, n, n, hist);
16284
+ } break;
16285
+ case GGML_TYPE_Q4_K:
16286
+ {
16287
+ GGML_ASSERT(start % QK_K == 0);
16288
+ block_q4_K * block = (block_q4_K*)dst + start / QK_K;
16289
+ result = ggml_quantize_q4_K(src + start, block, n, n, hist);
16290
+ } break;
16291
+ case GGML_TYPE_Q5_K:
16292
+ {
16293
+ GGML_ASSERT(start % QK_K == 0);
16294
+ block_q5_K * block = (block_q5_K*)dst + start / QK_K;
16295
+ result = ggml_quantize_q5_K(src + start, block, n, n, hist);
16296
+ } break;
16297
+ case GGML_TYPE_Q6_K:
16298
+ {
16299
+ GGML_ASSERT(start % QK_K == 0);
16300
+ block_q6_K * block = (block_q6_K*)dst + start / QK_K;
16301
+ result = ggml_quantize_q6_K(src + start, block, n, n, hist);
16302
+ } break;
16303
+ #endif
16073
16304
  default:
16074
16305
  assert(false);
16075
16306
  }