llama_cpp 0.1.4 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -3,6 +3,10 @@
3
3
 
4
4
  #include "ggml.h"
5
5
 
6
+ #ifdef GGML_USE_K_QUANTS
7
+ #include "k_quants.h"
8
+ #endif
9
+
6
10
  #if defined(_MSC_VER) || defined(__MINGW32__)
7
11
  #include <malloc.h> // using malloc.h with MSC/MINGW
8
12
  #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
@@ -21,6 +25,10 @@
21
25
  #include <float.h>
22
26
  #include <limits.h>
23
27
 
28
+ #ifdef GGML_USE_METAL
29
+ #include <unistd.h>
30
+ #endif
31
+
24
32
  // if C99 - static_assert is noop
25
33
  // ref: https://stackoverflow.com/a/53923785/4039976
26
34
  #ifndef static_assert
@@ -121,7 +129,11 @@ typedef void* thread_ret_t;
121
129
  #else
122
130
  inline static void* ggml_aligned_malloc(size_t size) {
123
131
  void* aligned_memory = NULL;
132
+ #ifdef GGML_USE_METAL
133
+ int result = posix_memalign(&aligned_memory, getpagesize(), size);
134
+ #else
124
135
  int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
136
+ #endif
125
137
  if (result != 0) {
126
138
  // Handle allocation failure
127
139
  return NULL;
@@ -403,21 +415,27 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) {
403
415
  //
404
416
 
405
417
  #if defined(_MSC_VER) || defined(__MINGW32__)
406
- static int64_t timer_freq;
418
+ static int64_t timer_freq, timer_start;
407
419
  void ggml_time_init(void) {
408
- LARGE_INTEGER frequency;
409
- QueryPerformanceFrequency(&frequency);
410
- timer_freq = frequency.QuadPart;
420
+ LARGE_INTEGER t;
421
+ QueryPerformanceFrequency(&t);
422
+ timer_freq = t.QuadPart;
423
+
424
+ // The multiplication by 1000 or 1000000 below can cause an overflow if timer_freq
425
+ // and the uptime is high enough.
426
+ // We subtract the program start time to reduce the likelihood of that happening.
427
+ QueryPerformanceCounter(&t);
428
+ timer_start = t.QuadPart;
411
429
  }
412
430
  int64_t ggml_time_ms(void) {
413
431
  LARGE_INTEGER t;
414
432
  QueryPerformanceCounter(&t);
415
- return (t.QuadPart * 1000) / timer_freq;
433
+ return ((t.QuadPart-timer_start) * 1000) / timer_freq;
416
434
  }
417
435
  int64_t ggml_time_us(void) {
418
436
  LARGE_INTEGER t;
419
437
  QueryPerformanceCounter(&t);
420
- return (t.QuadPart * 1000000) / timer_freq;
438
+ return ((t.QuadPart-timer_start) * 1000000) / timer_freq;
421
439
  }
422
440
  #else
423
441
  void ggml_time_init(void) {}
@@ -474,6 +492,8 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
474
492
  // quantization
475
493
  //
476
494
 
495
+ #define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
496
+
477
497
  #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
478
498
  // multiply int8_t, add results pairwise twice
479
499
  static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
@@ -533,7 +553,7 @@ static inline __m256i bytes_from_bits_32(const uint8_t * x) {
533
553
  static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
534
554
  {
535
555
  const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
536
- const __m256i bytes = _mm256_set_m128i(_mm_srli_epi16(tmp, 4), tmp);
556
+ const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp);
537
557
  const __m256i lowMask = _mm256_set1_epi8( 0xF );
538
558
  return _mm256_and_si256(lowMask, bytes);
539
559
  }
@@ -606,7 +626,7 @@ static inline __m256i bytes_from_bits_32(const uint8_t * x) {
606
626
  bytesh = _mm_or_si128(bytesh, bit_mask);
607
627
  bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1));
608
628
  bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1));
609
- return _mm256_set_m128i(bytesh, bytesl);
629
+ return MM256_SET_M128I(bytesh, bytesl);
610
630
  }
611
631
 
612
632
  // Unpack 32 4-bit fields into 32 bytes
@@ -619,7 +639,7 @@ static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
619
639
  const __m128i lowMask = _mm_set1_epi8(0xF);
620
640
  tmpl = _mm_and_si128(lowMask, tmpl);
621
641
  tmph = _mm_and_si128(lowMask, tmph);
622
- return _mm256_set_m128i(tmph, tmpl);
642
+ return MM256_SET_M128I(tmph, tmpl);
623
643
  }
624
644
 
625
645
  // add int16_t pairwise and return as float vector
@@ -627,7 +647,7 @@ static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) {
627
647
  const __m128i ones = _mm_set1_epi16(1);
628
648
  const __m128i summed_pairsl = _mm_madd_epi16(ones, xl);
629
649
  const __m128i summed_pairsh = _mm_madd_epi16(ones, xh);
630
- const __m256i summed_pairs = _mm256_set_m128i(summed_pairsh, summed_pairsl);
650
+ const __m256i summed_pairs = MM256_SET_M128I(summed_pairsh, summed_pairsl);
631
651
  return _mm256_cvtepi32_ps(summed_pairs);
632
652
  }
633
653
 
@@ -1565,6 +1585,48 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
1565
1585
  .vec_dot_q = NULL, // TODO
1566
1586
  .vec_dot_type = GGML_TYPE_Q8_1,
1567
1587
  },
1588
+ #ifdef GGML_USE_K_QUANTS
1589
+ [GGML_TYPE_Q2_K] = {
1590
+ .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q2_K,
1591
+ .quantize_row_q = quantize_row_q2_K,
1592
+ .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q2_K_reference,
1593
+ .quantize_row_q_dot = quantize_row_q8_K,
1594
+ .vec_dot_q = ggml_vec_dot_q2_K_q8_K,
1595
+ .vec_dot_type = GGML_TYPE_Q8_K,
1596
+ },
1597
+ [GGML_TYPE_Q3_K] = {
1598
+ .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q3_K,
1599
+ .quantize_row_q = quantize_row_q3_K,
1600
+ .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q3_K_reference,
1601
+ .quantize_row_q_dot = quantize_row_q8_K,
1602
+ .vec_dot_q = ggml_vec_dot_q3_K_q8_K,
1603
+ .vec_dot_type = GGML_TYPE_Q8_K,
1604
+ },
1605
+ [GGML_TYPE_Q4_K] = {
1606
+ .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q4_K,
1607
+ .quantize_row_q = quantize_row_q4_K,
1608
+ .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_K_reference,
1609
+ .quantize_row_q_dot = quantize_row_q8_K,
1610
+ .vec_dot_q = ggml_vec_dot_q4_K_q8_K,
1611
+ .vec_dot_type = GGML_TYPE_Q8_K,
1612
+ },
1613
+ [GGML_TYPE_Q5_K] = {
1614
+ .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_K,
1615
+ .quantize_row_q = quantize_row_q5_K,
1616
+ .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_K_reference,
1617
+ .quantize_row_q_dot = quantize_row_q8_K,
1618
+ .vec_dot_q = ggml_vec_dot_q5_K_q8_K,
1619
+ .vec_dot_type = GGML_TYPE_Q8_K,
1620
+ },
1621
+ [GGML_TYPE_Q6_K] = {
1622
+ .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q6_K,
1623
+ .quantize_row_q = quantize_row_q6_K,
1624
+ .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q6_K_reference,
1625
+ .quantize_row_q_dot = quantize_row_q8_K,
1626
+ .vec_dot_q = ggml_vec_dot_q6_K_q8_K,
1627
+ .vec_dot_type = GGML_TYPE_Q8_K,
1628
+ },
1629
+ #endif
1568
1630
  };
1569
1631
 
1570
1632
  // For internal test use
@@ -2290,7 +2352,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2290
2352
  const __m128i i32_1 = mul_sum_i8_pairs(bx, by);
2291
2353
 
2292
2354
  // Convert int32_t to float
2293
- __m256 p = _mm256_cvtepi32_ps(_mm256_set_m128i(i32_0, i32_1));
2355
+ __m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1));
2294
2356
 
2295
2357
  // Apply the scale, and accumulate
2296
2358
  acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
@@ -2766,7 +2828,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2766
2828
  __m128i bxh = _mm256_extractf128_si256(bx, 1);
2767
2829
  bxl = _mm_or_si128(bxl, bxhil);
2768
2830
  bxh = _mm_or_si128(bxh, bxhih);
2769
- bx = _mm256_set_m128i(bxh, bxl);
2831
+ bx = MM256_SET_M128I(bxh, bxl);
2770
2832
 
2771
2833
  const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
2772
2834
 
@@ -3022,7 +3084,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
3022
3084
  __m128i bxh = _mm256_extractf128_si256(bx, 1);
3023
3085
  bxl = _mm_or_si128(bxl, bxhil);
3024
3086
  bxh = _mm_or_si128(bxh, bxhih);
3025
- bx = _mm256_set_m128i(bxh, bxl);
3087
+ bx = MM256_SET_M128I(bxh, bxl);
3026
3088
 
3027
3089
  const __m256 dy = _mm256_set1_ps(y[i].d);
3028
3090
  const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
@@ -3444,11 +3506,19 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
3444
3506
  [GGML_TYPE_Q5_1] = QK5_1,
3445
3507
  [GGML_TYPE_Q8_0] = QK8_0,
3446
3508
  [GGML_TYPE_Q8_1] = QK8_1,
3509
+ #ifdef GGML_USE_K_QUANTS
3510
+ [GGML_TYPE_Q2_K] = QK_K,
3511
+ [GGML_TYPE_Q3_K] = QK_K,
3512
+ [GGML_TYPE_Q4_K] = QK_K,
3513
+ [GGML_TYPE_Q5_K] = QK_K,
3514
+ [GGML_TYPE_Q6_K] = QK_K,
3515
+ [GGML_TYPE_Q8_K] = QK_K,
3516
+ #endif
3447
3517
  [GGML_TYPE_I8] = 1,
3448
3518
  [GGML_TYPE_I16] = 1,
3449
3519
  [GGML_TYPE_I32] = 1,
3450
3520
  };
3451
- static_assert(GGML_TYPE_COUNT == 13, "GGML_BLCK_SIZE is outdated");
3521
+ static_assert(GGML_TYPE_COUNT == 19, "GGML_BLCK_SIZE is outdated");
3452
3522
 
3453
3523
  static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
3454
3524
  [GGML_TYPE_F32] = sizeof(float),
@@ -3459,11 +3529,19 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
3459
3529
  [GGML_TYPE_Q5_1] = sizeof(block_q5_1),
3460
3530
  [GGML_TYPE_Q8_0] = sizeof(block_q8_0),
3461
3531
  [GGML_TYPE_Q8_1] = sizeof(block_q8_1),
3532
+ #ifdef GGML_USE_K_QUANTS
3533
+ [GGML_TYPE_Q2_K] = sizeof(block_q2_K),
3534
+ [GGML_TYPE_Q3_K] = sizeof(block_q3_K),
3535
+ [GGML_TYPE_Q4_K] = sizeof(block_q4_K),
3536
+ [GGML_TYPE_Q5_K] = sizeof(block_q5_K),
3537
+ [GGML_TYPE_Q6_K] = sizeof(block_q6_K),
3538
+ [GGML_TYPE_Q8_K] = sizeof(block_q8_K),
3539
+ #endif
3462
3540
  [GGML_TYPE_I8] = sizeof(int8_t),
3463
3541
  [GGML_TYPE_I16] = sizeof(int16_t),
3464
3542
  [GGML_TYPE_I32] = sizeof(int32_t),
3465
3543
  };
3466
- static_assert(GGML_TYPE_COUNT == 13, "GGML_TYPE_SIZE is outdated");
3544
+ static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_SIZE is outdated");
3467
3545
 
3468
3546
 
3469
3547
  static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
@@ -3475,11 +3553,17 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
3475
3553
  [GGML_TYPE_Q5_1] = "q5_1",
3476
3554
  [GGML_TYPE_Q8_0] = "q8_0",
3477
3555
  [GGML_TYPE_Q8_1] = "q8_1",
3556
+ [GGML_TYPE_Q2_K] = "q2_K",
3557
+ [GGML_TYPE_Q3_K] = "q3_K",
3558
+ [GGML_TYPE_Q4_K] = "q4_K",
3559
+ [GGML_TYPE_Q5_K] = "q5_K",
3560
+ [GGML_TYPE_Q6_K] = "q6_K",
3561
+ [GGML_TYPE_Q8_K] = "q8_K",
3478
3562
  [GGML_TYPE_I8] = "i8",
3479
3563
  [GGML_TYPE_I16] = "i16",
3480
3564
  [GGML_TYPE_I32] = "i32",
3481
3565
  };
3482
- static_assert(GGML_TYPE_COUNT == 13, "GGML_TYPE_NAME is outdated");
3566
+ static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_NAME is outdated");
3483
3567
 
3484
3568
  static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
3485
3569
  [GGML_TYPE_F32] = false,
@@ -3490,11 +3574,17 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
3490
3574
  [GGML_TYPE_Q5_1] = true,
3491
3575
  [GGML_TYPE_Q8_0] = true,
3492
3576
  [GGML_TYPE_Q8_1] = true,
3577
+ [GGML_TYPE_Q2_K] = true,
3578
+ [GGML_TYPE_Q3_K] = true,
3579
+ [GGML_TYPE_Q4_K] = true,
3580
+ [GGML_TYPE_Q5_K] = true,
3581
+ [GGML_TYPE_Q6_K] = true,
3582
+ [GGML_TYPE_Q8_K] = true,
3493
3583
  [GGML_TYPE_I8] = false,
3494
3584
  [GGML_TYPE_I16] = false,
3495
3585
  [GGML_TYPE_I32] = false,
3496
3586
  };
3497
- static_assert(GGML_TYPE_COUNT == 13, "GGML_IS_QUANTIZED is outdated");
3587
+ static_assert(GGML_TYPE_COUNT == 19, "GGML_IS_QUANTIZED is outdated");
3498
3588
 
3499
3589
  static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3500
3590
  "NONE",
@@ -3631,6 +3721,7 @@ struct ggml_context {
3631
3721
  void * mem_buffer;
3632
3722
  bool mem_buffer_owned;
3633
3723
  bool no_alloc;
3724
+ bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
3634
3725
 
3635
3726
  int n_objects;
3636
3727
 
@@ -3647,26 +3738,6 @@ struct ggml_context_container {
3647
3738
  struct ggml_context context;
3648
3739
  };
3649
3740
 
3650
- //
3651
- // compute types
3652
- //
3653
-
3654
- enum ggml_task_type {
3655
- GGML_TASK_INIT = 0,
3656
- GGML_TASK_COMPUTE,
3657
- GGML_TASK_FINALIZE,
3658
- };
3659
-
3660
- struct ggml_compute_params {
3661
- enum ggml_task_type type;
3662
-
3663
- int ith, nth;
3664
-
3665
- // work buffer for all threads
3666
- size_t wsize;
3667
- void * wdata;
3668
- };
3669
-
3670
3741
  //
3671
3742
  // ggml state
3672
3743
  //
@@ -3723,7 +3794,7 @@ int64_t ggml_nelements(const struct ggml_tensor * tensor) {
3723
3794
  return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
3724
3795
  }
3725
3796
 
3726
- int ggml_nrows(const struct ggml_tensor * tensor) {
3797
+ int64_t ggml_nrows(const struct ggml_tensor * tensor) {
3727
3798
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3728
3799
 
3729
3800
  return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
@@ -3732,7 +3803,20 @@ int ggml_nrows(const struct ggml_tensor * tensor) {
3732
3803
  size_t ggml_nbytes(const struct ggml_tensor * tensor) {
3733
3804
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3734
3805
 
3735
- return (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type];
3806
+ // this should handle cases where the tensor is not contiguous in memory
3807
+ // probaby just:
3808
+ //
3809
+ // return tensor->ne[3]*tensor->nb[3]
3810
+ //
3811
+ // is enough, but just in case, adding the second part
3812
+
3813
+ return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]);
3814
+ }
3815
+
3816
+ size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
3817
+ static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3818
+
3819
+ return (nrows_split*tensor->ne[0]*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type];
3736
3820
  }
3737
3821
 
3738
3822
  int ggml_blck_size(enum ggml_type type) {
@@ -3801,6 +3885,11 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
3801
3885
  case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break;
3802
3886
  case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break;
3803
3887
  case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break;
3888
+ case GGML_FTYPE_MOSTLY_Q2_K: wtype = GGML_TYPE_Q2_K; break;
3889
+ case GGML_FTYPE_MOSTLY_Q3_K: wtype = GGML_TYPE_Q3_K; break;
3890
+ case GGML_FTYPE_MOSTLY_Q4_K: wtype = GGML_TYPE_Q4_K; break;
3891
+ case GGML_FTYPE_MOSTLY_Q5_K: wtype = GGML_TYPE_Q5_K; break;
3892
+ case GGML_FTYPE_MOSTLY_Q6_K: wtype = GGML_TYPE_Q6_K; break;
3804
3893
  case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
3805
3894
  case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
3806
3895
  }
@@ -3814,11 +3903,11 @@ size_t ggml_tensor_overhead(void) {
3814
3903
  return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16;
3815
3904
  }
3816
3905
 
3817
- static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
3906
+ bool ggml_is_transposed(const struct ggml_tensor * tensor) {
3818
3907
  return tensor->nb[0] > tensor->nb[1];
3819
3908
  }
3820
3909
 
3821
- static inline bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
3910
+ bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
3822
3911
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3823
3912
 
3824
3913
  return
@@ -3967,6 +4056,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
3967
4056
  /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size),
3968
4057
  /*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
3969
4058
  /*.no_alloc =*/ params.no_alloc,
4059
+ /*.no_alloc_save =*/ params.no_alloc,
3970
4060
  /*.n_objects =*/ 0,
3971
4061
  /*.objects_begin =*/ NULL,
3972
4062
  /*.objects_end =*/ NULL,
@@ -4044,11 +4134,18 @@ size_t ggml_get_mem_size(struct ggml_context * ctx) {
4044
4134
  // operators when using scratch buffers
4045
4135
  // TODO: implement a better way
4046
4136
  void ggml_scratch_save(struct ggml_context * ctx) {
4137
+ // this is needed to allow opt tensors to store their data
4138
+ // TODO: again, need to find a better way
4139
+ ctx->no_alloc_save = ctx->no_alloc;
4140
+ ctx->no_alloc = false;
4141
+
4047
4142
  ctx->scratch_save = ctx->scratch;
4048
4143
  ctx->scratch.data = NULL;
4049
4144
  }
4050
4145
 
4051
4146
  void ggml_scratch_load(struct ggml_context * ctx) {
4147
+ ctx->no_alloc = ctx->no_alloc_save;
4148
+
4052
4149
  ctx->scratch = ctx->scratch_save;
4053
4150
  }
4054
4151
 
@@ -4157,6 +4254,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
4157
4254
  /*.perf_time_us =*/ 0,
4158
4255
  /*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
4159
4256
  /*.name =*/ { 0 },
4257
+ /*.extra =*/ NULL,
4160
4258
  /*.pad =*/ { 0 },
4161
4259
  };
4162
4260
 
@@ -5802,10 +5900,18 @@ struct ggml_tensor * ggml_view_1d(
5802
5900
 
5803
5901
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
5804
5902
 
5903
+ ggml_scratch_save(ctx);
5904
+
5905
+ struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
5906
+ memcpy(offs->data, &offset, 2*sizeof(int32_t));
5907
+
5908
+ ggml_scratch_load(ctx);
5909
+
5805
5910
  result->op = GGML_OP_VIEW;
5806
5911
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5807
5912
  result->src0 = a;
5808
5913
  result->src1 = NULL;
5914
+ result->opt[0] = offs;
5809
5915
 
5810
5916
  if (is_node) {
5811
5917
  memcpy(result->padding, &offset, sizeof(offset));
@@ -5834,6 +5940,13 @@ struct ggml_tensor * ggml_view_2d(
5834
5940
 
5835
5941
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
5836
5942
 
5943
+ ggml_scratch_save(ctx);
5944
+
5945
+ struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
5946
+ memcpy(offs->data, &offset, 2*sizeof(int32_t));
5947
+
5948
+ ggml_scratch_load(ctx);
5949
+
5837
5950
  result->nb[1] = nb1;
5838
5951
  result->nb[2] = result->nb[1]*ne1;
5839
5952
  result->nb[3] = result->nb[2];
@@ -5842,6 +5955,7 @@ struct ggml_tensor * ggml_view_2d(
5842
5955
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5843
5956
  result->src0 = a;
5844
5957
  result->src1 = NULL;
5958
+ result->opt[0] = offs;
5845
5959
 
5846
5960
  if (is_node) {
5847
5961
  memcpy(result->padding, &offset, sizeof(offset));
@@ -5872,6 +5986,13 @@ struct ggml_tensor * ggml_view_3d(
5872
5986
 
5873
5987
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
5874
5988
 
5989
+ ggml_scratch_save(ctx);
5990
+
5991
+ struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
5992
+ memcpy(offs->data, &offset, 2*sizeof(int32_t));
5993
+
5994
+ ggml_scratch_load(ctx);
5995
+
5875
5996
  result->nb[1] = nb1;
5876
5997
  result->nb[2] = nb2;
5877
5998
  result->nb[3] = result->nb[2]*ne2;
@@ -5880,6 +6001,7 @@ struct ggml_tensor * ggml_view_3d(
5880
6001
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5881
6002
  result->src0 = a;
5882
6003
  result->src1 = NULL;
6004
+ result->opt[0] = offs;
5883
6005
 
5884
6006
  if (is_node) {
5885
6007
  memcpy(result->padding, &offset, sizeof(offset));
@@ -5912,6 +6034,13 @@ struct ggml_tensor * ggml_view_4d(
5912
6034
 
5913
6035
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
5914
6036
 
6037
+ ggml_scratch_save(ctx);
6038
+
6039
+ struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6040
+ memcpy(offs->data, &offset, 2*sizeof(int32_t));
6041
+
6042
+ ggml_scratch_load(ctx);
6043
+
5915
6044
  result->nb[1] = nb1;
5916
6045
  result->nb[2] = nb2;
5917
6046
  result->nb[3] = nb3;
@@ -5920,6 +6049,7 @@ struct ggml_tensor * ggml_view_4d(
5920
6049
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5921
6050
  result->src0 = a;
5922
6051
  result->src1 = NULL;
6052
+ result->opt[0] = offs;
5923
6053
 
5924
6054
  if (is_node) {
5925
6055
  memcpy(result->padding, &offset, sizeof(offset));
@@ -7584,6 +7714,11 @@ static void ggml_compute_forward_add(
7584
7714
  case GGML_TYPE_Q5_0:
7585
7715
  case GGML_TYPE_Q5_1:
7586
7716
  case GGML_TYPE_Q8_0:
7717
+ case GGML_TYPE_Q2_K:
7718
+ case GGML_TYPE_Q3_K:
7719
+ case GGML_TYPE_Q4_K:
7720
+ case GGML_TYPE_Q5_K:
7721
+ case GGML_TYPE_Q6_K:
7587
7722
  {
7588
7723
  ggml_compute_forward_add_q_f32(params, src0, src1, dst);
7589
7724
  } break;
@@ -7887,6 +8022,11 @@ static void ggml_compute_forward_add1(
7887
8022
  case GGML_TYPE_Q5_1:
7888
8023
  case GGML_TYPE_Q8_0:
7889
8024
  case GGML_TYPE_Q8_1:
8025
+ case GGML_TYPE_Q2_K:
8026
+ case GGML_TYPE_Q3_K:
8027
+ case GGML_TYPE_Q4_K:
8028
+ case GGML_TYPE_Q5_K:
8029
+ case GGML_TYPE_Q6_K:
7890
8030
  {
7891
8031
  ggml_compute_forward_add1_q_f32(params, src0, src1, dst);
7892
8032
  } break;
@@ -8009,6 +8149,11 @@ static void ggml_compute_forward_acc(
8009
8149
  case GGML_TYPE_Q5_1:
8010
8150
  case GGML_TYPE_Q8_0:
8011
8151
  case GGML_TYPE_Q8_1:
8152
+ case GGML_TYPE_Q2_K:
8153
+ case GGML_TYPE_Q3_K:
8154
+ case GGML_TYPE_Q4_K:
8155
+ case GGML_TYPE_Q5_K:
8156
+ case GGML_TYPE_Q6_K:
8012
8157
  default:
8013
8158
  {
8014
8159
  GGML_ASSERT(false);
@@ -8127,10 +8272,10 @@ static void ggml_compute_forward_mul_f32(
8127
8272
  const int ith = params->ith;
8128
8273
  const int nth = params->nth;
8129
8274
 
8130
- #ifdef GGML_USE_CUBLAS
8131
- if (src1->backend == GGML_BACKEND_CUDA) {
8275
+ #ifdef GGML_USE_CLBLAST
8276
+ if (src1->backend == GGML_BACKEND_GPU) {
8132
8277
  if (ith == 0) {
8133
- ggml_cuda_mul(src0, src1, dst);
8278
+ ggml_cl_mul(src0, src1, dst);
8134
8279
  }
8135
8280
  return;
8136
8281
  }
@@ -9245,7 +9390,7 @@ static void ggml_compute_forward_rms_norm_f32(
9245
9390
  sum += (ggml_float)(x[i00] * x[i00]);
9246
9391
  }
9247
9392
 
9248
- float mean = sum/ne00;
9393
+ const float mean = sum/ne00;
9249
9394
 
9250
9395
  float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
9251
9396
 
@@ -9568,14 +9713,7 @@ static void ggml_compute_forward_mul_mat_f32(
9568
9713
  // nb01 >= nb00 - src0 is not transposed
9569
9714
  // compute by src0 rows
9570
9715
 
9571
- #if defined(GGML_USE_CUBLAS)
9572
- if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
9573
- if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
9574
- ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
9575
- }
9576
- return;
9577
- }
9578
- #elif defined(GGML_USE_CLBLAST)
9716
+ #if defined(GGML_USE_CLBLAST)
9579
9717
  if (ggml_cl_can_mul_mat(src0, src1, dst)) {
9580
9718
  if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
9581
9719
  ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
@@ -9740,14 +9878,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
9740
9878
  // nb01 >= nb00 - src0 is not transposed
9741
9879
  // compute by src0 rows
9742
9880
 
9743
- #if defined(GGML_USE_CUBLAS)
9744
- if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
9745
- if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
9746
- ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
9747
- }
9748
- return;
9749
- }
9750
- #elif defined(GGML_USE_CLBLAST)
9881
+ #if defined(GGML_USE_CLBLAST)
9751
9882
  if (ggml_cl_can_mul_mat(src0, src1, dst)) {
9752
9883
  if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
9753
9884
  ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
@@ -9952,14 +10083,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
9952
10083
  // nb01 >= nb00 - src0 is not transposed
9953
10084
  // compute by src0 rows
9954
10085
 
9955
- #if defined(GGML_USE_CUBLAS)
9956
- if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
9957
- if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
9958
- ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
9959
- }
9960
- return;
9961
- }
9962
- #elif defined(GGML_USE_CLBLAST)
10086
+ #if defined(GGML_USE_CLBLAST)
9963
10087
  if (ggml_cl_can_mul_mat(src0, src1, dst)) {
9964
10088
  if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
9965
10089
  ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
@@ -10102,6 +10226,11 @@ static void ggml_compute_forward_mul_mat(
10102
10226
  case GGML_TYPE_Q5_1:
10103
10227
  case GGML_TYPE_Q8_0:
10104
10228
  case GGML_TYPE_Q8_1:
10229
+ case GGML_TYPE_Q2_K:
10230
+ case GGML_TYPE_Q3_K:
10231
+ case GGML_TYPE_Q4_K:
10232
+ case GGML_TYPE_Q5_K:
10233
+ case GGML_TYPE_Q6_K:
10105
10234
  {
10106
10235
  ggml_compute_forward_mul_mat_q_f32(params, src0, src1, dst);
10107
10236
  } break;
@@ -10285,6 +10414,11 @@ static void ggml_compute_forward_set(
10285
10414
  case GGML_TYPE_Q5_1:
10286
10415
  case GGML_TYPE_Q8_0:
10287
10416
  case GGML_TYPE_Q8_1:
10417
+ case GGML_TYPE_Q2_K:
10418
+ case GGML_TYPE_Q3_K:
10419
+ case GGML_TYPE_Q4_K:
10420
+ case GGML_TYPE_Q5_K:
10421
+ case GGML_TYPE_Q6_K:
10288
10422
  default:
10289
10423
  {
10290
10424
  GGML_ASSERT(false);
@@ -10450,6 +10584,11 @@ static void ggml_compute_forward_get_rows(
10450
10584
  case GGML_TYPE_Q5_1:
10451
10585
  case GGML_TYPE_Q8_0:
10452
10586
  case GGML_TYPE_Q8_1:
10587
+ case GGML_TYPE_Q2_K:
10588
+ case GGML_TYPE_Q3_K:
10589
+ case GGML_TYPE_Q4_K:
10590
+ case GGML_TYPE_Q5_K:
10591
+ case GGML_TYPE_Q6_K:
10453
10592
  {
10454
10593
  ggml_compute_forward_get_rows_q(params, src0, src1, dst);
10455
10594
  } break;
@@ -10996,6 +11135,12 @@ static void ggml_compute_forward_alibi(
10996
11135
  case GGML_TYPE_Q5_1:
10997
11136
  case GGML_TYPE_Q8_0:
10998
11137
  case GGML_TYPE_Q8_1:
11138
+ case GGML_TYPE_Q2_K:
11139
+ case GGML_TYPE_Q3_K:
11140
+ case GGML_TYPE_Q4_K:
11141
+ case GGML_TYPE_Q5_K:
11142
+ case GGML_TYPE_Q6_K:
11143
+ case GGML_TYPE_Q8_K:
10999
11144
  case GGML_TYPE_I8:
11000
11145
  case GGML_TYPE_I16:
11001
11146
  case GGML_TYPE_I32:
@@ -11067,6 +11212,12 @@ static void ggml_compute_forward_clamp(
11067
11212
  case GGML_TYPE_Q5_1:
11068
11213
  case GGML_TYPE_Q8_0:
11069
11214
  case GGML_TYPE_Q8_1:
11215
+ case GGML_TYPE_Q2_K:
11216
+ case GGML_TYPE_Q3_K:
11217
+ case GGML_TYPE_Q4_K:
11218
+ case GGML_TYPE_Q5_K:
11219
+ case GGML_TYPE_Q6_K:
11220
+ case GGML_TYPE_Q8_K:
11070
11221
  case GGML_TYPE_I8:
11071
11222
  case GGML_TYPE_I16:
11072
11223
  case GGML_TYPE_I32:
@@ -11156,7 +11307,7 @@ static void ggml_compute_forward_rope_f32(
11156
11307
  theta *= theta_scale;
11157
11308
 
11158
11309
  const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
11159
- float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
11310
+ float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
11160
11311
 
11161
11312
  const float x0 = src[0];
11162
11313
  const float x1 = src[1];
@@ -11177,7 +11328,7 @@ static void ggml_compute_forward_rope_f32(
11177
11328
  const int64_t i0 = ib*n_dims + ic/2;
11178
11329
 
11179
11330
  const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
11180
- float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
11331
+ float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
11181
11332
 
11182
11333
  const float x0 = src[0];
11183
11334
  const float x1 = src[n_dims/2];
@@ -12885,6 +13036,15 @@ static void ggml_compute_forward_map_binary(
12885
13036
  static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
12886
13037
  GGML_ASSERT(params);
12887
13038
 
13039
+ #ifdef GGML_USE_CUBLAS
13040
+ bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
13041
+ if (skip_cpu) {
13042
+ return;
13043
+ }
13044
+ GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU);
13045
+ GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU);
13046
+ #endif // GGML_USE_CUBLAS
13047
+
12888
13048
  switch (tensor->op) {
12889
13049
  case GGML_OP_DUP:
12890
13050
  {
@@ -14191,7 +14351,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
14191
14351
  if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
14192
14352
  node->n_tasks = 1; // TODO: this actually is doing nothing
14193
14353
  // the threads are still spinning
14194
- cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node);
14195
14354
  }
14196
14355
  else
14197
14356
  #elif defined(GGML_USE_CLBLAST)
@@ -14581,7 +14740,7 @@ static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fou
14581
14740
  const int64_t * ne = tensor->ne;
14582
14741
  const size_t * nb = tensor->nb;
14583
14742
 
14584
- fprintf(fout, "%-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %16p %16s\n",
14743
+ fprintf(fout, "%-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
14585
14744
  ggml_type_name(tensor->type),
14586
14745
  ggml_op_name (tensor->op),
14587
14746
  tensor->n_dims,
@@ -14595,7 +14754,7 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
14595
14754
  const int64_t * ne = tensor->ne;
14596
14755
  const size_t * nb = tensor->nb;
14597
14756
 
14598
- fprintf(fout, "%-6s %-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %8d %16p %16s\n",
14757
+ fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %8d %16p %32s\n",
14599
14758
  arg,
14600
14759
  ggml_type_name(tensor->type),
14601
14760
  ggml_op_name (tensor->op),
@@ -14608,8 +14767,8 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
14608
14767
  }
14609
14768
 
14610
14769
  void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
14611
- assert(cgraph->work == NULL);
14612
- assert(cgraph->work_size == 0);
14770
+ //assert(cgraph->work == NULL);
14771
+ //assert(cgraph->work_size == 0);
14613
14772
 
14614
14773
  uint64_t size_eval = 0;
14615
14774
 
@@ -14624,11 +14783,11 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
14624
14783
  FILE * fout = stdout;
14625
14784
 
14626
14785
  fprintf(fout, "\n");
14627
- fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC);
14628
- fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION);
14629
- fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs);
14630
- fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes);
14631
- fprintf(fout, "%-16s %8llu\n", "eval", size_eval);
14786
+ fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC);
14787
+ fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION);
14788
+ fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs);
14789
+ fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes);
14790
+ fprintf(fout, "%-16s %" PRIu64 "\n", "eval", size_eval);
14632
14791
 
14633
14792
  // header
14634
14793
  fprintf(fout, "\n");
@@ -14830,7 +14989,6 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
14830
14989
  // read file into data
14831
14990
  {
14832
14991
  FILE * fin = fopen(fname, "rb");
14833
-
14834
14992
  if (!fin) {
14835
14993
  fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
14836
14994
  return result;
@@ -14862,7 +15020,11 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
14862
15020
 
14863
15021
  data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize);
14864
15022
 
14865
- fread(data->data, sizeof(char), fsize, fin);
15023
+ const size_t ret = fread(data->data, sizeof(char), fsize, fin);
15024
+ if (ret != fsize) {
15025
+ fprintf(stderr, "%s: failed to read %s\n", __func__, fname);
15026
+ return result;
15027
+ }
14866
15028
 
14867
15029
  fclose(fin);
14868
15030
  }
@@ -14970,6 +15132,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
14970
15132
  op = *(const uint32_t *) ptr; ptr += sizeof(op);
14971
15133
  n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
14972
15134
 
15135
+ enum ggml_op eop = (enum ggml_op) op;
15136
+
14973
15137
  int64_t ne[GGML_MAX_DIMS];
14974
15138
  size_t nb[GGML_MAX_DIMS];
14975
15139
 
@@ -14984,42 +15148,77 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
14984
15148
  nb[j] = nb_cur;
14985
15149
  }
14986
15150
 
14987
- struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
15151
+ uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur); // TODO: not yet used
14988
15152
 
14989
- tensor->op = (enum ggml_op) op;
15153
+ const char * ptr_name = ptr; ptr += GGML_MAX_NAME;
14990
15154
 
14991
- uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur);
15155
+ const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += (2 + GGML_MAX_OPT)*sizeof(int32_t);
14992
15156
 
14993
- memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
14994
-
14995
- for (int j = 0; j < GGML_MAX_DIMS; ++j) {
14996
- tensor->nb[j] = nb[j];
14997
- }
15157
+ struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL };
14998
15158
 
14999
15159
  // parse args
15000
- {
15001
- struct ggml_tensor ** args[2 + GGML_MAX_OPT] = {
15002
- &tensor->src0,
15003
- &tensor->src1,
15004
- };
15160
+ for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
15161
+ const int32_t arg_idx = ptr_arg_idx[j];
15005
15162
 
15006
- for (int j = 0; j < GGML_MAX_OPT; ++j) {
15007
- args[2 + j] = &tensor->opt[j];
15163
+ if (arg_idx == -1) {
15164
+ continue;
15008
15165
  }
15009
15166
 
15010
- for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
15011
- const int32_t arg_idx = *(const int32_t *) ptr; ptr += sizeof(arg_idx);
15167
+ if (arg_idx < GGML_MAX_NODES) {
15168
+ args[j] = result.leafs[arg_idx];
15169
+ } else {
15170
+ args[j] = result.nodes[arg_idx - GGML_MAX_NODES];
15171
+ }
15172
+ }
15012
15173
 
15013
- if (arg_idx == -1) {
15014
- continue;
15015
- }
15174
+ // create the tensor
15175
+ // "view" operations are handled differently
15176
+ // TODO: handle inplace ops - currently a copy is always made
15177
+
15178
+ struct ggml_tensor * tensor = NULL;
15179
+
15180
+ switch (eop) {
15181
+ // TODO: implement other view ops
15182
+ case GGML_OP_RESHAPE:
15183
+ {
15184
+ tensor = ggml_reshape_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3]);
15185
+ } break;
15186
+ case GGML_OP_VIEW:
15187
+ {
15188
+ tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
15189
+
15190
+ uint64_t offs;
15191
+ memcpy(&offs, args[2]->data, sizeof(offs));
15192
+
15193
+ tensor->data = ((char *) tensor->data) + offs;
15194
+ } break;
15195
+ case GGML_OP_TRANSPOSE:
15196
+ {
15197
+ tensor = ggml_transpose(*ctx_eval, args[0]);
15198
+ } break;
15199
+ case GGML_OP_PERMUTE:
15200
+ {
15201
+ tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
15202
+ } break;
15203
+ default:
15204
+ {
15205
+ tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
15206
+
15207
+ tensor->op = eop;
15208
+ } break;
15209
+ }
15016
15210
 
15017
- if (arg_idx < GGML_MAX_NODES) {
15018
- *args[j] = result.leafs[arg_idx];
15019
- } else {
15020
- *args[j] = result.nodes[arg_idx - GGML_MAX_NODES];
15021
- }
15022
- }
15211
+ memcpy(tensor->name, ptr_name, GGML_MAX_NAME);
15212
+
15213
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
15214
+ tensor->nb[j] = nb[j];
15215
+ }
15216
+
15217
+ tensor->src0 = args[0];
15218
+ tensor->src1 = args[1];
15219
+
15220
+ for (int j = 0; j < GGML_MAX_OPT; ++j) {
15221
+ tensor->opt[j] = args[2 + j];
15023
15222
  }
15024
15223
 
15025
15224
  result.nodes[i] = tensor;
@@ -16070,6 +16269,38 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
16070
16269
  block_q8_0 * block = (block_q8_0*)dst + start / QK8_0;
16071
16270
  result = ggml_quantize_q8_0(src + start, block, n, n, hist);
16072
16271
  } break;
16272
+ #ifdef GGML_USE_K_QUANTS
16273
+ case GGML_TYPE_Q2_K:
16274
+ {
16275
+ GGML_ASSERT(start % QK_K == 0);
16276
+ block_q2_K * block = (block_q2_K*)dst + start / QK_K;
16277
+ result = ggml_quantize_q2_K(src + start, block, n, n, hist);
16278
+ } break;
16279
+ case GGML_TYPE_Q3_K:
16280
+ {
16281
+ GGML_ASSERT(start % QK_K == 0);
16282
+ block_q3_K * block = (block_q3_K*)dst + start / QK_K;
16283
+ result = ggml_quantize_q3_K(src + start, block, n, n, hist);
16284
+ } break;
16285
+ case GGML_TYPE_Q4_K:
16286
+ {
16287
+ GGML_ASSERT(start % QK_K == 0);
16288
+ block_q4_K * block = (block_q4_K*)dst + start / QK_K;
16289
+ result = ggml_quantize_q4_K(src + start, block, n, n, hist);
16290
+ } break;
16291
+ case GGML_TYPE_Q5_K:
16292
+ {
16293
+ GGML_ASSERT(start % QK_K == 0);
16294
+ block_q5_K * block = (block_q5_K*)dst + start / QK_K;
16295
+ result = ggml_quantize_q5_K(src + start, block, n, n, hist);
16296
+ } break;
16297
+ case GGML_TYPE_Q6_K:
16298
+ {
16299
+ GGML_ASSERT(start % QK_K == 0);
16300
+ block_q6_K * block = (block_q6_K*)dst + start / QK_K;
16301
+ result = ggml_quantize_q6_K(src + start, block, n, n, hist);
16302
+ } break;
16303
+ #endif
16073
16304
  default:
16074
16305
  assert(false);
16075
16306
  }