llama_cpp 0.15.0 → 0.15.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -79,6 +79,11 @@ extern "C" {
79
79
  LLAMA_VOCAB_PRE_TYPE_MPT = 5,
80
80
  LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
81
81
  LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
82
+ LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
83
+ LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
84
+ LLAMA_VOCAB_PRE_TYPE_QWEN2 = 10,
85
+ LLAMA_VOCAB_PRE_TYPE_OLMO = 11,
86
+ LLAMA_VOCAB_PRE_TYPE_DBRX = 12,
82
87
  };
83
88
 
84
89
  // note: these values should be synchronized with ggml_rope
@@ -134,6 +139,7 @@ extern "C" {
134
139
  LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
135
140
  LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
136
141
  LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
142
+ LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
137
143
 
138
144
  LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
139
145
  };
@@ -171,7 +177,7 @@ extern "C" {
171
177
  bool sorted;
172
178
  } llama_token_data_array;
173
179
 
174
- typedef bool (*llama_progress_callback)(float progress, void *ctx);
180
+ typedef bool (*llama_progress_callback)(float progress, void * user_data);
175
181
 
176
182
  // Input data for llama_decode
177
183
  // A llama_batch object can contain input about one or many sequences
@@ -236,6 +242,9 @@ extern "C" {
236
242
  // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
237
243
  const float * tensor_split;
238
244
 
245
+ // comma separated list of RPC servers to use for offloading
246
+ const char * rpc_servers;
247
+
239
248
  // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
240
249
  // If the provided progress_callback returns true, model loading continues.
241
250
  // If it returns false, model loading is immediately aborted.
@@ -1,6 +1,3 @@
1
- // -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
2
- // vi: set et ft=c++ ts=4 sts=4 sw=4 fenc=utf-8 :vi
3
- //
4
1
  // Copyright 2024 Mozilla Foundation
5
2
  //
6
3
  // Permission is hereby granted, free of charge, to any person obtaining
@@ -585,15 +582,15 @@ class tinyBLAS_Q0_ARM {
585
582
  };
586
583
  #endif // __ARM_FEATURE_DOTPROD
587
584
 
588
- #if defined(__AVX2__) || defined(__AVX512F__)
585
+ #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
589
586
  template <typename TA, typename TB, typename TC>
590
- class tinyBLAS_Q0_AVX2 {
587
+ class tinyBLAS_Q0_AVX {
591
588
  public:
592
- tinyBLAS_Q0_AVX2(int64_t k,
593
- const TA *A, int64_t lda,
594
- const TB *B, int64_t ldb,
595
- TC *C, int64_t ldc,
596
- int ith, int nth)
589
+ tinyBLAS_Q0_AVX(int64_t k,
590
+ const TA *A, int64_t lda,
591
+ const TB *B, int64_t ldb,
592
+ TC *C, int64_t ldc,
593
+ int ith, int nth)
597
594
  : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
598
595
  }
599
596
 
@@ -728,14 +725,34 @@ class tinyBLAS_Q0_AVX2 {
728
725
  __m256 Cv[RN][RM] = {};
729
726
  for (int64_t l = 0; l < k; ++l)
730
727
  for (int64_t j = 0; j < RN; ++j)
731
- for (int64_t i = 0; i < RM; ++i)
728
+ for (int64_t i = 0; i < RM; ++i) {
729
+ #if defined(__AVX2__)
730
+ __m256 udTmp = updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
731
+ load(A + lda * (ii + i) + l)),
732
+ _mm256_sign_epi8(load(B + ldb * (jj + j) + l),
733
+ load(A + lda * (ii + i) + l)));
734
+ #else
735
+ __m128i ali0 = load0(A + lda * (ii + i) + l);
736
+ __m128i ali1 = load1(A + lda * (ii + i) + l);
737
+ __m128i blj0 = load0(B + ldb * (jj + j) + l);
738
+ __m128i blj1 = load1(B + ldb * (jj + j) + l);
739
+
740
+ __m128i sepAA0 = _mm_sign_epi8(ali0, ali0);
741
+ __m128i sepAA1 = _mm_sign_epi8(ali1, ali1);
742
+ __m128i sepBA0 = _mm_sign_epi8(blj0, ali0);
743
+ __m128i sepBA1 = _mm_sign_epi8(blj1, ali1);
744
+
745
+ // updot
746
+ const __m128i oneFill = _mm_set1_epi16(1);
747
+ __m128i mad0 = _mm_maddubs_epi16(sepAA0, sepBA0);
748
+ __m128i mad1 = _mm_maddubs_epi16(sepAA1, sepBA1);
749
+ __m256 udTmp = _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_madd_epi16(oneFill, mad1), _mm_madd_epi16(oneFill, mad0)));
750
+ #endif
732
751
  Cv[j][i] = madd(_mm256_set1_ps(unhalf(A[lda * (ii + i) + l].d) *
733
752
  unhalf(B[ldb * (jj + j) + l].d)),
734
- updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
735
- load(A + lda * (ii + i) + l)),
736
- _mm256_sign_epi8(load(B + ldb * (jj + j) + l),
737
- load(A + lda * (ii + i) + l))),
738
- Cv[j][i]);
753
+ udTmp,
754
+ Cv[j][i]);
755
+ }
739
756
  for (int64_t j = 0; j < RN; ++j)
740
757
  for (int64_t i = 0; i < RM; ++i)
741
758
  C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
@@ -746,10 +763,28 @@ class tinyBLAS_Q0_AVX2 {
746
763
  return _mm256_loadu_si256((const __m256i *)b->qs);
747
764
  }
748
765
 
766
+ inline __m128i load0(const block_q8_0 *b) {
767
+ return _mm_loadu_si128((const __m128i *)b->qs);
768
+ }
769
+
770
+ inline __m128i load1(const block_q8_0 *b) {
771
+ return _mm_loadu_si128(((const __m128i *)b->qs) + 1);
772
+ }
773
+
749
774
  inline __m256i load(const block_q4_0 *b) {
750
775
  return _mm256_sub_epi8(denibble(b->qs), _mm256_set1_epi8(8));
751
776
  }
752
777
 
778
+ inline __m128i load0(const block_q4_0 *b) {
779
+ const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
780
+ return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), x), _mm_set1_epi8(8));
781
+ }
782
+
783
+ inline __m128i load1(const block_q4_0 *b) {
784
+ const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
785
+ return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)), _mm_set1_epi8(8));
786
+ }
787
+
753
788
  inline __m256 updot(__m256i u, __m256i s) {
754
789
  __m256i res;
755
790
  #if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
@@ -777,7 +812,7 @@ class tinyBLAS_Q0_AVX2 {
777
812
  const int ith;
778
813
  const int nth;
779
814
  };
780
- #endif // __AVX2__
815
+ #endif // __AVX__
781
816
 
782
817
  } // namespace
783
818
 
@@ -928,8 +963,8 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
928
963
  case GGML_TYPE_Q8_0: {
929
964
  if (Btype != GGML_TYPE_Q8_0)
930
965
  return false;
931
- #if defined(__AVX2__) || defined(__AVX512F__)
932
- tinyBLAS_Q0_AVX2<block_q8_0, block_q8_0, float> tb{
966
+ #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
967
+ tinyBLAS_Q0_AVX<block_q8_0, block_q8_0, float> tb{
933
968
  k, (const block_q8_0 *)A, lda,
934
969
  (const block_q8_0 *)B, ldb,
935
970
  (float *)C, ldc,
@@ -952,8 +987,8 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
952
987
  case GGML_TYPE_Q4_0: {
953
988
  if (Btype != GGML_TYPE_Q8_0)
954
989
  return false;
955
- #if defined(__AVX2__) || defined(__AVX512F__)
956
- tinyBLAS_Q0_AVX2<block_q4_0, block_q8_0, float> tb{
990
+ #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
991
+ tinyBLAS_Q0_AVX<block_q4_0, block_q8_0, float> tb{
957
992
  k, (const block_q4_0 *)A, lda,
958
993
  (const block_q8_0 *)B, ldb,
959
994
  (float *)C, ldc,