llama_cpp 0.15.0 → 0.15.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -79,6 +79,11 @@ extern "C" {
79
79
  LLAMA_VOCAB_PRE_TYPE_MPT = 5,
80
80
  LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
81
81
  LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
82
+ LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
83
+ LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
84
+ LLAMA_VOCAB_PRE_TYPE_QWEN2 = 10,
85
+ LLAMA_VOCAB_PRE_TYPE_OLMO = 11,
86
+ LLAMA_VOCAB_PRE_TYPE_DBRX = 12,
82
87
  };
83
88
 
84
89
  // note: these values should be synchronized with ggml_rope
@@ -134,6 +139,7 @@ extern "C" {
134
139
  LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
135
140
  LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
136
141
  LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
142
+ LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
137
143
 
138
144
  LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
139
145
  };
@@ -171,7 +177,7 @@ extern "C" {
171
177
  bool sorted;
172
178
  } llama_token_data_array;
173
179
 
174
- typedef bool (*llama_progress_callback)(float progress, void *ctx);
180
+ typedef bool (*llama_progress_callback)(float progress, void * user_data);
175
181
 
176
182
  // Input data for llama_decode
177
183
  // A llama_batch object can contain input about one or many sequences
@@ -236,6 +242,9 @@ extern "C" {
236
242
  // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
237
243
  const float * tensor_split;
238
244
 
245
+ // comma separated list of RPC servers to use for offloading
246
+ const char * rpc_servers;
247
+
239
248
  // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
240
249
  // If the provided progress_callback returns true, model loading continues.
241
250
  // If it returns false, model loading is immediately aborted.
@@ -1,6 +1,3 @@
1
- // -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
2
- // vi: set et ft=c++ ts=4 sts=4 sw=4 fenc=utf-8 :vi
3
- //
4
1
  // Copyright 2024 Mozilla Foundation
5
2
  //
6
3
  // Permission is hereby granted, free of charge, to any person obtaining
@@ -585,15 +582,15 @@ class tinyBLAS_Q0_ARM {
585
582
  };
586
583
  #endif // __ARM_FEATURE_DOTPROD
587
584
 
588
- #if defined(__AVX2__) || defined(__AVX512F__)
585
+ #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
589
586
  template <typename TA, typename TB, typename TC>
590
- class tinyBLAS_Q0_AVX2 {
587
+ class tinyBLAS_Q0_AVX {
591
588
  public:
592
- tinyBLAS_Q0_AVX2(int64_t k,
593
- const TA *A, int64_t lda,
594
- const TB *B, int64_t ldb,
595
- TC *C, int64_t ldc,
596
- int ith, int nth)
589
+ tinyBLAS_Q0_AVX(int64_t k,
590
+ const TA *A, int64_t lda,
591
+ const TB *B, int64_t ldb,
592
+ TC *C, int64_t ldc,
593
+ int ith, int nth)
597
594
  : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
598
595
  }
599
596
 
@@ -728,14 +725,34 @@ class tinyBLAS_Q0_AVX2 {
728
725
  __m256 Cv[RN][RM] = {};
729
726
  for (int64_t l = 0; l < k; ++l)
730
727
  for (int64_t j = 0; j < RN; ++j)
731
- for (int64_t i = 0; i < RM; ++i)
728
+ for (int64_t i = 0; i < RM; ++i) {
729
+ #if defined(__AVX2__)
730
+ __m256 udTmp = updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
731
+ load(A + lda * (ii + i) + l)),
732
+ _mm256_sign_epi8(load(B + ldb * (jj + j) + l),
733
+ load(A + lda * (ii + i) + l)));
734
+ #else
735
+ __m128i ali0 = load0(A + lda * (ii + i) + l);
736
+ __m128i ali1 = load1(A + lda * (ii + i) + l);
737
+ __m128i blj0 = load0(B + ldb * (jj + j) + l);
738
+ __m128i blj1 = load1(B + ldb * (jj + j) + l);
739
+
740
+ __m128i sepAA0 = _mm_sign_epi8(ali0, ali0);
741
+ __m128i sepAA1 = _mm_sign_epi8(ali1, ali1);
742
+ __m128i sepBA0 = _mm_sign_epi8(blj0, ali0);
743
+ __m128i sepBA1 = _mm_sign_epi8(blj1, ali1);
744
+
745
+ // updot
746
+ const __m128i oneFill = _mm_set1_epi16(1);
747
+ __m128i mad0 = _mm_maddubs_epi16(sepAA0, sepBA0);
748
+ __m128i mad1 = _mm_maddubs_epi16(sepAA1, sepBA1);
749
+ __m256 udTmp = _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_madd_epi16(oneFill, mad1), _mm_madd_epi16(oneFill, mad0)));
750
+ #endif
732
751
  Cv[j][i] = madd(_mm256_set1_ps(unhalf(A[lda * (ii + i) + l].d) *
733
752
  unhalf(B[ldb * (jj + j) + l].d)),
734
- updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
735
- load(A + lda * (ii + i) + l)),
736
- _mm256_sign_epi8(load(B + ldb * (jj + j) + l),
737
- load(A + lda * (ii + i) + l))),
738
- Cv[j][i]);
753
+ udTmp,
754
+ Cv[j][i]);
755
+ }
739
756
  for (int64_t j = 0; j < RN; ++j)
740
757
  for (int64_t i = 0; i < RM; ++i)
741
758
  C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
@@ -746,10 +763,28 @@ class tinyBLAS_Q0_AVX2 {
746
763
  return _mm256_loadu_si256((const __m256i *)b->qs);
747
764
  }
748
765
 
766
+ inline __m128i load0(const block_q8_0 *b) {
767
+ return _mm_loadu_si128((const __m128i *)b->qs);
768
+ }
769
+
770
+ inline __m128i load1(const block_q8_0 *b) {
771
+ return _mm_loadu_si128(((const __m128i *)b->qs) + 1);
772
+ }
773
+
749
774
  inline __m256i load(const block_q4_0 *b) {
750
775
  return _mm256_sub_epi8(denibble(b->qs), _mm256_set1_epi8(8));
751
776
  }
752
777
 
778
+ inline __m128i load0(const block_q4_0 *b) {
779
+ const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
780
+ return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), x), _mm_set1_epi8(8));
781
+ }
782
+
783
+ inline __m128i load1(const block_q4_0 *b) {
784
+ const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
785
+ return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)), _mm_set1_epi8(8));
786
+ }
787
+
753
788
  inline __m256 updot(__m256i u, __m256i s) {
754
789
  __m256i res;
755
790
  #if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
@@ -777,7 +812,7 @@ class tinyBLAS_Q0_AVX2 {
777
812
  const int ith;
778
813
  const int nth;
779
814
  };
780
- #endif // __AVX2__
815
+ #endif // __AVX__
781
816
 
782
817
  } // namespace
783
818
 
@@ -928,8 +963,8 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
928
963
  case GGML_TYPE_Q8_0: {
929
964
  if (Btype != GGML_TYPE_Q8_0)
930
965
  return false;
931
- #if defined(__AVX2__) || defined(__AVX512F__)
932
- tinyBLAS_Q0_AVX2<block_q8_0, block_q8_0, float> tb{
966
+ #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
967
+ tinyBLAS_Q0_AVX<block_q8_0, block_q8_0, float> tb{
933
968
  k, (const block_q8_0 *)A, lda,
934
969
  (const block_q8_0 *)B, ldb,
935
970
  (float *)C, ldc,
@@ -952,8 +987,8 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
952
987
  case GGML_TYPE_Q4_0: {
953
988
  if (Btype != GGML_TYPE_Q8_0)
954
989
  return false;
955
- #if defined(__AVX2__) || defined(__AVX512F__)
956
- tinyBLAS_Q0_AVX2<block_q4_0, block_q8_0, float> tb{
990
+ #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
991
+ tinyBLAS_Q0_AVX<block_q4_0, block_q8_0, float> tb{
957
992
  k, (const block_q4_0 *)A, lda,
958
993
  (const block_q8_0 *)B, ldb,
959
994
  (float *)C, ldc,