llama_cpp 0.15.0 → 0.15.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/ext/llama_cpp/llama_cpp.cpp +6 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -0
- data/vendor/tmp/llama.cpp/Makefile +6 -7
- data/vendor/tmp/llama.cpp/ggml-backend.c +2 -3
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +303 -23
- data/vendor/tmp/llama.cpp/ggml-impl.h +84 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +9 -3
- data/vendor/tmp/llama.cpp/ggml-metal.m +137 -133
- data/vendor/tmp/llama.cpp/ggml-metal.metal +87 -110
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +1 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +2220 -28
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +1032 -0
- data/vendor/tmp/llama.cpp/ggml-rpc.h +24 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +35 -152
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +46843 -39205
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +953 -268
- data/vendor/tmp/llama.cpp/ggml.c +1762 -681
- data/vendor/tmp/llama.cpp/ggml.h +43 -24
- data/vendor/tmp/llama.cpp/llama.cpp +533 -296
- data/vendor/tmp/llama.cpp/llama.h +10 -1
- data/vendor/tmp/llama.cpp/sgemm.cpp +56 -21
- data/vendor/tmp/llama.cpp/unicode-data.cpp +6969 -1637
- data/vendor/tmp/llama.cpp/unicode-data.h +15 -11
- data/vendor/tmp/llama.cpp/unicode.cpp +286 -176
- data/vendor/tmp/llama.cpp/unicode.h +44 -10
- metadata +4 -2
@@ -79,6 +79,11 @@ extern "C" {
|
|
79
79
|
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
|
80
80
|
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
|
81
81
|
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
|
82
|
+
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
|
83
|
+
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
|
84
|
+
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 10,
|
85
|
+
LLAMA_VOCAB_PRE_TYPE_OLMO = 11,
|
86
|
+
LLAMA_VOCAB_PRE_TYPE_DBRX = 12,
|
82
87
|
};
|
83
88
|
|
84
89
|
// note: these values should be synchronized with ggml_rope
|
@@ -134,6 +139,7 @@ extern "C" {
|
|
134
139
|
LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
|
135
140
|
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
|
136
141
|
LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
|
142
|
+
LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
|
137
143
|
|
138
144
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
139
145
|
};
|
@@ -171,7 +177,7 @@ extern "C" {
|
|
171
177
|
bool sorted;
|
172
178
|
} llama_token_data_array;
|
173
179
|
|
174
|
-
typedef bool (*llama_progress_callback)(float progress, void *
|
180
|
+
typedef bool (*llama_progress_callback)(float progress, void * user_data);
|
175
181
|
|
176
182
|
// Input data for llama_decode
|
177
183
|
// A llama_batch object can contain input about one or many sequences
|
@@ -236,6 +242,9 @@ extern "C" {
|
|
236
242
|
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
|
237
243
|
const float * tensor_split;
|
238
244
|
|
245
|
+
// comma separated list of RPC servers to use for offloading
|
246
|
+
const char * rpc_servers;
|
247
|
+
|
239
248
|
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
240
249
|
// If the provided progress_callback returns true, model loading continues.
|
241
250
|
// If it returns false, model loading is immediately aborted.
|
@@ -1,6 +1,3 @@
|
|
1
|
-
// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
|
2
|
-
// vi: set et ft=c++ ts=4 sts=4 sw=4 fenc=utf-8 :vi
|
3
|
-
//
|
4
1
|
// Copyright 2024 Mozilla Foundation
|
5
2
|
//
|
6
3
|
// Permission is hereby granted, free of charge, to any person obtaining
|
@@ -585,15 +582,15 @@ class tinyBLAS_Q0_ARM {
|
|
585
582
|
};
|
586
583
|
#endif // __ARM_FEATURE_DOTPROD
|
587
584
|
|
588
|
-
#if defined(__AVX2__) || defined(__AVX512F__)
|
585
|
+
#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
|
589
586
|
template <typename TA, typename TB, typename TC>
|
590
|
-
class
|
587
|
+
class tinyBLAS_Q0_AVX {
|
591
588
|
public:
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
589
|
+
tinyBLAS_Q0_AVX(int64_t k,
|
590
|
+
const TA *A, int64_t lda,
|
591
|
+
const TB *B, int64_t ldb,
|
592
|
+
TC *C, int64_t ldc,
|
593
|
+
int ith, int nth)
|
597
594
|
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
598
595
|
}
|
599
596
|
|
@@ -728,14 +725,34 @@ class tinyBLAS_Q0_AVX2 {
|
|
728
725
|
__m256 Cv[RN][RM] = {};
|
729
726
|
for (int64_t l = 0; l < k; ++l)
|
730
727
|
for (int64_t j = 0; j < RN; ++j)
|
731
|
-
for (int64_t i = 0; i < RM; ++i)
|
728
|
+
for (int64_t i = 0; i < RM; ++i) {
|
729
|
+
#if defined(__AVX2__)
|
730
|
+
__m256 udTmp = updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
|
731
|
+
load(A + lda * (ii + i) + l)),
|
732
|
+
_mm256_sign_epi8(load(B + ldb * (jj + j) + l),
|
733
|
+
load(A + lda * (ii + i) + l)));
|
734
|
+
#else
|
735
|
+
__m128i ali0 = load0(A + lda * (ii + i) + l);
|
736
|
+
__m128i ali1 = load1(A + lda * (ii + i) + l);
|
737
|
+
__m128i blj0 = load0(B + ldb * (jj + j) + l);
|
738
|
+
__m128i blj1 = load1(B + ldb * (jj + j) + l);
|
739
|
+
|
740
|
+
__m128i sepAA0 = _mm_sign_epi8(ali0, ali0);
|
741
|
+
__m128i sepAA1 = _mm_sign_epi8(ali1, ali1);
|
742
|
+
__m128i sepBA0 = _mm_sign_epi8(blj0, ali0);
|
743
|
+
__m128i sepBA1 = _mm_sign_epi8(blj1, ali1);
|
744
|
+
|
745
|
+
// updot
|
746
|
+
const __m128i oneFill = _mm_set1_epi16(1);
|
747
|
+
__m128i mad0 = _mm_maddubs_epi16(sepAA0, sepBA0);
|
748
|
+
__m128i mad1 = _mm_maddubs_epi16(sepAA1, sepBA1);
|
749
|
+
__m256 udTmp = _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_madd_epi16(oneFill, mad1), _mm_madd_epi16(oneFill, mad0)));
|
750
|
+
#endif
|
732
751
|
Cv[j][i] = madd(_mm256_set1_ps(unhalf(A[lda * (ii + i) + l].d) *
|
733
752
|
unhalf(B[ldb * (jj + j) + l].d)),
|
734
|
-
|
735
|
-
|
736
|
-
|
737
|
-
load(A + lda * (ii + i) + l))),
|
738
|
-
Cv[j][i]);
|
753
|
+
udTmp,
|
754
|
+
Cv[j][i]);
|
755
|
+
}
|
739
756
|
for (int64_t j = 0; j < RN; ++j)
|
740
757
|
for (int64_t i = 0; i < RM; ++i)
|
741
758
|
C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
|
@@ -746,10 +763,28 @@ class tinyBLAS_Q0_AVX2 {
|
|
746
763
|
return _mm256_loadu_si256((const __m256i *)b->qs);
|
747
764
|
}
|
748
765
|
|
766
|
+
inline __m128i load0(const block_q8_0 *b) {
|
767
|
+
return _mm_loadu_si128((const __m128i *)b->qs);
|
768
|
+
}
|
769
|
+
|
770
|
+
inline __m128i load1(const block_q8_0 *b) {
|
771
|
+
return _mm_loadu_si128(((const __m128i *)b->qs) + 1);
|
772
|
+
}
|
773
|
+
|
749
774
|
inline __m256i load(const block_q4_0 *b) {
|
750
775
|
return _mm256_sub_epi8(denibble(b->qs), _mm256_set1_epi8(8));
|
751
776
|
}
|
752
777
|
|
778
|
+
inline __m128i load0(const block_q4_0 *b) {
|
779
|
+
const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
|
780
|
+
return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), x), _mm_set1_epi8(8));
|
781
|
+
}
|
782
|
+
|
783
|
+
inline __m128i load1(const block_q4_0 *b) {
|
784
|
+
const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
|
785
|
+
return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)), _mm_set1_epi8(8));
|
786
|
+
}
|
787
|
+
|
753
788
|
inline __m256 updot(__m256i u, __m256i s) {
|
754
789
|
__m256i res;
|
755
790
|
#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
|
@@ -777,7 +812,7 @@ class tinyBLAS_Q0_AVX2 {
|
|
777
812
|
const int ith;
|
778
813
|
const int nth;
|
779
814
|
};
|
780
|
-
#endif //
|
815
|
+
#endif // __AVX__
|
781
816
|
|
782
817
|
} // namespace
|
783
818
|
|
@@ -928,8 +963,8 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
|
928
963
|
case GGML_TYPE_Q8_0: {
|
929
964
|
if (Btype != GGML_TYPE_Q8_0)
|
930
965
|
return false;
|
931
|
-
#if defined(__AVX2__) || defined(__AVX512F__)
|
932
|
-
|
966
|
+
#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
|
967
|
+
tinyBLAS_Q0_AVX<block_q8_0, block_q8_0, float> tb{
|
933
968
|
k, (const block_q8_0 *)A, lda,
|
934
969
|
(const block_q8_0 *)B, ldb,
|
935
970
|
(float *)C, ldc,
|
@@ -952,8 +987,8 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
|
952
987
|
case GGML_TYPE_Q4_0: {
|
953
988
|
if (Btype != GGML_TYPE_Q8_0)
|
954
989
|
return false;
|
955
|
-
#if defined(__AVX2__) || defined(__AVX512F__)
|
956
|
-
|
990
|
+
#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
|
991
|
+
tinyBLAS_Q0_AVX<block_q4_0, block_q8_0, float> tb{
|
957
992
|
k, (const block_q4_0 *)A, lda,
|
958
993
|
(const block_q8_0 *)B, ldb,
|
959
994
|
(float *)C, ldc,
|