llama_cpp 0.15.0 → 0.15.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/ext/llama_cpp/llama_cpp.cpp +6 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -0
- data/vendor/tmp/llama.cpp/Makefile +6 -7
- data/vendor/tmp/llama.cpp/ggml-backend.c +2 -3
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +303 -23
- data/vendor/tmp/llama.cpp/ggml-impl.h +84 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +9 -3
- data/vendor/tmp/llama.cpp/ggml-metal.m +137 -133
- data/vendor/tmp/llama.cpp/ggml-metal.metal +87 -110
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +1 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +2220 -28
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +1032 -0
- data/vendor/tmp/llama.cpp/ggml-rpc.h +24 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +35 -152
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +46843 -39205
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +953 -268
- data/vendor/tmp/llama.cpp/ggml.c +1762 -681
- data/vendor/tmp/llama.cpp/ggml.h +43 -24
- data/vendor/tmp/llama.cpp/llama.cpp +533 -296
- data/vendor/tmp/llama.cpp/llama.h +10 -1
- data/vendor/tmp/llama.cpp/sgemm.cpp +56 -21
- data/vendor/tmp/llama.cpp/unicode-data.cpp +6969 -1637
- data/vendor/tmp/llama.cpp/unicode-data.h +15 -11
- data/vendor/tmp/llama.cpp/unicode.cpp +286 -176
- data/vendor/tmp/llama.cpp/unicode.h +44 -10
- metadata +4 -2
@@ -79,6 +79,11 @@ extern "C" {
|
|
79
79
|
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
|
80
80
|
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
|
81
81
|
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
|
82
|
+
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
|
83
|
+
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
|
84
|
+
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 10,
|
85
|
+
LLAMA_VOCAB_PRE_TYPE_OLMO = 11,
|
86
|
+
LLAMA_VOCAB_PRE_TYPE_DBRX = 12,
|
82
87
|
};
|
83
88
|
|
84
89
|
// note: these values should be synchronized with ggml_rope
|
@@ -134,6 +139,7 @@ extern "C" {
|
|
134
139
|
LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
|
135
140
|
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
|
136
141
|
LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
|
142
|
+
LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
|
137
143
|
|
138
144
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
139
145
|
};
|
@@ -171,7 +177,7 @@ extern "C" {
|
|
171
177
|
bool sorted;
|
172
178
|
} llama_token_data_array;
|
173
179
|
|
174
|
-
typedef bool (*llama_progress_callback)(float progress, void *
|
180
|
+
typedef bool (*llama_progress_callback)(float progress, void * user_data);
|
175
181
|
|
176
182
|
// Input data for llama_decode
|
177
183
|
// A llama_batch object can contain input about one or many sequences
|
@@ -236,6 +242,9 @@ extern "C" {
|
|
236
242
|
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
|
237
243
|
const float * tensor_split;
|
238
244
|
|
245
|
+
// comma separated list of RPC servers to use for offloading
|
246
|
+
const char * rpc_servers;
|
247
|
+
|
239
248
|
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
240
249
|
// If the provided progress_callback returns true, model loading continues.
|
241
250
|
// If it returns false, model loading is immediately aborted.
|
@@ -1,6 +1,3 @@
|
|
1
|
-
// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
|
2
|
-
// vi: set et ft=c++ ts=4 sts=4 sw=4 fenc=utf-8 :vi
|
3
|
-
//
|
4
1
|
// Copyright 2024 Mozilla Foundation
|
5
2
|
//
|
6
3
|
// Permission is hereby granted, free of charge, to any person obtaining
|
@@ -585,15 +582,15 @@ class tinyBLAS_Q0_ARM {
|
|
585
582
|
};
|
586
583
|
#endif // __ARM_FEATURE_DOTPROD
|
587
584
|
|
588
|
-
#if defined(__AVX2__) || defined(__AVX512F__)
|
585
|
+
#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
|
589
586
|
template <typename TA, typename TB, typename TC>
|
590
|
-
class
|
587
|
+
class tinyBLAS_Q0_AVX {
|
591
588
|
public:
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
589
|
+
tinyBLAS_Q0_AVX(int64_t k,
|
590
|
+
const TA *A, int64_t lda,
|
591
|
+
const TB *B, int64_t ldb,
|
592
|
+
TC *C, int64_t ldc,
|
593
|
+
int ith, int nth)
|
597
594
|
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
598
595
|
}
|
599
596
|
|
@@ -728,14 +725,34 @@ class tinyBLAS_Q0_AVX2 {
|
|
728
725
|
__m256 Cv[RN][RM] = {};
|
729
726
|
for (int64_t l = 0; l < k; ++l)
|
730
727
|
for (int64_t j = 0; j < RN; ++j)
|
731
|
-
for (int64_t i = 0; i < RM; ++i)
|
728
|
+
for (int64_t i = 0; i < RM; ++i) {
|
729
|
+
#if defined(__AVX2__)
|
730
|
+
__m256 udTmp = updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
|
731
|
+
load(A + lda * (ii + i) + l)),
|
732
|
+
_mm256_sign_epi8(load(B + ldb * (jj + j) + l),
|
733
|
+
load(A + lda * (ii + i) + l)));
|
734
|
+
#else
|
735
|
+
__m128i ali0 = load0(A + lda * (ii + i) + l);
|
736
|
+
__m128i ali1 = load1(A + lda * (ii + i) + l);
|
737
|
+
__m128i blj0 = load0(B + ldb * (jj + j) + l);
|
738
|
+
__m128i blj1 = load1(B + ldb * (jj + j) + l);
|
739
|
+
|
740
|
+
__m128i sepAA0 = _mm_sign_epi8(ali0, ali0);
|
741
|
+
__m128i sepAA1 = _mm_sign_epi8(ali1, ali1);
|
742
|
+
__m128i sepBA0 = _mm_sign_epi8(blj0, ali0);
|
743
|
+
__m128i sepBA1 = _mm_sign_epi8(blj1, ali1);
|
744
|
+
|
745
|
+
// updot
|
746
|
+
const __m128i oneFill = _mm_set1_epi16(1);
|
747
|
+
__m128i mad0 = _mm_maddubs_epi16(sepAA0, sepBA0);
|
748
|
+
__m128i mad1 = _mm_maddubs_epi16(sepAA1, sepBA1);
|
749
|
+
__m256 udTmp = _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_madd_epi16(oneFill, mad1), _mm_madd_epi16(oneFill, mad0)));
|
750
|
+
#endif
|
732
751
|
Cv[j][i] = madd(_mm256_set1_ps(unhalf(A[lda * (ii + i) + l].d) *
|
733
752
|
unhalf(B[ldb * (jj + j) + l].d)),
|
734
|
-
|
735
|
-
|
736
|
-
|
737
|
-
load(A + lda * (ii + i) + l))),
|
738
|
-
Cv[j][i]);
|
753
|
+
udTmp,
|
754
|
+
Cv[j][i]);
|
755
|
+
}
|
739
756
|
for (int64_t j = 0; j < RN; ++j)
|
740
757
|
for (int64_t i = 0; i < RM; ++i)
|
741
758
|
C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
|
@@ -746,10 +763,28 @@ class tinyBLAS_Q0_AVX2 {
|
|
746
763
|
return _mm256_loadu_si256((const __m256i *)b->qs);
|
747
764
|
}
|
748
765
|
|
766
|
+
inline __m128i load0(const block_q8_0 *b) {
|
767
|
+
return _mm_loadu_si128((const __m128i *)b->qs);
|
768
|
+
}
|
769
|
+
|
770
|
+
inline __m128i load1(const block_q8_0 *b) {
|
771
|
+
return _mm_loadu_si128(((const __m128i *)b->qs) + 1);
|
772
|
+
}
|
773
|
+
|
749
774
|
inline __m256i load(const block_q4_0 *b) {
|
750
775
|
return _mm256_sub_epi8(denibble(b->qs), _mm256_set1_epi8(8));
|
751
776
|
}
|
752
777
|
|
778
|
+
inline __m128i load0(const block_q4_0 *b) {
|
779
|
+
const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
|
780
|
+
return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), x), _mm_set1_epi8(8));
|
781
|
+
}
|
782
|
+
|
783
|
+
inline __m128i load1(const block_q4_0 *b) {
|
784
|
+
const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
|
785
|
+
return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)), _mm_set1_epi8(8));
|
786
|
+
}
|
787
|
+
|
753
788
|
inline __m256 updot(__m256i u, __m256i s) {
|
754
789
|
__m256i res;
|
755
790
|
#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
|
@@ -777,7 +812,7 @@ class tinyBLAS_Q0_AVX2 {
|
|
777
812
|
const int ith;
|
778
813
|
const int nth;
|
779
814
|
};
|
780
|
-
#endif //
|
815
|
+
#endif // __AVX__
|
781
816
|
|
782
817
|
} // namespace
|
783
818
|
|
@@ -928,8 +963,8 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
|
928
963
|
case GGML_TYPE_Q8_0: {
|
929
964
|
if (Btype != GGML_TYPE_Q8_0)
|
930
965
|
return false;
|
931
|
-
#if defined(__AVX2__) || defined(__AVX512F__)
|
932
|
-
|
966
|
+
#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
|
967
|
+
tinyBLAS_Q0_AVX<block_q8_0, block_q8_0, float> tb{
|
933
968
|
k, (const block_q8_0 *)A, lda,
|
934
969
|
(const block_q8_0 *)B, ldb,
|
935
970
|
(float *)C, ldc,
|
@@ -952,8 +987,8 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
|
952
987
|
case GGML_TYPE_Q4_0: {
|
953
988
|
if (Btype != GGML_TYPE_Q8_0)
|
954
989
|
return false;
|
955
|
-
#if defined(__AVX2__) || defined(__AVX512F__)
|
956
|
-
|
990
|
+
#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
|
991
|
+
tinyBLAS_Q0_AVX<block_q4_0, block_q8_0, float> tb{
|
957
992
|
k, (const block_q4_0 *)A, lda,
|
958
993
|
(const block_q8_0 *)B, ldb,
|
959
994
|
(float *)C, ldc,
|