llama_cpp 0.14.7 → 0.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/README.md +2 -2
- data/ext/llama_cpp/extconf.rb +2 -1
- data/ext/llama_cpp/llama_cpp.cpp +53 -9
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +18 -3
- data/vendor/tmp/llama.cpp/Makefile +41 -16
- data/vendor/tmp/llama.cpp/ggml-backend.c +7 -5
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +6 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +376 -176
- data/vendor/tmp/llama.cpp/ggml-metal.metal +654 -18
- data/vendor/tmp/llama.cpp/ggml-quants.c +284 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +17 -7
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml.c +391 -27
- data/vendor/tmp/llama.cpp/ggml.h +22 -0
- data/vendor/tmp/llama.cpp/llama.cpp +623 -395
- data/vendor/tmp/llama.cpp/llama.h +27 -9
- data/vendor/tmp/llama.cpp/sgemm.cpp +83 -87
- data/vendor/tmp/llama.cpp/sgemm.h +4 -2
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1 -1
- data/vendor/tmp/llama.cpp/unicode-data.h +2 -2
- data/vendor/tmp/llama.cpp/unicode.cpp +448 -39
- data/vendor/tmp/llama.cpp/unicode.h +2 -1
- metadata +3 -3
@@ -40,7 +40,7 @@
|
|
40
40
|
#define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq'
|
41
41
|
|
42
42
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
43
|
-
#define LLAMA_SESSION_VERSION
|
43
|
+
#define LLAMA_SESSION_VERSION 6
|
44
44
|
|
45
45
|
#define LLAMA_STATE_SEQ_MAGIC LLAMA_FILE_MAGIC_GGSQ
|
46
46
|
#define LLAMA_STATE_SEQ_VERSION 1
|
@@ -69,6 +69,18 @@ extern "C" {
|
|
69
69
|
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
|
70
70
|
};
|
71
71
|
|
72
|
+
// pre-tokenization types
|
73
|
+
enum llama_vocab_pre_type {
|
74
|
+
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
|
75
|
+
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
|
76
|
+
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
|
77
|
+
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
|
78
|
+
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
|
79
|
+
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
|
80
|
+
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
|
81
|
+
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
|
82
|
+
};
|
83
|
+
|
72
84
|
// note: these values should be synchronized with ggml_rope
|
73
85
|
// TODO: maybe move this enum to ggml.h (ggml_rope_type)
|
74
86
|
enum llama_rope_type {
|
@@ -195,15 +207,19 @@ extern "C" {
|
|
195
207
|
LLAMA_KV_OVERRIDE_TYPE_INT,
|
196
208
|
LLAMA_KV_OVERRIDE_TYPE_FLOAT,
|
197
209
|
LLAMA_KV_OVERRIDE_TYPE_BOOL,
|
210
|
+
LLAMA_KV_OVERRIDE_TYPE_STR,
|
198
211
|
};
|
199
212
|
|
200
213
|
struct llama_model_kv_override {
|
201
|
-
char key[128];
|
202
214
|
enum llama_model_kv_override_type tag;
|
215
|
+
|
216
|
+
char key[128];
|
217
|
+
|
203
218
|
union {
|
204
|
-
int64_t
|
205
|
-
double
|
206
|
-
bool
|
219
|
+
int64_t val_i64;
|
220
|
+
double val_f64;
|
221
|
+
bool val_bool;
|
222
|
+
char val_str[128];
|
207
223
|
};
|
208
224
|
};
|
209
225
|
|
@@ -232,9 +248,10 @@ extern "C" {
|
|
232
248
|
const struct llama_model_kv_override * kv_overrides;
|
233
249
|
|
234
250
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
235
|
-
bool vocab_only;
|
236
|
-
bool use_mmap;
|
237
|
-
bool use_mlock;
|
251
|
+
bool vocab_only; // only load the vocabulary, no weights
|
252
|
+
bool use_mmap; // use mmap if possible
|
253
|
+
bool use_mlock; // force system to keep model in RAM
|
254
|
+
bool check_tensors; // validate model tensor data
|
238
255
|
};
|
239
256
|
|
240
257
|
struct llama_context_params {
|
@@ -270,6 +287,7 @@ extern "C" {
|
|
270
287
|
bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
271
288
|
bool embeddings; // if true, extract embeddings (together with logits)
|
272
289
|
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
290
|
+
bool flash_attn; // whether to use flash attention
|
273
291
|
|
274
292
|
// Abort callback
|
275
293
|
// if it returns true, execution of llama_decode() will be aborted
|
@@ -525,7 +543,7 @@ extern "C" {
|
|
525
543
|
// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
|
526
544
|
LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx);
|
527
545
|
|
528
|
-
// Clear the KV cache
|
546
|
+
// Clear the KV cache - both cell info is erased and KV data is zeroed
|
529
547
|
LLAMA_API void llama_kv_cache_clear(
|
530
548
|
struct llama_context * ctx);
|
531
549
|
|
@@ -50,7 +50,6 @@
|
|
50
50
|
#pragma GCC diagnostic ignored "-Wignored-attributes"
|
51
51
|
|
52
52
|
#include "sgemm.h"
|
53
|
-
#include <algorithm>
|
54
53
|
#include "ggml-impl.h"
|
55
54
|
#include "ggml-quants.h"
|
56
55
|
|
@@ -243,23 +242,23 @@ template <> inline __m512 load(const ggml_fp16_t *p) {
|
|
243
242
|
template <int KN, typename D, typename V, typename TA, typename TB, typename TC>
|
244
243
|
class tinyBLAS {
|
245
244
|
public:
|
246
|
-
tinyBLAS(
|
247
|
-
const TA *A,
|
248
|
-
const TB *B,
|
249
|
-
TC *C,
|
245
|
+
tinyBLAS(int64_t k,
|
246
|
+
const TA *A, int64_t lda,
|
247
|
+
const TB *B, int64_t ldb,
|
248
|
+
TC *C, int64_t ldc,
|
250
249
|
int ith, int nth)
|
251
250
|
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
252
251
|
}
|
253
252
|
|
254
|
-
void matmul(
|
253
|
+
void matmul(int64_t m, int64_t n, int task) {
|
255
254
|
if (task == GGML_TASK_TYPE_COMPUTE)
|
256
255
|
mnpack(0, m, 0, n);
|
257
256
|
}
|
258
257
|
|
259
258
|
private:
|
260
|
-
NOINLINE void mnpack(
|
261
|
-
|
262
|
-
switch ((
|
259
|
+
NOINLINE void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
260
|
+
int64_t mc, nc, mp, np;
|
261
|
+
switch ((MIN(m - m0, 5) << 4) | MIN(n - n0, 5)) {
|
263
262
|
#if VECTOR_REGISTERS == 32
|
264
263
|
case 0x55:
|
265
264
|
mc = 5;
|
@@ -409,27 +408,27 @@ class tinyBLAS {
|
|
409
408
|
}
|
410
409
|
|
411
410
|
template <int RM, int RN>
|
412
|
-
NOINLINE void gemm(
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
411
|
+
NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
412
|
+
int64_t ytiles = (m - m0) / RM;
|
413
|
+
int64_t xtiles = (n - n0) / RN;
|
414
|
+
int64_t tiles = xtiles * ytiles;
|
415
|
+
int64_t duty = (tiles + nth - 1) / nth;
|
416
|
+
int64_t start = duty * ith;
|
417
|
+
int64_t end = start + duty;
|
419
418
|
if (end > tiles)
|
420
419
|
end = tiles;
|
421
|
-
for (
|
422
|
-
|
423
|
-
|
420
|
+
for (int64_t job = start; job < end; ++job) {
|
421
|
+
int64_t ii = m0 + job / xtiles * RM;
|
422
|
+
int64_t jj = n0 + job % xtiles * RN;
|
424
423
|
D Cv[RN][RM] = {};
|
425
|
-
for (
|
426
|
-
for (
|
427
|
-
for (
|
424
|
+
for (int64_t l = 0; l < k; l += KN)
|
425
|
+
for (int64_t j = 0; j < RN; ++j)
|
426
|
+
for (int64_t i = 0; i < RM; ++i)
|
428
427
|
Cv[j][i] = madd(load<V>(A + lda * (ii + i) + l),
|
429
428
|
load<V>(B + ldb * (jj + j) + l),
|
430
429
|
Cv[j][i]);
|
431
|
-
for (
|
432
|
-
for (
|
430
|
+
for (int64_t j = 0; j < RN; ++j)
|
431
|
+
for (int64_t i = 0; i < RM; ++i)
|
433
432
|
C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
|
434
433
|
}
|
435
434
|
}
|
@@ -437,10 +436,10 @@ class tinyBLAS {
|
|
437
436
|
const TA *const A;
|
438
437
|
const TB *const B;
|
439
438
|
TC *const C;
|
440
|
-
const
|
441
|
-
const
|
442
|
-
const
|
443
|
-
const
|
439
|
+
const int64_t k;
|
440
|
+
const int64_t lda;
|
441
|
+
const int64_t ldb;
|
442
|
+
const int64_t ldc;
|
444
443
|
const int ith;
|
445
444
|
const int nth;
|
446
445
|
};
|
@@ -452,23 +451,23 @@ class tinyBLAS {
|
|
452
451
|
template <typename TA>
|
453
452
|
class tinyBLAS_Q0_ARM {
|
454
453
|
public:
|
455
|
-
tinyBLAS_Q0_ARM(
|
456
|
-
const TA *A,
|
457
|
-
const block_q8_0 *B,
|
458
|
-
float *C,
|
454
|
+
tinyBLAS_Q0_ARM(int64_t k,
|
455
|
+
const TA *A, int64_t lda,
|
456
|
+
const block_q8_0 *B, int64_t ldb,
|
457
|
+
float *C, int64_t ldc,
|
459
458
|
int ith, int nth)
|
460
459
|
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
461
460
|
}
|
462
461
|
|
463
|
-
void matmul(
|
462
|
+
void matmul(int64_t m, int64_t n, int task) {
|
464
463
|
if (task == GGML_TASK_TYPE_COMPUTE)
|
465
464
|
mnpack(0, m, 0, n);
|
466
465
|
}
|
467
466
|
|
468
467
|
private:
|
469
|
-
NOINLINE void mnpack(
|
470
|
-
|
471
|
-
switch ((
|
468
|
+
NOINLINE void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
469
|
+
int64_t mc, nc, mp, np;
|
470
|
+
switch ((MIN(m - m0, 3) << 4) | MIN(n - n0, 3ll)) {
|
472
471
|
case 0x33:
|
473
472
|
mc = 3;
|
474
473
|
nc = 3;
|
@@ -524,22 +523,22 @@ class tinyBLAS_Q0_ARM {
|
|
524
523
|
}
|
525
524
|
|
526
525
|
template <int RM, int RN>
|
527
|
-
NOINLINE void gemm(
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
526
|
+
NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
527
|
+
int64_t ytiles = (m - m0) / RM;
|
528
|
+
int64_t xtiles = (n - n0) / RN;
|
529
|
+
int64_t tiles = xtiles * ytiles;
|
530
|
+
int64_t duty = (tiles + nth - 1) / nth;
|
531
|
+
int64_t start = duty * ith;
|
532
|
+
int64_t end = start + duty;
|
534
533
|
if (end > tiles)
|
535
534
|
end = tiles;
|
536
|
-
for (
|
537
|
-
|
538
|
-
|
535
|
+
for (int64_t job = start; job < end; ++job) {
|
536
|
+
int64_t ii = m0 + job / xtiles * RM;
|
537
|
+
int64_t jj = n0 + job % xtiles * RN;
|
539
538
|
float32x4_t Cv[RN][RM] = {};
|
540
|
-
for (
|
541
|
-
for (
|
542
|
-
for (
|
539
|
+
for (int64_t l = 0; l < k; ++l)
|
540
|
+
for (int64_t j = 0; j < RN; ++j)
|
541
|
+
for (int64_t i = 0; i < RM; ++i)
|
543
542
|
Cv[j][i] = vmlaq_n_f32(Cv[j][i],
|
544
543
|
vcvtq_f32_s32(vdotq_s32(
|
545
544
|
vdotq_s32(vdupq_n_s32(0),
|
@@ -549,8 +548,8 @@ class tinyBLAS_Q0_ARM {
|
|
549
548
|
load_hi(B + ldb * (jj + j) + l))),
|
550
549
|
unhalf(A[lda * (ii + i) + l].d) *
|
551
550
|
unhalf(B[ldb * (jj + j) + l].d));
|
552
|
-
for (
|
553
|
-
for (
|
551
|
+
for (int64_t j = 0; j < RN; ++j)
|
552
|
+
for (int64_t i = 0; i < RM; ++i)
|
554
553
|
C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
|
555
554
|
}
|
556
555
|
}
|
@@ -577,10 +576,10 @@ class tinyBLAS_Q0_ARM {
|
|
577
576
|
const TA *const A;
|
578
577
|
const block_q8_0 *const B;
|
579
578
|
float *const C;
|
580
|
-
const
|
581
|
-
const
|
582
|
-
const
|
583
|
-
const
|
579
|
+
const int64_t k;
|
580
|
+
const int64_t lda;
|
581
|
+
const int64_t ldb;
|
582
|
+
const int64_t ldc;
|
584
583
|
const int ith;
|
585
584
|
const int nth;
|
586
585
|
};
|
@@ -590,23 +589,23 @@ class tinyBLAS_Q0_ARM {
|
|
590
589
|
template <typename TA, typename TB, typename TC>
|
591
590
|
class tinyBLAS_Q0_AVX2 {
|
592
591
|
public:
|
593
|
-
tinyBLAS_Q0_AVX2(
|
594
|
-
const TA *A,
|
595
|
-
const TB *B,
|
596
|
-
TC *C,
|
592
|
+
tinyBLAS_Q0_AVX2(int64_t k,
|
593
|
+
const TA *A, int64_t lda,
|
594
|
+
const TB *B, int64_t ldb,
|
595
|
+
TC *C, int64_t ldc,
|
597
596
|
int ith, int nth)
|
598
597
|
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
599
598
|
}
|
600
599
|
|
601
|
-
void matmul(
|
600
|
+
void matmul(int64_t m, int64_t n, int task) {
|
602
601
|
if (task == GGML_TASK_TYPE_COMPUTE)
|
603
602
|
mnpack(0, m, 0, n);
|
604
603
|
}
|
605
604
|
|
606
605
|
private:
|
607
|
-
void mnpack(
|
608
|
-
|
609
|
-
switch ((
|
606
|
+
void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
607
|
+
int64_t mc, nc, mp, np;
|
608
|
+
switch ((MIN(m - m0, 4) << 4) | MIN(n - n0, 4)) {
|
610
609
|
#if VECTOR_REGISTERS == 32
|
611
610
|
case 0x44:
|
612
611
|
mc = 4;
|
@@ -714,22 +713,22 @@ class tinyBLAS_Q0_AVX2 {
|
|
714
713
|
}
|
715
714
|
|
716
715
|
template <int RM, int RN>
|
717
|
-
NOINLINE void gemm(
|
718
|
-
|
719
|
-
|
720
|
-
|
721
|
-
|
722
|
-
|
723
|
-
|
716
|
+
NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
717
|
+
int64_t ytiles = (m - m0) / RM;
|
718
|
+
int64_t xtiles = (n - n0) / RN;
|
719
|
+
int64_t tiles = xtiles * ytiles;
|
720
|
+
int64_t duty = (tiles + nth - 1) / nth;
|
721
|
+
int64_t start = duty * ith;
|
722
|
+
int64_t end = start + duty;
|
724
723
|
if (end > tiles)
|
725
724
|
end = tiles;
|
726
|
-
for (
|
727
|
-
|
728
|
-
|
725
|
+
for (int64_t job = start; job < end; ++job) {
|
726
|
+
int64_t ii = m0 + job / xtiles * RM;
|
727
|
+
int64_t jj = n0 + job % xtiles * RN;
|
729
728
|
__m256 Cv[RN][RM] = {};
|
730
|
-
for (
|
731
|
-
for (
|
732
|
-
for (
|
729
|
+
for (int64_t l = 0; l < k; ++l)
|
730
|
+
for (int64_t j = 0; j < RN; ++j)
|
731
|
+
for (int64_t i = 0; i < RM; ++i)
|
733
732
|
Cv[j][i] = madd(_mm256_set1_ps(unhalf(A[lda * (ii + i) + l].d) *
|
734
733
|
unhalf(B[ldb * (jj + j) + l].d)),
|
735
734
|
updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
|
@@ -737,8 +736,8 @@ class tinyBLAS_Q0_AVX2 {
|
|
737
736
|
_mm256_sign_epi8(load(B + ldb * (jj + j) + l),
|
738
737
|
load(A + lda * (ii + i) + l))),
|
739
738
|
Cv[j][i]);
|
740
|
-
for (
|
741
|
-
for (
|
739
|
+
for (int64_t j = 0; j < RN; ++j)
|
740
|
+
for (int64_t i = 0; i < RM; ++i)
|
742
741
|
C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
|
743
742
|
}
|
744
743
|
}
|
@@ -771,10 +770,10 @@ class tinyBLAS_Q0_AVX2 {
|
|
771
770
|
const TA *const A;
|
772
771
|
const TB *const B;
|
773
772
|
TC *const C;
|
774
|
-
const
|
775
|
-
const
|
776
|
-
const
|
777
|
-
const
|
773
|
+
const int64_t k;
|
774
|
+
const int64_t lda;
|
775
|
+
const int64_t ldb;
|
776
|
+
const int64_t ldc;
|
778
777
|
const int ith;
|
779
778
|
const int nth;
|
780
779
|
};
|
@@ -813,8 +812,8 @@ class tinyBLAS_Q0_AVX2 {
|
|
813
812
|
* @param Ctype is GGML data type of `C`
|
814
813
|
* @return true if this function was able to service the matmul request
|
815
814
|
*/
|
816
|
-
bool llamafile_sgemm(
|
817
|
-
|
815
|
+
bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
|
816
|
+
int64_t ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype) {
|
818
817
|
|
819
818
|
assert(m >= 0);
|
820
819
|
assert(n >= 0);
|
@@ -824,9 +823,6 @@ bool llamafile_sgemm(int m, int n, int k, const void *A, int lda, const void *B,
|
|
824
823
|
assert(ldc >= m);
|
825
824
|
assert(nth > 0);
|
826
825
|
assert(ith < nth);
|
827
|
-
assert(1ll * lda * m <= 0x7fffffff);
|
828
|
-
assert(1ll * ldb * n <= 0x7fffffff);
|
829
|
-
assert(1ll * ldc * n <= 0x7fffffff);
|
830
826
|
|
831
827
|
if (Ctype != GGML_TYPE_F32)
|
832
828
|
return false;
|
@@ -1,11 +1,13 @@
|
|
1
1
|
#pragma once
|
2
|
+
#include <stdint.h>
|
2
3
|
#include <stdbool.h>
|
3
4
|
#ifdef __cplusplus
|
4
5
|
extern "C" {
|
5
6
|
#endif
|
6
7
|
|
7
|
-
bool llamafile_sgemm(
|
8
|
-
void *,
|
8
|
+
bool llamafile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t,
|
9
|
+
const void *, int64_t, void *, int64_t, int, int,
|
10
|
+
int, int, int, int);
|
9
11
|
|
10
12
|
#ifdef __cplusplus
|
11
13
|
}
|
@@ -12,5 +12,5 @@ extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_accent_ma
|
|
12
12
|
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_punctuation;
|
13
13
|
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_symbol;
|
14
14
|
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_control;
|
15
|
-
extern const std::multimap<uint32_t, uint32_t>
|
16
|
-
extern const std::map<char32_t, char32_t>
|
15
|
+
extern const std::multimap<uint32_t, uint32_t> unicode_map_nfd;
|
16
|
+
extern const std::map<char32_t, char32_t> unicode_map_lowercase;
|