llama_cpp 0.14.7 → 0.15.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +19 -0
- data/README.md +2 -2
- data/ext/llama_cpp/extconf.rb +2 -1
- data/ext/llama_cpp/llama_cpp.cpp +59 -9
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +24 -3
- data/vendor/tmp/llama.cpp/Makefile +42 -18
- data/vendor/tmp/llama.cpp/ggml-backend.c +7 -5
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +295 -17
- data/vendor/tmp/llama.cpp/ggml-impl.h +78 -1
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +399 -184
- data/vendor/tmp/llama.cpp/ggml-metal.metal +654 -18
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +1 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +302 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +28 -16
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +46843 -39205
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +951 -263
- data/vendor/tmp/llama.cpp/ggml.c +1457 -92
- data/vendor/tmp/llama.cpp/ggml.h +37 -7
- data/vendor/tmp/llama.cpp/llama.cpp +671 -403
- data/vendor/tmp/llama.cpp/llama.h +34 -10
- data/vendor/tmp/llama.cpp/sgemm.cpp +134 -103
- data/vendor/tmp/llama.cpp/sgemm.h +4 -2
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1188 -656
- data/vendor/tmp/llama.cpp/unicode-data.h +4 -3
- data/vendor/tmp/llama.cpp/unicode.cpp +590 -49
- data/vendor/tmp/llama.cpp/unicode.h +6 -3
- metadata +3 -3
data/vendor/tmp/llama.cpp/ggml.h
CHANGED
@@ -326,14 +326,20 @@ extern "C" {
|
|
326
326
|
// get ggml_status name string
|
327
327
|
GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status);
|
328
328
|
|
329
|
+
// ieee 754-2008 half-precision float16
|
330
|
+
// todo: make this not an integral type
|
329
331
|
typedef uint16_t ggml_fp16_t;
|
330
|
-
|
331
|
-
|
332
|
-
GGML_API
|
333
|
-
GGML_API
|
334
|
-
|
335
|
-
|
336
|
-
|
332
|
+
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t);
|
333
|
+
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float);
|
334
|
+
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t *, float *, int64_t);
|
335
|
+
GGML_API void ggml_fp32_to_fp16_row(const float *, ggml_fp16_t *, int64_t);
|
336
|
+
|
337
|
+
// google brain half-precision bfloat16
|
338
|
+
typedef struct { uint16_t bits; } ggml_bf16_t;
|
339
|
+
GGML_API ggml_bf16_t ggml_fp32_to_bf16(float);
|
340
|
+
GGML_API float ggml_bf16_to_fp32(ggml_bf16_t); // consider just doing << 16
|
341
|
+
GGML_API void ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t);
|
342
|
+
GGML_API void ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t);
|
337
343
|
|
338
344
|
struct ggml_object;
|
339
345
|
struct ggml_context;
|
@@ -370,6 +376,7 @@ extern "C" {
|
|
370
376
|
GGML_TYPE_I64 = 27,
|
371
377
|
GGML_TYPE_F64 = 28,
|
372
378
|
GGML_TYPE_IQ1_M = 29,
|
379
|
+
GGML_TYPE_BF16 = 30,
|
373
380
|
GGML_TYPE_COUNT,
|
374
381
|
};
|
375
382
|
|
@@ -410,6 +417,7 @@ extern "C" {
|
|
410
417
|
GGML_FTYPE_MOSTLY_IQ2_S = 21, // except 1d tensors
|
411
418
|
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
|
412
419
|
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
|
420
|
+
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
|
413
421
|
};
|
414
422
|
|
415
423
|
// available tensor operations:
|
@@ -475,6 +483,7 @@ extern "C" {
|
|
475
483
|
GGML_OP_LEAKY_RELU,
|
476
484
|
|
477
485
|
GGML_OP_FLASH_ATTN,
|
486
|
+
GGML_OP_FLASH_ATTN_EXT,
|
478
487
|
GGML_OP_FLASH_FF,
|
479
488
|
GGML_OP_FLASH_ATTN_BACK,
|
480
489
|
GGML_OP_SSM_CONV,
|
@@ -762,6 +771,8 @@ extern "C" {
|
|
762
771
|
// use this to compute the memory overhead of a tensor
|
763
772
|
GGML_API size_t ggml_tensor_overhead(void);
|
764
773
|
|
774
|
+
GGML_API bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes);
|
775
|
+
|
765
776
|
// main
|
766
777
|
|
767
778
|
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|
@@ -1720,6 +1731,25 @@ extern "C" {
|
|
1720
1731
|
struct ggml_tensor * v,
|
1721
1732
|
bool masked);
|
1722
1733
|
|
1734
|
+
#define GGML_KQ_MASK_PAD 32
|
1735
|
+
|
1736
|
+
// q: [n_embd, n_batch, n_head, 1]
|
1737
|
+
// k: [n_embd, n_kv, n_head_kv, 1]
|
1738
|
+
// v: [n_embd, n_kv, n_head_kv, 1] !! not transposed !!
|
1739
|
+
// mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
|
1740
|
+
// res: [n_embd, n_head, n_batch, 1] !! permuted !!
|
1741
|
+
GGML_API struct ggml_tensor * ggml_flash_attn_ext(
|
1742
|
+
struct ggml_context * ctx,
|
1743
|
+
struct ggml_tensor * q,
|
1744
|
+
struct ggml_tensor * k,
|
1745
|
+
struct ggml_tensor * v,
|
1746
|
+
struct ggml_tensor * mask,
|
1747
|
+
float scale);
|
1748
|
+
|
1749
|
+
GGML_API void ggml_flash_attn_ext_set_prec(
|
1750
|
+
struct ggml_tensor * a,
|
1751
|
+
enum ggml_prec prec);
|
1752
|
+
|
1723
1753
|
GGML_API struct ggml_tensor * ggml_flash_attn_back(
|
1724
1754
|
struct ggml_context * ctx,
|
1725
1755
|
struct ggml_tensor * q,
|