llama_cpp 0.14.7 → 0.15.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +19 -0
- data/README.md +2 -2
- data/ext/llama_cpp/extconf.rb +2 -1
- data/ext/llama_cpp/llama_cpp.cpp +59 -9
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +24 -3
- data/vendor/tmp/llama.cpp/Makefile +42 -18
- data/vendor/tmp/llama.cpp/ggml-backend.c +7 -5
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +295 -17
- data/vendor/tmp/llama.cpp/ggml-impl.h +78 -1
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +399 -184
- data/vendor/tmp/llama.cpp/ggml-metal.metal +654 -18
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +1 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +302 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +28 -16
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +46843 -39205
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +951 -263
- data/vendor/tmp/llama.cpp/ggml.c +1457 -92
- data/vendor/tmp/llama.cpp/ggml.h +37 -7
- data/vendor/tmp/llama.cpp/llama.cpp +671 -403
- data/vendor/tmp/llama.cpp/llama.h +34 -10
- data/vendor/tmp/llama.cpp/sgemm.cpp +134 -103
- data/vendor/tmp/llama.cpp/sgemm.h +4 -2
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1188 -656
- data/vendor/tmp/llama.cpp/unicode-data.h +4 -3
- data/vendor/tmp/llama.cpp/unicode.cpp +590 -49
- data/vendor/tmp/llama.cpp/unicode.h +6 -3
- metadata +3 -3
data/vendor/tmp/llama.cpp/ggml.h
CHANGED
@@ -326,14 +326,20 @@ extern "C" {
|
|
326
326
|
// get ggml_status name string
|
327
327
|
GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status);
|
328
328
|
|
329
|
+
// ieee 754-2008 half-precision float16
|
330
|
+
// todo: make this not an integral type
|
329
331
|
typedef uint16_t ggml_fp16_t;
|
330
|
-
|
331
|
-
|
332
|
-
GGML_API
|
333
|
-
GGML_API
|
334
|
-
|
335
|
-
|
336
|
-
|
332
|
+
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t);
|
333
|
+
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float);
|
334
|
+
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t *, float *, int64_t);
|
335
|
+
GGML_API void ggml_fp32_to_fp16_row(const float *, ggml_fp16_t *, int64_t);
|
336
|
+
|
337
|
+
// google brain half-precision bfloat16
|
338
|
+
typedef struct { uint16_t bits; } ggml_bf16_t;
|
339
|
+
GGML_API ggml_bf16_t ggml_fp32_to_bf16(float);
|
340
|
+
GGML_API float ggml_bf16_to_fp32(ggml_bf16_t); // consider just doing << 16
|
341
|
+
GGML_API void ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t);
|
342
|
+
GGML_API void ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t);
|
337
343
|
|
338
344
|
struct ggml_object;
|
339
345
|
struct ggml_context;
|
@@ -370,6 +376,7 @@ extern "C" {
|
|
370
376
|
GGML_TYPE_I64 = 27,
|
371
377
|
GGML_TYPE_F64 = 28,
|
372
378
|
GGML_TYPE_IQ1_M = 29,
|
379
|
+
GGML_TYPE_BF16 = 30,
|
373
380
|
GGML_TYPE_COUNT,
|
374
381
|
};
|
375
382
|
|
@@ -410,6 +417,7 @@ extern "C" {
|
|
410
417
|
GGML_FTYPE_MOSTLY_IQ2_S = 21, // except 1d tensors
|
411
418
|
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
|
412
419
|
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
|
420
|
+
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
|
413
421
|
};
|
414
422
|
|
415
423
|
// available tensor operations:
|
@@ -475,6 +483,7 @@ extern "C" {
|
|
475
483
|
GGML_OP_LEAKY_RELU,
|
476
484
|
|
477
485
|
GGML_OP_FLASH_ATTN,
|
486
|
+
GGML_OP_FLASH_ATTN_EXT,
|
478
487
|
GGML_OP_FLASH_FF,
|
479
488
|
GGML_OP_FLASH_ATTN_BACK,
|
480
489
|
GGML_OP_SSM_CONV,
|
@@ -762,6 +771,8 @@ extern "C" {
|
|
762
771
|
// use this to compute the memory overhead of a tensor
|
763
772
|
GGML_API size_t ggml_tensor_overhead(void);
|
764
773
|
|
774
|
+
GGML_API bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes);
|
775
|
+
|
765
776
|
// main
|
766
777
|
|
767
778
|
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|
@@ -1720,6 +1731,25 @@ extern "C" {
|
|
1720
1731
|
struct ggml_tensor * v,
|
1721
1732
|
bool masked);
|
1722
1733
|
|
1734
|
+
#define GGML_KQ_MASK_PAD 32
|
1735
|
+
|
1736
|
+
// q: [n_embd, n_batch, n_head, 1]
|
1737
|
+
// k: [n_embd, n_kv, n_head_kv, 1]
|
1738
|
+
// v: [n_embd, n_kv, n_head_kv, 1] !! not transposed !!
|
1739
|
+
// mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
|
1740
|
+
// res: [n_embd, n_head, n_batch, 1] !! permuted !!
|
1741
|
+
GGML_API struct ggml_tensor * ggml_flash_attn_ext(
|
1742
|
+
struct ggml_context * ctx,
|
1743
|
+
struct ggml_tensor * q,
|
1744
|
+
struct ggml_tensor * k,
|
1745
|
+
struct ggml_tensor * v,
|
1746
|
+
struct ggml_tensor * mask,
|
1747
|
+
float scale);
|
1748
|
+
|
1749
|
+
GGML_API void ggml_flash_attn_ext_set_prec(
|
1750
|
+
struct ggml_tensor * a,
|
1751
|
+
enum ggml_prec prec);
|
1752
|
+
|
1723
1753
|
GGML_API struct ggml_tensor * ggml_flash_attn_back(
|
1724
1754
|
struct ggml_context * ctx,
|
1725
1755
|
struct ggml_tensor * q,
|