llama_cpp 0.14.0 → 0.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/llama_cpp.cpp +71 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +9 -0
- data/vendor/tmp/llama.cpp/Makefile +28 -12
- data/vendor/tmp/llama.cpp/ggml-alloc.c +45 -64
- data/vendor/tmp/llama.cpp/ggml-alloc.h +13 -5
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +14 -3
- data/vendor/tmp/llama.cpp/ggml-backend.c +358 -135
- data/vendor/tmp/llama.cpp/ggml-backend.h +41 -17
- data/vendor/tmp/llama.cpp/ggml-common.h +1830 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +187 -1033
- data/vendor/tmp/llama.cpp/ggml-impl.h +6 -2
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +42 -20
- data/vendor/tmp/llama.cpp/ggml-metal.metal +44 -910
- data/vendor/tmp/llama.cpp/ggml-quants.c +457 -1074
- data/vendor/tmp/llama.cpp/ggml-quants.h +27 -259
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +388 -565
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +6 -39
- data/vendor/tmp/llama.cpp/ggml.c +509 -343
- data/vendor/tmp/llama.cpp/ggml.h +61 -47
- data/vendor/tmp/llama.cpp/llama.cpp +1446 -687
- data/vendor/tmp/llama.cpp/llama.h +25 -11
- data/vendor/tmp/llama.cpp/unicode.cpp +1672 -0
- data/vendor/tmp/llama.cpp/unicode.h +16 -774
- metadata +4 -2
data/vendor/tmp/llama.cpp/ggml.h
CHANGED
@@ -337,24 +337,24 @@ extern "C" {
|
|
337
337
|
struct ggml_object;
|
338
338
|
struct ggml_context;
|
339
339
|
|
340
|
+
// NOTE: always add types at the end of the enum to keep backward compatibility
|
340
341
|
enum ggml_type {
|
341
|
-
GGML_TYPE_F32
|
342
|
-
GGML_TYPE_F16
|
343
|
-
GGML_TYPE_Q4_0
|
344
|
-
GGML_TYPE_Q4_1
|
342
|
+
GGML_TYPE_F32 = 0,
|
343
|
+
GGML_TYPE_F16 = 1,
|
344
|
+
GGML_TYPE_Q4_0 = 2,
|
345
|
+
GGML_TYPE_Q4_1 = 3,
|
345
346
|
// GGML_TYPE_Q4_2 = 4, support has been removed
|
346
|
-
// GGML_TYPE_Q4_3
|
347
|
-
GGML_TYPE_Q5_0
|
348
|
-
GGML_TYPE_Q5_1
|
349
|
-
GGML_TYPE_Q8_0
|
350
|
-
GGML_TYPE_Q8_1
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
GGML_TYPE_Q8_K = 15,
|
347
|
+
// GGML_TYPE_Q4_3 = 5, support has been removed
|
348
|
+
GGML_TYPE_Q5_0 = 6,
|
349
|
+
GGML_TYPE_Q5_1 = 7,
|
350
|
+
GGML_TYPE_Q8_0 = 8,
|
351
|
+
GGML_TYPE_Q8_1 = 9,
|
352
|
+
GGML_TYPE_Q2_K = 10,
|
353
|
+
GGML_TYPE_Q3_K = 11,
|
354
|
+
GGML_TYPE_Q4_K = 12,
|
355
|
+
GGML_TYPE_Q5_K = 13,
|
356
|
+
GGML_TYPE_Q6_K = 14,
|
357
|
+
GGML_TYPE_Q8_K = 15,
|
358
358
|
GGML_TYPE_IQ2_XXS = 16,
|
359
359
|
GGML_TYPE_IQ2_XS = 17,
|
360
360
|
GGML_TYPE_IQ3_XXS = 18,
|
@@ -363,9 +363,11 @@ extern "C" {
|
|
363
363
|
GGML_TYPE_IQ3_S = 21,
|
364
364
|
GGML_TYPE_IQ2_S = 22,
|
365
365
|
GGML_TYPE_IQ4_XS = 23,
|
366
|
-
GGML_TYPE_I8,
|
367
|
-
GGML_TYPE_I16,
|
368
|
-
GGML_TYPE_I32,
|
366
|
+
GGML_TYPE_I8 = 24,
|
367
|
+
GGML_TYPE_I16 = 25,
|
368
|
+
GGML_TYPE_I32 = 26,
|
369
|
+
GGML_TYPE_I64 = 27,
|
370
|
+
GGML_TYPE_F64 = 28,
|
369
371
|
GGML_TYPE_COUNT,
|
370
372
|
};
|
371
373
|
|
@@ -383,20 +385,20 @@ extern "C" {
|
|
383
385
|
|
384
386
|
// model file types
|
385
387
|
enum ggml_ftype {
|
386
|
-
GGML_FTYPE_UNKNOWN
|
387
|
-
GGML_FTYPE_ALL_F32
|
388
|
-
GGML_FTYPE_MOSTLY_F16
|
389
|
-
GGML_FTYPE_MOSTLY_Q4_0
|
390
|
-
GGML_FTYPE_MOSTLY_Q4_1
|
388
|
+
GGML_FTYPE_UNKNOWN = -1,
|
389
|
+
GGML_FTYPE_ALL_F32 = 0,
|
390
|
+
GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
391
|
+
GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
392
|
+
GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
391
393
|
GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
392
|
-
GGML_FTYPE_MOSTLY_Q8_0
|
393
|
-
GGML_FTYPE_MOSTLY_Q5_0
|
394
|
-
GGML_FTYPE_MOSTLY_Q5_1
|
395
|
-
GGML_FTYPE_MOSTLY_Q2_K
|
396
|
-
GGML_FTYPE_MOSTLY_Q3_K
|
397
|
-
GGML_FTYPE_MOSTLY_Q4_K
|
398
|
-
GGML_FTYPE_MOSTLY_Q5_K
|
399
|
-
GGML_FTYPE_MOSTLY_Q6_K
|
394
|
+
GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
395
|
+
GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
396
|
+
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
397
|
+
GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
|
398
|
+
GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
|
399
|
+
GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
|
400
|
+
GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
|
401
|
+
GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
|
400
402
|
GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
|
401
403
|
GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors
|
402
404
|
GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
|
@@ -472,6 +474,8 @@ extern "C" {
|
|
472
474
|
GGML_OP_FLASH_ATTN,
|
473
475
|
GGML_OP_FLASH_FF,
|
474
476
|
GGML_OP_FLASH_ATTN_BACK,
|
477
|
+
GGML_OP_SSM_CONV,
|
478
|
+
GGML_OP_SSM_SCAN,
|
475
479
|
GGML_OP_WIN_PART,
|
476
480
|
GGML_OP_WIN_UNPART,
|
477
481
|
GGML_OP_GET_REL_POS,
|
@@ -1728,6 +1732,23 @@ extern "C" {
|
|
1728
1732
|
struct ggml_tensor * c0,
|
1729
1733
|
struct ggml_tensor * c1);
|
1730
1734
|
|
1735
|
+
GGML_API struct ggml_tensor * ggml_ssm_conv(
|
1736
|
+
struct ggml_context * ctx,
|
1737
|
+
struct ggml_tensor * s,
|
1738
|
+
struct ggml_tensor * x,
|
1739
|
+
struct ggml_tensor * c,
|
1740
|
+
struct ggml_tensor * sq);
|
1741
|
+
|
1742
|
+
GGML_API struct ggml_tensor * ggml_ssm_scan(
|
1743
|
+
struct ggml_context * ctx,
|
1744
|
+
struct ggml_tensor * s,
|
1745
|
+
struct ggml_tensor * x,
|
1746
|
+
struct ggml_tensor * dt,
|
1747
|
+
struct ggml_tensor * A,
|
1748
|
+
struct ggml_tensor * B,
|
1749
|
+
struct ggml_tensor * C,
|
1750
|
+
struct ggml_tensor * sq);
|
1751
|
+
|
1731
1752
|
// partition into non-overlapping windows with padding if needed
|
1732
1753
|
// example:
|
1733
1754
|
// a: 768 64 64 1
|
@@ -2175,25 +2196,18 @@ extern "C" {
|
|
2175
2196
|
GGML_API void ggml_quantize_init(enum ggml_type type);
|
2176
2197
|
GGML_API void ggml_quantize_free(void);
|
2177
2198
|
|
2178
|
-
// TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
|
2179
|
-
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
2180
|
-
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
2181
|
-
GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
2182
|
-
GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
2183
|
-
GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
2184
|
-
|
2185
|
-
GGML_API size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
2186
|
-
GGML_API size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
2187
|
-
GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
2188
|
-
GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
2189
|
-
GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
2190
|
-
|
2191
2199
|
// some quantization type cannot be used without an importance matrix
|
2192
2200
|
GGML_API bool ggml_quantize_requires_imatrix(enum ggml_type type);
|
2193
2201
|
|
2194
2202
|
// calls ggml_quantize_init internally (i.e. can allocate memory)
|
2195
|
-
GGML_API size_t ggml_quantize_chunk(
|
2196
|
-
|
2203
|
+
GGML_API size_t ggml_quantize_chunk(
|
2204
|
+
enum ggml_type type,
|
2205
|
+
const float * src,
|
2206
|
+
void * dst,
|
2207
|
+
int start,
|
2208
|
+
int nrows,
|
2209
|
+
int n_per_row,
|
2210
|
+
const float * imatrix);
|
2197
2211
|
|
2198
2212
|
//
|
2199
2213
|
// gguf
|