llama_cpp 0.12.0 → 0.12.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +78 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +11 -0
- data/vendor/tmp/llama.cpp/Makefile +7 -10
- data/vendor/tmp/llama.cpp/ggml-alloc.c +28 -6
- data/vendor/tmp/llama.cpp/ggml-alloc.h +3 -1
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +36 -36
- data/vendor/tmp/llama.cpp/ggml-backend.c +512 -261
- data/vendor/tmp/llama.cpp/ggml-backend.h +43 -33
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +1494 -559
- data/vendor/tmp/llama.cpp/ggml-cuda.h +18 -30
- data/vendor/tmp/llama.cpp/ggml-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-metal.h +4 -56
- data/vendor/tmp/llama.cpp/ggml-metal.m +1868 -2002
- data/vendor/tmp/llama.cpp/ggml-metal.metal +692 -8
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +321 -14
- data/vendor/tmp/llama.cpp/ggml-opencl.h +13 -3
- data/vendor/tmp/llama.cpp/ggml-quants.c +2182 -44
- data/vendor/tmp/llama.cpp/ggml-quants.h +36 -1
- data/vendor/tmp/llama.cpp/ggml.c +222 -105
- data/vendor/tmp/llama.cpp/ggml.h +56 -35
- data/vendor/tmp/llama.cpp/llama.cpp +1271 -1618
- data/vendor/tmp/llama.cpp/llama.h +44 -8
- metadata +2 -2
data/vendor/tmp/llama.cpp/ggml.h
CHANGED
|
@@ -187,6 +187,16 @@
|
|
|
187
187
|
# define GGML_API
|
|
188
188
|
#endif
|
|
189
189
|
|
|
190
|
+
#ifdef GGML_MULTIPLATFORM
|
|
191
|
+
# if defined(_WIN32)
|
|
192
|
+
# define GGML_CALL
|
|
193
|
+
# else
|
|
194
|
+
# define GGML_CALL __attribute__((__ms_abi__))
|
|
195
|
+
# endif
|
|
196
|
+
#else
|
|
197
|
+
# define GGML_CALL
|
|
198
|
+
#endif
|
|
199
|
+
|
|
190
200
|
// TODO: support for clang
|
|
191
201
|
#ifdef __GNUC__
|
|
192
202
|
# define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
|
|
@@ -218,7 +228,9 @@
|
|
|
218
228
|
#define GGML_MAX_PARAMS 2048
|
|
219
229
|
#define GGML_MAX_CONTEXTS 64
|
|
220
230
|
#define GGML_MAX_SRC 10
|
|
231
|
+
#ifndef GGML_MAX_NAME
|
|
221
232
|
#define GGML_MAX_NAME 64
|
|
233
|
+
#endif
|
|
222
234
|
#define GGML_MAX_OP_PARAMS 64
|
|
223
235
|
#define GGML_DEFAULT_N_THREADS 4
|
|
224
236
|
#define GGML_DEFAULT_GRAPH_SIZE 2048
|
|
@@ -339,6 +351,8 @@ extern "C" {
|
|
|
339
351
|
GGML_TYPE_Q5_K = 13,
|
|
340
352
|
GGML_TYPE_Q6_K = 14,
|
|
341
353
|
GGML_TYPE_Q8_K = 15,
|
|
354
|
+
GGML_TYPE_IQ2_XXS = 16,
|
|
355
|
+
GGML_TYPE_IQ2_XS = 17,
|
|
342
356
|
GGML_TYPE_I8,
|
|
343
357
|
GGML_TYPE_I16,
|
|
344
358
|
GGML_TYPE_I32,
|
|
@@ -373,6 +387,8 @@ extern "C" {
|
|
|
373
387
|
GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
|
|
374
388
|
GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
|
|
375
389
|
GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
|
|
390
|
+
GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
|
|
391
|
+
GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors
|
|
376
392
|
};
|
|
377
393
|
|
|
378
394
|
// available tensor operations:
|
|
@@ -643,41 +659,41 @@ extern "C" {
|
|
|
643
659
|
GGML_API void ggml_print_object (const struct ggml_object * obj);
|
|
644
660
|
GGML_API void ggml_print_objects(const struct ggml_context * ctx);
|
|
645
661
|
|
|
646
|
-
GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
|
|
647
|
-
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
|
648
|
-
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
|
649
|
-
GGML_API
|
|
662
|
+
GGML_API GGML_CALL int64_t ggml_nelements (const struct ggml_tensor * tensor);
|
|
663
|
+
GGML_API GGML_CALL int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
|
664
|
+
GGML_API GGML_CALL size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
|
665
|
+
GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
|
|
650
666
|
|
|
651
|
-
GGML_API int ggml_blck_size(enum ggml_type type);
|
|
652
|
-
GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
|
|
653
|
-
GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
|
|
667
|
+
GGML_API GGML_CALL int ggml_blck_size(enum ggml_type type);
|
|
668
|
+
GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
|
|
669
|
+
GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
|
|
654
670
|
|
|
655
671
|
GGML_DEPRECATED(
|
|
656
672
|
GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
|
|
657
673
|
"use ggml_row_size() instead");
|
|
658
674
|
|
|
659
|
-
GGML_API const char * ggml_type_name(enum ggml_type type);
|
|
660
|
-
GGML_API const char * ggml_op_name (enum ggml_op op);
|
|
661
|
-
GGML_API
|
|
675
|
+
GGML_API GGML_CALL const char * ggml_type_name(enum ggml_type type);
|
|
676
|
+
GGML_API GGML_CALL const char * ggml_op_name (enum ggml_op op);
|
|
677
|
+
GGML_API const char * ggml_op_symbol(enum ggml_op op);
|
|
662
678
|
|
|
663
|
-
GGML_API
|
|
664
|
-
GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
|
|
679
|
+
GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
|
|
680
|
+
GGML_API GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
|
|
665
681
|
|
|
666
|
-
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
|
|
682
|
+
GGML_API GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor);
|
|
667
683
|
|
|
668
|
-
GGML_API bool ggml_is_quantized(enum ggml_type type);
|
|
684
|
+
GGML_API GGML_CALL bool ggml_is_quantized(enum ggml_type type);
|
|
669
685
|
|
|
670
686
|
// TODO: temporary until model loading of ggml examples is refactored
|
|
671
687
|
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
|
672
688
|
|
|
673
|
-
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
|
674
|
-
GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
|
675
|
-
GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
|
676
|
-
GGML_API
|
|
677
|
-
GGML_API
|
|
678
|
-
GGML_API
|
|
679
|
-
GGML_API
|
|
680
|
-
GGML_API
|
|
689
|
+
GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
|
690
|
+
GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
|
691
|
+
GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
|
692
|
+
GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
|
|
693
|
+
GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
|
|
694
|
+
GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
|
|
695
|
+
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
|
696
|
+
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
|
681
697
|
|
|
682
698
|
GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
|
683
699
|
|
|
@@ -764,7 +780,7 @@ extern "C" {
|
|
|
764
780
|
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
|
765
781
|
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
|
766
782
|
|
|
767
|
-
GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
|
|
783
|
+
GGML_API GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
|
|
768
784
|
|
|
769
785
|
GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
|
|
770
786
|
GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
|
|
@@ -1159,22 +1175,16 @@ extern "C" {
|
|
|
1159
1175
|
struct ggml_tensor * a,
|
|
1160
1176
|
struct ggml_tensor * b);
|
|
1161
1177
|
|
|
1162
|
-
|
|
1163
|
-
GGML_API struct ggml_tensor * ggml_cpy_inplace(
|
|
1178
|
+
GGML_API struct ggml_tensor * ggml_cast(
|
|
1164
1179
|
struct ggml_context * ctx,
|
|
1165
1180
|
struct ggml_tensor * a,
|
|
1166
|
-
|
|
1181
|
+
enum ggml_type type);
|
|
1167
1182
|
|
|
1168
1183
|
// make contiguous
|
|
1169
1184
|
GGML_API struct ggml_tensor * ggml_cont(
|
|
1170
1185
|
struct ggml_context * ctx,
|
|
1171
1186
|
struct ggml_tensor * a);
|
|
1172
1187
|
|
|
1173
|
-
// make contiguous, in-place
|
|
1174
|
-
GGML_API struct ggml_tensor * ggml_cont_inplace(
|
|
1175
|
-
struct ggml_context * ctx,
|
|
1176
|
-
struct ggml_tensor * a);
|
|
1177
|
-
|
|
1178
1188
|
// make contiguous, with new shape
|
|
1179
1189
|
GGML_API struct ggml_tensor * ggml_cont_1d(
|
|
1180
1190
|
struct ggml_context * ctx,
|
|
@@ -1413,7 +1423,7 @@ extern "C" {
|
|
|
1413
1423
|
float beta_slow);
|
|
1414
1424
|
|
|
1415
1425
|
// compute correction dims for YaRN RoPE scaling
|
|
1416
|
-
void ggml_rope_yarn_corr_dims(
|
|
1426
|
+
GGML_CALL void ggml_rope_yarn_corr_dims(
|
|
1417
1427
|
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
|
|
1418
1428
|
|
|
1419
1429
|
// xPos RoPE, in-place, returns view(a)
|
|
@@ -1847,8 +1857,8 @@ extern "C" {
|
|
|
1847
1857
|
|
|
1848
1858
|
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
|
1849
1859
|
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
|
1850
|
-
GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
|
1851
|
-
GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
|
1860
|
+
GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
|
1861
|
+
GGML_API int ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
|
1852
1862
|
|
|
1853
1863
|
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
|
1854
1864
|
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
|
@@ -2068,7 +2078,18 @@ extern "C" {
|
|
|
2068
2078
|
GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
|
2069
2079
|
GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
|
2070
2080
|
|
|
2071
|
-
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst,
|
|
2081
|
+
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst,
|
|
2082
|
+
int start, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
|
2083
|
+
|
|
2084
|
+
// These are needed for IQ2_XS and IQ2_XXS quantizations
|
|
2085
|
+
GGML_API void ggml_init_iq2_quantization(enum ggml_type type);
|
|
2086
|
+
GGML_API void ggml_deinit_iq2_quantization(enum ggml_type type);
|
|
2087
|
+
|
|
2088
|
+
//
|
|
2089
|
+
// Importance matrix
|
|
2090
|
+
//
|
|
2091
|
+
typedef void(*ggml_collect_imatrix_t)(const struct ggml_tensor * src0, const struct ggml_tensor * src1);
|
|
2092
|
+
GGML_API void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect);
|
|
2072
2093
|
|
|
2073
2094
|
//
|
|
2074
2095
|
// gguf
|