llama_cpp 0.12.2 → 0.12.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +15 -0
- data/README.md +2 -2
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +68 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -2
- data/vendor/tmp/llama.cpp/Makefile +25 -3
- data/vendor/tmp/llama.cpp/ggml-alloc.c +87 -27
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +6 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +176 -18
- data/vendor/tmp/llama.cpp/ggml-backend.h +14 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +1990 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.h +46 -0
- data/vendor/tmp/llama.cpp/ggml-metal.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +144 -113
- data/vendor/tmp/llama.cpp/ggml-metal.metal +303 -4
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +95 -3
- data/vendor/tmp/llama.cpp/ggml-opencl.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +736 -59
- data/vendor/tmp/llama.cpp/ggml-quants.h +20 -1
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +15255 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.h +29 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +60854 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +5270 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +34 -0
- data/vendor/tmp/llama.cpp/ggml.c +664 -117
- data/vendor/tmp/llama.cpp/ggml.h +46 -11
- data/vendor/tmp/llama.cpp/llama.cpp +1426 -341
- data/vendor/tmp/llama.cpp/llama.h +24 -15
- data/vendor/tmp/llama.cpp/unicode.h +2 -1
- metadata +10 -3
data/vendor/tmp/llama.cpp/ggml.h
CHANGED
@@ -353,6 +353,7 @@ extern "C" {
|
|
353
353
|
GGML_TYPE_Q8_K = 15,
|
354
354
|
GGML_TYPE_IQ2_XXS = 16,
|
355
355
|
GGML_TYPE_IQ2_XS = 17,
|
356
|
+
GGML_TYPE_IQ3_XXS = 18,
|
356
357
|
GGML_TYPE_I8,
|
357
358
|
GGML_TYPE_I16,
|
358
359
|
GGML_TYPE_I32,
|
@@ -389,6 +390,7 @@ extern "C" {
|
|
389
390
|
GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
|
390
391
|
GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
|
391
392
|
GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors
|
393
|
+
GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
|
392
394
|
};
|
393
395
|
|
394
396
|
// available tensor operations:
|
@@ -489,6 +491,8 @@ extern "C" {
|
|
489
491
|
GGML_UNARY_OP_GELU,
|
490
492
|
GGML_UNARY_OP_GELU_QUICK,
|
491
493
|
GGML_UNARY_OP_SILU,
|
494
|
+
GGML_UNARY_OP_HARDSWISH,
|
495
|
+
GGML_UNARY_OP_HARDSIGMOID,
|
492
496
|
|
493
497
|
GGML_UNARY_OP_COUNT,
|
494
498
|
};
|
@@ -1032,6 +1036,16 @@ extern "C" {
|
|
1032
1036
|
struct ggml_tensor * a,
|
1033
1037
|
struct ggml_tensor * b);
|
1034
1038
|
|
1039
|
+
// hardswish(x) = x * relu6(x + 3) / 6
|
1040
|
+
GGML_API struct ggml_tensor * ggml_hardswish(
|
1041
|
+
struct ggml_context * ctx,
|
1042
|
+
struct ggml_tensor * a);
|
1043
|
+
|
1044
|
+
// hardsigmoid(x) = relu6(x + 3) / 6
|
1045
|
+
GGML_API struct ggml_tensor * ggml_hardsigmoid(
|
1046
|
+
struct ggml_context * ctx,
|
1047
|
+
struct ggml_tensor * a);
|
1048
|
+
|
1035
1049
|
// normalize along rows
|
1036
1050
|
GGML_API struct ggml_tensor * ggml_norm(
|
1037
1051
|
struct ggml_context * ctx,
|
@@ -1481,7 +1495,19 @@ extern "C" {
|
|
1481
1495
|
int p1,
|
1482
1496
|
int d0,
|
1483
1497
|
int d1,
|
1484
|
-
bool is_2D
|
1498
|
+
bool is_2D,
|
1499
|
+
enum ggml_type dst_type);
|
1500
|
+
|
1501
|
+
GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
|
1502
|
+
struct ggml_context * ctx,
|
1503
|
+
struct ggml_tensor * a,
|
1504
|
+
struct ggml_tensor * b,
|
1505
|
+
int s0,
|
1506
|
+
int s1,
|
1507
|
+
int p0,
|
1508
|
+
int p1,
|
1509
|
+
int d0,
|
1510
|
+
int d1);
|
1485
1511
|
|
1486
1512
|
GGML_API struct ggml_tensor * ggml_conv_1d(
|
1487
1513
|
struct ggml_context * ctx,
|
@@ -2065,6 +2091,18 @@ extern "C" {
|
|
2065
2091
|
// quantization
|
2066
2092
|
//
|
2067
2093
|
|
2094
|
+
// - ggml_quantize_init can be called multiple times with the same type
|
2095
|
+
// it will only initialize the quantization tables for the first call or after ggml_quantize_free
|
2096
|
+
// automatically called by ggml_quantize_chunk for convenience
|
2097
|
+
//
|
2098
|
+
// - ggml_quantize_free will free any memory allocated by ggml_quantize_init
|
2099
|
+
// call this at the end of the program to avoid memory leaks
|
2100
|
+
//
|
2101
|
+
// note: these are thread-safe
|
2102
|
+
//
|
2103
|
+
GGML_API void ggml_quantize_init(enum ggml_type type);
|
2104
|
+
GGML_API void ggml_quantize_free(void);
|
2105
|
+
|
2068
2106
|
// TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
|
2069
2107
|
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
2070
2108
|
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
@@ -2078,19 +2116,13 @@ extern "C" {
|
|
2078
2116
|
GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
2079
2117
|
GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
2080
2118
|
|
2119
|
+
// some quantization type cannot be used without an importance matrix
|
2120
|
+
GGML_API bool ggml_quantize_requires_imatrix(enum ggml_type type);
|
2121
|
+
|
2122
|
+
// calls ggml_quantize_init internally (i.e. can allocate memory)
|
2081
2123
|
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst,
|
2082
2124
|
int start, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
2083
2125
|
|
2084
|
-
// These are needed for IQ2_XS and IQ2_XXS quantizations
|
2085
|
-
GGML_API void ggml_init_iq2_quantization(enum ggml_type type);
|
2086
|
-
GGML_API void ggml_deinit_iq2_quantization(enum ggml_type type);
|
2087
|
-
|
2088
|
-
//
|
2089
|
-
// Importance matrix
|
2090
|
-
//
|
2091
|
-
typedef void(*ggml_collect_imatrix_t)(const struct ggml_tensor * src0, const struct ggml_tensor * src1);
|
2092
|
-
GGML_API void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect);
|
2093
|
-
|
2094
2126
|
//
|
2095
2127
|
// gguf
|
2096
2128
|
//
|
@@ -2234,9 +2266,12 @@ extern "C" {
|
|
2234
2266
|
GGML_API int ggml_cpu_has_blas (void);
|
2235
2267
|
GGML_API int ggml_cpu_has_cublas (void);
|
2236
2268
|
GGML_API int ggml_cpu_has_clblast (void);
|
2269
|
+
GGML_API int ggml_cpu_has_vulkan (void);
|
2270
|
+
GGML_API int ggml_cpu_has_kompute (void);
|
2237
2271
|
GGML_API int ggml_cpu_has_gpublas (void);
|
2238
2272
|
GGML_API int ggml_cpu_has_sse3 (void);
|
2239
2273
|
GGML_API int ggml_cpu_has_ssse3 (void);
|
2274
|
+
GGML_API int ggml_cpu_has_sycl (void);
|
2240
2275
|
GGML_API int ggml_cpu_has_vsx (void);
|
2241
2276
|
|
2242
2277
|
//
|