llama_cpp 0.9.0 → 0.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/ext/llama_cpp/extconf.rb +3 -11
- data/ext/llama_cpp/llama_cpp.cpp +147 -3
- data/ext/llama_cpp/src/ggml-cuda.cu +288 -92
- data/ext/llama_cpp/src/ggml-impl.h +237 -0
- data/ext/llama_cpp/src/ggml-metal.m +58 -37
- data/ext/llama_cpp/src/ggml-metal.metal +162 -34
- data/ext/llama_cpp/src/{k_quants.c → ggml-quants.c} +3329 -1099
- data/ext/llama_cpp/src/{k_quants.h → ggml-quants.h} +81 -22
- data/ext/llama_cpp/src/ggml.c +939 -3333
- data/ext/llama_cpp/src/ggml.h +25 -4
- data/ext/llama_cpp/src/llama.cpp +1819 -2554
- data/ext/llama_cpp/src/llama.h +32 -12
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +23 -2
- metadata +5 -4
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -219,7 +219,7 @@
|
|
219
219
|
#define GGML_MAX_CONTEXTS 64
|
220
220
|
#define GGML_MAX_SRC 6
|
221
221
|
#define GGML_MAX_NAME 64
|
222
|
-
#define GGML_MAX_OP_PARAMS
|
222
|
+
#define GGML_MAX_OP_PARAMS 64
|
223
223
|
#define GGML_DEFAULT_N_THREADS 4
|
224
224
|
|
225
225
|
#if UINTPTR_MAX == 0xFFFFFFFF
|
@@ -709,7 +709,7 @@ extern "C" {
|
|
709
709
|
// Context tensor enumeration and lookup
|
710
710
|
GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
|
711
711
|
GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
|
712
|
-
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
712
|
+
GGML_API struct ggml_tensor * ggml_get_tensor (struct ggml_context * ctx, const char * name);
|
713
713
|
|
714
714
|
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
715
715
|
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
@@ -1326,8 +1326,13 @@ extern "C" {
|
|
1326
1326
|
int n_dims,
|
1327
1327
|
int mode,
|
1328
1328
|
int n_ctx,
|
1329
|
+
int n_orig_ctx,
|
1329
1330
|
float freq_base,
|
1330
|
-
float freq_scale
|
1331
|
+
float freq_scale,
|
1332
|
+
float ext_factor,
|
1333
|
+
float attn_factor,
|
1334
|
+
float beta_fast,
|
1335
|
+
float beta_slow);
|
1331
1336
|
|
1332
1337
|
// in-place, returns view(a)
|
1333
1338
|
GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
|
@@ -1337,8 +1342,17 @@ extern "C" {
|
|
1337
1342
|
int n_dims,
|
1338
1343
|
int mode,
|
1339
1344
|
int n_ctx,
|
1345
|
+
int n_orig_ctx,
|
1340
1346
|
float freq_base,
|
1341
|
-
float freq_scale
|
1347
|
+
float freq_scale,
|
1348
|
+
float ext_factor,
|
1349
|
+
float attn_factor,
|
1350
|
+
float beta_fast,
|
1351
|
+
float beta_slow);
|
1352
|
+
|
1353
|
+
// compute correction dims for YaRN RoPE scaling
|
1354
|
+
void ggml_rope_yarn_corr_dims(
|
1355
|
+
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
|
1342
1356
|
|
1343
1357
|
// xPos RoPE, in-place, returns view(a)
|
1344
1358
|
GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
|
@@ -1930,12 +1944,19 @@ extern "C" {
|
|
1930
1944
|
// quantization
|
1931
1945
|
//
|
1932
1946
|
|
1947
|
+
// TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
|
1933
1948
|
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
1934
1949
|
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
1935
1950
|
GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
1936
1951
|
GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
1937
1952
|
GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
1938
1953
|
|
1954
|
+
GGML_API size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
1955
|
+
GGML_API size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
1956
|
+
GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
1957
|
+
GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
1958
|
+
GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
1959
|
+
|
1939
1960
|
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
|
1940
1961
|
|
1941
1962
|
//
|