llama_cpp 0.9.0 → 0.9.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/ext/llama_cpp/extconf.rb +3 -11
- data/ext/llama_cpp/llama_cpp.cpp +147 -3
- data/ext/llama_cpp/src/ggml-cuda.cu +288 -92
- data/ext/llama_cpp/src/ggml-impl.h +237 -0
- data/ext/llama_cpp/src/ggml-metal.m +58 -37
- data/ext/llama_cpp/src/ggml-metal.metal +162 -34
- data/ext/llama_cpp/src/{k_quants.c → ggml-quants.c} +3329 -1099
- data/ext/llama_cpp/src/{k_quants.h → ggml-quants.h} +81 -22
- data/ext/llama_cpp/src/ggml.c +939 -3333
- data/ext/llama_cpp/src/ggml.h +25 -4
- data/ext/llama_cpp/src/llama.cpp +1819 -2554
- data/ext/llama_cpp/src/llama.h +32 -12
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +23 -2
- metadata +5 -4
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -219,7 +219,7 @@
|
|
219
219
|
#define GGML_MAX_CONTEXTS 64
|
220
220
|
#define GGML_MAX_SRC 6
|
221
221
|
#define GGML_MAX_NAME 64
|
222
|
-
#define GGML_MAX_OP_PARAMS
|
222
|
+
#define GGML_MAX_OP_PARAMS 64
|
223
223
|
#define GGML_DEFAULT_N_THREADS 4
|
224
224
|
|
225
225
|
#if UINTPTR_MAX == 0xFFFFFFFF
|
@@ -709,7 +709,7 @@ extern "C" {
|
|
709
709
|
// Context tensor enumeration and lookup
|
710
710
|
GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
|
711
711
|
GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
|
712
|
-
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
712
|
+
GGML_API struct ggml_tensor * ggml_get_tensor (struct ggml_context * ctx, const char * name);
|
713
713
|
|
714
714
|
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
715
715
|
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
@@ -1326,8 +1326,13 @@ extern "C" {
|
|
1326
1326
|
int n_dims,
|
1327
1327
|
int mode,
|
1328
1328
|
int n_ctx,
|
1329
|
+
int n_orig_ctx,
|
1329
1330
|
float freq_base,
|
1330
|
-
float freq_scale
|
1331
|
+
float freq_scale,
|
1332
|
+
float ext_factor,
|
1333
|
+
float attn_factor,
|
1334
|
+
float beta_fast,
|
1335
|
+
float beta_slow);
|
1331
1336
|
|
1332
1337
|
// in-place, returns view(a)
|
1333
1338
|
GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
|
@@ -1337,8 +1342,17 @@ extern "C" {
|
|
1337
1342
|
int n_dims,
|
1338
1343
|
int mode,
|
1339
1344
|
int n_ctx,
|
1345
|
+
int n_orig_ctx,
|
1340
1346
|
float freq_base,
|
1341
|
-
float freq_scale
|
1347
|
+
float freq_scale,
|
1348
|
+
float ext_factor,
|
1349
|
+
float attn_factor,
|
1350
|
+
float beta_fast,
|
1351
|
+
float beta_slow);
|
1352
|
+
|
1353
|
+
// compute correction dims for YaRN RoPE scaling
|
1354
|
+
void ggml_rope_yarn_corr_dims(
|
1355
|
+
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
|
1342
1356
|
|
1343
1357
|
// xPos RoPE, in-place, returns view(a)
|
1344
1358
|
GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
|
@@ -1930,12 +1944,19 @@ extern "C" {
|
|
1930
1944
|
// quantization
|
1931
1945
|
//
|
1932
1946
|
|
1947
|
+
// TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
|
1933
1948
|
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
1934
1949
|
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
1935
1950
|
GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
1936
1951
|
GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
1937
1952
|
GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
1938
1953
|
|
1954
|
+
GGML_API size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
1955
|
+
GGML_API size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
1956
|
+
GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
1957
|
+
GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
1958
|
+
GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
1959
|
+
|
1939
1960
|
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
|
1940
1961
|
|
1941
1962
|
//
|