llama_cpp 0.8.0 → 0.9.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +19 -0
- data/examples/chat.rb +8 -6
- data/ext/llama_cpp/extconf.rb +3 -11
- data/ext/llama_cpp/llama_cpp.cpp +228 -165
- data/ext/llama_cpp/src/ggml-cuda.cu +441 -77
- data/ext/llama_cpp/src/ggml-impl.h +237 -0
- data/ext/llama_cpp/src/ggml-metal.m +71 -42
- data/ext/llama_cpp/src/ggml-metal.metal +171 -35
- data/ext/llama_cpp/src/ggml-opencl.cpp +161 -169
- data/ext/llama_cpp/src/{k_quants.c → ggml-quants.c} +3329 -1099
- data/ext/llama_cpp/src/{k_quants.h → ggml-quants.h} +81 -22
- data/ext/llama_cpp/src/ggml.c +1303 -3419
- data/ext/llama_cpp/src/ggml.h +33 -11
- data/ext/llama_cpp/src/llama.cpp +1925 -2655
- data/ext/llama_cpp/src/llama.h +48 -33
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +4 -4
- data/sig/llama_cpp.rbs +34 -14
- metadata +5 -4
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -219,7 +219,7 @@
|
|
219
219
|
#define GGML_MAX_CONTEXTS 64
|
220
220
|
#define GGML_MAX_SRC 6
|
221
221
|
#define GGML_MAX_NAME 64
|
222
|
-
#define GGML_MAX_OP_PARAMS
|
222
|
+
#define GGML_MAX_OP_PARAMS 64
|
223
223
|
#define GGML_DEFAULT_N_THREADS 4
|
224
224
|
|
225
225
|
#if UINTPTR_MAX == 0xFFFFFFFF
|
@@ -401,15 +401,16 @@ extern "C" {
|
|
401
401
|
GGML_OP_ALIBI,
|
402
402
|
GGML_OP_CLAMP,
|
403
403
|
GGML_OP_CONV_1D,
|
404
|
-
|
404
|
+
GGML_OP_CONV_1D_STAGE_0, // internal
|
405
|
+
GGML_OP_CONV_1D_STAGE_1, // internal
|
405
406
|
GGML_OP_CONV_TRANSPOSE_1D,
|
407
|
+
GGML_OP_CONV_2D,
|
408
|
+
GGML_OP_CONV_2D_STAGE_0, // internal
|
409
|
+
GGML_OP_CONV_2D_STAGE_1, // internal
|
406
410
|
GGML_OP_CONV_TRANSPOSE_2D,
|
407
411
|
GGML_OP_POOL_1D,
|
408
412
|
GGML_OP_POOL_2D,
|
409
413
|
|
410
|
-
GGML_OP_CONV_1D_STAGE_0, // internal
|
411
|
-
GGML_OP_CONV_1D_STAGE_1, // internal
|
412
|
-
|
413
414
|
GGML_OP_UPSCALE, // nearest interpolate
|
414
415
|
|
415
416
|
GGML_OP_FLASH_ATTN,
|
@@ -708,7 +709,7 @@ extern "C" {
|
|
708
709
|
// Context tensor enumeration and lookup
|
709
710
|
GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
|
710
711
|
GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
|
711
|
-
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
712
|
+
GGML_API struct ggml_tensor * ggml_get_tensor (struct ggml_context * ctx, const char * name);
|
712
713
|
|
713
714
|
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
714
715
|
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
@@ -1020,9 +1021,9 @@ extern "C" {
|
|
1020
1021
|
struct ggml_tensor * b,
|
1021
1022
|
float eps);
|
1022
1023
|
|
1023
|
-
// A:
|
1024
|
-
// B:
|
1025
|
-
// result is
|
1024
|
+
// A: k columns, n rows => [ne03, ne02, n, k]
|
1025
|
+
// B: k columns, m rows (i.e. we transpose it internally) => [ne03 * x, ne02 * y, m, k]
|
1026
|
+
// result is n columns, m rows => [ne03 * x, ne02 * y, m, n]
|
1026
1027
|
GGML_API struct ggml_tensor * ggml_mul_mat(
|
1027
1028
|
struct ggml_context * ctx,
|
1028
1029
|
struct ggml_tensor * a,
|
@@ -1325,8 +1326,13 @@ extern "C" {
|
|
1325
1326
|
int n_dims,
|
1326
1327
|
int mode,
|
1327
1328
|
int n_ctx,
|
1329
|
+
int n_orig_ctx,
|
1328
1330
|
float freq_base,
|
1329
|
-
float freq_scale
|
1331
|
+
float freq_scale,
|
1332
|
+
float ext_factor,
|
1333
|
+
float attn_factor,
|
1334
|
+
float beta_fast,
|
1335
|
+
float beta_slow);
|
1330
1336
|
|
1331
1337
|
// in-place, returns view(a)
|
1332
1338
|
GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
|
@@ -1336,8 +1342,17 @@ extern "C" {
|
|
1336
1342
|
int n_dims,
|
1337
1343
|
int mode,
|
1338
1344
|
int n_ctx,
|
1345
|
+
int n_orig_ctx,
|
1339
1346
|
float freq_base,
|
1340
|
-
float freq_scale
|
1347
|
+
float freq_scale,
|
1348
|
+
float ext_factor,
|
1349
|
+
float attn_factor,
|
1350
|
+
float beta_fast,
|
1351
|
+
float beta_slow);
|
1352
|
+
|
1353
|
+
// compute correction dims for YaRN RoPE scaling
|
1354
|
+
void ggml_rope_yarn_corr_dims(
|
1355
|
+
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
|
1341
1356
|
|
1342
1357
|
// xPos RoPE, in-place, returns view(a)
|
1343
1358
|
GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
|
@@ -1929,12 +1944,19 @@ extern "C" {
|
|
1929
1944
|
// quantization
|
1930
1945
|
//
|
1931
1946
|
|
1947
|
+
// TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
|
1932
1948
|
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
1933
1949
|
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
1934
1950
|
GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
1935
1951
|
GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
1936
1952
|
GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
1937
1953
|
|
1954
|
+
GGML_API size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
1955
|
+
GGML_API size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
1956
|
+
GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
1957
|
+
GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
1958
|
+
GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
1959
|
+
|
1938
1960
|
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
|
1939
1961
|
|
1940
1962
|
//
|