llama_cpp 0.8.0 → 0.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +19 -0
- data/examples/chat.rb +8 -6
- data/ext/llama_cpp/extconf.rb +3 -11
- data/ext/llama_cpp/llama_cpp.cpp +228 -165
- data/ext/llama_cpp/src/ggml-cuda.cu +441 -77
- data/ext/llama_cpp/src/ggml-impl.h +237 -0
- data/ext/llama_cpp/src/ggml-metal.m +71 -42
- data/ext/llama_cpp/src/ggml-metal.metal +171 -35
- data/ext/llama_cpp/src/ggml-opencl.cpp +161 -169
- data/ext/llama_cpp/src/{k_quants.c → ggml-quants.c} +3329 -1099
- data/ext/llama_cpp/src/{k_quants.h → ggml-quants.h} +81 -22
- data/ext/llama_cpp/src/ggml.c +1303 -3419
- data/ext/llama_cpp/src/ggml.h +33 -11
- data/ext/llama_cpp/src/llama.cpp +1925 -2655
- data/ext/llama_cpp/src/llama.h +48 -33
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +4 -4
- data/sig/llama_cpp.rbs +34 -14
- metadata +5 -4
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -219,7 +219,7 @@
|
|
219
219
|
#define GGML_MAX_CONTEXTS 64
|
220
220
|
#define GGML_MAX_SRC 6
|
221
221
|
#define GGML_MAX_NAME 64
|
222
|
-
#define GGML_MAX_OP_PARAMS
|
222
|
+
#define GGML_MAX_OP_PARAMS 64
|
223
223
|
#define GGML_DEFAULT_N_THREADS 4
|
224
224
|
|
225
225
|
#if UINTPTR_MAX == 0xFFFFFFFF
|
@@ -401,15 +401,16 @@ extern "C" {
|
|
401
401
|
GGML_OP_ALIBI,
|
402
402
|
GGML_OP_CLAMP,
|
403
403
|
GGML_OP_CONV_1D,
|
404
|
-
|
404
|
+
GGML_OP_CONV_1D_STAGE_0, // internal
|
405
|
+
GGML_OP_CONV_1D_STAGE_1, // internal
|
405
406
|
GGML_OP_CONV_TRANSPOSE_1D,
|
407
|
+
GGML_OP_CONV_2D,
|
408
|
+
GGML_OP_CONV_2D_STAGE_0, // internal
|
409
|
+
GGML_OP_CONV_2D_STAGE_1, // internal
|
406
410
|
GGML_OP_CONV_TRANSPOSE_2D,
|
407
411
|
GGML_OP_POOL_1D,
|
408
412
|
GGML_OP_POOL_2D,
|
409
413
|
|
410
|
-
GGML_OP_CONV_1D_STAGE_0, // internal
|
411
|
-
GGML_OP_CONV_1D_STAGE_1, // internal
|
412
|
-
|
413
414
|
GGML_OP_UPSCALE, // nearest interpolate
|
414
415
|
|
415
416
|
GGML_OP_FLASH_ATTN,
|
@@ -708,7 +709,7 @@ extern "C" {
|
|
708
709
|
// Context tensor enumeration and lookup
|
709
710
|
GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
|
710
711
|
GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
|
711
|
-
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
712
|
+
GGML_API struct ggml_tensor * ggml_get_tensor (struct ggml_context * ctx, const char * name);
|
712
713
|
|
713
714
|
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
714
715
|
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
@@ -1020,9 +1021,9 @@ extern "C" {
|
|
1020
1021
|
struct ggml_tensor * b,
|
1021
1022
|
float eps);
|
1022
1023
|
|
1023
|
-
// A:
|
1024
|
-
// B:
|
1025
|
-
// result is
|
1024
|
+
// A: k columns, n rows => [ne03, ne02, n, k]
|
1025
|
+
// B: k columns, m rows (i.e. we transpose it internally) => [ne03 * x, ne02 * y, m, k]
|
1026
|
+
// result is n columns, m rows => [ne03 * x, ne02 * y, m, n]
|
1026
1027
|
GGML_API struct ggml_tensor * ggml_mul_mat(
|
1027
1028
|
struct ggml_context * ctx,
|
1028
1029
|
struct ggml_tensor * a,
|
@@ -1325,8 +1326,13 @@ extern "C" {
|
|
1325
1326
|
int n_dims,
|
1326
1327
|
int mode,
|
1327
1328
|
int n_ctx,
|
1329
|
+
int n_orig_ctx,
|
1328
1330
|
float freq_base,
|
1329
|
-
float freq_scale
|
1331
|
+
float freq_scale,
|
1332
|
+
float ext_factor,
|
1333
|
+
float attn_factor,
|
1334
|
+
float beta_fast,
|
1335
|
+
float beta_slow);
|
1330
1336
|
|
1331
1337
|
// in-place, returns view(a)
|
1332
1338
|
GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
|
@@ -1336,8 +1342,17 @@ extern "C" {
|
|
1336
1342
|
int n_dims,
|
1337
1343
|
int mode,
|
1338
1344
|
int n_ctx,
|
1345
|
+
int n_orig_ctx,
|
1339
1346
|
float freq_base,
|
1340
|
-
float freq_scale
|
1347
|
+
float freq_scale,
|
1348
|
+
float ext_factor,
|
1349
|
+
float attn_factor,
|
1350
|
+
float beta_fast,
|
1351
|
+
float beta_slow);
|
1352
|
+
|
1353
|
+
// compute correction dims for YaRN RoPE scaling
|
1354
|
+
void ggml_rope_yarn_corr_dims(
|
1355
|
+
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
|
1341
1356
|
|
1342
1357
|
// xPos RoPE, in-place, returns view(a)
|
1343
1358
|
GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
|
@@ -1929,12 +1944,19 @@ extern "C" {
|
|
1929
1944
|
// quantization
|
1930
1945
|
//
|
1931
1946
|
|
1947
|
+
// TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
|
1932
1948
|
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
1933
1949
|
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
1934
1950
|
GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
1935
1951
|
GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
1936
1952
|
GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
1937
1953
|
|
1954
|
+
GGML_API size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
1955
|
+
GGML_API size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
1956
|
+
GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
1957
|
+
GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
1958
|
+
GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
1959
|
+
|
1938
1960
|
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
|
1939
1961
|
|
1940
1962
|
//
|