llama_cpp 0.5.3 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/README.md +6 -5
- data/examples/chat.rb +13 -13
- data/examples/embedding.rb +9 -9
- data/ext/llama_cpp/llama_cpp.cpp +547 -272
- data/ext/llama_cpp/src/ggml-alloc.c +8 -2
- data/ext/llama_cpp/src/ggml-alloc.h +1 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +209 -82
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.m +163 -84
- data/ext/llama_cpp/src/ggml-metal.metal +121 -38
- data/ext/llama_cpp/src/ggml.c +1596 -842
- data/ext/llama_cpp/src/ggml.h +116 -35
- data/ext/llama_cpp/src/llama.cpp +1015 -586
- data/ext/llama_cpp/src/llama.h +304 -119
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +5 -9
- data/sig/llama_cpp.rbs +65 -34
- metadata +3 -3
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -214,8 +214,8 @@
|
|
214
214
|
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
|
215
215
|
|
216
216
|
#define GGML_MAX_DIMS 4
|
217
|
-
#define GGML_MAX_NODES
|
218
|
-
#define GGML_MAX_PARAMS
|
217
|
+
#define GGML_MAX_NODES 16384
|
218
|
+
#define GGML_MAX_PARAMS 1024
|
219
219
|
#define GGML_MAX_CONTEXTS 64
|
220
220
|
#define GGML_MAX_SRC 6
|
221
221
|
#define GGML_MAX_NAME 64
|
@@ -248,6 +248,14 @@
|
|
248
248
|
} \
|
249
249
|
} while (0)
|
250
250
|
|
251
|
+
#ifndef NDEBUG
|
252
|
+
#define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
|
253
|
+
#elif defined(__GNUC__)
|
254
|
+
#define GGML_UNREACHABLE() __builtin_unreachable()
|
255
|
+
#else
|
256
|
+
#define GGML_UNREACHABLE() ((void) 0)
|
257
|
+
#endif
|
258
|
+
|
251
259
|
// used to copy the number of elements and stride in bytes of tensors into local variables.
|
252
260
|
// main purpose is to reduce code duplication and improve readability.
|
253
261
|
//
|
@@ -445,6 +453,12 @@ extern "C" {
|
|
445
453
|
GGML_OBJECT_WORK_BUFFER
|
446
454
|
};
|
447
455
|
|
456
|
+
enum ggml_log_level {
|
457
|
+
GGML_LOG_LEVEL_ERROR = 2,
|
458
|
+
GGML_LOG_LEVEL_WARN = 3,
|
459
|
+
GGML_LOG_LEVEL_INFO = 4
|
460
|
+
};
|
461
|
+
|
448
462
|
// ggml object
|
449
463
|
struct ggml_object {
|
450
464
|
size_t offs;
|
@@ -467,8 +481,8 @@ extern "C" {
|
|
467
481
|
int n_dims;
|
468
482
|
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
469
483
|
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
|
470
|
-
// nb[0] =
|
471
|
-
// nb[1] = nb[0] * ne[0] + padding
|
484
|
+
// nb[0] = ggml_type_size(type)
|
485
|
+
// nb[1] = nb[0] * (ne[0] / ggml_blck_size(type)) + padding
|
472
486
|
// nb[i] = nb[i-1] * ne[i-1]
|
473
487
|
|
474
488
|
// compute data
|
@@ -520,7 +534,15 @@ extern "C" {
|
|
520
534
|
// next prime after GGML_MAX_NODES
|
521
535
|
// #define GGML_GRAPH_HASHTABLE_SIZE 4099
|
522
536
|
// next prime after GGML_MAX_NODES * 2 (nodes + leafs)
|
523
|
-
#define GGML_GRAPH_HASHTABLE_SIZE 8273
|
537
|
+
// #define GGML_GRAPH_HASHTABLE_SIZE 8273
|
538
|
+
// #define GGML_GRAPH_HASHTABLE_SIZE 16411
|
539
|
+
#define GGML_GRAPH_HASHTABLE_SIZE 32771
|
540
|
+
|
541
|
+
enum ggml_cgraph_eval_order {
|
542
|
+
GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
|
543
|
+
GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
|
544
|
+
GGML_CGRAPH_EVAL_ORDER_COUNT
|
545
|
+
};
|
524
546
|
|
525
547
|
// computation graph
|
526
548
|
struct ggml_cgraph {
|
@@ -533,6 +555,8 @@ extern "C" {
|
|
533
555
|
|
534
556
|
void * visited_hash_table[GGML_GRAPH_HASHTABLE_SIZE];
|
535
557
|
|
558
|
+
enum ggml_cgraph_eval_order order;
|
559
|
+
|
536
560
|
// performance
|
537
561
|
int perf_runs;
|
538
562
|
int64_t perf_cycles;
|
@@ -680,12 +704,21 @@ extern "C" {
|
|
680
704
|
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
681
705
|
GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
|
682
706
|
|
707
|
+
// Converts a flat index into coordinates
|
708
|
+
GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
|
709
|
+
|
683
710
|
GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
|
684
711
|
GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
|
685
712
|
|
713
|
+
GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
714
|
+
GGML_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
|
715
|
+
|
686
716
|
GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
|
687
717
|
GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
|
688
718
|
|
719
|
+
GGML_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
720
|
+
GGML_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
|
721
|
+
|
689
722
|
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
690
723
|
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
691
724
|
|
@@ -719,6 +752,12 @@ extern "C" {
|
|
719
752
|
struct ggml_tensor * a,
|
720
753
|
struct ggml_tensor * b);
|
721
754
|
|
755
|
+
GGML_API struct ggml_tensor * ggml_add_cast(
|
756
|
+
struct ggml_context * ctx,
|
757
|
+
struct ggml_tensor * a,
|
758
|
+
struct ggml_tensor * b,
|
759
|
+
enum ggml_type type);
|
760
|
+
|
722
761
|
GGML_API struct ggml_tensor * ggml_add1(
|
723
762
|
struct ggml_context * ctx,
|
724
763
|
struct ggml_tensor * a,
|
@@ -828,6 +867,7 @@ extern "C" {
|
|
828
867
|
struct ggml_tensor * a,
|
829
868
|
struct ggml_tensor * b);
|
830
869
|
|
870
|
+
// sums repetitions in a into shape of b
|
831
871
|
GGML_API struct ggml_tensor * ggml_repeat_back(
|
832
872
|
struct ggml_context * ctx,
|
833
873
|
struct ggml_tensor * a,
|
@@ -1049,7 +1089,6 @@ extern "C" {
|
|
1049
1089
|
size_t nb1,
|
1050
1090
|
size_t offset);
|
1051
1091
|
|
1052
|
-
|
1053
1092
|
// a -> b, return view(b)
|
1054
1093
|
GGML_API struct ggml_tensor * ggml_cpy(
|
1055
1094
|
struct ggml_context * ctx,
|
@@ -1072,6 +1111,33 @@ extern "C" {
|
|
1072
1111
|
struct ggml_context * ctx,
|
1073
1112
|
struct ggml_tensor * a);
|
1074
1113
|
|
1114
|
+
// make contiguous, with new shape
|
1115
|
+
GGML_API struct ggml_tensor * ggml_cont_1d(
|
1116
|
+
struct ggml_context * ctx,
|
1117
|
+
struct ggml_tensor * a,
|
1118
|
+
int64_t ne0);
|
1119
|
+
|
1120
|
+
GGML_API struct ggml_tensor * ggml_cont_2d(
|
1121
|
+
struct ggml_context * ctx,
|
1122
|
+
struct ggml_tensor * a,
|
1123
|
+
int64_t ne0,
|
1124
|
+
int64_t ne1);
|
1125
|
+
|
1126
|
+
GGML_API struct ggml_tensor * ggml_cont_3d(
|
1127
|
+
struct ggml_context * ctx,
|
1128
|
+
struct ggml_tensor * a,
|
1129
|
+
int64_t ne0,
|
1130
|
+
int64_t ne1,
|
1131
|
+
int64_t ne2);
|
1132
|
+
|
1133
|
+
GGML_API struct ggml_tensor * ggml_cont_4d(
|
1134
|
+
struct ggml_context * ctx,
|
1135
|
+
struct ggml_tensor * a,
|
1136
|
+
int64_t ne0,
|
1137
|
+
int64_t ne1,
|
1138
|
+
int64_t ne2,
|
1139
|
+
int64_t ne3);
|
1140
|
+
|
1075
1141
|
// return view(a), b specifies the new shape
|
1076
1142
|
// TODO: when we start computing gradient, make a copy instead of view
|
1077
1143
|
GGML_API struct ggml_tensor * ggml_reshape(
|
@@ -1219,14 +1285,15 @@ extern "C" {
|
|
1219
1285
|
struct ggml_tensor * b);
|
1220
1286
|
|
1221
1287
|
// rotary position embedding
|
1222
|
-
// if mode & 1 == 1, skip n_past elements
|
1288
|
+
// if mode & 1 == 1, skip n_past elements (DEPRECATED)
|
1223
1289
|
// if mode & 2 == 1, GPT-NeoX style
|
1224
1290
|
// if mode & 4 == 1, ChatGLM style
|
1225
|
-
//
|
1291
|
+
//
|
1292
|
+
// b is an int32 vector with size a->ne[2], it contains the positions
|
1226
1293
|
GGML_API struct ggml_tensor * ggml_rope(
|
1227
1294
|
struct ggml_context * ctx,
|
1228
1295
|
struct ggml_tensor * a,
|
1229
|
-
|
1296
|
+
struct ggml_tensor * b,
|
1230
1297
|
int n_dims,
|
1231
1298
|
int mode,
|
1232
1299
|
int n_ctx);
|
@@ -1235,7 +1302,7 @@ extern "C" {
|
|
1235
1302
|
GGML_API struct ggml_tensor * ggml_rope_inplace(
|
1236
1303
|
struct ggml_context * ctx,
|
1237
1304
|
struct ggml_tensor * a,
|
1238
|
-
|
1305
|
+
struct ggml_tensor * b,
|
1239
1306
|
int n_dims,
|
1240
1307
|
int mode,
|
1241
1308
|
int n_ctx);
|
@@ -1244,7 +1311,7 @@ extern "C" {
|
|
1244
1311
|
GGML_API struct ggml_tensor * ggml_rope_custom(
|
1245
1312
|
struct ggml_context * ctx,
|
1246
1313
|
struct ggml_tensor * a,
|
1247
|
-
|
1314
|
+
struct ggml_tensor * b,
|
1248
1315
|
int n_dims,
|
1249
1316
|
int mode,
|
1250
1317
|
int n_ctx,
|
@@ -1255,7 +1322,7 @@ extern "C" {
|
|
1255
1322
|
GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
|
1256
1323
|
struct ggml_context * ctx,
|
1257
1324
|
struct ggml_tensor * a,
|
1258
|
-
|
1325
|
+
struct ggml_tensor * b,
|
1259
1326
|
int n_dims,
|
1260
1327
|
int mode,
|
1261
1328
|
int n_ctx,
|
@@ -1266,7 +1333,7 @@ extern "C" {
|
|
1266
1333
|
GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
|
1267
1334
|
struct ggml_context * ctx,
|
1268
1335
|
struct ggml_tensor * a,
|
1269
|
-
|
1336
|
+
struct ggml_tensor * b,
|
1270
1337
|
int n_dims,
|
1271
1338
|
float base,
|
1272
1339
|
bool down);
|
@@ -1276,7 +1343,7 @@ extern "C" {
|
|
1276
1343
|
GGML_API struct ggml_tensor * ggml_rope_back(
|
1277
1344
|
struct ggml_context * ctx,
|
1278
1345
|
struct ggml_tensor * a,
|
1279
|
-
|
1346
|
+
struct ggml_tensor * b,
|
1280
1347
|
int n_dims,
|
1281
1348
|
int mode,
|
1282
1349
|
int n_ctx,
|
@@ -1656,6 +1723,16 @@ extern "C" {
|
|
1656
1723
|
// dump the graph into a file using the dot format
|
1657
1724
|
GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
|
1658
1725
|
|
1726
|
+
// build gradient checkpointing backward graph gb for gf using provided checkpoints
|
1727
|
+
// gb_tmp will contain original backward graph with rewritten backward process nodes,
|
1728
|
+
// but without the second forward pass nodes.
|
1729
|
+
GGML_API void ggml_build_backward_gradient_checkpointing(
|
1730
|
+
struct ggml_context * ctx,
|
1731
|
+
struct ggml_cgraph * gf,
|
1732
|
+
struct ggml_cgraph * gb,
|
1733
|
+
struct ggml_cgraph * gb_tmp,
|
1734
|
+
struct ggml_tensor * * checkpoints,
|
1735
|
+
int n_checkpoints);
|
1659
1736
|
//
|
1660
1737
|
// optimization
|
1661
1738
|
//
|
@@ -1690,7 +1767,8 @@ extern "C" {
|
|
1690
1767
|
GGML_LINESEARCH_INVALID_PARAMETERS,
|
1691
1768
|
};
|
1692
1769
|
|
1693
|
-
typedef void (*ggml_opt_callback)(void * data, float * sched);
|
1770
|
+
typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
|
1771
|
+
typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
|
1694
1772
|
|
1695
1773
|
// optimization parameters
|
1696
1774
|
//
|
@@ -1721,6 +1799,8 @@ extern "C" {
|
|
1721
1799
|
bool print_forward_graph;
|
1722
1800
|
bool print_backward_graph;
|
1723
1801
|
|
1802
|
+
int n_gradient_accumulation;
|
1803
|
+
|
1724
1804
|
// ADAM parameters
|
1725
1805
|
struct {
|
1726
1806
|
int n_iter;
|
@@ -1766,6 +1846,7 @@ extern "C" {
|
|
1766
1846
|
float loss_after;
|
1767
1847
|
|
1768
1848
|
struct {
|
1849
|
+
struct ggml_tensor * g; // current gradient
|
1769
1850
|
struct ggml_tensor * m; // first moment
|
1770
1851
|
struct ggml_tensor * v; // second moment
|
1771
1852
|
struct ggml_tensor * pf; // past function values
|
@@ -1882,26 +1963,26 @@ extern "C" {
|
|
1882
1963
|
|
1883
1964
|
GGML_API int gguf_get_n_kv(const struct gguf_context * ctx);
|
1884
1965
|
GGML_API int gguf_find_key(const struct gguf_context * ctx, const char * key);
|
1885
|
-
GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int
|
1886
|
-
|
1887
|
-
GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int
|
1888
|
-
GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int
|
1889
|
-
|
1890
|
-
//
|
1891
|
-
GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int
|
1892
|
-
GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int
|
1893
|
-
GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int
|
1894
|
-
GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int
|
1895
|
-
GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int
|
1896
|
-
GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int
|
1897
|
-
GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int
|
1898
|
-
GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int
|
1899
|
-
GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int
|
1900
|
-
GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int
|
1901
|
-
GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int
|
1902
|
-
GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int
|
1903
|
-
GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int
|
1904
|
-
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int
|
1966
|
+
GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int key_id);
|
1967
|
+
|
1968
|
+
GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int key_id);
|
1969
|
+
GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id);
|
1970
|
+
|
1971
|
+
// will abort if the wrong type is used for the key
|
1972
|
+
GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int key_id);
|
1973
|
+
GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int key_id);
|
1974
|
+
GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int key_id);
|
1975
|
+
GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int key_id);
|
1976
|
+
GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int key_id);
|
1977
|
+
GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int key_id);
|
1978
|
+
GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int key_id);
|
1979
|
+
GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int key_id);
|
1980
|
+
GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int key_id);
|
1981
|
+
GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
|
1982
|
+
GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
|
1983
|
+
GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
|
1984
|
+
GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int key_id);
|
1985
|
+
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
|
1905
1986
|
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
|
1906
1987
|
|
1907
1988
|
GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
|