llama_cpp 0.5.3 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/README.md +6 -5
- data/examples/chat.rb +13 -13
- data/examples/embedding.rb +9 -9
- data/ext/llama_cpp/llama_cpp.cpp +547 -272
- data/ext/llama_cpp/src/ggml-alloc.c +8 -2
- data/ext/llama_cpp/src/ggml-alloc.h +1 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +209 -82
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.m +163 -84
- data/ext/llama_cpp/src/ggml-metal.metal +121 -38
- data/ext/llama_cpp/src/ggml.c +1596 -842
- data/ext/llama_cpp/src/ggml.h +116 -35
- data/ext/llama_cpp/src/llama.cpp +1015 -586
- data/ext/llama_cpp/src/llama.h +304 -119
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +5 -9
- data/sig/llama_cpp.rbs +65 -34
- metadata +3 -3
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -214,8 +214,8 @@
|
|
214
214
|
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
|
215
215
|
|
216
216
|
#define GGML_MAX_DIMS 4
|
217
|
-
#define GGML_MAX_NODES
|
218
|
-
#define GGML_MAX_PARAMS
|
217
|
+
#define GGML_MAX_NODES 16384
|
218
|
+
#define GGML_MAX_PARAMS 1024
|
219
219
|
#define GGML_MAX_CONTEXTS 64
|
220
220
|
#define GGML_MAX_SRC 6
|
221
221
|
#define GGML_MAX_NAME 64
|
@@ -248,6 +248,14 @@
|
|
248
248
|
} \
|
249
249
|
} while (0)
|
250
250
|
|
251
|
+
#ifndef NDEBUG
|
252
|
+
#define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
|
253
|
+
#elif defined(__GNUC__)
|
254
|
+
#define GGML_UNREACHABLE() __builtin_unreachable()
|
255
|
+
#else
|
256
|
+
#define GGML_UNREACHABLE() ((void) 0)
|
257
|
+
#endif
|
258
|
+
|
251
259
|
// used to copy the number of elements and stride in bytes of tensors into local variables.
|
252
260
|
// main purpose is to reduce code duplication and improve readability.
|
253
261
|
//
|
@@ -445,6 +453,12 @@ extern "C" {
|
|
445
453
|
GGML_OBJECT_WORK_BUFFER
|
446
454
|
};
|
447
455
|
|
456
|
+
enum ggml_log_level {
|
457
|
+
GGML_LOG_LEVEL_ERROR = 2,
|
458
|
+
GGML_LOG_LEVEL_WARN = 3,
|
459
|
+
GGML_LOG_LEVEL_INFO = 4
|
460
|
+
};
|
461
|
+
|
448
462
|
// ggml object
|
449
463
|
struct ggml_object {
|
450
464
|
size_t offs;
|
@@ -467,8 +481,8 @@ extern "C" {
|
|
467
481
|
int n_dims;
|
468
482
|
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
469
483
|
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
|
470
|
-
// nb[0] =
|
471
|
-
// nb[1] = nb[0] * ne[0] + padding
|
484
|
+
// nb[0] = ggml_type_size(type)
|
485
|
+
// nb[1] = nb[0] * (ne[0] / ggml_blck_size(type)) + padding
|
472
486
|
// nb[i] = nb[i-1] * ne[i-1]
|
473
487
|
|
474
488
|
// compute data
|
@@ -520,7 +534,15 @@ extern "C" {
|
|
520
534
|
// next prime after GGML_MAX_NODES
|
521
535
|
// #define GGML_GRAPH_HASHTABLE_SIZE 4099
|
522
536
|
// next prime after GGML_MAX_NODES * 2 (nodes + leafs)
|
523
|
-
#define GGML_GRAPH_HASHTABLE_SIZE 8273
|
537
|
+
// #define GGML_GRAPH_HASHTABLE_SIZE 8273
|
538
|
+
// #define GGML_GRAPH_HASHTABLE_SIZE 16411
|
539
|
+
#define GGML_GRAPH_HASHTABLE_SIZE 32771
|
540
|
+
|
541
|
+
enum ggml_cgraph_eval_order {
|
542
|
+
GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
|
543
|
+
GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
|
544
|
+
GGML_CGRAPH_EVAL_ORDER_COUNT
|
545
|
+
};
|
524
546
|
|
525
547
|
// computation graph
|
526
548
|
struct ggml_cgraph {
|
@@ -533,6 +555,8 @@ extern "C" {
|
|
533
555
|
|
534
556
|
void * visited_hash_table[GGML_GRAPH_HASHTABLE_SIZE];
|
535
557
|
|
558
|
+
enum ggml_cgraph_eval_order order;
|
559
|
+
|
536
560
|
// performance
|
537
561
|
int perf_runs;
|
538
562
|
int64_t perf_cycles;
|
@@ -680,12 +704,21 @@ extern "C" {
|
|
680
704
|
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
681
705
|
GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
|
682
706
|
|
707
|
+
// Converts a flat index into coordinates
|
708
|
+
GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
|
709
|
+
|
683
710
|
GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
|
684
711
|
GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
|
685
712
|
|
713
|
+
GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
714
|
+
GGML_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
|
715
|
+
|
686
716
|
GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
|
687
717
|
GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
|
688
718
|
|
719
|
+
GGML_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
720
|
+
GGML_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
|
721
|
+
|
689
722
|
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
690
723
|
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
691
724
|
|
@@ -719,6 +752,12 @@ extern "C" {
|
|
719
752
|
struct ggml_tensor * a,
|
720
753
|
struct ggml_tensor * b);
|
721
754
|
|
755
|
+
GGML_API struct ggml_tensor * ggml_add_cast(
|
756
|
+
struct ggml_context * ctx,
|
757
|
+
struct ggml_tensor * a,
|
758
|
+
struct ggml_tensor * b,
|
759
|
+
enum ggml_type type);
|
760
|
+
|
722
761
|
GGML_API struct ggml_tensor * ggml_add1(
|
723
762
|
struct ggml_context * ctx,
|
724
763
|
struct ggml_tensor * a,
|
@@ -828,6 +867,7 @@ extern "C" {
|
|
828
867
|
struct ggml_tensor * a,
|
829
868
|
struct ggml_tensor * b);
|
830
869
|
|
870
|
+
// sums repetitions in a into shape of b
|
831
871
|
GGML_API struct ggml_tensor * ggml_repeat_back(
|
832
872
|
struct ggml_context * ctx,
|
833
873
|
struct ggml_tensor * a,
|
@@ -1049,7 +1089,6 @@ extern "C" {
|
|
1049
1089
|
size_t nb1,
|
1050
1090
|
size_t offset);
|
1051
1091
|
|
1052
|
-
|
1053
1092
|
// a -> b, return view(b)
|
1054
1093
|
GGML_API struct ggml_tensor * ggml_cpy(
|
1055
1094
|
struct ggml_context * ctx,
|
@@ -1072,6 +1111,33 @@ extern "C" {
|
|
1072
1111
|
struct ggml_context * ctx,
|
1073
1112
|
struct ggml_tensor * a);
|
1074
1113
|
|
1114
|
+
// make contiguous, with new shape
|
1115
|
+
GGML_API struct ggml_tensor * ggml_cont_1d(
|
1116
|
+
struct ggml_context * ctx,
|
1117
|
+
struct ggml_tensor * a,
|
1118
|
+
int64_t ne0);
|
1119
|
+
|
1120
|
+
GGML_API struct ggml_tensor * ggml_cont_2d(
|
1121
|
+
struct ggml_context * ctx,
|
1122
|
+
struct ggml_tensor * a,
|
1123
|
+
int64_t ne0,
|
1124
|
+
int64_t ne1);
|
1125
|
+
|
1126
|
+
GGML_API struct ggml_tensor * ggml_cont_3d(
|
1127
|
+
struct ggml_context * ctx,
|
1128
|
+
struct ggml_tensor * a,
|
1129
|
+
int64_t ne0,
|
1130
|
+
int64_t ne1,
|
1131
|
+
int64_t ne2);
|
1132
|
+
|
1133
|
+
GGML_API struct ggml_tensor * ggml_cont_4d(
|
1134
|
+
struct ggml_context * ctx,
|
1135
|
+
struct ggml_tensor * a,
|
1136
|
+
int64_t ne0,
|
1137
|
+
int64_t ne1,
|
1138
|
+
int64_t ne2,
|
1139
|
+
int64_t ne3);
|
1140
|
+
|
1075
1141
|
// return view(a), b specifies the new shape
|
1076
1142
|
// TODO: when we start computing gradient, make a copy instead of view
|
1077
1143
|
GGML_API struct ggml_tensor * ggml_reshape(
|
@@ -1219,14 +1285,15 @@ extern "C" {
|
|
1219
1285
|
struct ggml_tensor * b);
|
1220
1286
|
|
1221
1287
|
// rotary position embedding
|
1222
|
-
// if mode & 1 == 1, skip n_past elements
|
1288
|
+
// if mode & 1 == 1, skip n_past elements (DEPRECATED)
|
1223
1289
|
// if mode & 2 == 1, GPT-NeoX style
|
1224
1290
|
// if mode & 4 == 1, ChatGLM style
|
1225
|
-
//
|
1291
|
+
//
|
1292
|
+
// b is an int32 vector with size a->ne[2], it contains the positions
|
1226
1293
|
GGML_API struct ggml_tensor * ggml_rope(
|
1227
1294
|
struct ggml_context * ctx,
|
1228
1295
|
struct ggml_tensor * a,
|
1229
|
-
|
1296
|
+
struct ggml_tensor * b,
|
1230
1297
|
int n_dims,
|
1231
1298
|
int mode,
|
1232
1299
|
int n_ctx);
|
@@ -1235,7 +1302,7 @@ extern "C" {
|
|
1235
1302
|
GGML_API struct ggml_tensor * ggml_rope_inplace(
|
1236
1303
|
struct ggml_context * ctx,
|
1237
1304
|
struct ggml_tensor * a,
|
1238
|
-
|
1305
|
+
struct ggml_tensor * b,
|
1239
1306
|
int n_dims,
|
1240
1307
|
int mode,
|
1241
1308
|
int n_ctx);
|
@@ -1244,7 +1311,7 @@ extern "C" {
|
|
1244
1311
|
GGML_API struct ggml_tensor * ggml_rope_custom(
|
1245
1312
|
struct ggml_context * ctx,
|
1246
1313
|
struct ggml_tensor * a,
|
1247
|
-
|
1314
|
+
struct ggml_tensor * b,
|
1248
1315
|
int n_dims,
|
1249
1316
|
int mode,
|
1250
1317
|
int n_ctx,
|
@@ -1255,7 +1322,7 @@ extern "C" {
|
|
1255
1322
|
GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
|
1256
1323
|
struct ggml_context * ctx,
|
1257
1324
|
struct ggml_tensor * a,
|
1258
|
-
|
1325
|
+
struct ggml_tensor * b,
|
1259
1326
|
int n_dims,
|
1260
1327
|
int mode,
|
1261
1328
|
int n_ctx,
|
@@ -1266,7 +1333,7 @@ extern "C" {
|
|
1266
1333
|
GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
|
1267
1334
|
struct ggml_context * ctx,
|
1268
1335
|
struct ggml_tensor * a,
|
1269
|
-
|
1336
|
+
struct ggml_tensor * b,
|
1270
1337
|
int n_dims,
|
1271
1338
|
float base,
|
1272
1339
|
bool down);
|
@@ -1276,7 +1343,7 @@ extern "C" {
|
|
1276
1343
|
GGML_API struct ggml_tensor * ggml_rope_back(
|
1277
1344
|
struct ggml_context * ctx,
|
1278
1345
|
struct ggml_tensor * a,
|
1279
|
-
|
1346
|
+
struct ggml_tensor * b,
|
1280
1347
|
int n_dims,
|
1281
1348
|
int mode,
|
1282
1349
|
int n_ctx,
|
@@ -1656,6 +1723,16 @@ extern "C" {
|
|
1656
1723
|
// dump the graph into a file using the dot format
|
1657
1724
|
GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
|
1658
1725
|
|
1726
|
+
// build gradient checkpointing backward graph gb for gf using provided checkpoints
|
1727
|
+
// gb_tmp will contain original backward graph with rewritten backward process nodes,
|
1728
|
+
// but without the second forward pass nodes.
|
1729
|
+
GGML_API void ggml_build_backward_gradient_checkpointing(
|
1730
|
+
struct ggml_context * ctx,
|
1731
|
+
struct ggml_cgraph * gf,
|
1732
|
+
struct ggml_cgraph * gb,
|
1733
|
+
struct ggml_cgraph * gb_tmp,
|
1734
|
+
struct ggml_tensor * * checkpoints,
|
1735
|
+
int n_checkpoints);
|
1659
1736
|
//
|
1660
1737
|
// optimization
|
1661
1738
|
//
|
@@ -1690,7 +1767,8 @@ extern "C" {
|
|
1690
1767
|
GGML_LINESEARCH_INVALID_PARAMETERS,
|
1691
1768
|
};
|
1692
1769
|
|
1693
|
-
typedef void (*ggml_opt_callback)(void * data, float * sched);
|
1770
|
+
typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
|
1771
|
+
typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
|
1694
1772
|
|
1695
1773
|
// optimization parameters
|
1696
1774
|
//
|
@@ -1721,6 +1799,8 @@ extern "C" {
|
|
1721
1799
|
bool print_forward_graph;
|
1722
1800
|
bool print_backward_graph;
|
1723
1801
|
|
1802
|
+
int n_gradient_accumulation;
|
1803
|
+
|
1724
1804
|
// ADAM parameters
|
1725
1805
|
struct {
|
1726
1806
|
int n_iter;
|
@@ -1766,6 +1846,7 @@ extern "C" {
|
|
1766
1846
|
float loss_after;
|
1767
1847
|
|
1768
1848
|
struct {
|
1849
|
+
struct ggml_tensor * g; // current gradient
|
1769
1850
|
struct ggml_tensor * m; // first moment
|
1770
1851
|
struct ggml_tensor * v; // second moment
|
1771
1852
|
struct ggml_tensor * pf; // past function values
|
@@ -1882,26 +1963,26 @@ extern "C" {
|
|
1882
1963
|
|
1883
1964
|
GGML_API int gguf_get_n_kv(const struct gguf_context * ctx);
|
1884
1965
|
GGML_API int gguf_find_key(const struct gguf_context * ctx, const char * key);
|
1885
|
-
GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int
|
1886
|
-
|
1887
|
-
GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int
|
1888
|
-
GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int
|
1889
|
-
|
1890
|
-
//
|
1891
|
-
GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int
|
1892
|
-
GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int
|
1893
|
-
GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int
|
1894
|
-
GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int
|
1895
|
-
GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int
|
1896
|
-
GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int
|
1897
|
-
GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int
|
1898
|
-
GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int
|
1899
|
-
GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int
|
1900
|
-
GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int
|
1901
|
-
GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int
|
1902
|
-
GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int
|
1903
|
-
GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int
|
1904
|
-
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int
|
1966
|
+
GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int key_id);
|
1967
|
+
|
1968
|
+
GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int key_id);
|
1969
|
+
GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id);
|
1970
|
+
|
1971
|
+
// will abort if the wrong type is used for the key
|
1972
|
+
GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int key_id);
|
1973
|
+
GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int key_id);
|
1974
|
+
GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int key_id);
|
1975
|
+
GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int key_id);
|
1976
|
+
GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int key_id);
|
1977
|
+
GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int key_id);
|
1978
|
+
GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int key_id);
|
1979
|
+
GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int key_id);
|
1980
|
+
GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int key_id);
|
1981
|
+
GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
|
1982
|
+
GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
|
1983
|
+
GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
|
1984
|
+
GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int key_id);
|
1985
|
+
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
|
1905
1986
|
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
|
1906
1987
|
|
1907
1988
|
GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
|