llama_cpp 0.5.3 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/README.md +6 -5
- data/examples/chat.rb +13 -13
- data/examples/embedding.rb +9 -9
- data/ext/llama_cpp/llama_cpp.cpp +583 -262
- data/ext/llama_cpp/src/ggml-alloc.c +8 -2
- data/ext/llama_cpp/src/ggml-alloc.h +1 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +326 -149
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.m +167 -89
- data/ext/llama_cpp/src/ggml-metal.metal +130 -40
- data/ext/llama_cpp/src/ggml-opencl.cpp +119 -53
- data/ext/llama_cpp/src/ggml.c +2355 -1166
- data/ext/llama_cpp/src/ggml.h +129 -35
- data/ext/llama_cpp/src/k_quants.c +744 -2
- data/ext/llama_cpp/src/llama.cpp +1766 -671
- data/ext/llama_cpp/src/llama.h +321 -120
- data/ext/llama_cpp/src/unicode.h +462 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +6 -10
- data/sig/llama_cpp.rbs +70 -34
- metadata +4 -3
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -214,8 +214,8 @@
|
|
214
214
|
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
|
215
215
|
|
216
216
|
#define GGML_MAX_DIMS 4
|
217
|
-
#define GGML_MAX_NODES
|
218
|
-
#define GGML_MAX_PARAMS
|
217
|
+
#define GGML_MAX_NODES 16384
|
218
|
+
#define GGML_MAX_PARAMS 1024
|
219
219
|
#define GGML_MAX_CONTEXTS 64
|
220
220
|
#define GGML_MAX_SRC 6
|
221
221
|
#define GGML_MAX_NAME 64
|
@@ -248,6 +248,14 @@
|
|
248
248
|
} \
|
249
249
|
} while (0)
|
250
250
|
|
251
|
+
#ifndef NDEBUG
|
252
|
+
#define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
|
253
|
+
#elif defined(__GNUC__)
|
254
|
+
#define GGML_UNREACHABLE() __builtin_unreachable()
|
255
|
+
#else
|
256
|
+
#define GGML_UNREACHABLE() ((void) 0)
|
257
|
+
#endif
|
258
|
+
|
251
259
|
// used to copy the number of elements and stride in bytes of tensors into local variables.
|
252
260
|
// main purpose is to reduce code duplication and improve readability.
|
253
261
|
//
|
@@ -393,10 +401,14 @@ extern "C" {
|
|
393
401
|
GGML_OP_CLAMP,
|
394
402
|
GGML_OP_CONV_1D,
|
395
403
|
GGML_OP_CONV_2D,
|
404
|
+
GGML_OP_CONV_TRANSPOSE_1D,
|
396
405
|
GGML_OP_CONV_TRANSPOSE_2D,
|
397
406
|
GGML_OP_POOL_1D,
|
398
407
|
GGML_OP_POOL_2D,
|
399
408
|
|
409
|
+
GGML_OP_CONV_1D_STAGE_0, // internal
|
410
|
+
GGML_OP_CONV_1D_STAGE_1, // internal
|
411
|
+
|
400
412
|
GGML_OP_UPSCALE, // nearest interpolate
|
401
413
|
|
402
414
|
GGML_OP_FLASH_ATTN,
|
@@ -445,6 +457,12 @@ extern "C" {
|
|
445
457
|
GGML_OBJECT_WORK_BUFFER
|
446
458
|
};
|
447
459
|
|
460
|
+
enum ggml_log_level {
|
461
|
+
GGML_LOG_LEVEL_ERROR = 2,
|
462
|
+
GGML_LOG_LEVEL_WARN = 3,
|
463
|
+
GGML_LOG_LEVEL_INFO = 4
|
464
|
+
};
|
465
|
+
|
448
466
|
// ggml object
|
449
467
|
struct ggml_object {
|
450
468
|
size_t offs;
|
@@ -467,8 +485,8 @@ extern "C" {
|
|
467
485
|
int n_dims;
|
468
486
|
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
469
487
|
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
|
470
|
-
// nb[0] =
|
471
|
-
// nb[1] = nb[0] * ne[0] + padding
|
488
|
+
// nb[0] = ggml_type_size(type)
|
489
|
+
// nb[1] = nb[0] * (ne[0] / ggml_blck_size(type)) + padding
|
472
490
|
// nb[i] = nb[i-1] * ne[i-1]
|
473
491
|
|
474
492
|
// compute data
|
@@ -520,7 +538,15 @@ extern "C" {
|
|
520
538
|
// next prime after GGML_MAX_NODES
|
521
539
|
// #define GGML_GRAPH_HASHTABLE_SIZE 4099
|
522
540
|
// next prime after GGML_MAX_NODES * 2 (nodes + leafs)
|
523
|
-
#define GGML_GRAPH_HASHTABLE_SIZE 8273
|
541
|
+
// #define GGML_GRAPH_HASHTABLE_SIZE 8273
|
542
|
+
// #define GGML_GRAPH_HASHTABLE_SIZE 16411
|
543
|
+
#define GGML_GRAPH_HASHTABLE_SIZE 32771
|
544
|
+
|
545
|
+
enum ggml_cgraph_eval_order {
|
546
|
+
GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
|
547
|
+
GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
|
548
|
+
GGML_CGRAPH_EVAL_ORDER_COUNT
|
549
|
+
};
|
524
550
|
|
525
551
|
// computation graph
|
526
552
|
struct ggml_cgraph {
|
@@ -533,6 +559,8 @@ extern "C" {
|
|
533
559
|
|
534
560
|
void * visited_hash_table[GGML_GRAPH_HASHTABLE_SIZE];
|
535
561
|
|
562
|
+
enum ggml_cgraph_eval_order order;
|
563
|
+
|
536
564
|
// performance
|
537
565
|
int perf_runs;
|
538
566
|
int64_t perf_cycles;
|
@@ -680,12 +708,21 @@ extern "C" {
|
|
680
708
|
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
681
709
|
GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
|
682
710
|
|
711
|
+
// Converts a flat index into coordinates
|
712
|
+
GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
|
713
|
+
|
683
714
|
GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
|
684
715
|
GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
|
685
716
|
|
717
|
+
GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
718
|
+
GGML_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
|
719
|
+
|
686
720
|
GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
|
687
721
|
GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
|
688
722
|
|
723
|
+
GGML_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
724
|
+
GGML_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
|
725
|
+
|
689
726
|
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
690
727
|
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
691
728
|
|
@@ -719,6 +756,12 @@ extern "C" {
|
|
719
756
|
struct ggml_tensor * a,
|
720
757
|
struct ggml_tensor * b);
|
721
758
|
|
759
|
+
GGML_API struct ggml_tensor * ggml_add_cast(
|
760
|
+
struct ggml_context * ctx,
|
761
|
+
struct ggml_tensor * a,
|
762
|
+
struct ggml_tensor * b,
|
763
|
+
enum ggml_type type);
|
764
|
+
|
722
765
|
GGML_API struct ggml_tensor * ggml_add1(
|
723
766
|
struct ggml_context * ctx,
|
724
767
|
struct ggml_tensor * a,
|
@@ -828,6 +871,7 @@ extern "C" {
|
|
828
871
|
struct ggml_tensor * a,
|
829
872
|
struct ggml_tensor * b);
|
830
873
|
|
874
|
+
// sums repetitions in a into shape of b
|
831
875
|
GGML_API struct ggml_tensor * ggml_repeat_back(
|
832
876
|
struct ggml_context * ctx,
|
833
877
|
struct ggml_tensor * a,
|
@@ -1049,7 +1093,6 @@ extern "C" {
|
|
1049
1093
|
size_t nb1,
|
1050
1094
|
size_t offset);
|
1051
1095
|
|
1052
|
-
|
1053
1096
|
// a -> b, return view(b)
|
1054
1097
|
GGML_API struct ggml_tensor * ggml_cpy(
|
1055
1098
|
struct ggml_context * ctx,
|
@@ -1072,6 +1115,33 @@ extern "C" {
|
|
1072
1115
|
struct ggml_context * ctx,
|
1073
1116
|
struct ggml_tensor * a);
|
1074
1117
|
|
1118
|
+
// make contiguous, with new shape
|
1119
|
+
GGML_API struct ggml_tensor * ggml_cont_1d(
|
1120
|
+
struct ggml_context * ctx,
|
1121
|
+
struct ggml_tensor * a,
|
1122
|
+
int64_t ne0);
|
1123
|
+
|
1124
|
+
GGML_API struct ggml_tensor * ggml_cont_2d(
|
1125
|
+
struct ggml_context * ctx,
|
1126
|
+
struct ggml_tensor * a,
|
1127
|
+
int64_t ne0,
|
1128
|
+
int64_t ne1);
|
1129
|
+
|
1130
|
+
GGML_API struct ggml_tensor * ggml_cont_3d(
|
1131
|
+
struct ggml_context * ctx,
|
1132
|
+
struct ggml_tensor * a,
|
1133
|
+
int64_t ne0,
|
1134
|
+
int64_t ne1,
|
1135
|
+
int64_t ne2);
|
1136
|
+
|
1137
|
+
GGML_API struct ggml_tensor * ggml_cont_4d(
|
1138
|
+
struct ggml_context * ctx,
|
1139
|
+
struct ggml_tensor * a,
|
1140
|
+
int64_t ne0,
|
1141
|
+
int64_t ne1,
|
1142
|
+
int64_t ne2,
|
1143
|
+
int64_t ne3);
|
1144
|
+
|
1075
1145
|
// return view(a), b specifies the new shape
|
1076
1146
|
// TODO: when we start computing gradient, make a copy instead of view
|
1077
1147
|
GGML_API struct ggml_tensor * ggml_reshape(
|
@@ -1219,14 +1289,15 @@ extern "C" {
|
|
1219
1289
|
struct ggml_tensor * b);
|
1220
1290
|
|
1221
1291
|
// rotary position embedding
|
1222
|
-
// if mode & 1 == 1, skip n_past elements
|
1292
|
+
// if mode & 1 == 1, skip n_past elements (DEPRECATED)
|
1223
1293
|
// if mode & 2 == 1, GPT-NeoX style
|
1224
1294
|
// if mode & 4 == 1, ChatGLM style
|
1225
|
-
//
|
1295
|
+
//
|
1296
|
+
// b is an int32 vector with size a->ne[2], it contains the positions
|
1226
1297
|
GGML_API struct ggml_tensor * ggml_rope(
|
1227
1298
|
struct ggml_context * ctx,
|
1228
1299
|
struct ggml_tensor * a,
|
1229
|
-
|
1300
|
+
struct ggml_tensor * b,
|
1230
1301
|
int n_dims,
|
1231
1302
|
int mode,
|
1232
1303
|
int n_ctx);
|
@@ -1235,7 +1306,7 @@ extern "C" {
|
|
1235
1306
|
GGML_API struct ggml_tensor * ggml_rope_inplace(
|
1236
1307
|
struct ggml_context * ctx,
|
1237
1308
|
struct ggml_tensor * a,
|
1238
|
-
|
1309
|
+
struct ggml_tensor * b,
|
1239
1310
|
int n_dims,
|
1240
1311
|
int mode,
|
1241
1312
|
int n_ctx);
|
@@ -1244,7 +1315,7 @@ extern "C" {
|
|
1244
1315
|
GGML_API struct ggml_tensor * ggml_rope_custom(
|
1245
1316
|
struct ggml_context * ctx,
|
1246
1317
|
struct ggml_tensor * a,
|
1247
|
-
|
1318
|
+
struct ggml_tensor * b,
|
1248
1319
|
int n_dims,
|
1249
1320
|
int mode,
|
1250
1321
|
int n_ctx,
|
@@ -1255,7 +1326,7 @@ extern "C" {
|
|
1255
1326
|
GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
|
1256
1327
|
struct ggml_context * ctx,
|
1257
1328
|
struct ggml_tensor * a,
|
1258
|
-
|
1329
|
+
struct ggml_tensor * b,
|
1259
1330
|
int n_dims,
|
1260
1331
|
int mode,
|
1261
1332
|
int n_ctx,
|
@@ -1266,7 +1337,7 @@ extern "C" {
|
|
1266
1337
|
GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
|
1267
1338
|
struct ggml_context * ctx,
|
1268
1339
|
struct ggml_tensor * a,
|
1269
|
-
|
1340
|
+
struct ggml_tensor * b,
|
1270
1341
|
int n_dims,
|
1271
1342
|
float base,
|
1272
1343
|
bool down);
|
@@ -1276,7 +1347,7 @@ extern "C" {
|
|
1276
1347
|
GGML_API struct ggml_tensor * ggml_rope_back(
|
1277
1348
|
struct ggml_context * ctx,
|
1278
1349
|
struct ggml_tensor * a,
|
1279
|
-
|
1350
|
+
struct ggml_tensor * b,
|
1280
1351
|
int n_dims,
|
1281
1352
|
int mode,
|
1282
1353
|
int n_ctx,
|
@@ -1319,6 +1390,14 @@ extern "C" {
|
|
1319
1390
|
int s,
|
1320
1391
|
int d);
|
1321
1392
|
|
1393
|
+
GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
|
1394
|
+
struct ggml_context * ctx,
|
1395
|
+
struct ggml_tensor * a,
|
1396
|
+
struct ggml_tensor * b,
|
1397
|
+
int s0,
|
1398
|
+
int p0,
|
1399
|
+
int d0);
|
1400
|
+
|
1322
1401
|
GGML_API struct ggml_tensor * ggml_conv_2d(
|
1323
1402
|
struct ggml_context * ctx,
|
1324
1403
|
struct ggml_tensor * a,
|
@@ -1656,6 +1735,16 @@ extern "C" {
|
|
1656
1735
|
// dump the graph into a file using the dot format
|
1657
1736
|
GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
|
1658
1737
|
|
1738
|
+
// build gradient checkpointing backward graph gb for gf using provided checkpoints
|
1739
|
+
// gb_tmp will contain original backward graph with rewritten backward process nodes,
|
1740
|
+
// but without the second forward pass nodes.
|
1741
|
+
GGML_API void ggml_build_backward_gradient_checkpointing(
|
1742
|
+
struct ggml_context * ctx,
|
1743
|
+
struct ggml_cgraph * gf,
|
1744
|
+
struct ggml_cgraph * gb,
|
1745
|
+
struct ggml_cgraph * gb_tmp,
|
1746
|
+
struct ggml_tensor * * checkpoints,
|
1747
|
+
int n_checkpoints);
|
1659
1748
|
//
|
1660
1749
|
// optimization
|
1661
1750
|
//
|
@@ -1682,6 +1771,7 @@ extern "C" {
|
|
1682
1771
|
GGML_OPT_NO_CONTEXT,
|
1683
1772
|
GGML_OPT_INVALID_WOLFE,
|
1684
1773
|
GGML_OPT_FAIL,
|
1774
|
+
GGML_OPT_CANCEL,
|
1685
1775
|
|
1686
1776
|
GGML_LINESEARCH_FAIL = -128,
|
1687
1777
|
GGML_LINESEARCH_MINIMUM_STEP,
|
@@ -1690,7 +1780,8 @@ extern "C" {
|
|
1690
1780
|
GGML_LINESEARCH_INVALID_PARAMETERS,
|
1691
1781
|
};
|
1692
1782
|
|
1693
|
-
typedef void (*ggml_opt_callback)(void * data, float * sched);
|
1783
|
+
typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
|
1784
|
+
typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
|
1694
1785
|
|
1695
1786
|
// optimization parameters
|
1696
1787
|
//
|
@@ -1721,6 +1812,8 @@ extern "C" {
|
|
1721
1812
|
bool print_forward_graph;
|
1722
1813
|
bool print_backward_graph;
|
1723
1814
|
|
1815
|
+
int n_gradient_accumulation;
|
1816
|
+
|
1724
1817
|
// ADAM parameters
|
1725
1818
|
struct {
|
1726
1819
|
int n_iter;
|
@@ -1766,6 +1859,7 @@ extern "C" {
|
|
1766
1859
|
float loss_after;
|
1767
1860
|
|
1768
1861
|
struct {
|
1862
|
+
struct ggml_tensor * g; // current gradient
|
1769
1863
|
struct ggml_tensor * m; // first moment
|
1770
1864
|
struct ggml_tensor * v; // second moment
|
1771
1865
|
struct ggml_tensor * pf; // past function values
|
@@ -1882,26 +1976,26 @@ extern "C" {
|
|
1882
1976
|
|
1883
1977
|
GGML_API int gguf_get_n_kv(const struct gguf_context * ctx);
|
1884
1978
|
GGML_API int gguf_find_key(const struct gguf_context * ctx, const char * key);
|
1885
|
-
GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int
|
1886
|
-
|
1887
|
-
GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int
|
1888
|
-
GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int
|
1889
|
-
|
1890
|
-
//
|
1891
|
-
GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int
|
1892
|
-
GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int
|
1893
|
-
GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int
|
1894
|
-
GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int
|
1895
|
-
GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int
|
1896
|
-
GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int
|
1897
|
-
GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int
|
1898
|
-
GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int
|
1899
|
-
GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int
|
1900
|
-
GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int
|
1901
|
-
GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int
|
1902
|
-
GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int
|
1903
|
-
GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int
|
1904
|
-
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int
|
1979
|
+
GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int key_id);
|
1980
|
+
|
1981
|
+
GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int key_id);
|
1982
|
+
GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id);
|
1983
|
+
|
1984
|
+
// will abort if the wrong type is used for the key
|
1985
|
+
GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int key_id);
|
1986
|
+
GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int key_id);
|
1987
|
+
GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int key_id);
|
1988
|
+
GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int key_id);
|
1989
|
+
GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int key_id);
|
1990
|
+
GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int key_id);
|
1991
|
+
GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int key_id);
|
1992
|
+
GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int key_id);
|
1993
|
+
GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int key_id);
|
1994
|
+
GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
|
1995
|
+
GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
|
1996
|
+
GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
|
1997
|
+
GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int key_id);
|
1998
|
+
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
|
1905
1999
|
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
|
1906
2000
|
|
1907
2001
|
GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
|