llama_cpp 0.5.3 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/README.md +6 -5
- data/examples/chat.rb +13 -13
- data/examples/embedding.rb +9 -9
- data/ext/llama_cpp/llama_cpp.cpp +583 -262
- data/ext/llama_cpp/src/ggml-alloc.c +8 -2
- data/ext/llama_cpp/src/ggml-alloc.h +1 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +326 -149
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.m +167 -89
- data/ext/llama_cpp/src/ggml-metal.metal +130 -40
- data/ext/llama_cpp/src/ggml-opencl.cpp +119 -53
- data/ext/llama_cpp/src/ggml.c +2355 -1166
- data/ext/llama_cpp/src/ggml.h +129 -35
- data/ext/llama_cpp/src/k_quants.c +744 -2
- data/ext/llama_cpp/src/llama.cpp +1766 -671
- data/ext/llama_cpp/src/llama.h +321 -120
- data/ext/llama_cpp/src/unicode.h +462 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +6 -10
- data/sig/llama_cpp.rbs +70 -34
- metadata +4 -3
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -214,8 +214,8 @@
|
|
214
214
|
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
|
215
215
|
|
216
216
|
#define GGML_MAX_DIMS 4
|
217
|
-
#define GGML_MAX_NODES
|
218
|
-
#define GGML_MAX_PARAMS
|
217
|
+
#define GGML_MAX_NODES 16384
|
218
|
+
#define GGML_MAX_PARAMS 1024
|
219
219
|
#define GGML_MAX_CONTEXTS 64
|
220
220
|
#define GGML_MAX_SRC 6
|
221
221
|
#define GGML_MAX_NAME 64
|
@@ -248,6 +248,14 @@
|
|
248
248
|
} \
|
249
249
|
} while (0)
|
250
250
|
|
251
|
+
#ifndef NDEBUG
|
252
|
+
#define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
|
253
|
+
#elif defined(__GNUC__)
|
254
|
+
#define GGML_UNREACHABLE() __builtin_unreachable()
|
255
|
+
#else
|
256
|
+
#define GGML_UNREACHABLE() ((void) 0)
|
257
|
+
#endif
|
258
|
+
|
251
259
|
// used to copy the number of elements and stride in bytes of tensors into local variables.
|
252
260
|
// main purpose is to reduce code duplication and improve readability.
|
253
261
|
//
|
@@ -393,10 +401,14 @@ extern "C" {
|
|
393
401
|
GGML_OP_CLAMP,
|
394
402
|
GGML_OP_CONV_1D,
|
395
403
|
GGML_OP_CONV_2D,
|
404
|
+
GGML_OP_CONV_TRANSPOSE_1D,
|
396
405
|
GGML_OP_CONV_TRANSPOSE_2D,
|
397
406
|
GGML_OP_POOL_1D,
|
398
407
|
GGML_OP_POOL_2D,
|
399
408
|
|
409
|
+
GGML_OP_CONV_1D_STAGE_0, // internal
|
410
|
+
GGML_OP_CONV_1D_STAGE_1, // internal
|
411
|
+
|
400
412
|
GGML_OP_UPSCALE, // nearest interpolate
|
401
413
|
|
402
414
|
GGML_OP_FLASH_ATTN,
|
@@ -445,6 +457,12 @@ extern "C" {
|
|
445
457
|
GGML_OBJECT_WORK_BUFFER
|
446
458
|
};
|
447
459
|
|
460
|
+
enum ggml_log_level {
|
461
|
+
GGML_LOG_LEVEL_ERROR = 2,
|
462
|
+
GGML_LOG_LEVEL_WARN = 3,
|
463
|
+
GGML_LOG_LEVEL_INFO = 4
|
464
|
+
};
|
465
|
+
|
448
466
|
// ggml object
|
449
467
|
struct ggml_object {
|
450
468
|
size_t offs;
|
@@ -467,8 +485,8 @@ extern "C" {
|
|
467
485
|
int n_dims;
|
468
486
|
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
469
487
|
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
|
470
|
-
// nb[0] =
|
471
|
-
// nb[1] = nb[0] * ne[0] + padding
|
488
|
+
// nb[0] = ggml_type_size(type)
|
489
|
+
// nb[1] = nb[0] * (ne[0] / ggml_blck_size(type)) + padding
|
472
490
|
// nb[i] = nb[i-1] * ne[i-1]
|
473
491
|
|
474
492
|
// compute data
|
@@ -520,7 +538,15 @@ extern "C" {
|
|
520
538
|
// next prime after GGML_MAX_NODES
|
521
539
|
// #define GGML_GRAPH_HASHTABLE_SIZE 4099
|
522
540
|
// next prime after GGML_MAX_NODES * 2 (nodes + leafs)
|
523
|
-
#define GGML_GRAPH_HASHTABLE_SIZE 8273
|
541
|
+
// #define GGML_GRAPH_HASHTABLE_SIZE 8273
|
542
|
+
// #define GGML_GRAPH_HASHTABLE_SIZE 16411
|
543
|
+
#define GGML_GRAPH_HASHTABLE_SIZE 32771
|
544
|
+
|
545
|
+
enum ggml_cgraph_eval_order {
|
546
|
+
GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
|
547
|
+
GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
|
548
|
+
GGML_CGRAPH_EVAL_ORDER_COUNT
|
549
|
+
};
|
524
550
|
|
525
551
|
// computation graph
|
526
552
|
struct ggml_cgraph {
|
@@ -533,6 +559,8 @@ extern "C" {
|
|
533
559
|
|
534
560
|
void * visited_hash_table[GGML_GRAPH_HASHTABLE_SIZE];
|
535
561
|
|
562
|
+
enum ggml_cgraph_eval_order order;
|
563
|
+
|
536
564
|
// performance
|
537
565
|
int perf_runs;
|
538
566
|
int64_t perf_cycles;
|
@@ -680,12 +708,21 @@ extern "C" {
|
|
680
708
|
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
681
709
|
GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
|
682
710
|
|
711
|
+
// Converts a flat index into coordinates
|
712
|
+
GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
|
713
|
+
|
683
714
|
GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
|
684
715
|
GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
|
685
716
|
|
717
|
+
GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
718
|
+
GGML_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
|
719
|
+
|
686
720
|
GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
|
687
721
|
GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
|
688
722
|
|
723
|
+
GGML_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
724
|
+
GGML_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
|
725
|
+
|
689
726
|
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
690
727
|
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
691
728
|
|
@@ -719,6 +756,12 @@ extern "C" {
|
|
719
756
|
struct ggml_tensor * a,
|
720
757
|
struct ggml_tensor * b);
|
721
758
|
|
759
|
+
GGML_API struct ggml_tensor * ggml_add_cast(
|
760
|
+
struct ggml_context * ctx,
|
761
|
+
struct ggml_tensor * a,
|
762
|
+
struct ggml_tensor * b,
|
763
|
+
enum ggml_type type);
|
764
|
+
|
722
765
|
GGML_API struct ggml_tensor * ggml_add1(
|
723
766
|
struct ggml_context * ctx,
|
724
767
|
struct ggml_tensor * a,
|
@@ -828,6 +871,7 @@ extern "C" {
|
|
828
871
|
struct ggml_tensor * a,
|
829
872
|
struct ggml_tensor * b);
|
830
873
|
|
874
|
+
// sums repetitions in a into shape of b
|
831
875
|
GGML_API struct ggml_tensor * ggml_repeat_back(
|
832
876
|
struct ggml_context * ctx,
|
833
877
|
struct ggml_tensor * a,
|
@@ -1049,7 +1093,6 @@ extern "C" {
|
|
1049
1093
|
size_t nb1,
|
1050
1094
|
size_t offset);
|
1051
1095
|
|
1052
|
-
|
1053
1096
|
// a -> b, return view(b)
|
1054
1097
|
GGML_API struct ggml_tensor * ggml_cpy(
|
1055
1098
|
struct ggml_context * ctx,
|
@@ -1072,6 +1115,33 @@ extern "C" {
|
|
1072
1115
|
struct ggml_context * ctx,
|
1073
1116
|
struct ggml_tensor * a);
|
1074
1117
|
|
1118
|
+
// make contiguous, with new shape
|
1119
|
+
GGML_API struct ggml_tensor * ggml_cont_1d(
|
1120
|
+
struct ggml_context * ctx,
|
1121
|
+
struct ggml_tensor * a,
|
1122
|
+
int64_t ne0);
|
1123
|
+
|
1124
|
+
GGML_API struct ggml_tensor * ggml_cont_2d(
|
1125
|
+
struct ggml_context * ctx,
|
1126
|
+
struct ggml_tensor * a,
|
1127
|
+
int64_t ne0,
|
1128
|
+
int64_t ne1);
|
1129
|
+
|
1130
|
+
GGML_API struct ggml_tensor * ggml_cont_3d(
|
1131
|
+
struct ggml_context * ctx,
|
1132
|
+
struct ggml_tensor * a,
|
1133
|
+
int64_t ne0,
|
1134
|
+
int64_t ne1,
|
1135
|
+
int64_t ne2);
|
1136
|
+
|
1137
|
+
GGML_API struct ggml_tensor * ggml_cont_4d(
|
1138
|
+
struct ggml_context * ctx,
|
1139
|
+
struct ggml_tensor * a,
|
1140
|
+
int64_t ne0,
|
1141
|
+
int64_t ne1,
|
1142
|
+
int64_t ne2,
|
1143
|
+
int64_t ne3);
|
1144
|
+
|
1075
1145
|
// return view(a), b specifies the new shape
|
1076
1146
|
// TODO: when we start computing gradient, make a copy instead of view
|
1077
1147
|
GGML_API struct ggml_tensor * ggml_reshape(
|
@@ -1219,14 +1289,15 @@ extern "C" {
|
|
1219
1289
|
struct ggml_tensor * b);
|
1220
1290
|
|
1221
1291
|
// rotary position embedding
|
1222
|
-
// if mode & 1 == 1, skip n_past elements
|
1292
|
+
// if mode & 1 == 1, skip n_past elements (DEPRECATED)
|
1223
1293
|
// if mode & 2 == 1, GPT-NeoX style
|
1224
1294
|
// if mode & 4 == 1, ChatGLM style
|
1225
|
-
//
|
1295
|
+
//
|
1296
|
+
// b is an int32 vector with size a->ne[2], it contains the positions
|
1226
1297
|
GGML_API struct ggml_tensor * ggml_rope(
|
1227
1298
|
struct ggml_context * ctx,
|
1228
1299
|
struct ggml_tensor * a,
|
1229
|
-
|
1300
|
+
struct ggml_tensor * b,
|
1230
1301
|
int n_dims,
|
1231
1302
|
int mode,
|
1232
1303
|
int n_ctx);
|
@@ -1235,7 +1306,7 @@ extern "C" {
|
|
1235
1306
|
GGML_API struct ggml_tensor * ggml_rope_inplace(
|
1236
1307
|
struct ggml_context * ctx,
|
1237
1308
|
struct ggml_tensor * a,
|
1238
|
-
|
1309
|
+
struct ggml_tensor * b,
|
1239
1310
|
int n_dims,
|
1240
1311
|
int mode,
|
1241
1312
|
int n_ctx);
|
@@ -1244,7 +1315,7 @@ extern "C" {
|
|
1244
1315
|
GGML_API struct ggml_tensor * ggml_rope_custom(
|
1245
1316
|
struct ggml_context * ctx,
|
1246
1317
|
struct ggml_tensor * a,
|
1247
|
-
|
1318
|
+
struct ggml_tensor * b,
|
1248
1319
|
int n_dims,
|
1249
1320
|
int mode,
|
1250
1321
|
int n_ctx,
|
@@ -1255,7 +1326,7 @@ extern "C" {
|
|
1255
1326
|
GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
|
1256
1327
|
struct ggml_context * ctx,
|
1257
1328
|
struct ggml_tensor * a,
|
1258
|
-
|
1329
|
+
struct ggml_tensor * b,
|
1259
1330
|
int n_dims,
|
1260
1331
|
int mode,
|
1261
1332
|
int n_ctx,
|
@@ -1266,7 +1337,7 @@ extern "C" {
|
|
1266
1337
|
GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
|
1267
1338
|
struct ggml_context * ctx,
|
1268
1339
|
struct ggml_tensor * a,
|
1269
|
-
|
1340
|
+
struct ggml_tensor * b,
|
1270
1341
|
int n_dims,
|
1271
1342
|
float base,
|
1272
1343
|
bool down);
|
@@ -1276,7 +1347,7 @@ extern "C" {
|
|
1276
1347
|
GGML_API struct ggml_tensor * ggml_rope_back(
|
1277
1348
|
struct ggml_context * ctx,
|
1278
1349
|
struct ggml_tensor * a,
|
1279
|
-
|
1350
|
+
struct ggml_tensor * b,
|
1280
1351
|
int n_dims,
|
1281
1352
|
int mode,
|
1282
1353
|
int n_ctx,
|
@@ -1319,6 +1390,14 @@ extern "C" {
|
|
1319
1390
|
int s,
|
1320
1391
|
int d);
|
1321
1392
|
|
1393
|
+
GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
|
1394
|
+
struct ggml_context * ctx,
|
1395
|
+
struct ggml_tensor * a,
|
1396
|
+
struct ggml_tensor * b,
|
1397
|
+
int s0,
|
1398
|
+
int p0,
|
1399
|
+
int d0);
|
1400
|
+
|
1322
1401
|
GGML_API struct ggml_tensor * ggml_conv_2d(
|
1323
1402
|
struct ggml_context * ctx,
|
1324
1403
|
struct ggml_tensor * a,
|
@@ -1656,6 +1735,16 @@ extern "C" {
|
|
1656
1735
|
// dump the graph into a file using the dot format
|
1657
1736
|
GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
|
1658
1737
|
|
1738
|
+
// build gradient checkpointing backward graph gb for gf using provided checkpoints
|
1739
|
+
// gb_tmp will contain original backward graph with rewritten backward process nodes,
|
1740
|
+
// but without the second forward pass nodes.
|
1741
|
+
GGML_API void ggml_build_backward_gradient_checkpointing(
|
1742
|
+
struct ggml_context * ctx,
|
1743
|
+
struct ggml_cgraph * gf,
|
1744
|
+
struct ggml_cgraph * gb,
|
1745
|
+
struct ggml_cgraph * gb_tmp,
|
1746
|
+
struct ggml_tensor * * checkpoints,
|
1747
|
+
int n_checkpoints);
|
1659
1748
|
//
|
1660
1749
|
// optimization
|
1661
1750
|
//
|
@@ -1682,6 +1771,7 @@ extern "C" {
|
|
1682
1771
|
GGML_OPT_NO_CONTEXT,
|
1683
1772
|
GGML_OPT_INVALID_WOLFE,
|
1684
1773
|
GGML_OPT_FAIL,
|
1774
|
+
GGML_OPT_CANCEL,
|
1685
1775
|
|
1686
1776
|
GGML_LINESEARCH_FAIL = -128,
|
1687
1777
|
GGML_LINESEARCH_MINIMUM_STEP,
|
@@ -1690,7 +1780,8 @@ extern "C" {
|
|
1690
1780
|
GGML_LINESEARCH_INVALID_PARAMETERS,
|
1691
1781
|
};
|
1692
1782
|
|
1693
|
-
typedef void (*ggml_opt_callback)(void * data, float * sched);
|
1783
|
+
typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
|
1784
|
+
typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
|
1694
1785
|
|
1695
1786
|
// optimization parameters
|
1696
1787
|
//
|
@@ -1721,6 +1812,8 @@ extern "C" {
|
|
1721
1812
|
bool print_forward_graph;
|
1722
1813
|
bool print_backward_graph;
|
1723
1814
|
|
1815
|
+
int n_gradient_accumulation;
|
1816
|
+
|
1724
1817
|
// ADAM parameters
|
1725
1818
|
struct {
|
1726
1819
|
int n_iter;
|
@@ -1766,6 +1859,7 @@ extern "C" {
|
|
1766
1859
|
float loss_after;
|
1767
1860
|
|
1768
1861
|
struct {
|
1862
|
+
struct ggml_tensor * g; // current gradient
|
1769
1863
|
struct ggml_tensor * m; // first moment
|
1770
1864
|
struct ggml_tensor * v; // second moment
|
1771
1865
|
struct ggml_tensor * pf; // past function values
|
@@ -1882,26 +1976,26 @@ extern "C" {
|
|
1882
1976
|
|
1883
1977
|
GGML_API int gguf_get_n_kv(const struct gguf_context * ctx);
|
1884
1978
|
GGML_API int gguf_find_key(const struct gguf_context * ctx, const char * key);
|
1885
|
-
GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int
|
1886
|
-
|
1887
|
-
GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int
|
1888
|
-
GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int
|
1889
|
-
|
1890
|
-
//
|
1891
|
-
GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int
|
1892
|
-
GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int
|
1893
|
-
GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int
|
1894
|
-
GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int
|
1895
|
-
GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int
|
1896
|
-
GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int
|
1897
|
-
GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int
|
1898
|
-
GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int
|
1899
|
-
GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int
|
1900
|
-
GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int
|
1901
|
-
GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int
|
1902
|
-
GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int
|
1903
|
-
GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int
|
1904
|
-
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int
|
1979
|
+
GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int key_id);
|
1980
|
+
|
1981
|
+
GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int key_id);
|
1982
|
+
GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id);
|
1983
|
+
|
1984
|
+
// will abort if the wrong type is used for the key
|
1985
|
+
GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int key_id);
|
1986
|
+
GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int key_id);
|
1987
|
+
GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int key_id);
|
1988
|
+
GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int key_id);
|
1989
|
+
GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int key_id);
|
1990
|
+
GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int key_id);
|
1991
|
+
GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int key_id);
|
1992
|
+
GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int key_id);
|
1993
|
+
GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int key_id);
|
1994
|
+
GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
|
1995
|
+
GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
|
1996
|
+
GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
|
1997
|
+
GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int key_id);
|
1998
|
+
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
|
1905
1999
|
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
|
1906
2000
|
|
1907
2001
|
GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
|