llama_cpp 0.12.5 → 0.12.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/ext/llama_cpp/llama_cpp.cpp +67 -10
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -1
- data/vendor/tmp/llama.cpp/Makefile +51 -12
- data/vendor/tmp/llama.cpp/ggml-alloc.c +595 -492
- data/vendor/tmp/llama.cpp/ggml-alloc.h +39 -65
- data/vendor/tmp/llama.cpp/ggml-backend.c +268 -271
- data/vendor/tmp/llama.cpp/ggml-backend.h +8 -12
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +560 -346
- data/vendor/tmp/llama.cpp/ggml-impl.h +20 -7
- data/vendor/tmp/llama.cpp/ggml-metal.m +101 -11
- data/vendor/tmp/llama.cpp/ggml-metal.metal +608 -9
- data/vendor/tmp/llama.cpp/ggml-quants.c +1255 -94
- data/vendor/tmp/llama.cpp/ggml-quants.h +39 -16
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +95 -264
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +213 -58
- data/vendor/tmp/llama.cpp/ggml.c +1082 -564
- data/vendor/tmp/llama.cpp/ggml.h +50 -17
- data/vendor/tmp/llama.cpp/llama.cpp +1329 -280
- data/vendor/tmp/llama.cpp/llama.h +43 -1
- data/vendor/tmp/llama.cpp/scripts/get-flags.mk +1 -1
- data/vendor/tmp/llama.cpp/unicode.h +42 -30
- metadata +2 -2
data/vendor/tmp/llama.cpp/ggml.h
CHANGED
@@ -315,13 +315,7 @@
|
|
315
315
|
extern "C" {
|
316
316
|
#endif
|
317
317
|
|
318
|
-
#if defined(__ARM_NEON) && defined(__CUDACC__)
|
319
|
-
typedef half ggml_fp16_t;
|
320
|
-
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
|
321
|
-
typedef __fp16 ggml_fp16_t;
|
322
|
-
#else
|
323
318
|
typedef uint16_t ggml_fp16_t;
|
324
|
-
#endif
|
325
319
|
|
326
320
|
// convert FP16 <-> FP32
|
327
321
|
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
|
@@ -354,6 +348,8 @@ extern "C" {
|
|
354
348
|
GGML_TYPE_IQ2_XXS = 16,
|
355
349
|
GGML_TYPE_IQ2_XS = 17,
|
356
350
|
GGML_TYPE_IQ3_XXS = 18,
|
351
|
+
GGML_TYPE_IQ1_S = 19,
|
352
|
+
GGML_TYPE_IQ4_NL = 20,
|
357
353
|
GGML_TYPE_I8,
|
358
354
|
GGML_TYPE_I16,
|
359
355
|
GGML_TYPE_I32,
|
@@ -391,6 +387,8 @@ extern "C" {
|
|
391
387
|
GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
|
392
388
|
GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors
|
393
389
|
GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
|
390
|
+
GGML_FTYPE_MOSTLY_IQ1_S = 18, // except 1d tensors
|
391
|
+
GGML_FTYPE_MOSTLY_IQ4_NL = 19, // except 1d tensors
|
394
392
|
};
|
395
393
|
|
396
394
|
// available tensor operations:
|
@@ -505,11 +503,17 @@ extern "C" {
|
|
505
503
|
|
506
504
|
enum ggml_log_level {
|
507
505
|
GGML_LOG_LEVEL_ERROR = 2,
|
508
|
-
GGML_LOG_LEVEL_WARN
|
509
|
-
GGML_LOG_LEVEL_INFO
|
506
|
+
GGML_LOG_LEVEL_WARN = 3,
|
507
|
+
GGML_LOG_LEVEL_INFO = 4,
|
510
508
|
GGML_LOG_LEVEL_DEBUG = 5
|
511
509
|
};
|
512
510
|
|
511
|
+
enum ggml_tensor_flag {
|
512
|
+
GGML_TENSOR_FLAG_INPUT = 1,
|
513
|
+
GGML_TENSOR_FLAG_OUTPUT = 2,
|
514
|
+
GGML_TENSOR_FLAG_PARAM = 4,
|
515
|
+
};
|
516
|
+
|
513
517
|
// ggml object
|
514
518
|
struct ggml_object {
|
515
519
|
size_t offs;
|
@@ -543,7 +547,7 @@ extern "C" {
|
|
543
547
|
// op params - allocated as int32_t for alignment
|
544
548
|
int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
|
545
549
|
|
546
|
-
|
550
|
+
int32_t flags;
|
547
551
|
|
548
552
|
struct ggml_tensor * grad;
|
549
553
|
struct ggml_tensor * src[GGML_MAX_SRC];
|
@@ -567,6 +571,11 @@ extern "C" {
|
|
567
571
|
|
568
572
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
569
573
|
|
574
|
+
// Abort callback
|
575
|
+
// If not NULL, called before ggml computation
|
576
|
+
// If it returns true, the computation is aborted
|
577
|
+
typedef bool (*ggml_abort_callback)(void * data);
|
578
|
+
|
570
579
|
// the compute plan that needs to be prepared for ggml_graph_compute()
|
571
580
|
// since https://github.com/ggerganov/ggml/issues/287
|
572
581
|
struct ggml_cplan {
|
@@ -576,8 +585,8 @@ extern "C" {
|
|
576
585
|
int n_threads;
|
577
586
|
|
578
587
|
// abort ggml_graph_compute when true
|
579
|
-
|
580
|
-
void *
|
588
|
+
ggml_abort_callback abort_callback;
|
589
|
+
void * abort_callback_data;
|
581
590
|
};
|
582
591
|
|
583
592
|
enum ggml_cgraph_eval_order {
|
@@ -647,6 +656,16 @@ extern "C" {
|
|
647
656
|
void * wdata;
|
648
657
|
};
|
649
658
|
|
659
|
+
// numa strategies
|
660
|
+
enum ggml_numa_strategy {
|
661
|
+
GGML_NUMA_STRATEGY_DISABLED = 0,
|
662
|
+
GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
|
663
|
+
GGML_NUMA_STRATEGY_ISOLATE = 2,
|
664
|
+
GGML_NUMA_STRATEGY_NUMACTL = 3,
|
665
|
+
GGML_NUMA_STRATEGY_MIRROR = 4,
|
666
|
+
GGML_NUMA_STRATEGY_COUNT
|
667
|
+
};
|
668
|
+
|
650
669
|
// misc
|
651
670
|
|
652
671
|
GGML_API void ggml_time_init(void); // call this once at the beginning of the program
|
@@ -657,7 +676,7 @@ extern "C" {
|
|
657
676
|
|
658
677
|
GGML_API void ggml_print_backtrace(void);
|
659
678
|
|
660
|
-
GGML_API void ggml_numa_init(
|
679
|
+
GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
|
661
680
|
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
|
662
681
|
|
663
682
|
GGML_API void ggml_print_object (const struct ggml_object * obj);
|
@@ -1362,13 +1381,17 @@ extern "C" {
|
|
1362
1381
|
struct ggml_context * ctx,
|
1363
1382
|
struct ggml_tensor * a);
|
1364
1383
|
|
1365
|
-
// fused soft_max(a*scale + mask)
|
1384
|
+
// fused soft_max(a*scale + mask + pos[i]*(ALiBi slope))
|
1366
1385
|
// mask is optional
|
1386
|
+
// pos is required when max_bias > 0.0f
|
1387
|
+
// max_bias = 0.0f for no ALiBi
|
1367
1388
|
GGML_API struct ggml_tensor * ggml_soft_max_ext(
|
1368
1389
|
struct ggml_context * ctx,
|
1369
1390
|
struct ggml_tensor * a,
|
1370
1391
|
struct ggml_tensor * mask,
|
1371
|
-
|
1392
|
+
struct ggml_tensor * pos,
|
1393
|
+
float scale,
|
1394
|
+
float max_bias);
|
1372
1395
|
|
1373
1396
|
GGML_API struct ggml_tensor * ggml_soft_max_back(
|
1374
1397
|
struct ggml_context * ctx,
|
@@ -1470,12 +1493,13 @@ extern "C" {
|
|
1470
1493
|
|
1471
1494
|
// alibi position embedding
|
1472
1495
|
// in-place, returns view(a)
|
1473
|
-
GGML_API struct ggml_tensor * ggml_alibi(
|
1496
|
+
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_alibi(
|
1474
1497
|
struct ggml_context * ctx,
|
1475
1498
|
struct ggml_tensor * a,
|
1476
1499
|
int n_past,
|
1477
1500
|
int n_head,
|
1478
|
-
float bias_max)
|
1501
|
+
float bias_max),
|
1502
|
+
"use ggml_soft_max_ext instead (will be removed in Mar 2024)");
|
1479
1503
|
|
1480
1504
|
// clamp
|
1481
1505
|
// in-place, returns view(a)
|
@@ -2087,6 +2111,12 @@ extern "C" {
|
|
2087
2111
|
ggml_opt_callback callback,
|
2088
2112
|
void * callback_data);
|
2089
2113
|
|
2114
|
+
//
|
2115
|
+
// tensor flags
|
2116
|
+
//
|
2117
|
+
GGML_API void ggml_set_input(struct ggml_tensor * tensor);
|
2118
|
+
GGML_API void ggml_set_output(struct ggml_tensor * tensor);
|
2119
|
+
|
2090
2120
|
//
|
2091
2121
|
// quantization
|
2092
2122
|
//
|
@@ -2273,6 +2303,7 @@ extern "C" {
|
|
2273
2303
|
GGML_API int ggml_cpu_has_ssse3 (void);
|
2274
2304
|
GGML_API int ggml_cpu_has_sycl (void);
|
2275
2305
|
GGML_API int ggml_cpu_has_vsx (void);
|
2306
|
+
GGML_API int ggml_cpu_has_matmul_int8(void);
|
2276
2307
|
|
2277
2308
|
//
|
2278
2309
|
// Internal types and functions exposed for tests and benchmarks
|
@@ -2286,7 +2317,8 @@ extern "C" {
|
|
2286
2317
|
#endif
|
2287
2318
|
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
2288
2319
|
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
2289
|
-
typedef void (*ggml_vec_dot_t) (
|
2320
|
+
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
|
2321
|
+
const void * GGML_RESTRICT y, size_t by, int nrc);
|
2290
2322
|
|
2291
2323
|
typedef struct {
|
2292
2324
|
const char * type_name;
|
@@ -2298,6 +2330,7 @@ extern "C" {
|
|
2298
2330
|
ggml_from_float_t from_float_reference;
|
2299
2331
|
ggml_vec_dot_t vec_dot;
|
2300
2332
|
enum ggml_type vec_dot_type;
|
2333
|
+
int64_t nrows; // number of rows to process simultaneously;
|
2301
2334
|
} ggml_type_traits_t;
|
2302
2335
|
|
2303
2336
|
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
|