llama_cpp 0.12.6 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +21 -0
- data/ext/llama_cpp/llama_cpp.cpp +90 -269
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +28 -23
- data/vendor/tmp/llama.cpp/Makefile +51 -15
- data/vendor/tmp/llama.cpp/ggml-alloc.c +73 -43
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +32 -11
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +560 -346
- data/vendor/tmp/llama.cpp/ggml-impl.h +20 -7
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +191 -22
- data/vendor/tmp/llama.cpp/ggml-metal.metal +2472 -862
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +25 -25
- data/vendor/tmp/llama.cpp/ggml-quants.c +3176 -667
- data/vendor/tmp/llama.cpp/ggml-quants.h +77 -2
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +373 -424
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +186 -102
- data/vendor/tmp/llama.cpp/ggml.c +1266 -699
- data/vendor/tmp/llama.cpp/ggml.h +59 -30
- data/vendor/tmp/llama.cpp/llama.cpp +1517 -717
- data/vendor/tmp/llama.cpp/llama.h +87 -63
- data/vendor/tmp/llama.cpp/scripts/get-flags.mk +1 -1
- data/vendor/tmp/llama.cpp/unicode.h +310 -1
- metadata +2 -2
data/vendor/tmp/llama.cpp/ggml.h
CHANGED
|
@@ -315,13 +315,7 @@
|
|
|
315
315
|
extern "C" {
|
|
316
316
|
#endif
|
|
317
317
|
|
|
318
|
-
#if defined(__ARM_NEON) && defined(__CUDACC__)
|
|
319
|
-
typedef half ggml_fp16_t;
|
|
320
|
-
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
|
|
321
|
-
typedef __fp16 ggml_fp16_t;
|
|
322
|
-
#else
|
|
323
318
|
typedef uint16_t ggml_fp16_t;
|
|
324
|
-
#endif
|
|
325
319
|
|
|
326
320
|
// convert FP16 <-> FP32
|
|
327
321
|
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
|
|
@@ -354,6 +348,11 @@ extern "C" {
|
|
|
354
348
|
GGML_TYPE_IQ2_XXS = 16,
|
|
355
349
|
GGML_TYPE_IQ2_XS = 17,
|
|
356
350
|
GGML_TYPE_IQ3_XXS = 18,
|
|
351
|
+
GGML_TYPE_IQ1_S = 19,
|
|
352
|
+
GGML_TYPE_IQ4_NL = 20,
|
|
353
|
+
GGML_TYPE_IQ3_S = 21,
|
|
354
|
+
GGML_TYPE_IQ2_S = 22,
|
|
355
|
+
GGML_TYPE_IQ4_XS = 23,
|
|
357
356
|
GGML_TYPE_I8,
|
|
358
357
|
GGML_TYPE_I16,
|
|
359
358
|
GGML_TYPE_I32,
|
|
@@ -367,9 +366,9 @@ extern "C" {
|
|
|
367
366
|
};
|
|
368
367
|
|
|
369
368
|
enum ggml_backend_type {
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
369
|
+
GGML_BACKEND_TYPE_CPU = 0,
|
|
370
|
+
GGML_BACKEND_TYPE_GPU = 10,
|
|
371
|
+
GGML_BACKEND_TYPE_GPU_SPLIT = 20,
|
|
373
372
|
};
|
|
374
373
|
|
|
375
374
|
// model file types
|
|
@@ -391,6 +390,11 @@ extern "C" {
|
|
|
391
390
|
GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
|
|
392
391
|
GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors
|
|
393
392
|
GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
|
|
393
|
+
GGML_FTYPE_MOSTLY_IQ1_S = 18, // except 1d tensors
|
|
394
|
+
GGML_FTYPE_MOSTLY_IQ4_NL = 19, // except 1d tensors
|
|
395
|
+
GGML_FTYPE_MOSTLY_IQ3_S = 20, // except 1d tensors
|
|
396
|
+
GGML_FTYPE_MOSTLY_IQ2_S = 21, // except 1d tensors
|
|
397
|
+
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
|
|
394
398
|
};
|
|
395
399
|
|
|
396
400
|
// available tensor operations:
|
|
@@ -498,9 +502,9 @@ extern "C" {
|
|
|
498
502
|
};
|
|
499
503
|
|
|
500
504
|
enum ggml_object_type {
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
505
|
+
GGML_OBJECT_TYPE_TENSOR,
|
|
506
|
+
GGML_OBJECT_TYPE_GRAPH,
|
|
507
|
+
GGML_OBJECT_TYPE_WORK_BUFFER
|
|
504
508
|
};
|
|
505
509
|
|
|
506
510
|
enum ggml_log_level {
|
|
@@ -642,9 +646,9 @@ extern "C" {
|
|
|
642
646
|
// NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
|
|
643
647
|
// This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
|
|
644
648
|
enum ggml_task_type {
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
649
|
+
GGML_TASK_TYPE_INIT = 0,
|
|
650
|
+
GGML_TASK_TYPE_COMPUTE,
|
|
651
|
+
GGML_TASK_TYPE_FINALIZE,
|
|
648
652
|
};
|
|
649
653
|
|
|
650
654
|
struct ggml_compute_params {
|
|
@@ -658,6 +662,26 @@ extern "C" {
|
|
|
658
662
|
void * wdata;
|
|
659
663
|
};
|
|
660
664
|
|
|
665
|
+
// numa strategies
|
|
666
|
+
enum ggml_numa_strategy {
|
|
667
|
+
GGML_NUMA_STRATEGY_DISABLED = 0,
|
|
668
|
+
GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
|
|
669
|
+
GGML_NUMA_STRATEGY_ISOLATE = 2,
|
|
670
|
+
GGML_NUMA_STRATEGY_NUMACTL = 3,
|
|
671
|
+
GGML_NUMA_STRATEGY_MIRROR = 4,
|
|
672
|
+
GGML_NUMA_STRATEGY_COUNT
|
|
673
|
+
};
|
|
674
|
+
|
|
675
|
+
//
|
|
676
|
+
// GUID
|
|
677
|
+
//
|
|
678
|
+
|
|
679
|
+
// GUID types
|
|
680
|
+
typedef uint8_t ggml_guid[16];
|
|
681
|
+
typedef ggml_guid * ggml_guid_t;
|
|
682
|
+
|
|
683
|
+
GGML_API bool ggml_guid_matches(ggml_guid_t guid_a, ggml_guid_t guid_b);
|
|
684
|
+
|
|
661
685
|
// misc
|
|
662
686
|
|
|
663
687
|
GGML_API void ggml_time_init(void); // call this once at the beginning of the program
|
|
@@ -668,7 +692,7 @@ extern "C" {
|
|
|
668
692
|
|
|
669
693
|
GGML_API void ggml_print_backtrace(void);
|
|
670
694
|
|
|
671
|
-
GGML_API void ggml_numa_init(
|
|
695
|
+
GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
|
|
672
696
|
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
|
|
673
697
|
|
|
674
698
|
GGML_API void ggml_print_object (const struct ggml_object * obj);
|
|
@@ -1373,13 +1397,17 @@ extern "C" {
|
|
|
1373
1397
|
struct ggml_context * ctx,
|
|
1374
1398
|
struct ggml_tensor * a);
|
|
1375
1399
|
|
|
1376
|
-
// fused soft_max(a*scale + mask)
|
|
1400
|
+
// fused soft_max(a*scale + mask + pos[i]*(ALiBi slope))
|
|
1377
1401
|
// mask is optional
|
|
1402
|
+
// pos is required when max_bias > 0.0f
|
|
1403
|
+
// max_bias = 0.0f for no ALiBi
|
|
1378
1404
|
GGML_API struct ggml_tensor * ggml_soft_max_ext(
|
|
1379
1405
|
struct ggml_context * ctx,
|
|
1380
1406
|
struct ggml_tensor * a,
|
|
1381
1407
|
struct ggml_tensor * mask,
|
|
1382
|
-
|
|
1408
|
+
struct ggml_tensor * pos,
|
|
1409
|
+
float scale,
|
|
1410
|
+
float max_bias);
|
|
1383
1411
|
|
|
1384
1412
|
GGML_API struct ggml_tensor * ggml_soft_max_back(
|
|
1385
1413
|
struct ggml_context * ctx,
|
|
@@ -1481,12 +1509,13 @@ extern "C" {
|
|
|
1481
1509
|
|
|
1482
1510
|
// alibi position embedding
|
|
1483
1511
|
// in-place, returns view(a)
|
|
1484
|
-
GGML_API struct ggml_tensor * ggml_alibi(
|
|
1512
|
+
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_alibi(
|
|
1485
1513
|
struct ggml_context * ctx,
|
|
1486
1514
|
struct ggml_tensor * a,
|
|
1487
1515
|
int n_past,
|
|
1488
1516
|
int n_head,
|
|
1489
|
-
float bias_max)
|
|
1517
|
+
float bias_max),
|
|
1518
|
+
"use ggml_soft_max_ext instead (will be removed in Mar 2024)");
|
|
1490
1519
|
|
|
1491
1520
|
// clamp
|
|
1492
1521
|
// in-place, returns view(a)
|
|
@@ -1634,8 +1663,8 @@ extern "C" {
|
|
|
1634
1663
|
|
|
1635
1664
|
// sort rows
|
|
1636
1665
|
enum ggml_sort_order {
|
|
1637
|
-
|
|
1638
|
-
|
|
1666
|
+
GGML_SORT_ORDER_ASC,
|
|
1667
|
+
GGML_SORT_ORDER_DESC,
|
|
1639
1668
|
};
|
|
1640
1669
|
|
|
1641
1670
|
GGML_API struct ggml_tensor * ggml_argsort(
|
|
@@ -1928,8 +1957,8 @@ extern "C" {
|
|
|
1928
1957
|
|
|
1929
1958
|
// optimization methods
|
|
1930
1959
|
enum ggml_opt_type {
|
|
1931
|
-
|
|
1932
|
-
|
|
1960
|
+
GGML_OPT_TYPE_ADAM,
|
|
1961
|
+
GGML_OPT_TYPE_LBFGS,
|
|
1933
1962
|
};
|
|
1934
1963
|
|
|
1935
1964
|
// linesearch methods
|
|
@@ -1943,12 +1972,12 @@ extern "C" {
|
|
|
1943
1972
|
|
|
1944
1973
|
// optimization return values
|
|
1945
1974
|
enum ggml_opt_result {
|
|
1946
|
-
|
|
1947
|
-
|
|
1948
|
-
|
|
1949
|
-
|
|
1950
|
-
|
|
1951
|
-
|
|
1975
|
+
GGML_OPT_RESULT_OK = 0,
|
|
1976
|
+
GGML_OPT_RESULT_DID_NOT_CONVERGE,
|
|
1977
|
+
GGML_OPT_RESULT_NO_CONTEXT,
|
|
1978
|
+
GGML_OPT_RESULT_INVALID_WOLFE,
|
|
1979
|
+
GGML_OPT_RESULT_FAIL,
|
|
1980
|
+
GGML_OPT_RESULT_CANCEL,
|
|
1952
1981
|
|
|
1953
1982
|
GGML_LINESEARCH_FAIL = -128,
|
|
1954
1983
|
GGML_LINESEARCH_MINIMUM_STEP,
|