llama_cpp 0.12.6 → 0.12.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/ext/llama_cpp/llama_cpp.cpp +21 -10
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +8 -1
- data/vendor/tmp/llama.cpp/Makefile +43 -12
- data/vendor/tmp/llama.cpp/ggml-alloc.c +73 -43
- data/vendor/tmp/llama.cpp/ggml-backend.c +18 -9
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +560 -346
- data/vendor/tmp/llama.cpp/ggml-impl.h +20 -7
- data/vendor/tmp/llama.cpp/ggml-metal.m +99 -11
- data/vendor/tmp/llama.cpp/ggml-metal.metal +608 -9
- data/vendor/tmp/llama.cpp/ggml-quants.c +908 -54
- data/vendor/tmp/llama.cpp/ggml-quants.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +81 -203
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +124 -52
- data/vendor/tmp/llama.cpp/ggml.c +948 -504
- data/vendor/tmp/llama.cpp/ggml.h +24 -11
- data/vendor/tmp/llama.cpp/llama.cpp +688 -163
- data/vendor/tmp/llama.cpp/llama.h +37 -1
- data/vendor/tmp/llama.cpp/scripts/get-flags.mk +1 -1
- metadata +2 -2
data/vendor/tmp/llama.cpp/ggml.h
CHANGED
@@ -315,13 +315,7 @@
|
|
315
315
|
extern "C" {
|
316
316
|
#endif
|
317
317
|
|
318
|
-
#if defined(__ARM_NEON) && defined(__CUDACC__)
|
319
|
-
typedef half ggml_fp16_t;
|
320
|
-
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
|
321
|
-
typedef __fp16 ggml_fp16_t;
|
322
|
-
#else
|
323
318
|
typedef uint16_t ggml_fp16_t;
|
324
|
-
#endif
|
325
319
|
|
326
320
|
// convert FP16 <-> FP32
|
327
321
|
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
|
@@ -354,6 +348,8 @@ extern "C" {
|
|
354
348
|
GGML_TYPE_IQ2_XXS = 16,
|
355
349
|
GGML_TYPE_IQ2_XS = 17,
|
356
350
|
GGML_TYPE_IQ3_XXS = 18,
|
351
|
+
GGML_TYPE_IQ1_S = 19,
|
352
|
+
GGML_TYPE_IQ4_NL = 20,
|
357
353
|
GGML_TYPE_I8,
|
358
354
|
GGML_TYPE_I16,
|
359
355
|
GGML_TYPE_I32,
|
@@ -391,6 +387,8 @@ extern "C" {
|
|
391
387
|
GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
|
392
388
|
GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors
|
393
389
|
GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
|
390
|
+
GGML_FTYPE_MOSTLY_IQ1_S = 18, // except 1d tensors
|
391
|
+
GGML_FTYPE_MOSTLY_IQ4_NL = 19, // except 1d tensors
|
394
392
|
};
|
395
393
|
|
396
394
|
// available tensor operations:
|
@@ -658,6 +656,16 @@ extern "C" {
|
|
658
656
|
void * wdata;
|
659
657
|
};
|
660
658
|
|
659
|
+
// numa strategies
|
660
|
+
enum ggml_numa_strategy {
|
661
|
+
GGML_NUMA_STRATEGY_DISABLED = 0,
|
662
|
+
GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
|
663
|
+
GGML_NUMA_STRATEGY_ISOLATE = 2,
|
664
|
+
GGML_NUMA_STRATEGY_NUMACTL = 3,
|
665
|
+
GGML_NUMA_STRATEGY_MIRROR = 4,
|
666
|
+
GGML_NUMA_STRATEGY_COUNT
|
667
|
+
};
|
668
|
+
|
661
669
|
// misc
|
662
670
|
|
663
671
|
GGML_API void ggml_time_init(void); // call this once at the beginning of the program
|
@@ -668,7 +676,7 @@ extern "C" {
|
|
668
676
|
|
669
677
|
GGML_API void ggml_print_backtrace(void);
|
670
678
|
|
671
|
-
GGML_API void ggml_numa_init(
|
679
|
+
GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
|
672
680
|
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
|
673
681
|
|
674
682
|
GGML_API void ggml_print_object (const struct ggml_object * obj);
|
@@ -1373,13 +1381,17 @@ extern "C" {
|
|
1373
1381
|
struct ggml_context * ctx,
|
1374
1382
|
struct ggml_tensor * a);
|
1375
1383
|
|
1376
|
-
// fused soft_max(a*scale + mask)
|
1384
|
+
// fused soft_max(a*scale + mask + pos[i]*(ALiBi slope))
|
1377
1385
|
// mask is optional
|
1386
|
+
// pos is required when max_bias > 0.0f
|
1387
|
+
// max_bias = 0.0f for no ALiBi
|
1378
1388
|
GGML_API struct ggml_tensor * ggml_soft_max_ext(
|
1379
1389
|
struct ggml_context * ctx,
|
1380
1390
|
struct ggml_tensor * a,
|
1381
1391
|
struct ggml_tensor * mask,
|
1382
|
-
|
1392
|
+
struct ggml_tensor * pos,
|
1393
|
+
float scale,
|
1394
|
+
float max_bias);
|
1383
1395
|
|
1384
1396
|
GGML_API struct ggml_tensor * ggml_soft_max_back(
|
1385
1397
|
struct ggml_context * ctx,
|
@@ -1481,12 +1493,13 @@ extern "C" {
|
|
1481
1493
|
|
1482
1494
|
// alibi position embedding
|
1483
1495
|
// in-place, returns view(a)
|
1484
|
-
GGML_API struct ggml_tensor * ggml_alibi(
|
1496
|
+
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_alibi(
|
1485
1497
|
struct ggml_context * ctx,
|
1486
1498
|
struct ggml_tensor * a,
|
1487
1499
|
int n_past,
|
1488
1500
|
int n_head,
|
1489
|
-
float bias_max)
|
1501
|
+
float bias_max),
|
1502
|
+
"use ggml_soft_max_ext instead (will be removed in Mar 2024)");
|
1490
1503
|
|
1491
1504
|
// clamp
|
1492
1505
|
// in-place, returns view(a)
|