llama_cpp 0.12.6 → 0.12.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/ext/llama_cpp/llama_cpp.cpp +21 -10
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +8 -1
- data/vendor/tmp/llama.cpp/Makefile +43 -12
- data/vendor/tmp/llama.cpp/ggml-alloc.c +73 -43
- data/vendor/tmp/llama.cpp/ggml-backend.c +18 -9
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +560 -346
- data/vendor/tmp/llama.cpp/ggml-impl.h +20 -7
- data/vendor/tmp/llama.cpp/ggml-metal.m +99 -11
- data/vendor/tmp/llama.cpp/ggml-metal.metal +608 -9
- data/vendor/tmp/llama.cpp/ggml-quants.c +908 -54
- data/vendor/tmp/llama.cpp/ggml-quants.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +81 -203
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +124 -52
- data/vendor/tmp/llama.cpp/ggml.c +948 -504
- data/vendor/tmp/llama.cpp/ggml.h +24 -11
- data/vendor/tmp/llama.cpp/llama.cpp +688 -163
- data/vendor/tmp/llama.cpp/llama.h +37 -1
- data/vendor/tmp/llama.cpp/scripts/get-flags.mk +1 -1
- metadata +2 -2
data/vendor/tmp/llama.cpp/ggml.h
CHANGED
@@ -315,13 +315,7 @@
|
|
315
315
|
extern "C" {
|
316
316
|
#endif
|
317
317
|
|
318
|
-
#if defined(__ARM_NEON) && defined(__CUDACC__)
|
319
|
-
typedef half ggml_fp16_t;
|
320
|
-
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
|
321
|
-
typedef __fp16 ggml_fp16_t;
|
322
|
-
#else
|
323
318
|
typedef uint16_t ggml_fp16_t;
|
324
|
-
#endif
|
325
319
|
|
326
320
|
// convert FP16 <-> FP32
|
327
321
|
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
|
@@ -354,6 +348,8 @@ extern "C" {
|
|
354
348
|
GGML_TYPE_IQ2_XXS = 16,
|
355
349
|
GGML_TYPE_IQ2_XS = 17,
|
356
350
|
GGML_TYPE_IQ3_XXS = 18,
|
351
|
+
GGML_TYPE_IQ1_S = 19,
|
352
|
+
GGML_TYPE_IQ4_NL = 20,
|
357
353
|
GGML_TYPE_I8,
|
358
354
|
GGML_TYPE_I16,
|
359
355
|
GGML_TYPE_I32,
|
@@ -391,6 +387,8 @@ extern "C" {
|
|
391
387
|
GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
|
392
388
|
GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors
|
393
389
|
GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
|
390
|
+
GGML_FTYPE_MOSTLY_IQ1_S = 18, // except 1d tensors
|
391
|
+
GGML_FTYPE_MOSTLY_IQ4_NL = 19, // except 1d tensors
|
394
392
|
};
|
395
393
|
|
396
394
|
// available tensor operations:
|
@@ -658,6 +656,16 @@ extern "C" {
|
|
658
656
|
void * wdata;
|
659
657
|
};
|
660
658
|
|
659
|
+
// numa strategies
|
660
|
+
enum ggml_numa_strategy {
|
661
|
+
GGML_NUMA_STRATEGY_DISABLED = 0,
|
662
|
+
GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
|
663
|
+
GGML_NUMA_STRATEGY_ISOLATE = 2,
|
664
|
+
GGML_NUMA_STRATEGY_NUMACTL = 3,
|
665
|
+
GGML_NUMA_STRATEGY_MIRROR = 4,
|
666
|
+
GGML_NUMA_STRATEGY_COUNT
|
667
|
+
};
|
668
|
+
|
661
669
|
// misc
|
662
670
|
|
663
671
|
GGML_API void ggml_time_init(void); // call this once at the beginning of the program
|
@@ -668,7 +676,7 @@ extern "C" {
|
|
668
676
|
|
669
677
|
GGML_API void ggml_print_backtrace(void);
|
670
678
|
|
671
|
-
GGML_API void ggml_numa_init(
|
679
|
+
GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
|
672
680
|
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
|
673
681
|
|
674
682
|
GGML_API void ggml_print_object (const struct ggml_object * obj);
|
@@ -1373,13 +1381,17 @@ extern "C" {
|
|
1373
1381
|
struct ggml_context * ctx,
|
1374
1382
|
struct ggml_tensor * a);
|
1375
1383
|
|
1376
|
-
// fused soft_max(a*scale + mask)
|
1384
|
+
// fused soft_max(a*scale + mask + pos[i]*(ALiBi slope))
|
1377
1385
|
// mask is optional
|
1386
|
+
// pos is required when max_bias > 0.0f
|
1387
|
+
// max_bias = 0.0f for no ALiBi
|
1378
1388
|
GGML_API struct ggml_tensor * ggml_soft_max_ext(
|
1379
1389
|
struct ggml_context * ctx,
|
1380
1390
|
struct ggml_tensor * a,
|
1381
1391
|
struct ggml_tensor * mask,
|
1382
|
-
|
1392
|
+
struct ggml_tensor * pos,
|
1393
|
+
float scale,
|
1394
|
+
float max_bias);
|
1383
1395
|
|
1384
1396
|
GGML_API struct ggml_tensor * ggml_soft_max_back(
|
1385
1397
|
struct ggml_context * ctx,
|
@@ -1481,12 +1493,13 @@ extern "C" {
|
|
1481
1493
|
|
1482
1494
|
// alibi position embedding
|
1483
1495
|
// in-place, returns view(a)
|
1484
|
-
GGML_API struct ggml_tensor * ggml_alibi(
|
1496
|
+
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_alibi(
|
1485
1497
|
struct ggml_context * ctx,
|
1486
1498
|
struct ggml_tensor * a,
|
1487
1499
|
int n_past,
|
1488
1500
|
int n_head,
|
1489
|
-
float bias_max)
|
1501
|
+
float bias_max),
|
1502
|
+
"use ggml_soft_max_ext instead (will be removed in Mar 2024)");
|
1490
1503
|
|
1491
1504
|
// clamp
|
1492
1505
|
// in-place, returns view(a)
|