llama_cpp 0.14.2 → 0.14.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +64 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -0
- data/vendor/tmp/llama.cpp/Makefile +91 -21
- data/vendor/tmp/llama.cpp/ggml-alloc.c +14 -5
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +155 -125
- data/vendor/tmp/llama.cpp/ggml-backend.h +4 -4
- data/vendor/tmp/llama.cpp/ggml-common.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +1779 -10762
- data/vendor/tmp/llama.cpp/ggml-cuda.h +6 -15
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +167 -124
- data/vendor/tmp/llama.cpp/ggml-metal.metal +603 -303
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +663 -56
- data/vendor/tmp/llama.cpp/ggml-quants.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +341 -469
- data/vendor/tmp/llama.cpp/ggml-sycl.h +19 -4
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +37199 -14939
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +335 -307
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -11
- data/vendor/tmp/llama.cpp/ggml.c +229 -107
- data/vendor/tmp/llama.cpp/ggml.h +11 -5
- data/vendor/tmp/llama.cpp/llama.cpp +2136 -464
- data/vendor/tmp/llama.cpp/llama.h +86 -23
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1651 -0
- data/vendor/tmp/llama.cpp/unicode-data.h +16 -0
- data/vendor/tmp/llama.cpp/unicode.cpp +8 -1403
- data/vendor/tmp/llama.cpp/unicode.h +2 -0
- metadata +5 -3
@@ -11,17 +11,6 @@ extern "C" {
|
|
11
11
|
#define GGML_VK_MAX_DEVICES 16
|
12
12
|
|
13
13
|
GGML_API void ggml_vk_instance_init(void);
|
14
|
-
GGML_API void ggml_vk_init_cpu_assist(void);
|
15
|
-
|
16
|
-
GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node);
|
17
|
-
GGML_API void ggml_vk_preallocate_buffers_cpu_assist(void);
|
18
|
-
GGML_API void ggml_vk_build_graph_cpu_assist(struct ggml_tensor * node, bool last_node);
|
19
|
-
GGML_API bool ggml_vk_compute_forward_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
20
|
-
#ifdef GGML_VULKAN_CHECK_RESULTS
|
21
|
-
void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
22
|
-
#endif
|
23
|
-
GGML_API void ggml_vk_graph_cleanup_cpu_assist(void);
|
24
|
-
GGML_API void ggml_vk_free_cpu_assist(void);
|
25
14
|
|
26
15
|
// backend API
|
27
16
|
GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);
|
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
@@ -3,6 +3,7 @@
|
|
3
3
|
|
4
4
|
#include "ggml-impl.h"
|
5
5
|
#include "ggml-quants.h"
|
6
|
+
#include "ggml.h"
|
6
7
|
|
7
8
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
8
9
|
#include <malloc.h> // using malloc.h with MSC/MINGW
|
@@ -43,6 +44,10 @@
|
|
43
44
|
|
44
45
|
#if defined(_WIN32)
|
45
46
|
|
47
|
+
#define WIN32_LEAN_AND_MEAN
|
48
|
+
#ifndef NOMINMAX
|
49
|
+
#define NOMINMAX
|
50
|
+
#endif
|
46
51
|
#include <windows.h>
|
47
52
|
|
48
53
|
typedef volatile LONG atomic_int;
|
@@ -273,8 +278,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
|
|
273
278
|
#include <Accelerate/Accelerate.h>
|
274
279
|
#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
|
275
280
|
#include "ggml-opencl.h"
|
276
|
-
#elif defined(GGML_USE_VULKAN)
|
277
|
-
#include "ggml-vulkan.h"
|
278
281
|
#endif
|
279
282
|
#elif defined(GGML_USE_OPENBLAS)
|
280
283
|
#if defined(GGML_BLAS_USE_MKL)
|
@@ -282,14 +285,8 @@ inline static void * ggml_calloc(size_t num, size_t size) {
|
|
282
285
|
#else
|
283
286
|
#include <cblas.h>
|
284
287
|
#endif
|
285
|
-
#elif defined(GGML_USE_CUBLAS)
|
286
|
-
#include "ggml-cuda.h"
|
287
288
|
#elif defined(GGML_USE_CLBLAST)
|
288
289
|
#include "ggml-opencl.h"
|
289
|
-
#elif defined(GGML_USE_VULKAN)
|
290
|
-
#include "ggml-vulkan.h"
|
291
|
-
#elif defined(GGML_USE_SYCL)
|
292
|
-
#include "ggml-sycl.h"
|
293
290
|
#endif
|
294
291
|
|
295
292
|
// floating point type used to accumulate sums
|
@@ -432,6 +429,57 @@ int64_t ggml_cycles_per_ms(void) {
|
|
432
429
|
#define ggml_perf_cycles_per_ms() 0
|
433
430
|
#endif
|
434
431
|
|
432
|
+
//
|
433
|
+
// cross-platform UTF-8 file paths
|
434
|
+
//
|
435
|
+
|
436
|
+
#ifdef _WIN32
|
437
|
+
static wchar_t * ggml_mbstowcs(const char * mbs) {
|
438
|
+
int wlen = MultiByteToWideChar(CP_UTF8, 0, mbs, -1, NULL, 0);
|
439
|
+
if (!wlen) {
|
440
|
+
errno = EINVAL;
|
441
|
+
return NULL;
|
442
|
+
}
|
443
|
+
|
444
|
+
wchar_t * wbuf = GGML_MALLOC(wlen * sizeof(wchar_t));
|
445
|
+
wlen = MultiByteToWideChar(CP_UTF8, 0, mbs, -1, wbuf, wlen);
|
446
|
+
if (!wlen) {
|
447
|
+
GGML_FREE(wbuf);
|
448
|
+
errno = EINVAL;
|
449
|
+
return NULL;
|
450
|
+
}
|
451
|
+
|
452
|
+
return wbuf;
|
453
|
+
}
|
454
|
+
#endif
|
455
|
+
|
456
|
+
FILE * ggml_fopen(const char * fname, const char * mode) {
|
457
|
+
#ifdef _WIN32
|
458
|
+
FILE * file = NULL;
|
459
|
+
|
460
|
+
// convert fname (UTF-8)
|
461
|
+
wchar_t * wfname = ggml_mbstowcs(fname);
|
462
|
+
if (wfname) {
|
463
|
+
// convert mode (ANSI)
|
464
|
+
wchar_t * wmode = GGML_MALLOC((strlen(mode) + 1) * sizeof(wchar_t));
|
465
|
+
wchar_t * wmode_p = wmode;
|
466
|
+
do {
|
467
|
+
*wmode_p++ = (wchar_t)*mode;
|
468
|
+
} while (*mode++);
|
469
|
+
|
470
|
+
// open file
|
471
|
+
file = _wfopen(wfname, wmode);
|
472
|
+
|
473
|
+
GGML_FREE(wfname);
|
474
|
+
GGML_FREE(wmode);
|
475
|
+
}
|
476
|
+
|
477
|
+
return file;
|
478
|
+
#else
|
479
|
+
return fopen(fname, mode);
|
480
|
+
#endif
|
481
|
+
}
|
482
|
+
|
435
483
|
//
|
436
484
|
// cache line
|
437
485
|
//
|
@@ -742,6 +790,18 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
742
790
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
743
791
|
.nrows = 1,
|
744
792
|
},
|
793
|
+
[GGML_TYPE_IQ1_M] = {
|
794
|
+
.type_name = "iq1_m",
|
795
|
+
.blck_size = QK_K,
|
796
|
+
.type_size = sizeof(block_iq1_m),
|
797
|
+
.is_quantized = true,
|
798
|
+
.to_float = (ggml_to_float_t) dequantize_row_iq1_m,
|
799
|
+
.from_float = NULL,
|
800
|
+
.from_float_reference = NULL,
|
801
|
+
.vec_dot = ggml_vec_dot_iq1_m_q8_K,
|
802
|
+
.vec_dot_type = GGML_TYPE_Q8_K,
|
803
|
+
.nrows = 1,
|
804
|
+
},
|
745
805
|
[GGML_TYPE_IQ4_NL] = {
|
746
806
|
.type_name = "iq4_nl",
|
747
807
|
.blck_size = QK4_NL,
|
@@ -931,6 +991,101 @@ inline static float vaddvq_f32(float32x4_t v) {
|
|
931
991
|
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
|
932
992
|
#endif
|
933
993
|
|
994
|
+
#elif defined(__AVX512F__)
|
995
|
+
|
996
|
+
#define GGML_SIMD
|
997
|
+
|
998
|
+
// F32 AVX512
|
999
|
+
|
1000
|
+
#define GGML_F32_STEP 64
|
1001
|
+
#define GGML_F32_EPR 16
|
1002
|
+
|
1003
|
+
#define GGML_F32x16 __m512
|
1004
|
+
#define GGML_F32x16_ZERO _mm512_setzero_ps()
|
1005
|
+
#define GGML_F32x16_SET1(x) _mm512_set1_ps(x)
|
1006
|
+
#define GGML_F32x16_LOAD _mm512_loadu_ps
|
1007
|
+
#define GGML_F32x16_STORE _mm512_storeu_ps
|
1008
|
+
// _mm512_fmadd_ps is defined in AVX512F so no guard is required
|
1009
|
+
#define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
|
1010
|
+
#define GGML_F32x16_ADD _mm512_add_ps
|
1011
|
+
#define GGML_F32x16_MUL _mm512_mul_ps
|
1012
|
+
#define GGML_F32x16_REDUCE(res, x) \
|
1013
|
+
do { \
|
1014
|
+
int offset = GGML_F32_ARR >> 1; \
|
1015
|
+
for (int i = 0; i < offset; ++i) { \
|
1016
|
+
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
1017
|
+
} \
|
1018
|
+
offset >>= 1; \
|
1019
|
+
for (int i = 0; i < offset; ++i) { \
|
1020
|
+
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
1021
|
+
} \
|
1022
|
+
offset >>= 1; \
|
1023
|
+
for (int i = 0; i < offset; ++i) { \
|
1024
|
+
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
1025
|
+
} \
|
1026
|
+
res = _mm512_reduce_add_ps(x[0]); \
|
1027
|
+
} while (0)
|
1028
|
+
|
1029
|
+
// TODO: is this optimal ?
|
1030
|
+
|
1031
|
+
#define GGML_F32_VEC GGML_F32x16
|
1032
|
+
#define GGML_F32_VEC_ZERO GGML_F32x16_ZERO
|
1033
|
+
#define GGML_F32_VEC_SET1 GGML_F32x16_SET1
|
1034
|
+
#define GGML_F32_VEC_LOAD GGML_F32x16_LOAD
|
1035
|
+
#define GGML_F32_VEC_STORE GGML_F32x16_STORE
|
1036
|
+
#define GGML_F32_VEC_FMA GGML_F32x16_FMA
|
1037
|
+
#define GGML_F32_VEC_ADD GGML_F32x16_ADD
|
1038
|
+
#define GGML_F32_VEC_MUL GGML_F32x16_MUL
|
1039
|
+
#define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE
|
1040
|
+
|
1041
|
+
// F16 AVX512
|
1042
|
+
|
1043
|
+
// F16 AVX
|
1044
|
+
|
1045
|
+
#define GGML_F16_STEP 64
|
1046
|
+
#define GGML_F16_EPR 16
|
1047
|
+
|
1048
|
+
// AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
|
1049
|
+
|
1050
|
+
#define GGML_F32Cx16 __m512
|
1051
|
+
#define GGML_F32Cx16_ZERO _mm512_setzero_ps()
|
1052
|
+
#define GGML_F32Cx16_SET1(x) _mm512_set1_ps(x)
|
1053
|
+
|
1054
|
+
// unlike _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
|
1055
|
+
// so F16C guard isn't required
|
1056
|
+
#define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((__m256i *)(x)))
|
1057
|
+
#define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
|
1058
|
+
|
1059
|
+
#define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
|
1060
|
+
#define GGML_F32Cx16_ADD _mm512_add_ps
|
1061
|
+
#define GGML_F32Cx16_MUL _mm512_mul_ps
|
1062
|
+
#define GGML_F32Cx16_REDUCE(res, x) \
|
1063
|
+
do { \
|
1064
|
+
int offset = GGML_F32_ARR >> 1; \
|
1065
|
+
for (int i = 0; i < offset; ++i) { \
|
1066
|
+
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
1067
|
+
} \
|
1068
|
+
offset >>= 1; \
|
1069
|
+
for (int i = 0; i < offset; ++i) { \
|
1070
|
+
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
1071
|
+
} \
|
1072
|
+
offset >>= 1; \
|
1073
|
+
for (int i = 0; i < offset; ++i) { \
|
1074
|
+
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
1075
|
+
} \
|
1076
|
+
res = _mm512_reduce_add_ps(x[0]); \
|
1077
|
+
} while (0)
|
1078
|
+
|
1079
|
+
#define GGML_F16_VEC GGML_F32Cx16
|
1080
|
+
#define GGML_F16_VEC_ZERO GGML_F32Cx16_ZERO
|
1081
|
+
#define GGML_F16_VEC_SET1 GGML_F32Cx16_SET1
|
1082
|
+
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx16_LOAD(p)
|
1083
|
+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i])
|
1084
|
+
#define GGML_F16_VEC_FMA GGML_F32Cx16_FMA
|
1085
|
+
#define GGML_F16_VEC_ADD GGML_F32Cx16_ADD
|
1086
|
+
#define GGML_F16_VEC_MUL GGML_F32Cx16_MUL
|
1087
|
+
#define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
|
1088
|
+
|
934
1089
|
#elif defined(__AVX__)
|
935
1090
|
|
936
1091
|
#define GGML_SIMD
|
@@ -2392,6 +2547,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
|
2392
2547
|
case GGML_FTYPE_MOSTLY_IQ2_XS: wtype = GGML_TYPE_IQ2_XS; break;
|
2393
2548
|
case GGML_FTYPE_MOSTLY_IQ3_XXS: wtype = GGML_TYPE_IQ3_XXS; break;
|
2394
2549
|
case GGML_FTYPE_MOSTLY_IQ1_S: wtype = GGML_TYPE_IQ1_S; break;
|
2550
|
+
case GGML_FTYPE_MOSTLY_IQ1_M: wtype = GGML_TYPE_IQ1_M; break;
|
2395
2551
|
case GGML_FTYPE_MOSTLY_IQ4_NL: wtype = GGML_TYPE_IQ4_NL; break;
|
2396
2552
|
case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break;
|
2397
2553
|
case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break;
|
@@ -2447,6 +2603,16 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
|
|
2447
2603
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
2448
2604
|
}
|
2449
2605
|
|
2606
|
+
GGML_CALL bool ggml_is_empty(const struct ggml_tensor * tensor) {
|
2607
|
+
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
2608
|
+
if (tensor->ne[i] == 0) {
|
2609
|
+
// empty if any dimension has no elements
|
2610
|
+
return true;
|
2611
|
+
}
|
2612
|
+
}
|
2613
|
+
return false;
|
2614
|
+
}
|
2615
|
+
|
2450
2616
|
bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
2451
2617
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
2452
2618
|
|
@@ -2461,7 +2627,7 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor
|
|
2461
2627
|
static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
2462
2628
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
2463
2629
|
|
2464
|
-
return
|
2630
|
+
return ggml_is_empty(t0) ? ggml_is_empty(t1) :
|
2465
2631
|
(t1->ne[0]%t0->ne[0] == 0) &&
|
2466
2632
|
(t1->ne[1]%t0->ne[1] == 0) &&
|
2467
2633
|
(t1->ne[2]%t0->ne[2] == 0) &&
|
@@ -2545,14 +2711,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
2545
2711
|
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
2546
2712
|
}
|
2547
2713
|
|
2548
|
-
#if defined(
|
2549
|
-
ggml_init_cublas();
|
2550
|
-
#elif defined(GGML_USE_CLBLAST)
|
2714
|
+
#if defined(GGML_USE_CLBLAST)
|
2551
2715
|
ggml_cl_init();
|
2552
|
-
#elif defined(GGML_USE_VULKAN)
|
2553
|
-
ggml_vk_init_cpu_assist();
|
2554
|
-
#elif defined(GGML_USE_SYCL)
|
2555
|
-
ggml_init_sycl();
|
2556
2716
|
#endif
|
2557
2717
|
|
2558
2718
|
ggml_setup_op_has_task_pass();
|
@@ -2772,7 +2932,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
2772
2932
|
data_size *= ne[i];
|
2773
2933
|
}
|
2774
2934
|
|
2775
|
-
GGML_ASSERT(view_src == NULL || data_size + view_offs <= ggml_nbytes(view_src));
|
2935
|
+
GGML_ASSERT(view_src == NULL || data_size == 0 || data_size + view_offs <= ggml_nbytes(view_src));
|
2776
2936
|
|
2777
2937
|
void * data = view_src != NULL ? view_src->data : NULL;
|
2778
2938
|
if (data != NULL) {
|
@@ -4413,45 +4573,38 @@ void ggml_mul_mat_set_prec(
|
|
4413
4573
|
|
4414
4574
|
// ggml_mul_mat_id
|
4415
4575
|
|
4576
|
+
// NOTE: id will be removed in the future and instead all the experts listed in ids will be computed
|
4577
|
+
// this will allow computing all the used experts in a single matrix multiplication
|
4416
4578
|
struct ggml_tensor * ggml_mul_mat_id(
|
4417
4579
|
struct ggml_context * ctx,
|
4418
|
-
struct ggml_tensor *
|
4419
|
-
int n_as,
|
4580
|
+
struct ggml_tensor * as,
|
4420
4581
|
struct ggml_tensor * ids,
|
4421
4582
|
int id,
|
4422
4583
|
struct ggml_tensor * b) {
|
4423
4584
|
|
4424
4585
|
GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
4425
|
-
GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1);
|
4426
|
-
GGML_ASSERT(ids->ne[1] == b->ne[1]);
|
4586
|
+
GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); // ids is 2d
|
4587
|
+
GGML_ASSERT(ids->ne[1] == b->ne[1]); // must have an expert per b row
|
4427
4588
|
GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
|
4428
|
-
GGML_ASSERT(
|
4429
|
-
GGML_ASSERT(
|
4589
|
+
GGML_ASSERT(id >= 0 && id < ids->ne[0]); // valid id
|
4590
|
+
GGML_ASSERT(as->ne[0] == b->ne[0]); // can_mul_mat
|
4430
4591
|
|
4431
4592
|
bool is_node = false;
|
4432
4593
|
|
4433
|
-
if (as
|
4594
|
+
if (as->grad || b->grad) {
|
4434
4595
|
is_node = true;
|
4435
4596
|
}
|
4436
4597
|
|
4437
|
-
const int64_t ne[4] = { as
|
4598
|
+
const int64_t ne[4] = { as->ne[1], b->ne[1], b->ne[2], b->ne[3] };
|
4438
4599
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
4439
4600
|
|
4440
4601
|
ggml_set_op_params_i32(result, 0, id);
|
4441
|
-
ggml_set_op_params_i32(result, 1, n_as);
|
4442
4602
|
|
4443
4603
|
result->op = GGML_OP_MUL_MAT_ID;
|
4444
4604
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
4445
|
-
result->src[0] =
|
4605
|
+
result->src[0] = as;
|
4446
4606
|
result->src[1] = b;
|
4447
|
-
|
4448
|
-
for (int i = 0; i < n_as; i++) {
|
4449
|
-
struct ggml_tensor * a = as[i];
|
4450
|
-
GGML_ASSERT(ggml_are_same_shape(as[0], a));
|
4451
|
-
GGML_ASSERT(ggml_can_mul_mat(a, b));
|
4452
|
-
GGML_ASSERT(!ggml_is_transposed(a));
|
4453
|
-
result->src[i + 2] = a;
|
4454
|
-
}
|
4607
|
+
result->src[2] = ids;
|
4455
4608
|
|
4456
4609
|
return result;
|
4457
4610
|
}
|
@@ -7992,6 +8145,7 @@ static void ggml_compute_forward_add(
|
|
7992
8145
|
case GGML_TYPE_IQ2_XS:
|
7993
8146
|
case GGML_TYPE_IQ3_XXS:
|
7994
8147
|
case GGML_TYPE_IQ1_S:
|
8148
|
+
case GGML_TYPE_IQ1_M:
|
7995
8149
|
case GGML_TYPE_IQ4_NL:
|
7996
8150
|
case GGML_TYPE_IQ4_XS:
|
7997
8151
|
case GGML_TYPE_IQ3_S:
|
@@ -8274,6 +8428,7 @@ static void ggml_compute_forward_add1(
|
|
8274
8428
|
case GGML_TYPE_IQ2_XS:
|
8275
8429
|
case GGML_TYPE_IQ3_XXS:
|
8276
8430
|
case GGML_TYPE_IQ1_S:
|
8431
|
+
case GGML_TYPE_IQ1_M:
|
8277
8432
|
case GGML_TYPE_IQ4_NL:
|
8278
8433
|
case GGML_TYPE_IQ4_XS:
|
8279
8434
|
case GGML_TYPE_IQ3_S:
|
@@ -8401,6 +8556,7 @@ static void ggml_compute_forward_acc(
|
|
8401
8556
|
case GGML_TYPE_IQ2_XS:
|
8402
8557
|
case GGML_TYPE_IQ3_XXS:
|
8403
8558
|
case GGML_TYPE_IQ1_S:
|
8559
|
+
case GGML_TYPE_IQ1_M:
|
8404
8560
|
case GGML_TYPE_IQ4_NL:
|
8405
8561
|
case GGML_TYPE_IQ4_XS:
|
8406
8562
|
case GGML_TYPE_IQ3_S:
|
@@ -10785,10 +10941,9 @@ static void ggml_compute_forward_mul_mat_id(
|
|
10785
10941
|
const struct ggml_compute_params * params,
|
10786
10942
|
struct ggml_tensor * dst) {
|
10787
10943
|
|
10788
|
-
const struct ggml_tensor *
|
10944
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
10789
10945
|
const struct ggml_tensor * src1 = dst->src[1];
|
10790
|
-
|
10791
|
-
const struct ggml_tensor * src0 = dst->src[2]; // only for GGML_TENSOR_BINARY_OP_LOCALS
|
10946
|
+
const struct ggml_tensor * ids = dst->src[2];
|
10792
10947
|
|
10793
10948
|
GGML_TENSOR_BINARY_OP_LOCALS
|
10794
10949
|
|
@@ -10818,13 +10973,13 @@ static void ggml_compute_forward_mul_mat_id(
|
|
10818
10973
|
GGML_ASSERT(nb1 <= nb2);
|
10819
10974
|
GGML_ASSERT(nb2 <= nb3);
|
10820
10975
|
|
10821
|
-
// broadcast
|
10822
|
-
|
10823
|
-
|
10976
|
+
// broadcast is not supported with mmid
|
10977
|
+
assert(ne12 == 1);
|
10978
|
+
assert(ne13 == 1);
|
10824
10979
|
|
10825
10980
|
// row groups
|
10826
10981
|
const int id = ggml_get_op_params_i32(dst, 0);
|
10827
|
-
const int n_as =
|
10982
|
+
const int n_as = src0->ne[2];
|
10828
10983
|
|
10829
10984
|
char * wdata_src1_end = (src1->type == vec_dot_type) ?
|
10830
10985
|
(char *) params->wdata :
|
@@ -10884,7 +11039,7 @@ static void ggml_compute_forward_mul_mat_id(
|
|
10884
11039
|
continue;
|
10885
11040
|
}
|
10886
11041
|
|
10887
|
-
|
11042
|
+
size_t src0_offset = cur_a*src0->nb[2];
|
10888
11043
|
|
10889
11044
|
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
10890
11045
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
@@ -10919,9 +11074,6 @@ static void ggml_compute_forward_mul_mat_id(
|
|
10919
11074
|
continue;
|
10920
11075
|
}
|
10921
11076
|
|
10922
|
-
assert(ne12 % ne02 == 0);
|
10923
|
-
assert(ne13 % ne03 == 0);
|
10924
|
-
|
10925
11077
|
// block-tiling attempt
|
10926
11078
|
const int64_t blck_0 = 16;
|
10927
11079
|
const int64_t blck_1 = 16;
|
@@ -10938,14 +11090,14 @@ static void ggml_compute_forward_mul_mat_id(
|
|
10938
11090
|
const int64_t i11 = MMID_MATRIX_ROW(cur_a, _i11);
|
10939
11091
|
|
10940
11092
|
// broadcast src0 into src1
|
10941
|
-
const int64_t i03 = i13/r3;
|
10942
|
-
const int64_t i02 = i12/r2;
|
11093
|
+
//const int64_t i03 = i13/r3;
|
11094
|
+
//const int64_t i02 = i12/r2;
|
10943
11095
|
|
10944
11096
|
const int64_t i1 = i11;
|
10945
11097
|
const int64_t i2 = i12;
|
10946
11098
|
const int64_t i3 = i13;
|
10947
11099
|
|
10948
|
-
const char * src0_row = (const char *)
|
11100
|
+
const char * src0_row = (const char *) src0->data + src0_offset;
|
10949
11101
|
|
10950
11102
|
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
|
10951
11103
|
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
|
@@ -11010,7 +11162,6 @@ static void ggml_compute_forward_out_prod_f32(
|
|
11010
11162
|
// nb01 >= nb00 - src0 is not transposed
|
11011
11163
|
// compute by src0 rows
|
11012
11164
|
|
11013
|
-
// TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
|
11014
11165
|
// TODO: #if defined(GGML_USE_CLBLAST)
|
11015
11166
|
|
11016
11167
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
@@ -11210,7 +11361,6 @@ static void ggml_compute_forward_out_prod_q_f32(
|
|
11210
11361
|
// nb01 >= nb00 - src0 is not transposed
|
11211
11362
|
// compute by src0 rows
|
11212
11363
|
|
11213
|
-
// TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
|
11214
11364
|
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
11215
11365
|
|
11216
11366
|
if (params->type == GGML_TASK_TYPE_INIT) {
|
@@ -11306,6 +11456,7 @@ static void ggml_compute_forward_out_prod(
|
|
11306
11456
|
case GGML_TYPE_IQ2_XS:
|
11307
11457
|
case GGML_TYPE_IQ3_XXS:
|
11308
11458
|
case GGML_TYPE_IQ1_S:
|
11459
|
+
case GGML_TYPE_IQ1_M:
|
11309
11460
|
case GGML_TYPE_IQ4_NL:
|
11310
11461
|
case GGML_TYPE_IQ4_XS:
|
11311
11462
|
case GGML_TYPE_IQ3_S:
|
@@ -11497,6 +11648,7 @@ static void ggml_compute_forward_set(
|
|
11497
11648
|
case GGML_TYPE_IQ2_XS:
|
11498
11649
|
case GGML_TYPE_IQ3_XXS:
|
11499
11650
|
case GGML_TYPE_IQ1_S:
|
11651
|
+
case GGML_TYPE_IQ1_M:
|
11500
11652
|
case GGML_TYPE_IQ4_NL:
|
11501
11653
|
case GGML_TYPE_IQ4_XS:
|
11502
11654
|
case GGML_TYPE_IQ3_S:
|
@@ -11720,6 +11872,7 @@ static void ggml_compute_forward_get_rows(
|
|
11720
11872
|
case GGML_TYPE_IQ2_XS:
|
11721
11873
|
case GGML_TYPE_IQ3_XXS:
|
11722
11874
|
case GGML_TYPE_IQ1_S:
|
11875
|
+
case GGML_TYPE_IQ1_M:
|
11723
11876
|
case GGML_TYPE_IQ4_NL:
|
11724
11877
|
case GGML_TYPE_IQ4_XS:
|
11725
11878
|
case GGML_TYPE_IQ3_S:
|
@@ -12423,6 +12576,7 @@ static void ggml_compute_forward_alibi(
|
|
12423
12576
|
case GGML_TYPE_IQ2_XS:
|
12424
12577
|
case GGML_TYPE_IQ3_XXS:
|
12425
12578
|
case GGML_TYPE_IQ1_S:
|
12579
|
+
case GGML_TYPE_IQ1_M:
|
12426
12580
|
case GGML_TYPE_IQ4_NL:
|
12427
12581
|
case GGML_TYPE_IQ4_XS:
|
12428
12582
|
case GGML_TYPE_IQ3_S:
|
@@ -12511,6 +12665,7 @@ static void ggml_compute_forward_clamp(
|
|
12511
12665
|
case GGML_TYPE_IQ2_XS:
|
12512
12666
|
case GGML_TYPE_IQ3_XXS:
|
12513
12667
|
case GGML_TYPE_IQ1_S:
|
12668
|
+
case GGML_TYPE_IQ1_M:
|
12514
12669
|
case GGML_TYPE_IQ4_NL:
|
12515
12670
|
case GGML_TYPE_IQ4_XS:
|
12516
12671
|
case GGML_TYPE_IQ3_S:
|
@@ -15952,37 +16107,10 @@ static void ggml_compute_forward_cross_entropy_loss_back(
|
|
15952
16107
|
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
15953
16108
|
GGML_ASSERT(params);
|
15954
16109
|
|
15955
|
-
if (tensor->op == GGML_OP_NONE) {
|
15956
|
-
return;
|
15957
|
-
}
|
15958
|
-
|
15959
|
-
#ifdef GGML_USE_CUBLAS
|
15960
|
-
bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
|
15961
|
-
if (skip_cpu) {
|
16110
|
+
if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) {
|
15962
16111
|
return;
|
15963
16112
|
}
|
15964
|
-
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
|
15965
|
-
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
|
15966
|
-
#elif defined(GGML_USE_VULKAN)
|
15967
|
-
const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
|
15968
|
-
#ifdef GGML_VULKAN_CHECK_RESULTS
|
15969
|
-
if (skip_cpu) {
|
15970
|
-
ggml_vk_check_results_1_cpu_assist(params, tensor);
|
15971
|
-
}
|
15972
|
-
#endif
|
15973
|
-
if (skip_cpu) {
|
15974
|
-
return;
|
15975
|
-
}
|
15976
|
-
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
|
15977
|
-
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
|
15978
|
-
#endif // GGML_USE_CUBLAS
|
15979
16113
|
|
15980
|
-
#ifdef GGML_USE_SYCL
|
15981
|
-
bool skip_cpu = ggml_sycl_compute_forward(params, tensor);
|
15982
|
-
if (skip_cpu) {
|
15983
|
-
return;
|
15984
|
-
}
|
15985
|
-
#endif // GGML_USE_SYCL
|
15986
16114
|
switch (tensor->op) {
|
15987
16115
|
case GGML_OP_DUP:
|
15988
16116
|
{
|
@@ -17834,6 +17962,12 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
|
|
17834
17962
|
static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_threads) {
|
17835
17963
|
int n_tasks = 0;
|
17836
17964
|
|
17965
|
+
if (ggml_is_empty(node)) {
|
17966
|
+
// no need to multi-thread a no-op
|
17967
|
+
n_tasks = 1;
|
17968
|
+
return n_tasks;
|
17969
|
+
}
|
17970
|
+
|
17837
17971
|
switch (node->op) {
|
17838
17972
|
case GGML_OP_CPY:
|
17839
17973
|
case GGML_OP_DUP:
|
@@ -18319,13 +18453,13 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
18319
18453
|
case GGML_OP_MUL_MAT_ID:
|
18320
18454
|
{
|
18321
18455
|
cur = 0;
|
18322
|
-
const struct ggml_tensor * src0 = node->src[
|
18456
|
+
const struct ggml_tensor * src0 = node->src[0];
|
18323
18457
|
const struct ggml_tensor * src1 = node->src[1];
|
18324
18458
|
const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
|
18325
18459
|
if (src1->type != vec_dot_type) {
|
18326
18460
|
cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
|
18327
18461
|
}
|
18328
|
-
const int n_as =
|
18462
|
+
const int n_as = src0->ne[2];
|
18329
18463
|
cur += GGML_PAD(cur, sizeof(int64_t)); // align
|
18330
18464
|
cur += n_as * sizeof(int64_t); // matrix_row_counts
|
18331
18465
|
cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
|
@@ -18452,17 +18586,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|
18452
18586
|
}
|
18453
18587
|
}
|
18454
18588
|
|
18455
|
-
#ifdef GGML_USE_VULKAN
|
18456
|
-
for (int i = 0; i < cgraph->n_nodes; i++) {
|
18457
|
-
ggml_vk_preallocate_buffers_graph_cpu_assist(cgraph->nodes[i]);
|
18458
|
-
}
|
18459
|
-
ggml_vk_preallocate_buffers_cpu_assist();
|
18460
|
-
|
18461
|
-
for (int i = 0; i < cgraph->n_nodes; i++) {
|
18462
|
-
ggml_vk_build_graph_cpu_assist(cgraph->nodes[i], i == cgraph->n_nodes - 1);
|
18463
|
-
}
|
18464
|
-
#endif
|
18465
|
-
|
18466
18589
|
const int n_threads = cplan->n_threads;
|
18467
18590
|
|
18468
18591
|
struct ggml_compute_state_shared state_shared = {
|
@@ -18519,10 +18642,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|
18519
18642
|
}
|
18520
18643
|
}
|
18521
18644
|
|
18522
|
-
#ifdef GGML_USE_VULKAN
|
18523
|
-
ggml_vk_graph_cleanup_cpu_assist();
|
18524
|
-
#endif
|
18525
|
-
|
18526
18645
|
// performance stats (graph)
|
18527
18646
|
{
|
18528
18647
|
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
|
@@ -18657,7 +18776,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|
18657
18776
|
|
18658
18777
|
// write binary data
|
18659
18778
|
{
|
18660
|
-
FILE * fout =
|
18779
|
+
FILE * fout = ggml_fopen(fname, "wb");
|
18661
18780
|
|
18662
18781
|
if (!fout) {
|
18663
18782
|
fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
|
@@ -18795,7 +18914,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|
18795
18914
|
|
18796
18915
|
// read file into data
|
18797
18916
|
{
|
18798
|
-
FILE * fin =
|
18917
|
+
FILE * fin = ggml_fopen(fname, "rb");
|
18799
18918
|
if (!fin) {
|
18800
18919
|
fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
|
18801
18920
|
return result;
|
@@ -19131,7 +19250,7 @@ static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node,
|
|
19131
19250
|
void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {
|
19132
19251
|
char color[16];
|
19133
19252
|
|
19134
|
-
FILE * fp =
|
19253
|
+
FILE * fp = ggml_fopen(filename, "w");
|
19135
19254
|
GGML_ASSERT(fp);
|
19136
19255
|
|
19137
19256
|
fprintf(fp, "digraph G {\n");
|
@@ -20178,7 +20297,8 @@ void ggml_quantize_init(enum ggml_type type) {
|
|
20178
20297
|
case GGML_TYPE_IQ2_XXS:
|
20179
20298
|
case GGML_TYPE_IQ2_XS:
|
20180
20299
|
case GGML_TYPE_IQ2_S:
|
20181
|
-
case GGML_TYPE_IQ1_S:
|
20300
|
+
case GGML_TYPE_IQ1_S:
|
20301
|
+
case GGML_TYPE_IQ1_M: iq2xs_init_impl(type); break;
|
20182
20302
|
case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
|
20183
20303
|
case GGML_TYPE_IQ3_S: iq3xs_init_impl(512); break;
|
20184
20304
|
default: // nothing
|
@@ -20203,7 +20323,8 @@ bool ggml_quantize_requires_imatrix(enum ggml_type type) {
|
|
20203
20323
|
return
|
20204
20324
|
type == GGML_TYPE_IQ2_XXS ||
|
20205
20325
|
type == GGML_TYPE_IQ2_XS ||
|
20206
|
-
type == GGML_TYPE_IQ1_S
|
20326
|
+
type == GGML_TYPE_IQ1_S;// ||
|
20327
|
+
//type == GGML_TYPE_IQ1_M;
|
20207
20328
|
}
|
20208
20329
|
|
20209
20330
|
size_t ggml_quantize_chunk(
|
@@ -20247,6 +20368,7 @@ size_t ggml_quantize_chunk(
|
|
20247
20368
|
case GGML_TYPE_IQ3_S: result = quantize_iq3_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20248
20369
|
case GGML_TYPE_IQ2_S: result = quantize_iq2_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20249
20370
|
case GGML_TYPE_IQ1_S: result = quantize_iq1_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20371
|
+
case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20250
20372
|
case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20251
20373
|
#if QK_K == 64
|
20252
20374
|
case GGML_TYPE_IQ4_XS: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
@@ -20449,7 +20571,7 @@ struct gguf_context * gguf_init_empty(void) {
|
|
20449
20571
|
}
|
20450
20572
|
|
20451
20573
|
struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
|
20452
|
-
FILE * file =
|
20574
|
+
FILE * file = ggml_fopen(fname, "rb");
|
20453
20575
|
if (!file) {
|
20454
20576
|
return NULL;
|
20455
20577
|
}
|
@@ -21404,7 +21526,7 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf *
|
|
21404
21526
|
}
|
21405
21527
|
|
21406
21528
|
void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) {
|
21407
|
-
FILE * file =
|
21529
|
+
FILE * file = ggml_fopen(fname, "wb");
|
21408
21530
|
if (!file) {
|
21409
21531
|
GGML_ASSERT(false && "failed to open file for writing");
|
21410
21532
|
}
|
@@ -21546,15 +21668,15 @@ int ggml_cpu_has_wasm_simd(void) {
|
|
21546
21668
|
}
|
21547
21669
|
|
21548
21670
|
int ggml_cpu_has_blas(void) {
|
21549
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(
|
21671
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
|
21550
21672
|
return 1;
|
21551
21673
|
#else
|
21552
21674
|
return 0;
|
21553
21675
|
#endif
|
21554
21676
|
}
|
21555
21677
|
|
21556
|
-
int
|
21557
|
-
#if defined(
|
21678
|
+
int ggml_cpu_has_cuda(void) {
|
21679
|
+
#if defined(GGML_USE_CUDA)
|
21558
21680
|
return 1;
|
21559
21681
|
#else
|
21560
21682
|
return 0;
|
@@ -21594,7 +21716,7 @@ int ggml_cpu_has_sycl(void) {
|
|
21594
21716
|
}
|
21595
21717
|
|
21596
21718
|
int ggml_cpu_has_gpublas(void) {
|
21597
|
-
return
|
21719
|
+
return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
|
21598
21720
|
ggml_cpu_has_sycl();
|
21599
21721
|
}
|
21600
21722
|
|