llama_cpp 0.14.2 → 0.14.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +64 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -0
- data/vendor/tmp/llama.cpp/Makefile +91 -21
- data/vendor/tmp/llama.cpp/ggml-alloc.c +14 -5
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +155 -125
- data/vendor/tmp/llama.cpp/ggml-backend.h +4 -4
- data/vendor/tmp/llama.cpp/ggml-common.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +1779 -10762
- data/vendor/tmp/llama.cpp/ggml-cuda.h +6 -15
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +167 -124
- data/vendor/tmp/llama.cpp/ggml-metal.metal +603 -303
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +663 -56
- data/vendor/tmp/llama.cpp/ggml-quants.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +341 -469
- data/vendor/tmp/llama.cpp/ggml-sycl.h +19 -4
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +37199 -14939
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +335 -307
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -11
- data/vendor/tmp/llama.cpp/ggml.c +229 -107
- data/vendor/tmp/llama.cpp/ggml.h +11 -5
- data/vendor/tmp/llama.cpp/llama.cpp +2136 -464
- data/vendor/tmp/llama.cpp/llama.h +86 -23
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1651 -0
- data/vendor/tmp/llama.cpp/unicode-data.h +16 -0
- data/vendor/tmp/llama.cpp/unicode.cpp +8 -1403
- data/vendor/tmp/llama.cpp/unicode.h +2 -0
- metadata +5 -3
@@ -11,17 +11,6 @@ extern "C" {
|
|
11
11
|
#define GGML_VK_MAX_DEVICES 16
|
12
12
|
|
13
13
|
GGML_API void ggml_vk_instance_init(void);
|
14
|
-
GGML_API void ggml_vk_init_cpu_assist(void);
|
15
|
-
|
16
|
-
GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node);
|
17
|
-
GGML_API void ggml_vk_preallocate_buffers_cpu_assist(void);
|
18
|
-
GGML_API void ggml_vk_build_graph_cpu_assist(struct ggml_tensor * node, bool last_node);
|
19
|
-
GGML_API bool ggml_vk_compute_forward_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
20
|
-
#ifdef GGML_VULKAN_CHECK_RESULTS
|
21
|
-
void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
22
|
-
#endif
|
23
|
-
GGML_API void ggml_vk_graph_cleanup_cpu_assist(void);
|
24
|
-
GGML_API void ggml_vk_free_cpu_assist(void);
|
25
14
|
|
26
15
|
// backend API
|
27
16
|
GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);
|
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
@@ -3,6 +3,7 @@
|
|
3
3
|
|
4
4
|
#include "ggml-impl.h"
|
5
5
|
#include "ggml-quants.h"
|
6
|
+
#include "ggml.h"
|
6
7
|
|
7
8
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
8
9
|
#include <malloc.h> // using malloc.h with MSC/MINGW
|
@@ -43,6 +44,10 @@
|
|
43
44
|
|
44
45
|
#if defined(_WIN32)
|
45
46
|
|
47
|
+
#define WIN32_LEAN_AND_MEAN
|
48
|
+
#ifndef NOMINMAX
|
49
|
+
#define NOMINMAX
|
50
|
+
#endif
|
46
51
|
#include <windows.h>
|
47
52
|
|
48
53
|
typedef volatile LONG atomic_int;
|
@@ -273,8 +278,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
|
|
273
278
|
#include <Accelerate/Accelerate.h>
|
274
279
|
#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
|
275
280
|
#include "ggml-opencl.h"
|
276
|
-
#elif defined(GGML_USE_VULKAN)
|
277
|
-
#include "ggml-vulkan.h"
|
278
281
|
#endif
|
279
282
|
#elif defined(GGML_USE_OPENBLAS)
|
280
283
|
#if defined(GGML_BLAS_USE_MKL)
|
@@ -282,14 +285,8 @@ inline static void * ggml_calloc(size_t num, size_t size) {
|
|
282
285
|
#else
|
283
286
|
#include <cblas.h>
|
284
287
|
#endif
|
285
|
-
#elif defined(GGML_USE_CUBLAS)
|
286
|
-
#include "ggml-cuda.h"
|
287
288
|
#elif defined(GGML_USE_CLBLAST)
|
288
289
|
#include "ggml-opencl.h"
|
289
|
-
#elif defined(GGML_USE_VULKAN)
|
290
|
-
#include "ggml-vulkan.h"
|
291
|
-
#elif defined(GGML_USE_SYCL)
|
292
|
-
#include "ggml-sycl.h"
|
293
290
|
#endif
|
294
291
|
|
295
292
|
// floating point type used to accumulate sums
|
@@ -432,6 +429,57 @@ int64_t ggml_cycles_per_ms(void) {
|
|
432
429
|
#define ggml_perf_cycles_per_ms() 0
|
433
430
|
#endif
|
434
431
|
|
432
|
+
//
|
433
|
+
// cross-platform UTF-8 file paths
|
434
|
+
//
|
435
|
+
|
436
|
+
#ifdef _WIN32
|
437
|
+
static wchar_t * ggml_mbstowcs(const char * mbs) {
|
438
|
+
int wlen = MultiByteToWideChar(CP_UTF8, 0, mbs, -1, NULL, 0);
|
439
|
+
if (!wlen) {
|
440
|
+
errno = EINVAL;
|
441
|
+
return NULL;
|
442
|
+
}
|
443
|
+
|
444
|
+
wchar_t * wbuf = GGML_MALLOC(wlen * sizeof(wchar_t));
|
445
|
+
wlen = MultiByteToWideChar(CP_UTF8, 0, mbs, -1, wbuf, wlen);
|
446
|
+
if (!wlen) {
|
447
|
+
GGML_FREE(wbuf);
|
448
|
+
errno = EINVAL;
|
449
|
+
return NULL;
|
450
|
+
}
|
451
|
+
|
452
|
+
return wbuf;
|
453
|
+
}
|
454
|
+
#endif
|
455
|
+
|
456
|
+
FILE * ggml_fopen(const char * fname, const char * mode) {
|
457
|
+
#ifdef _WIN32
|
458
|
+
FILE * file = NULL;
|
459
|
+
|
460
|
+
// convert fname (UTF-8)
|
461
|
+
wchar_t * wfname = ggml_mbstowcs(fname);
|
462
|
+
if (wfname) {
|
463
|
+
// convert mode (ANSI)
|
464
|
+
wchar_t * wmode = GGML_MALLOC((strlen(mode) + 1) * sizeof(wchar_t));
|
465
|
+
wchar_t * wmode_p = wmode;
|
466
|
+
do {
|
467
|
+
*wmode_p++ = (wchar_t)*mode;
|
468
|
+
} while (*mode++);
|
469
|
+
|
470
|
+
// open file
|
471
|
+
file = _wfopen(wfname, wmode);
|
472
|
+
|
473
|
+
GGML_FREE(wfname);
|
474
|
+
GGML_FREE(wmode);
|
475
|
+
}
|
476
|
+
|
477
|
+
return file;
|
478
|
+
#else
|
479
|
+
return fopen(fname, mode);
|
480
|
+
#endif
|
481
|
+
}
|
482
|
+
|
435
483
|
//
|
436
484
|
// cache line
|
437
485
|
//
|
@@ -742,6 +790,18 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
742
790
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
743
791
|
.nrows = 1,
|
744
792
|
},
|
793
|
+
[GGML_TYPE_IQ1_M] = {
|
794
|
+
.type_name = "iq1_m",
|
795
|
+
.blck_size = QK_K,
|
796
|
+
.type_size = sizeof(block_iq1_m),
|
797
|
+
.is_quantized = true,
|
798
|
+
.to_float = (ggml_to_float_t) dequantize_row_iq1_m,
|
799
|
+
.from_float = NULL,
|
800
|
+
.from_float_reference = NULL,
|
801
|
+
.vec_dot = ggml_vec_dot_iq1_m_q8_K,
|
802
|
+
.vec_dot_type = GGML_TYPE_Q8_K,
|
803
|
+
.nrows = 1,
|
804
|
+
},
|
745
805
|
[GGML_TYPE_IQ4_NL] = {
|
746
806
|
.type_name = "iq4_nl",
|
747
807
|
.blck_size = QK4_NL,
|
@@ -931,6 +991,101 @@ inline static float vaddvq_f32(float32x4_t v) {
|
|
931
991
|
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
|
932
992
|
#endif
|
933
993
|
|
994
|
+
#elif defined(__AVX512F__)
|
995
|
+
|
996
|
+
#define GGML_SIMD
|
997
|
+
|
998
|
+
// F32 AVX512
|
999
|
+
|
1000
|
+
#define GGML_F32_STEP 64
|
1001
|
+
#define GGML_F32_EPR 16
|
1002
|
+
|
1003
|
+
#define GGML_F32x16 __m512
|
1004
|
+
#define GGML_F32x16_ZERO _mm512_setzero_ps()
|
1005
|
+
#define GGML_F32x16_SET1(x) _mm512_set1_ps(x)
|
1006
|
+
#define GGML_F32x16_LOAD _mm512_loadu_ps
|
1007
|
+
#define GGML_F32x16_STORE _mm512_storeu_ps
|
1008
|
+
// _mm512_fmadd_ps is defined in AVX512F so no guard is required
|
1009
|
+
#define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
|
1010
|
+
#define GGML_F32x16_ADD _mm512_add_ps
|
1011
|
+
#define GGML_F32x16_MUL _mm512_mul_ps
|
1012
|
+
#define GGML_F32x16_REDUCE(res, x) \
|
1013
|
+
do { \
|
1014
|
+
int offset = GGML_F32_ARR >> 1; \
|
1015
|
+
for (int i = 0; i < offset; ++i) { \
|
1016
|
+
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
1017
|
+
} \
|
1018
|
+
offset >>= 1; \
|
1019
|
+
for (int i = 0; i < offset; ++i) { \
|
1020
|
+
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
1021
|
+
} \
|
1022
|
+
offset >>= 1; \
|
1023
|
+
for (int i = 0; i < offset; ++i) { \
|
1024
|
+
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
1025
|
+
} \
|
1026
|
+
res = _mm512_reduce_add_ps(x[0]); \
|
1027
|
+
} while (0)
|
1028
|
+
|
1029
|
+
// TODO: is this optimal ?
|
1030
|
+
|
1031
|
+
#define GGML_F32_VEC GGML_F32x16
|
1032
|
+
#define GGML_F32_VEC_ZERO GGML_F32x16_ZERO
|
1033
|
+
#define GGML_F32_VEC_SET1 GGML_F32x16_SET1
|
1034
|
+
#define GGML_F32_VEC_LOAD GGML_F32x16_LOAD
|
1035
|
+
#define GGML_F32_VEC_STORE GGML_F32x16_STORE
|
1036
|
+
#define GGML_F32_VEC_FMA GGML_F32x16_FMA
|
1037
|
+
#define GGML_F32_VEC_ADD GGML_F32x16_ADD
|
1038
|
+
#define GGML_F32_VEC_MUL GGML_F32x16_MUL
|
1039
|
+
#define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE
|
1040
|
+
|
1041
|
+
// F16 AVX512
|
1042
|
+
|
1043
|
+
// F16 AVX
|
1044
|
+
|
1045
|
+
#define GGML_F16_STEP 64
|
1046
|
+
#define GGML_F16_EPR 16
|
1047
|
+
|
1048
|
+
// AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
|
1049
|
+
|
1050
|
+
#define GGML_F32Cx16 __m512
|
1051
|
+
#define GGML_F32Cx16_ZERO _mm512_setzero_ps()
|
1052
|
+
#define GGML_F32Cx16_SET1(x) _mm512_set1_ps(x)
|
1053
|
+
|
1054
|
+
// unlike _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
|
1055
|
+
// so F16C guard isn't required
|
1056
|
+
#define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((__m256i *)(x)))
|
1057
|
+
#define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
|
1058
|
+
|
1059
|
+
#define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
|
1060
|
+
#define GGML_F32Cx16_ADD _mm512_add_ps
|
1061
|
+
#define GGML_F32Cx16_MUL _mm512_mul_ps
|
1062
|
+
#define GGML_F32Cx16_REDUCE(res, x) \
|
1063
|
+
do { \
|
1064
|
+
int offset = GGML_F32_ARR >> 1; \
|
1065
|
+
for (int i = 0; i < offset; ++i) { \
|
1066
|
+
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
1067
|
+
} \
|
1068
|
+
offset >>= 1; \
|
1069
|
+
for (int i = 0; i < offset; ++i) { \
|
1070
|
+
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
1071
|
+
} \
|
1072
|
+
offset >>= 1; \
|
1073
|
+
for (int i = 0; i < offset; ++i) { \
|
1074
|
+
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
1075
|
+
} \
|
1076
|
+
res = _mm512_reduce_add_ps(x[0]); \
|
1077
|
+
} while (0)
|
1078
|
+
|
1079
|
+
#define GGML_F16_VEC GGML_F32Cx16
|
1080
|
+
#define GGML_F16_VEC_ZERO GGML_F32Cx16_ZERO
|
1081
|
+
#define GGML_F16_VEC_SET1 GGML_F32Cx16_SET1
|
1082
|
+
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx16_LOAD(p)
|
1083
|
+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i])
|
1084
|
+
#define GGML_F16_VEC_FMA GGML_F32Cx16_FMA
|
1085
|
+
#define GGML_F16_VEC_ADD GGML_F32Cx16_ADD
|
1086
|
+
#define GGML_F16_VEC_MUL GGML_F32Cx16_MUL
|
1087
|
+
#define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
|
1088
|
+
|
934
1089
|
#elif defined(__AVX__)
|
935
1090
|
|
936
1091
|
#define GGML_SIMD
|
@@ -2392,6 +2547,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
|
2392
2547
|
case GGML_FTYPE_MOSTLY_IQ2_XS: wtype = GGML_TYPE_IQ2_XS; break;
|
2393
2548
|
case GGML_FTYPE_MOSTLY_IQ3_XXS: wtype = GGML_TYPE_IQ3_XXS; break;
|
2394
2549
|
case GGML_FTYPE_MOSTLY_IQ1_S: wtype = GGML_TYPE_IQ1_S; break;
|
2550
|
+
case GGML_FTYPE_MOSTLY_IQ1_M: wtype = GGML_TYPE_IQ1_M; break;
|
2395
2551
|
case GGML_FTYPE_MOSTLY_IQ4_NL: wtype = GGML_TYPE_IQ4_NL; break;
|
2396
2552
|
case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break;
|
2397
2553
|
case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break;
|
@@ -2447,6 +2603,16 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
|
|
2447
2603
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
2448
2604
|
}
|
2449
2605
|
|
2606
|
+
GGML_CALL bool ggml_is_empty(const struct ggml_tensor * tensor) {
|
2607
|
+
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
2608
|
+
if (tensor->ne[i] == 0) {
|
2609
|
+
// empty if any dimension has no elements
|
2610
|
+
return true;
|
2611
|
+
}
|
2612
|
+
}
|
2613
|
+
return false;
|
2614
|
+
}
|
2615
|
+
|
2450
2616
|
bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
2451
2617
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
2452
2618
|
|
@@ -2461,7 +2627,7 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor
|
|
2461
2627
|
static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
2462
2628
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
2463
2629
|
|
2464
|
-
return
|
2630
|
+
return ggml_is_empty(t0) ? ggml_is_empty(t1) :
|
2465
2631
|
(t1->ne[0]%t0->ne[0] == 0) &&
|
2466
2632
|
(t1->ne[1]%t0->ne[1] == 0) &&
|
2467
2633
|
(t1->ne[2]%t0->ne[2] == 0) &&
|
@@ -2545,14 +2711,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
2545
2711
|
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
2546
2712
|
}
|
2547
2713
|
|
2548
|
-
#if defined(
|
2549
|
-
ggml_init_cublas();
|
2550
|
-
#elif defined(GGML_USE_CLBLAST)
|
2714
|
+
#if defined(GGML_USE_CLBLAST)
|
2551
2715
|
ggml_cl_init();
|
2552
|
-
#elif defined(GGML_USE_VULKAN)
|
2553
|
-
ggml_vk_init_cpu_assist();
|
2554
|
-
#elif defined(GGML_USE_SYCL)
|
2555
|
-
ggml_init_sycl();
|
2556
2716
|
#endif
|
2557
2717
|
|
2558
2718
|
ggml_setup_op_has_task_pass();
|
@@ -2772,7 +2932,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
2772
2932
|
data_size *= ne[i];
|
2773
2933
|
}
|
2774
2934
|
|
2775
|
-
GGML_ASSERT(view_src == NULL || data_size + view_offs <= ggml_nbytes(view_src));
|
2935
|
+
GGML_ASSERT(view_src == NULL || data_size == 0 || data_size + view_offs <= ggml_nbytes(view_src));
|
2776
2936
|
|
2777
2937
|
void * data = view_src != NULL ? view_src->data : NULL;
|
2778
2938
|
if (data != NULL) {
|
@@ -4413,45 +4573,38 @@ void ggml_mul_mat_set_prec(
|
|
4413
4573
|
|
4414
4574
|
// ggml_mul_mat_id
|
4415
4575
|
|
4576
|
+
// NOTE: id will be removed in the future and instead all the experts listed in ids will be computed
|
4577
|
+
// this will allow computing all the used experts in a single matrix multiplication
|
4416
4578
|
struct ggml_tensor * ggml_mul_mat_id(
|
4417
4579
|
struct ggml_context * ctx,
|
4418
|
-
struct ggml_tensor *
|
4419
|
-
int n_as,
|
4580
|
+
struct ggml_tensor * as,
|
4420
4581
|
struct ggml_tensor * ids,
|
4421
4582
|
int id,
|
4422
4583
|
struct ggml_tensor * b) {
|
4423
4584
|
|
4424
4585
|
GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
4425
|
-
GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1);
|
4426
|
-
GGML_ASSERT(ids->ne[1] == b->ne[1]);
|
4586
|
+
GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); // ids is 2d
|
4587
|
+
GGML_ASSERT(ids->ne[1] == b->ne[1]); // must have an expert per b row
|
4427
4588
|
GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
|
4428
|
-
GGML_ASSERT(
|
4429
|
-
GGML_ASSERT(
|
4589
|
+
GGML_ASSERT(id >= 0 && id < ids->ne[0]); // valid id
|
4590
|
+
GGML_ASSERT(as->ne[0] == b->ne[0]); // can_mul_mat
|
4430
4591
|
|
4431
4592
|
bool is_node = false;
|
4432
4593
|
|
4433
|
-
if (as
|
4594
|
+
if (as->grad || b->grad) {
|
4434
4595
|
is_node = true;
|
4435
4596
|
}
|
4436
4597
|
|
4437
|
-
const int64_t ne[4] = { as
|
4598
|
+
const int64_t ne[4] = { as->ne[1], b->ne[1], b->ne[2], b->ne[3] };
|
4438
4599
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
4439
4600
|
|
4440
4601
|
ggml_set_op_params_i32(result, 0, id);
|
4441
|
-
ggml_set_op_params_i32(result, 1, n_as);
|
4442
4602
|
|
4443
4603
|
result->op = GGML_OP_MUL_MAT_ID;
|
4444
4604
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
4445
|
-
result->src[0] =
|
4605
|
+
result->src[0] = as;
|
4446
4606
|
result->src[1] = b;
|
4447
|
-
|
4448
|
-
for (int i = 0; i < n_as; i++) {
|
4449
|
-
struct ggml_tensor * a = as[i];
|
4450
|
-
GGML_ASSERT(ggml_are_same_shape(as[0], a));
|
4451
|
-
GGML_ASSERT(ggml_can_mul_mat(a, b));
|
4452
|
-
GGML_ASSERT(!ggml_is_transposed(a));
|
4453
|
-
result->src[i + 2] = a;
|
4454
|
-
}
|
4607
|
+
result->src[2] = ids;
|
4455
4608
|
|
4456
4609
|
return result;
|
4457
4610
|
}
|
@@ -7992,6 +8145,7 @@ static void ggml_compute_forward_add(
|
|
7992
8145
|
case GGML_TYPE_IQ2_XS:
|
7993
8146
|
case GGML_TYPE_IQ3_XXS:
|
7994
8147
|
case GGML_TYPE_IQ1_S:
|
8148
|
+
case GGML_TYPE_IQ1_M:
|
7995
8149
|
case GGML_TYPE_IQ4_NL:
|
7996
8150
|
case GGML_TYPE_IQ4_XS:
|
7997
8151
|
case GGML_TYPE_IQ3_S:
|
@@ -8274,6 +8428,7 @@ static void ggml_compute_forward_add1(
|
|
8274
8428
|
case GGML_TYPE_IQ2_XS:
|
8275
8429
|
case GGML_TYPE_IQ3_XXS:
|
8276
8430
|
case GGML_TYPE_IQ1_S:
|
8431
|
+
case GGML_TYPE_IQ1_M:
|
8277
8432
|
case GGML_TYPE_IQ4_NL:
|
8278
8433
|
case GGML_TYPE_IQ4_XS:
|
8279
8434
|
case GGML_TYPE_IQ3_S:
|
@@ -8401,6 +8556,7 @@ static void ggml_compute_forward_acc(
|
|
8401
8556
|
case GGML_TYPE_IQ2_XS:
|
8402
8557
|
case GGML_TYPE_IQ3_XXS:
|
8403
8558
|
case GGML_TYPE_IQ1_S:
|
8559
|
+
case GGML_TYPE_IQ1_M:
|
8404
8560
|
case GGML_TYPE_IQ4_NL:
|
8405
8561
|
case GGML_TYPE_IQ4_XS:
|
8406
8562
|
case GGML_TYPE_IQ3_S:
|
@@ -10785,10 +10941,9 @@ static void ggml_compute_forward_mul_mat_id(
|
|
10785
10941
|
const struct ggml_compute_params * params,
|
10786
10942
|
struct ggml_tensor * dst) {
|
10787
10943
|
|
10788
|
-
const struct ggml_tensor *
|
10944
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
10789
10945
|
const struct ggml_tensor * src1 = dst->src[1];
|
10790
|
-
|
10791
|
-
const struct ggml_tensor * src0 = dst->src[2]; // only for GGML_TENSOR_BINARY_OP_LOCALS
|
10946
|
+
const struct ggml_tensor * ids = dst->src[2];
|
10792
10947
|
|
10793
10948
|
GGML_TENSOR_BINARY_OP_LOCALS
|
10794
10949
|
|
@@ -10818,13 +10973,13 @@ static void ggml_compute_forward_mul_mat_id(
|
|
10818
10973
|
GGML_ASSERT(nb1 <= nb2);
|
10819
10974
|
GGML_ASSERT(nb2 <= nb3);
|
10820
10975
|
|
10821
|
-
// broadcast
|
10822
|
-
|
10823
|
-
|
10976
|
+
// broadcast is not supported with mmid
|
10977
|
+
assert(ne12 == 1);
|
10978
|
+
assert(ne13 == 1);
|
10824
10979
|
|
10825
10980
|
// row groups
|
10826
10981
|
const int id = ggml_get_op_params_i32(dst, 0);
|
10827
|
-
const int n_as =
|
10982
|
+
const int n_as = src0->ne[2];
|
10828
10983
|
|
10829
10984
|
char * wdata_src1_end = (src1->type == vec_dot_type) ?
|
10830
10985
|
(char *) params->wdata :
|
@@ -10884,7 +11039,7 @@ static void ggml_compute_forward_mul_mat_id(
|
|
10884
11039
|
continue;
|
10885
11040
|
}
|
10886
11041
|
|
10887
|
-
|
11042
|
+
size_t src0_offset = cur_a*src0->nb[2];
|
10888
11043
|
|
10889
11044
|
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
10890
11045
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
@@ -10919,9 +11074,6 @@ static void ggml_compute_forward_mul_mat_id(
|
|
10919
11074
|
continue;
|
10920
11075
|
}
|
10921
11076
|
|
10922
|
-
assert(ne12 % ne02 == 0);
|
10923
|
-
assert(ne13 % ne03 == 0);
|
10924
|
-
|
10925
11077
|
// block-tiling attempt
|
10926
11078
|
const int64_t blck_0 = 16;
|
10927
11079
|
const int64_t blck_1 = 16;
|
@@ -10938,14 +11090,14 @@ static void ggml_compute_forward_mul_mat_id(
|
|
10938
11090
|
const int64_t i11 = MMID_MATRIX_ROW(cur_a, _i11);
|
10939
11091
|
|
10940
11092
|
// broadcast src0 into src1
|
10941
|
-
const int64_t i03 = i13/r3;
|
10942
|
-
const int64_t i02 = i12/r2;
|
11093
|
+
//const int64_t i03 = i13/r3;
|
11094
|
+
//const int64_t i02 = i12/r2;
|
10943
11095
|
|
10944
11096
|
const int64_t i1 = i11;
|
10945
11097
|
const int64_t i2 = i12;
|
10946
11098
|
const int64_t i3 = i13;
|
10947
11099
|
|
10948
|
-
const char * src0_row = (const char *)
|
11100
|
+
const char * src0_row = (const char *) src0->data + src0_offset;
|
10949
11101
|
|
10950
11102
|
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
|
10951
11103
|
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
|
@@ -11010,7 +11162,6 @@ static void ggml_compute_forward_out_prod_f32(
|
|
11010
11162
|
// nb01 >= nb00 - src0 is not transposed
|
11011
11163
|
// compute by src0 rows
|
11012
11164
|
|
11013
|
-
// TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
|
11014
11165
|
// TODO: #if defined(GGML_USE_CLBLAST)
|
11015
11166
|
|
11016
11167
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
@@ -11210,7 +11361,6 @@ static void ggml_compute_forward_out_prod_q_f32(
|
|
11210
11361
|
// nb01 >= nb00 - src0 is not transposed
|
11211
11362
|
// compute by src0 rows
|
11212
11363
|
|
11213
|
-
// TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
|
11214
11364
|
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
11215
11365
|
|
11216
11366
|
if (params->type == GGML_TASK_TYPE_INIT) {
|
@@ -11306,6 +11456,7 @@ static void ggml_compute_forward_out_prod(
|
|
11306
11456
|
case GGML_TYPE_IQ2_XS:
|
11307
11457
|
case GGML_TYPE_IQ3_XXS:
|
11308
11458
|
case GGML_TYPE_IQ1_S:
|
11459
|
+
case GGML_TYPE_IQ1_M:
|
11309
11460
|
case GGML_TYPE_IQ4_NL:
|
11310
11461
|
case GGML_TYPE_IQ4_XS:
|
11311
11462
|
case GGML_TYPE_IQ3_S:
|
@@ -11497,6 +11648,7 @@ static void ggml_compute_forward_set(
|
|
11497
11648
|
case GGML_TYPE_IQ2_XS:
|
11498
11649
|
case GGML_TYPE_IQ3_XXS:
|
11499
11650
|
case GGML_TYPE_IQ1_S:
|
11651
|
+
case GGML_TYPE_IQ1_M:
|
11500
11652
|
case GGML_TYPE_IQ4_NL:
|
11501
11653
|
case GGML_TYPE_IQ4_XS:
|
11502
11654
|
case GGML_TYPE_IQ3_S:
|
@@ -11720,6 +11872,7 @@ static void ggml_compute_forward_get_rows(
|
|
11720
11872
|
case GGML_TYPE_IQ2_XS:
|
11721
11873
|
case GGML_TYPE_IQ3_XXS:
|
11722
11874
|
case GGML_TYPE_IQ1_S:
|
11875
|
+
case GGML_TYPE_IQ1_M:
|
11723
11876
|
case GGML_TYPE_IQ4_NL:
|
11724
11877
|
case GGML_TYPE_IQ4_XS:
|
11725
11878
|
case GGML_TYPE_IQ3_S:
|
@@ -12423,6 +12576,7 @@ static void ggml_compute_forward_alibi(
|
|
12423
12576
|
case GGML_TYPE_IQ2_XS:
|
12424
12577
|
case GGML_TYPE_IQ3_XXS:
|
12425
12578
|
case GGML_TYPE_IQ1_S:
|
12579
|
+
case GGML_TYPE_IQ1_M:
|
12426
12580
|
case GGML_TYPE_IQ4_NL:
|
12427
12581
|
case GGML_TYPE_IQ4_XS:
|
12428
12582
|
case GGML_TYPE_IQ3_S:
|
@@ -12511,6 +12665,7 @@ static void ggml_compute_forward_clamp(
|
|
12511
12665
|
case GGML_TYPE_IQ2_XS:
|
12512
12666
|
case GGML_TYPE_IQ3_XXS:
|
12513
12667
|
case GGML_TYPE_IQ1_S:
|
12668
|
+
case GGML_TYPE_IQ1_M:
|
12514
12669
|
case GGML_TYPE_IQ4_NL:
|
12515
12670
|
case GGML_TYPE_IQ4_XS:
|
12516
12671
|
case GGML_TYPE_IQ3_S:
|
@@ -15952,37 +16107,10 @@ static void ggml_compute_forward_cross_entropy_loss_back(
|
|
15952
16107
|
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
15953
16108
|
GGML_ASSERT(params);
|
15954
16109
|
|
15955
|
-
if (tensor->op == GGML_OP_NONE) {
|
15956
|
-
return;
|
15957
|
-
}
|
15958
|
-
|
15959
|
-
#ifdef GGML_USE_CUBLAS
|
15960
|
-
bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
|
15961
|
-
if (skip_cpu) {
|
16110
|
+
if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) {
|
15962
16111
|
return;
|
15963
16112
|
}
|
15964
|
-
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
|
15965
|
-
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
|
15966
|
-
#elif defined(GGML_USE_VULKAN)
|
15967
|
-
const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
|
15968
|
-
#ifdef GGML_VULKAN_CHECK_RESULTS
|
15969
|
-
if (skip_cpu) {
|
15970
|
-
ggml_vk_check_results_1_cpu_assist(params, tensor);
|
15971
|
-
}
|
15972
|
-
#endif
|
15973
|
-
if (skip_cpu) {
|
15974
|
-
return;
|
15975
|
-
}
|
15976
|
-
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
|
15977
|
-
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
|
15978
|
-
#endif // GGML_USE_CUBLAS
|
15979
16113
|
|
15980
|
-
#ifdef GGML_USE_SYCL
|
15981
|
-
bool skip_cpu = ggml_sycl_compute_forward(params, tensor);
|
15982
|
-
if (skip_cpu) {
|
15983
|
-
return;
|
15984
|
-
}
|
15985
|
-
#endif // GGML_USE_SYCL
|
15986
16114
|
switch (tensor->op) {
|
15987
16115
|
case GGML_OP_DUP:
|
15988
16116
|
{
|
@@ -17834,6 +17962,12 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
|
|
17834
17962
|
static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_threads) {
|
17835
17963
|
int n_tasks = 0;
|
17836
17964
|
|
17965
|
+
if (ggml_is_empty(node)) {
|
17966
|
+
// no need to multi-thread a no-op
|
17967
|
+
n_tasks = 1;
|
17968
|
+
return n_tasks;
|
17969
|
+
}
|
17970
|
+
|
17837
17971
|
switch (node->op) {
|
17838
17972
|
case GGML_OP_CPY:
|
17839
17973
|
case GGML_OP_DUP:
|
@@ -18319,13 +18453,13 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
18319
18453
|
case GGML_OP_MUL_MAT_ID:
|
18320
18454
|
{
|
18321
18455
|
cur = 0;
|
18322
|
-
const struct ggml_tensor * src0 = node->src[
|
18456
|
+
const struct ggml_tensor * src0 = node->src[0];
|
18323
18457
|
const struct ggml_tensor * src1 = node->src[1];
|
18324
18458
|
const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
|
18325
18459
|
if (src1->type != vec_dot_type) {
|
18326
18460
|
cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
|
18327
18461
|
}
|
18328
|
-
const int n_as =
|
18462
|
+
const int n_as = src0->ne[2];
|
18329
18463
|
cur += GGML_PAD(cur, sizeof(int64_t)); // align
|
18330
18464
|
cur += n_as * sizeof(int64_t); // matrix_row_counts
|
18331
18465
|
cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
|
@@ -18452,17 +18586,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|
18452
18586
|
}
|
18453
18587
|
}
|
18454
18588
|
|
18455
|
-
#ifdef GGML_USE_VULKAN
|
18456
|
-
for (int i = 0; i < cgraph->n_nodes; i++) {
|
18457
|
-
ggml_vk_preallocate_buffers_graph_cpu_assist(cgraph->nodes[i]);
|
18458
|
-
}
|
18459
|
-
ggml_vk_preallocate_buffers_cpu_assist();
|
18460
|
-
|
18461
|
-
for (int i = 0; i < cgraph->n_nodes; i++) {
|
18462
|
-
ggml_vk_build_graph_cpu_assist(cgraph->nodes[i], i == cgraph->n_nodes - 1);
|
18463
|
-
}
|
18464
|
-
#endif
|
18465
|
-
|
18466
18589
|
const int n_threads = cplan->n_threads;
|
18467
18590
|
|
18468
18591
|
struct ggml_compute_state_shared state_shared = {
|
@@ -18519,10 +18642,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|
18519
18642
|
}
|
18520
18643
|
}
|
18521
18644
|
|
18522
|
-
#ifdef GGML_USE_VULKAN
|
18523
|
-
ggml_vk_graph_cleanup_cpu_assist();
|
18524
|
-
#endif
|
18525
|
-
|
18526
18645
|
// performance stats (graph)
|
18527
18646
|
{
|
18528
18647
|
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
|
@@ -18657,7 +18776,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|
18657
18776
|
|
18658
18777
|
// write binary data
|
18659
18778
|
{
|
18660
|
-
FILE * fout =
|
18779
|
+
FILE * fout = ggml_fopen(fname, "wb");
|
18661
18780
|
|
18662
18781
|
if (!fout) {
|
18663
18782
|
fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
|
@@ -18795,7 +18914,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|
18795
18914
|
|
18796
18915
|
// read file into data
|
18797
18916
|
{
|
18798
|
-
FILE * fin =
|
18917
|
+
FILE * fin = ggml_fopen(fname, "rb");
|
18799
18918
|
if (!fin) {
|
18800
18919
|
fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
|
18801
18920
|
return result;
|
@@ -19131,7 +19250,7 @@ static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node,
|
|
19131
19250
|
void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {
|
19132
19251
|
char color[16];
|
19133
19252
|
|
19134
|
-
FILE * fp =
|
19253
|
+
FILE * fp = ggml_fopen(filename, "w");
|
19135
19254
|
GGML_ASSERT(fp);
|
19136
19255
|
|
19137
19256
|
fprintf(fp, "digraph G {\n");
|
@@ -20178,7 +20297,8 @@ void ggml_quantize_init(enum ggml_type type) {
|
|
20178
20297
|
case GGML_TYPE_IQ2_XXS:
|
20179
20298
|
case GGML_TYPE_IQ2_XS:
|
20180
20299
|
case GGML_TYPE_IQ2_S:
|
20181
|
-
case GGML_TYPE_IQ1_S:
|
20300
|
+
case GGML_TYPE_IQ1_S:
|
20301
|
+
case GGML_TYPE_IQ1_M: iq2xs_init_impl(type); break;
|
20182
20302
|
case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
|
20183
20303
|
case GGML_TYPE_IQ3_S: iq3xs_init_impl(512); break;
|
20184
20304
|
default: // nothing
|
@@ -20203,7 +20323,8 @@ bool ggml_quantize_requires_imatrix(enum ggml_type type) {
|
|
20203
20323
|
return
|
20204
20324
|
type == GGML_TYPE_IQ2_XXS ||
|
20205
20325
|
type == GGML_TYPE_IQ2_XS ||
|
20206
|
-
type == GGML_TYPE_IQ1_S
|
20326
|
+
type == GGML_TYPE_IQ1_S;// ||
|
20327
|
+
//type == GGML_TYPE_IQ1_M;
|
20207
20328
|
}
|
20208
20329
|
|
20209
20330
|
size_t ggml_quantize_chunk(
|
@@ -20247,6 +20368,7 @@ size_t ggml_quantize_chunk(
|
|
20247
20368
|
case GGML_TYPE_IQ3_S: result = quantize_iq3_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20248
20369
|
case GGML_TYPE_IQ2_S: result = quantize_iq2_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20249
20370
|
case GGML_TYPE_IQ1_S: result = quantize_iq1_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20371
|
+
case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20250
20372
|
case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20251
20373
|
#if QK_K == 64
|
20252
20374
|
case GGML_TYPE_IQ4_XS: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
@@ -20449,7 +20571,7 @@ struct gguf_context * gguf_init_empty(void) {
|
|
20449
20571
|
}
|
20450
20572
|
|
20451
20573
|
struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
|
20452
|
-
FILE * file =
|
20574
|
+
FILE * file = ggml_fopen(fname, "rb");
|
20453
20575
|
if (!file) {
|
20454
20576
|
return NULL;
|
20455
20577
|
}
|
@@ -21404,7 +21526,7 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf *
|
|
21404
21526
|
}
|
21405
21527
|
|
21406
21528
|
void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) {
|
21407
|
-
FILE * file =
|
21529
|
+
FILE * file = ggml_fopen(fname, "wb");
|
21408
21530
|
if (!file) {
|
21409
21531
|
GGML_ASSERT(false && "failed to open file for writing");
|
21410
21532
|
}
|
@@ -21546,15 +21668,15 @@ int ggml_cpu_has_wasm_simd(void) {
|
|
21546
21668
|
}
|
21547
21669
|
|
21548
21670
|
int ggml_cpu_has_blas(void) {
|
21549
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(
|
21671
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
|
21550
21672
|
return 1;
|
21551
21673
|
#else
|
21552
21674
|
return 0;
|
21553
21675
|
#endif
|
21554
21676
|
}
|
21555
21677
|
|
21556
|
-
int
|
21557
|
-
#if defined(
|
21678
|
+
int ggml_cpu_has_cuda(void) {
|
21679
|
+
#if defined(GGML_USE_CUDA)
|
21558
21680
|
return 1;
|
21559
21681
|
#else
|
21560
21682
|
return 0;
|
@@ -21594,7 +21716,7 @@ int ggml_cpu_has_sycl(void) {
|
|
21594
21716
|
}
|
21595
21717
|
|
21596
21718
|
int ggml_cpu_has_gpublas(void) {
|
21597
|
-
return
|
21719
|
+
return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
|
21598
21720
|
ggml_cpu_has_sycl();
|
21599
21721
|
}
|
21600
21722
|
|