llama_cpp 0.14.3 → 0.14.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +4 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +71 -18
- data/vendor/tmp/llama.cpp/ggml-alloc.c +7 -2
- data/vendor/tmp/llama.cpp/ggml-backend.c +1 -1
- data/vendor/tmp/llama.cpp/ggml-common.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +300 -9333
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +133 -113
- data/vendor/tmp/llama.cpp/ggml-metal.metal +344 -276
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +638 -43
- data/vendor/tmp/llama.cpp/ggml-quants.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +106 -393
- data/vendor/tmp/llama.cpp/ggml-sycl.h +13 -3
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +37199 -14939
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +329 -308
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -11
- data/vendor/tmp/llama.cpp/ggml.c +133 -93
- data/vendor/tmp/llama.cpp/ggml.h +11 -5
- data/vendor/tmp/llama.cpp/llama.cpp +1763 -431
- data/vendor/tmp/llama.cpp/llama.h +67 -19
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1651 -0
- data/vendor/tmp/llama.cpp/unicode-data.h +16 -0
- data/vendor/tmp/llama.cpp/unicode.cpp +8 -1403
- data/vendor/tmp/llama.cpp/unicode.h +2 -0
- metadata +5 -3
@@ -11,17 +11,6 @@ extern "C" {
|
|
11
11
|
#define GGML_VK_MAX_DEVICES 16
|
12
12
|
|
13
13
|
GGML_API void ggml_vk_instance_init(void);
|
14
|
-
GGML_API void ggml_vk_init_cpu_assist(void);
|
15
|
-
|
16
|
-
GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node);
|
17
|
-
GGML_API void ggml_vk_preallocate_buffers_cpu_assist(void);
|
18
|
-
GGML_API void ggml_vk_build_graph_cpu_assist(struct ggml_tensor * node, bool last_node);
|
19
|
-
GGML_API bool ggml_vk_compute_forward_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
20
|
-
#ifdef GGML_VULKAN_CHECK_RESULTS
|
21
|
-
void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
22
|
-
#endif
|
23
|
-
GGML_API void ggml_vk_graph_cleanup_cpu_assist(void);
|
24
|
-
GGML_API void ggml_vk_free_cpu_assist(void);
|
25
14
|
|
26
15
|
// backend API
|
27
16
|
GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);
|
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
@@ -3,6 +3,7 @@
|
|
3
3
|
|
4
4
|
#include "ggml-impl.h"
|
5
5
|
#include "ggml-quants.h"
|
6
|
+
#include "ggml.h"
|
6
7
|
|
7
8
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
8
9
|
#include <malloc.h> // using malloc.h with MSC/MINGW
|
@@ -43,6 +44,10 @@
|
|
43
44
|
|
44
45
|
#if defined(_WIN32)
|
45
46
|
|
47
|
+
#define WIN32_LEAN_AND_MEAN
|
48
|
+
#ifndef NOMINMAX
|
49
|
+
#define NOMINMAX
|
50
|
+
#endif
|
46
51
|
#include <windows.h>
|
47
52
|
|
48
53
|
typedef volatile LONG atomic_int;
|
@@ -273,8 +278,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
|
|
273
278
|
#include <Accelerate/Accelerate.h>
|
274
279
|
#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
|
275
280
|
#include "ggml-opencl.h"
|
276
|
-
#elif defined(GGML_USE_VULKAN)
|
277
|
-
#include "ggml-vulkan.h"
|
278
281
|
#endif
|
279
282
|
#elif defined(GGML_USE_OPENBLAS)
|
280
283
|
#if defined(GGML_BLAS_USE_MKL)
|
@@ -284,10 +287,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
|
|
284
287
|
#endif
|
285
288
|
#elif defined(GGML_USE_CLBLAST)
|
286
289
|
#include "ggml-opencl.h"
|
287
|
-
#elif defined(GGML_USE_VULKAN)
|
288
|
-
#include "ggml-vulkan.h"
|
289
|
-
#elif defined(GGML_USE_SYCL)
|
290
|
-
#include "ggml-sycl.h"
|
291
290
|
#endif
|
292
291
|
|
293
292
|
// floating point type used to accumulate sums
|
@@ -430,6 +429,57 @@ int64_t ggml_cycles_per_ms(void) {
|
|
430
429
|
#define ggml_perf_cycles_per_ms() 0
|
431
430
|
#endif
|
432
431
|
|
432
|
+
//
|
433
|
+
// cross-platform UTF-8 file paths
|
434
|
+
//
|
435
|
+
|
436
|
+
#ifdef _WIN32
|
437
|
+
static wchar_t * ggml_mbstowcs(const char * mbs) {
|
438
|
+
int wlen = MultiByteToWideChar(CP_UTF8, 0, mbs, -1, NULL, 0);
|
439
|
+
if (!wlen) {
|
440
|
+
errno = EINVAL;
|
441
|
+
return NULL;
|
442
|
+
}
|
443
|
+
|
444
|
+
wchar_t * wbuf = GGML_MALLOC(wlen * sizeof(wchar_t));
|
445
|
+
wlen = MultiByteToWideChar(CP_UTF8, 0, mbs, -1, wbuf, wlen);
|
446
|
+
if (!wlen) {
|
447
|
+
GGML_FREE(wbuf);
|
448
|
+
errno = EINVAL;
|
449
|
+
return NULL;
|
450
|
+
}
|
451
|
+
|
452
|
+
return wbuf;
|
453
|
+
}
|
454
|
+
#endif
|
455
|
+
|
456
|
+
FILE * ggml_fopen(const char * fname, const char * mode) {
|
457
|
+
#ifdef _WIN32
|
458
|
+
FILE * file = NULL;
|
459
|
+
|
460
|
+
// convert fname (UTF-8)
|
461
|
+
wchar_t * wfname = ggml_mbstowcs(fname);
|
462
|
+
if (wfname) {
|
463
|
+
// convert mode (ANSI)
|
464
|
+
wchar_t * wmode = GGML_MALLOC((strlen(mode) + 1) * sizeof(wchar_t));
|
465
|
+
wchar_t * wmode_p = wmode;
|
466
|
+
do {
|
467
|
+
*wmode_p++ = (wchar_t)*mode;
|
468
|
+
} while (*mode++);
|
469
|
+
|
470
|
+
// open file
|
471
|
+
file = _wfopen(wfname, wmode);
|
472
|
+
|
473
|
+
GGML_FREE(wfname);
|
474
|
+
GGML_FREE(wmode);
|
475
|
+
}
|
476
|
+
|
477
|
+
return file;
|
478
|
+
#else
|
479
|
+
return fopen(fname, mode);
|
480
|
+
#endif
|
481
|
+
}
|
482
|
+
|
433
483
|
//
|
434
484
|
// cache line
|
435
485
|
//
|
@@ -740,6 +790,18 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
740
790
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
741
791
|
.nrows = 1,
|
742
792
|
},
|
793
|
+
[GGML_TYPE_IQ1_M] = {
|
794
|
+
.type_name = "iq1_m",
|
795
|
+
.blck_size = QK_K,
|
796
|
+
.type_size = sizeof(block_iq1_m),
|
797
|
+
.is_quantized = true,
|
798
|
+
.to_float = (ggml_to_float_t) dequantize_row_iq1_m,
|
799
|
+
.from_float = NULL,
|
800
|
+
.from_float_reference = NULL,
|
801
|
+
.vec_dot = ggml_vec_dot_iq1_m_q8_K,
|
802
|
+
.vec_dot_type = GGML_TYPE_Q8_K,
|
803
|
+
.nrows = 1,
|
804
|
+
},
|
743
805
|
[GGML_TYPE_IQ4_NL] = {
|
744
806
|
.type_name = "iq4_nl",
|
745
807
|
.blck_size = QK4_NL,
|
@@ -2485,6 +2547,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
|
2485
2547
|
case GGML_FTYPE_MOSTLY_IQ2_XS: wtype = GGML_TYPE_IQ2_XS; break;
|
2486
2548
|
case GGML_FTYPE_MOSTLY_IQ3_XXS: wtype = GGML_TYPE_IQ3_XXS; break;
|
2487
2549
|
case GGML_FTYPE_MOSTLY_IQ1_S: wtype = GGML_TYPE_IQ1_S; break;
|
2550
|
+
case GGML_FTYPE_MOSTLY_IQ1_M: wtype = GGML_TYPE_IQ1_M; break;
|
2488
2551
|
case GGML_FTYPE_MOSTLY_IQ4_NL: wtype = GGML_TYPE_IQ4_NL; break;
|
2489
2552
|
case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break;
|
2490
2553
|
case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break;
|
@@ -2540,6 +2603,16 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
|
|
2540
2603
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
2541
2604
|
}
|
2542
2605
|
|
2606
|
+
GGML_CALL bool ggml_is_empty(const struct ggml_tensor * tensor) {
|
2607
|
+
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
2608
|
+
if (tensor->ne[i] == 0) {
|
2609
|
+
// empty if any dimension has no elements
|
2610
|
+
return true;
|
2611
|
+
}
|
2612
|
+
}
|
2613
|
+
return false;
|
2614
|
+
}
|
2615
|
+
|
2543
2616
|
bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
2544
2617
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
2545
2618
|
|
@@ -2554,7 +2627,7 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor
|
|
2554
2627
|
static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
2555
2628
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
2556
2629
|
|
2557
|
-
return
|
2630
|
+
return ggml_is_empty(t0) ? ggml_is_empty(t1) :
|
2558
2631
|
(t1->ne[0]%t0->ne[0] == 0) &&
|
2559
2632
|
(t1->ne[1]%t0->ne[1] == 0) &&
|
2560
2633
|
(t1->ne[2]%t0->ne[2] == 0) &&
|
@@ -2640,10 +2713,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
2640
2713
|
|
2641
2714
|
#if defined(GGML_USE_CLBLAST)
|
2642
2715
|
ggml_cl_init();
|
2643
|
-
#elif defined(GGML_USE_VULKAN)
|
2644
|
-
ggml_vk_init_cpu_assist();
|
2645
|
-
#elif defined(GGML_USE_SYCL)
|
2646
|
-
ggml_init_sycl();
|
2647
2716
|
#endif
|
2648
2717
|
|
2649
2718
|
ggml_setup_op_has_task_pass();
|
@@ -2863,7 +2932,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
2863
2932
|
data_size *= ne[i];
|
2864
2933
|
}
|
2865
2934
|
|
2866
|
-
GGML_ASSERT(view_src == NULL || data_size + view_offs <= ggml_nbytes(view_src));
|
2935
|
+
GGML_ASSERT(view_src == NULL || data_size == 0 || data_size + view_offs <= ggml_nbytes(view_src));
|
2867
2936
|
|
2868
2937
|
void * data = view_src != NULL ? view_src->data : NULL;
|
2869
2938
|
if (data != NULL) {
|
@@ -4504,45 +4573,38 @@ void ggml_mul_mat_set_prec(
|
|
4504
4573
|
|
4505
4574
|
// ggml_mul_mat_id
|
4506
4575
|
|
4576
|
+
// NOTE: id will be removed in the future and instead all the experts listed in ids will be computed
|
4577
|
+
// this will allow computing all the used experts in a single matrix multiplication
|
4507
4578
|
struct ggml_tensor * ggml_mul_mat_id(
|
4508
4579
|
struct ggml_context * ctx,
|
4509
|
-
struct ggml_tensor *
|
4510
|
-
int n_as,
|
4580
|
+
struct ggml_tensor * as,
|
4511
4581
|
struct ggml_tensor * ids,
|
4512
4582
|
int id,
|
4513
4583
|
struct ggml_tensor * b) {
|
4514
4584
|
|
4515
4585
|
GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
4516
|
-
GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1);
|
4517
|
-
GGML_ASSERT(ids->ne[1] == b->ne[1]);
|
4586
|
+
GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); // ids is 2d
|
4587
|
+
GGML_ASSERT(ids->ne[1] == b->ne[1]); // must have an expert per b row
|
4518
4588
|
GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
|
4519
|
-
GGML_ASSERT(
|
4520
|
-
GGML_ASSERT(
|
4589
|
+
GGML_ASSERT(id >= 0 && id < ids->ne[0]); // valid id
|
4590
|
+
GGML_ASSERT(as->ne[0] == b->ne[0]); // can_mul_mat
|
4521
4591
|
|
4522
4592
|
bool is_node = false;
|
4523
4593
|
|
4524
|
-
if (as
|
4594
|
+
if (as->grad || b->grad) {
|
4525
4595
|
is_node = true;
|
4526
4596
|
}
|
4527
4597
|
|
4528
|
-
const int64_t ne[4] = { as
|
4598
|
+
const int64_t ne[4] = { as->ne[1], b->ne[1], b->ne[2], b->ne[3] };
|
4529
4599
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
4530
4600
|
|
4531
4601
|
ggml_set_op_params_i32(result, 0, id);
|
4532
|
-
ggml_set_op_params_i32(result, 1, n_as);
|
4533
4602
|
|
4534
4603
|
result->op = GGML_OP_MUL_MAT_ID;
|
4535
4604
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
4536
|
-
result->src[0] =
|
4605
|
+
result->src[0] = as;
|
4537
4606
|
result->src[1] = b;
|
4538
|
-
|
4539
|
-
for (int i = 0; i < n_as; i++) {
|
4540
|
-
struct ggml_tensor * a = as[i];
|
4541
|
-
GGML_ASSERT(ggml_are_same_shape(as[0], a));
|
4542
|
-
GGML_ASSERT(ggml_can_mul_mat(a, b));
|
4543
|
-
GGML_ASSERT(!ggml_is_transposed(a));
|
4544
|
-
result->src[i + 2] = a;
|
4545
|
-
}
|
4607
|
+
result->src[2] = ids;
|
4546
4608
|
|
4547
4609
|
return result;
|
4548
4610
|
}
|
@@ -8083,6 +8145,7 @@ static void ggml_compute_forward_add(
|
|
8083
8145
|
case GGML_TYPE_IQ2_XS:
|
8084
8146
|
case GGML_TYPE_IQ3_XXS:
|
8085
8147
|
case GGML_TYPE_IQ1_S:
|
8148
|
+
case GGML_TYPE_IQ1_M:
|
8086
8149
|
case GGML_TYPE_IQ4_NL:
|
8087
8150
|
case GGML_TYPE_IQ4_XS:
|
8088
8151
|
case GGML_TYPE_IQ3_S:
|
@@ -8365,6 +8428,7 @@ static void ggml_compute_forward_add1(
|
|
8365
8428
|
case GGML_TYPE_IQ2_XS:
|
8366
8429
|
case GGML_TYPE_IQ3_XXS:
|
8367
8430
|
case GGML_TYPE_IQ1_S:
|
8431
|
+
case GGML_TYPE_IQ1_M:
|
8368
8432
|
case GGML_TYPE_IQ4_NL:
|
8369
8433
|
case GGML_TYPE_IQ4_XS:
|
8370
8434
|
case GGML_TYPE_IQ3_S:
|
@@ -8492,6 +8556,7 @@ static void ggml_compute_forward_acc(
|
|
8492
8556
|
case GGML_TYPE_IQ2_XS:
|
8493
8557
|
case GGML_TYPE_IQ3_XXS:
|
8494
8558
|
case GGML_TYPE_IQ1_S:
|
8559
|
+
case GGML_TYPE_IQ1_M:
|
8495
8560
|
case GGML_TYPE_IQ4_NL:
|
8496
8561
|
case GGML_TYPE_IQ4_XS:
|
8497
8562
|
case GGML_TYPE_IQ3_S:
|
@@ -10876,10 +10941,9 @@ static void ggml_compute_forward_mul_mat_id(
|
|
10876
10941
|
const struct ggml_compute_params * params,
|
10877
10942
|
struct ggml_tensor * dst) {
|
10878
10943
|
|
10879
|
-
const struct ggml_tensor *
|
10944
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
10880
10945
|
const struct ggml_tensor * src1 = dst->src[1];
|
10881
|
-
|
10882
|
-
const struct ggml_tensor * src0 = dst->src[2]; // only for GGML_TENSOR_BINARY_OP_LOCALS
|
10946
|
+
const struct ggml_tensor * ids = dst->src[2];
|
10883
10947
|
|
10884
10948
|
GGML_TENSOR_BINARY_OP_LOCALS
|
10885
10949
|
|
@@ -10909,13 +10973,13 @@ static void ggml_compute_forward_mul_mat_id(
|
|
10909
10973
|
GGML_ASSERT(nb1 <= nb2);
|
10910
10974
|
GGML_ASSERT(nb2 <= nb3);
|
10911
10975
|
|
10912
|
-
// broadcast
|
10913
|
-
|
10914
|
-
|
10976
|
+
// broadcast is not supported with mmid
|
10977
|
+
assert(ne12 == 1);
|
10978
|
+
assert(ne13 == 1);
|
10915
10979
|
|
10916
10980
|
// row groups
|
10917
10981
|
const int id = ggml_get_op_params_i32(dst, 0);
|
10918
|
-
const int n_as =
|
10982
|
+
const int n_as = src0->ne[2];
|
10919
10983
|
|
10920
10984
|
char * wdata_src1_end = (src1->type == vec_dot_type) ?
|
10921
10985
|
(char *) params->wdata :
|
@@ -10975,7 +11039,7 @@ static void ggml_compute_forward_mul_mat_id(
|
|
10975
11039
|
continue;
|
10976
11040
|
}
|
10977
11041
|
|
10978
|
-
|
11042
|
+
size_t src0_offset = cur_a*src0->nb[2];
|
10979
11043
|
|
10980
11044
|
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
10981
11045
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
@@ -11010,9 +11074,6 @@ static void ggml_compute_forward_mul_mat_id(
|
|
11010
11074
|
continue;
|
11011
11075
|
}
|
11012
11076
|
|
11013
|
-
assert(ne12 % ne02 == 0);
|
11014
|
-
assert(ne13 % ne03 == 0);
|
11015
|
-
|
11016
11077
|
// block-tiling attempt
|
11017
11078
|
const int64_t blck_0 = 16;
|
11018
11079
|
const int64_t blck_1 = 16;
|
@@ -11029,14 +11090,14 @@ static void ggml_compute_forward_mul_mat_id(
|
|
11029
11090
|
const int64_t i11 = MMID_MATRIX_ROW(cur_a, _i11);
|
11030
11091
|
|
11031
11092
|
// broadcast src0 into src1
|
11032
|
-
const int64_t i03 = i13/r3;
|
11033
|
-
const int64_t i02 = i12/r2;
|
11093
|
+
//const int64_t i03 = i13/r3;
|
11094
|
+
//const int64_t i02 = i12/r2;
|
11034
11095
|
|
11035
11096
|
const int64_t i1 = i11;
|
11036
11097
|
const int64_t i2 = i12;
|
11037
11098
|
const int64_t i3 = i13;
|
11038
11099
|
|
11039
|
-
const char * src0_row = (const char *)
|
11100
|
+
const char * src0_row = (const char *) src0->data + src0_offset;
|
11040
11101
|
|
11041
11102
|
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
|
11042
11103
|
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
|
@@ -11395,6 +11456,7 @@ static void ggml_compute_forward_out_prod(
|
|
11395
11456
|
case GGML_TYPE_IQ2_XS:
|
11396
11457
|
case GGML_TYPE_IQ3_XXS:
|
11397
11458
|
case GGML_TYPE_IQ1_S:
|
11459
|
+
case GGML_TYPE_IQ1_M:
|
11398
11460
|
case GGML_TYPE_IQ4_NL:
|
11399
11461
|
case GGML_TYPE_IQ4_XS:
|
11400
11462
|
case GGML_TYPE_IQ3_S:
|
@@ -11586,6 +11648,7 @@ static void ggml_compute_forward_set(
|
|
11586
11648
|
case GGML_TYPE_IQ2_XS:
|
11587
11649
|
case GGML_TYPE_IQ3_XXS:
|
11588
11650
|
case GGML_TYPE_IQ1_S:
|
11651
|
+
case GGML_TYPE_IQ1_M:
|
11589
11652
|
case GGML_TYPE_IQ4_NL:
|
11590
11653
|
case GGML_TYPE_IQ4_XS:
|
11591
11654
|
case GGML_TYPE_IQ3_S:
|
@@ -11809,6 +11872,7 @@ static void ggml_compute_forward_get_rows(
|
|
11809
11872
|
case GGML_TYPE_IQ2_XS:
|
11810
11873
|
case GGML_TYPE_IQ3_XXS:
|
11811
11874
|
case GGML_TYPE_IQ1_S:
|
11875
|
+
case GGML_TYPE_IQ1_M:
|
11812
11876
|
case GGML_TYPE_IQ4_NL:
|
11813
11877
|
case GGML_TYPE_IQ4_XS:
|
11814
11878
|
case GGML_TYPE_IQ3_S:
|
@@ -12512,6 +12576,7 @@ static void ggml_compute_forward_alibi(
|
|
12512
12576
|
case GGML_TYPE_IQ2_XS:
|
12513
12577
|
case GGML_TYPE_IQ3_XXS:
|
12514
12578
|
case GGML_TYPE_IQ1_S:
|
12579
|
+
case GGML_TYPE_IQ1_M:
|
12515
12580
|
case GGML_TYPE_IQ4_NL:
|
12516
12581
|
case GGML_TYPE_IQ4_XS:
|
12517
12582
|
case GGML_TYPE_IQ3_S:
|
@@ -12600,6 +12665,7 @@ static void ggml_compute_forward_clamp(
|
|
12600
12665
|
case GGML_TYPE_IQ2_XS:
|
12601
12666
|
case GGML_TYPE_IQ3_XXS:
|
12602
12667
|
case GGML_TYPE_IQ1_S:
|
12668
|
+
case GGML_TYPE_IQ1_M:
|
12603
12669
|
case GGML_TYPE_IQ4_NL:
|
12604
12670
|
case GGML_TYPE_IQ4_XS:
|
12605
12671
|
case GGML_TYPE_IQ3_S:
|
@@ -16041,30 +16107,10 @@ static void ggml_compute_forward_cross_entropy_loss_back(
|
|
16041
16107
|
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
16042
16108
|
GGML_ASSERT(params);
|
16043
16109
|
|
16044
|
-
if (tensor->op == GGML_OP_NONE) {
|
16045
|
-
return;
|
16046
|
-
}
|
16047
|
-
|
16048
|
-
#if defined(GGML_USE_VULKAN)
|
16049
|
-
const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
|
16050
|
-
#ifdef GGML_VULKAN_CHECK_RESULTS
|
16051
|
-
if (skip_cpu) {
|
16052
|
-
ggml_vk_check_results_1_cpu_assist(params, tensor);
|
16053
|
-
}
|
16054
|
-
#endif
|
16055
|
-
if (skip_cpu) {
|
16110
|
+
if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) {
|
16056
16111
|
return;
|
16057
16112
|
}
|
16058
|
-
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
|
16059
|
-
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
|
16060
|
-
#endif // GGML_USE_VULKAN
|
16061
16113
|
|
16062
|
-
#ifdef GGML_USE_SYCL
|
16063
|
-
bool skip_cpu = ggml_sycl_compute_forward(params, tensor);
|
16064
|
-
if (skip_cpu) {
|
16065
|
-
return;
|
16066
|
-
}
|
16067
|
-
#endif // GGML_USE_SYCL
|
16068
16114
|
switch (tensor->op) {
|
16069
16115
|
case GGML_OP_DUP:
|
16070
16116
|
{
|
@@ -17916,6 +17962,12 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
|
|
17916
17962
|
static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_threads) {
|
17917
17963
|
int n_tasks = 0;
|
17918
17964
|
|
17965
|
+
if (ggml_is_empty(node)) {
|
17966
|
+
// no need to multi-thread a no-op
|
17967
|
+
n_tasks = 1;
|
17968
|
+
return n_tasks;
|
17969
|
+
}
|
17970
|
+
|
17919
17971
|
switch (node->op) {
|
17920
17972
|
case GGML_OP_CPY:
|
17921
17973
|
case GGML_OP_DUP:
|
@@ -18401,13 +18453,13 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
18401
18453
|
case GGML_OP_MUL_MAT_ID:
|
18402
18454
|
{
|
18403
18455
|
cur = 0;
|
18404
|
-
const struct ggml_tensor * src0 = node->src[
|
18456
|
+
const struct ggml_tensor * src0 = node->src[0];
|
18405
18457
|
const struct ggml_tensor * src1 = node->src[1];
|
18406
18458
|
const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
|
18407
18459
|
if (src1->type != vec_dot_type) {
|
18408
18460
|
cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
|
18409
18461
|
}
|
18410
|
-
const int n_as =
|
18462
|
+
const int n_as = src0->ne[2];
|
18411
18463
|
cur += GGML_PAD(cur, sizeof(int64_t)); // align
|
18412
18464
|
cur += n_as * sizeof(int64_t); // matrix_row_counts
|
18413
18465
|
cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
|
@@ -18534,17 +18586,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|
18534
18586
|
}
|
18535
18587
|
}
|
18536
18588
|
|
18537
|
-
#ifdef GGML_USE_VULKAN
|
18538
|
-
for (int i = 0; i < cgraph->n_nodes; i++) {
|
18539
|
-
ggml_vk_preallocate_buffers_graph_cpu_assist(cgraph->nodes[i]);
|
18540
|
-
}
|
18541
|
-
ggml_vk_preallocate_buffers_cpu_assist();
|
18542
|
-
|
18543
|
-
for (int i = 0; i < cgraph->n_nodes; i++) {
|
18544
|
-
ggml_vk_build_graph_cpu_assist(cgraph->nodes[i], i == cgraph->n_nodes - 1);
|
18545
|
-
}
|
18546
|
-
#endif
|
18547
|
-
|
18548
18589
|
const int n_threads = cplan->n_threads;
|
18549
18590
|
|
18550
18591
|
struct ggml_compute_state_shared state_shared = {
|
@@ -18601,10 +18642,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|
18601
18642
|
}
|
18602
18643
|
}
|
18603
18644
|
|
18604
|
-
#ifdef GGML_USE_VULKAN
|
18605
|
-
ggml_vk_graph_cleanup_cpu_assist();
|
18606
|
-
#endif
|
18607
|
-
|
18608
18645
|
// performance stats (graph)
|
18609
18646
|
{
|
18610
18647
|
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
|
@@ -18739,7 +18776,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|
18739
18776
|
|
18740
18777
|
// write binary data
|
18741
18778
|
{
|
18742
|
-
FILE * fout =
|
18779
|
+
FILE * fout = ggml_fopen(fname, "wb");
|
18743
18780
|
|
18744
18781
|
if (!fout) {
|
18745
18782
|
fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
|
@@ -18877,7 +18914,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|
18877
18914
|
|
18878
18915
|
// read file into data
|
18879
18916
|
{
|
18880
|
-
FILE * fin =
|
18917
|
+
FILE * fin = ggml_fopen(fname, "rb");
|
18881
18918
|
if (!fin) {
|
18882
18919
|
fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
|
18883
18920
|
return result;
|
@@ -19213,7 +19250,7 @@ static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node,
|
|
19213
19250
|
void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {
|
19214
19251
|
char color[16];
|
19215
19252
|
|
19216
|
-
FILE * fp =
|
19253
|
+
FILE * fp = ggml_fopen(filename, "w");
|
19217
19254
|
GGML_ASSERT(fp);
|
19218
19255
|
|
19219
19256
|
fprintf(fp, "digraph G {\n");
|
@@ -20260,7 +20297,8 @@ void ggml_quantize_init(enum ggml_type type) {
|
|
20260
20297
|
case GGML_TYPE_IQ2_XXS:
|
20261
20298
|
case GGML_TYPE_IQ2_XS:
|
20262
20299
|
case GGML_TYPE_IQ2_S:
|
20263
|
-
case GGML_TYPE_IQ1_S:
|
20300
|
+
case GGML_TYPE_IQ1_S:
|
20301
|
+
case GGML_TYPE_IQ1_M: iq2xs_init_impl(type); break;
|
20264
20302
|
case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
|
20265
20303
|
case GGML_TYPE_IQ3_S: iq3xs_init_impl(512); break;
|
20266
20304
|
default: // nothing
|
@@ -20285,7 +20323,8 @@ bool ggml_quantize_requires_imatrix(enum ggml_type type) {
|
|
20285
20323
|
return
|
20286
20324
|
type == GGML_TYPE_IQ2_XXS ||
|
20287
20325
|
type == GGML_TYPE_IQ2_XS ||
|
20288
|
-
type == GGML_TYPE_IQ1_S
|
20326
|
+
type == GGML_TYPE_IQ1_S;// ||
|
20327
|
+
//type == GGML_TYPE_IQ1_M;
|
20289
20328
|
}
|
20290
20329
|
|
20291
20330
|
size_t ggml_quantize_chunk(
|
@@ -20329,6 +20368,7 @@ size_t ggml_quantize_chunk(
|
|
20329
20368
|
case GGML_TYPE_IQ3_S: result = quantize_iq3_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20330
20369
|
case GGML_TYPE_IQ2_S: result = quantize_iq2_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20331
20370
|
case GGML_TYPE_IQ1_S: result = quantize_iq1_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20371
|
+
case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20332
20372
|
case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20333
20373
|
#if QK_K == 64
|
20334
20374
|
case GGML_TYPE_IQ4_XS: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
@@ -20531,7 +20571,7 @@ struct gguf_context * gguf_init_empty(void) {
|
|
20531
20571
|
}
|
20532
20572
|
|
20533
20573
|
struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
|
20534
|
-
FILE * file =
|
20574
|
+
FILE * file = ggml_fopen(fname, "rb");
|
20535
20575
|
if (!file) {
|
20536
20576
|
return NULL;
|
20537
20577
|
}
|
@@ -21486,7 +21526,7 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf *
|
|
21486
21526
|
}
|
21487
21527
|
|
21488
21528
|
void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) {
|
21489
|
-
FILE * file =
|
21529
|
+
FILE * file = ggml_fopen(fname, "wb");
|
21490
21530
|
if (!file) {
|
21491
21531
|
GGML_ASSERT(false && "failed to open file for writing");
|
21492
21532
|
}
|
@@ -21628,15 +21668,15 @@ int ggml_cpu_has_wasm_simd(void) {
|
|
21628
21668
|
}
|
21629
21669
|
|
21630
21670
|
int ggml_cpu_has_blas(void) {
|
21631
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(
|
21671
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
|
21632
21672
|
return 1;
|
21633
21673
|
#else
|
21634
21674
|
return 0;
|
21635
21675
|
#endif
|
21636
21676
|
}
|
21637
21677
|
|
21638
|
-
int
|
21639
|
-
#if defined(
|
21678
|
+
int ggml_cpu_has_cuda(void) {
|
21679
|
+
#if defined(GGML_USE_CUDA)
|
21640
21680
|
return 1;
|
21641
21681
|
#else
|
21642
21682
|
return 0;
|
@@ -21676,7 +21716,7 @@ int ggml_cpu_has_sycl(void) {
|
|
21676
21716
|
}
|
21677
21717
|
|
21678
21718
|
int ggml_cpu_has_gpublas(void) {
|
21679
|
-
return
|
21719
|
+
return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
|
21680
21720
|
ggml_cpu_has_sycl();
|
21681
21721
|
}
|
21682
21722
|
|
data/vendor/tmp/llama.cpp/ggml.h
CHANGED
@@ -214,9 +214,10 @@
|
|
214
214
|
# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
215
215
|
#endif
|
216
216
|
|
217
|
-
#include <stdint.h>
|
218
|
-
#include <stddef.h>
|
219
217
|
#include <stdbool.h>
|
218
|
+
#include <stddef.h>
|
219
|
+
#include <stdint.h>
|
220
|
+
#include <stdio.h>
|
220
221
|
|
221
222
|
#define GGML_FILE_MAGIC 0x67676d6c // "ggml"
|
222
223
|
#define GGML_FILE_VERSION 1
|
@@ -368,6 +369,7 @@ extern "C" {
|
|
368
369
|
GGML_TYPE_I32 = 26,
|
369
370
|
GGML_TYPE_I64 = 27,
|
370
371
|
GGML_TYPE_F64 = 28,
|
372
|
+
GGML_TYPE_IQ1_M = 29,
|
371
373
|
GGML_TYPE_COUNT,
|
372
374
|
};
|
373
375
|
|
@@ -407,6 +409,7 @@ extern "C" {
|
|
407
409
|
GGML_FTYPE_MOSTLY_IQ3_S = 20, // except 1d tensors
|
408
410
|
GGML_FTYPE_MOSTLY_IQ2_S = 21, // except 1d tensors
|
409
411
|
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
|
412
|
+
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
|
410
413
|
};
|
411
414
|
|
412
415
|
// available tensor operations:
|
@@ -708,6 +711,9 @@ extern "C" {
|
|
708
711
|
|
709
712
|
GGML_API void ggml_print_backtrace(void);
|
710
713
|
|
714
|
+
// accepts a UTF-8 path, even on Windows
|
715
|
+
GGML_API FILE * ggml_fopen(const char * fname, const char * mode);
|
716
|
+
|
711
717
|
GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
|
712
718
|
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
|
713
719
|
|
@@ -744,6 +750,7 @@ extern "C" {
|
|
744
750
|
GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
745
751
|
GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
746
752
|
GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
753
|
+
GGML_API GGML_CALL bool ggml_is_empty (const struct ggml_tensor * tensor);
|
747
754
|
GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
|
748
755
|
GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
|
749
756
|
GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
|
@@ -1157,8 +1164,7 @@ extern "C" {
|
|
1157
1164
|
// ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
|
1158
1165
|
GGML_API struct ggml_tensor * ggml_mul_mat_id(
|
1159
1166
|
struct ggml_context * ctx,
|
1160
|
-
struct ggml_tensor *
|
1161
|
-
int n_as,
|
1167
|
+
struct ggml_tensor * as,
|
1162
1168
|
struct ggml_tensor * ids,
|
1163
1169
|
int id,
|
1164
1170
|
struct ggml_tensor * b);
|
@@ -2350,7 +2356,7 @@ extern "C" {
|
|
2350
2356
|
GGML_API int ggml_cpu_has_fp16_va (void);
|
2351
2357
|
GGML_API int ggml_cpu_has_wasm_simd (void);
|
2352
2358
|
GGML_API int ggml_cpu_has_blas (void);
|
2353
|
-
GGML_API int
|
2359
|
+
GGML_API int ggml_cpu_has_cuda (void);
|
2354
2360
|
GGML_API int ggml_cpu_has_clblast (void);
|
2355
2361
|
GGML_API int ggml_cpu_has_vulkan (void);
|
2356
2362
|
GGML_API int ggml_cpu_has_kompute (void);
|