llama_cpp 0.14.3 → 0.14.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/examples/chat.rb +2 -4
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +27 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +14 -0
- data/vendor/tmp/llama.cpp/LICENSE +1 -1
- data/vendor/tmp/llama.cpp/Makefile +81 -20
- data/vendor/tmp/llama.cpp/ggml-alloc.c +7 -2
- data/vendor/tmp/llama.cpp/ggml-backend.c +1 -1
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-common.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +295 -9324
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +133 -113
- data/vendor/tmp/llama.cpp/ggml-metal.metal +344 -276
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +785 -190
- data/vendor/tmp/llama.cpp/ggml-quants.h +83 -80
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +963 -588
- data/vendor/tmp/llama.cpp/ggml-sycl.h +13 -3
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +37199 -14939
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +329 -308
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -11
- data/vendor/tmp/llama.cpp/ggml.c +141 -101
- data/vendor/tmp/llama.cpp/ggml.h +18 -12
- data/vendor/tmp/llama.cpp/llama.cpp +2519 -625
- data/vendor/tmp/llama.cpp/llama.h +145 -29
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1651 -0
- data/vendor/tmp/llama.cpp/unicode-data.h +16 -0
- data/vendor/tmp/llama.cpp/unicode.cpp +8 -1403
- data/vendor/tmp/llama.cpp/unicode.h +2 -0
- metadata +5 -3
@@ -11,17 +11,6 @@ extern "C" {
|
|
11
11
|
#define GGML_VK_MAX_DEVICES 16
|
12
12
|
|
13
13
|
GGML_API void ggml_vk_instance_init(void);
|
14
|
-
GGML_API void ggml_vk_init_cpu_assist(void);
|
15
|
-
|
16
|
-
GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node);
|
17
|
-
GGML_API void ggml_vk_preallocate_buffers_cpu_assist(void);
|
18
|
-
GGML_API void ggml_vk_build_graph_cpu_assist(struct ggml_tensor * node, bool last_node);
|
19
|
-
GGML_API bool ggml_vk_compute_forward_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
20
|
-
#ifdef GGML_VULKAN_CHECK_RESULTS
|
21
|
-
void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
22
|
-
#endif
|
23
|
-
GGML_API void ggml_vk_graph_cleanup_cpu_assist(void);
|
24
|
-
GGML_API void ggml_vk_free_cpu_assist(void);
|
25
14
|
|
26
15
|
// backend API
|
27
16
|
GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);
|
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
@@ -3,6 +3,7 @@
|
|
3
3
|
|
4
4
|
#include "ggml-impl.h"
|
5
5
|
#include "ggml-quants.h"
|
6
|
+
#include "ggml.h"
|
6
7
|
|
7
8
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
8
9
|
#include <malloc.h> // using malloc.h with MSC/MINGW
|
@@ -43,6 +44,10 @@
|
|
43
44
|
|
44
45
|
#if defined(_WIN32)
|
45
46
|
|
47
|
+
#define WIN32_LEAN_AND_MEAN
|
48
|
+
#ifndef NOMINMAX
|
49
|
+
#define NOMINMAX
|
50
|
+
#endif
|
46
51
|
#include <windows.h>
|
47
52
|
|
48
53
|
typedef volatile LONG atomic_int;
|
@@ -273,8 +278,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
|
|
273
278
|
#include <Accelerate/Accelerate.h>
|
274
279
|
#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
|
275
280
|
#include "ggml-opencl.h"
|
276
|
-
#elif defined(GGML_USE_VULKAN)
|
277
|
-
#include "ggml-vulkan.h"
|
278
281
|
#endif
|
279
282
|
#elif defined(GGML_USE_OPENBLAS)
|
280
283
|
#if defined(GGML_BLAS_USE_MKL)
|
@@ -284,10 +287,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
|
|
284
287
|
#endif
|
285
288
|
#elif defined(GGML_USE_CLBLAST)
|
286
289
|
#include "ggml-opencl.h"
|
287
|
-
#elif defined(GGML_USE_VULKAN)
|
288
|
-
#include "ggml-vulkan.h"
|
289
|
-
#elif defined(GGML_USE_SYCL)
|
290
|
-
#include "ggml-sycl.h"
|
291
290
|
#endif
|
292
291
|
|
293
292
|
// floating point type used to accumulate sums
|
@@ -339,14 +338,14 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
|
|
339
338
|
return GGML_FP32_TO_FP16(x);
|
340
339
|
}
|
341
340
|
|
342
|
-
void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y,
|
343
|
-
for (
|
341
|
+
void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) {
|
342
|
+
for (int64_t i = 0; i < n; i++) {
|
344
343
|
y[i] = GGML_FP16_TO_FP32(x[i]);
|
345
344
|
}
|
346
345
|
}
|
347
346
|
|
348
|
-
void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y,
|
349
|
-
|
347
|
+
void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
|
348
|
+
int64_t i = 0;
|
350
349
|
#if defined(__F16C__)
|
351
350
|
for (; i + 7 < n; i += 8) {
|
352
351
|
__m256 x_vec = _mm256_loadu_ps(x + i);
|
@@ -430,6 +429,57 @@ int64_t ggml_cycles_per_ms(void) {
|
|
430
429
|
#define ggml_perf_cycles_per_ms() 0
|
431
430
|
#endif
|
432
431
|
|
432
|
+
//
|
433
|
+
// cross-platform UTF-8 file paths
|
434
|
+
//
|
435
|
+
|
436
|
+
#ifdef _WIN32
|
437
|
+
static wchar_t * ggml_mbstowcs(const char * mbs) {
|
438
|
+
int wlen = MultiByteToWideChar(CP_UTF8, 0, mbs, -1, NULL, 0);
|
439
|
+
if (!wlen) {
|
440
|
+
errno = EINVAL;
|
441
|
+
return NULL;
|
442
|
+
}
|
443
|
+
|
444
|
+
wchar_t * wbuf = GGML_MALLOC(wlen * sizeof(wchar_t));
|
445
|
+
wlen = MultiByteToWideChar(CP_UTF8, 0, mbs, -1, wbuf, wlen);
|
446
|
+
if (!wlen) {
|
447
|
+
GGML_FREE(wbuf);
|
448
|
+
errno = EINVAL;
|
449
|
+
return NULL;
|
450
|
+
}
|
451
|
+
|
452
|
+
return wbuf;
|
453
|
+
}
|
454
|
+
#endif
|
455
|
+
|
456
|
+
FILE * ggml_fopen(const char * fname, const char * mode) {
|
457
|
+
#ifdef _WIN32
|
458
|
+
FILE * file = NULL;
|
459
|
+
|
460
|
+
// convert fname (UTF-8)
|
461
|
+
wchar_t * wfname = ggml_mbstowcs(fname);
|
462
|
+
if (wfname) {
|
463
|
+
// convert mode (ANSI)
|
464
|
+
wchar_t * wmode = GGML_MALLOC((strlen(mode) + 1) * sizeof(wchar_t));
|
465
|
+
wchar_t * wmode_p = wmode;
|
466
|
+
do {
|
467
|
+
*wmode_p++ = (wchar_t)*mode;
|
468
|
+
} while (*mode++);
|
469
|
+
|
470
|
+
// open file
|
471
|
+
file = _wfopen(wfname, wmode);
|
472
|
+
|
473
|
+
GGML_FREE(wfname);
|
474
|
+
GGML_FREE(wmode);
|
475
|
+
}
|
476
|
+
|
477
|
+
return file;
|
478
|
+
#else
|
479
|
+
return fopen(fname, mode);
|
480
|
+
#endif
|
481
|
+
}
|
482
|
+
|
433
483
|
//
|
434
484
|
// cache line
|
435
485
|
//
|
@@ -740,6 +790,18 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
740
790
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
741
791
|
.nrows = 1,
|
742
792
|
},
|
793
|
+
[GGML_TYPE_IQ1_M] = {
|
794
|
+
.type_name = "iq1_m",
|
795
|
+
.blck_size = QK_K,
|
796
|
+
.type_size = sizeof(block_iq1_m),
|
797
|
+
.is_quantized = true,
|
798
|
+
.to_float = (ggml_to_float_t) dequantize_row_iq1_m,
|
799
|
+
.from_float = NULL,
|
800
|
+
.from_float_reference = NULL,
|
801
|
+
.vec_dot = ggml_vec_dot_iq1_m_q8_K,
|
802
|
+
.vec_dot_type = GGML_TYPE_Q8_K,
|
803
|
+
.nrows = 1,
|
804
|
+
},
|
743
805
|
[GGML_TYPE_IQ4_NL] = {
|
744
806
|
.type_name = "iq4_nl",
|
745
807
|
.blck_size = QK4_NL,
|
@@ -2485,6 +2547,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
|
2485
2547
|
case GGML_FTYPE_MOSTLY_IQ2_XS: wtype = GGML_TYPE_IQ2_XS; break;
|
2486
2548
|
case GGML_FTYPE_MOSTLY_IQ3_XXS: wtype = GGML_TYPE_IQ3_XXS; break;
|
2487
2549
|
case GGML_FTYPE_MOSTLY_IQ1_S: wtype = GGML_TYPE_IQ1_S; break;
|
2550
|
+
case GGML_FTYPE_MOSTLY_IQ1_M: wtype = GGML_TYPE_IQ1_M; break;
|
2488
2551
|
case GGML_FTYPE_MOSTLY_IQ4_NL: wtype = GGML_TYPE_IQ4_NL; break;
|
2489
2552
|
case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break;
|
2490
2553
|
case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break;
|
@@ -2540,6 +2603,16 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
|
|
2540
2603
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
2541
2604
|
}
|
2542
2605
|
|
2606
|
+
GGML_CALL bool ggml_is_empty(const struct ggml_tensor * tensor) {
|
2607
|
+
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
2608
|
+
if (tensor->ne[i] == 0) {
|
2609
|
+
// empty if any dimension has no elements
|
2610
|
+
return true;
|
2611
|
+
}
|
2612
|
+
}
|
2613
|
+
return false;
|
2614
|
+
}
|
2615
|
+
|
2543
2616
|
bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
2544
2617
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
2545
2618
|
|
@@ -2554,7 +2627,7 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor
|
|
2554
2627
|
static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
2555
2628
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
2556
2629
|
|
2557
|
-
return
|
2630
|
+
return ggml_is_empty(t0) ? ggml_is_empty(t1) :
|
2558
2631
|
(t1->ne[0]%t0->ne[0] == 0) &&
|
2559
2632
|
(t1->ne[1]%t0->ne[1] == 0) &&
|
2560
2633
|
(t1->ne[2]%t0->ne[2] == 0) &&
|
@@ -2640,10 +2713,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
2640
2713
|
|
2641
2714
|
#if defined(GGML_USE_CLBLAST)
|
2642
2715
|
ggml_cl_init();
|
2643
|
-
#elif defined(GGML_USE_VULKAN)
|
2644
|
-
ggml_vk_init_cpu_assist();
|
2645
|
-
#elif defined(GGML_USE_SYCL)
|
2646
|
-
ggml_init_sycl();
|
2647
2716
|
#endif
|
2648
2717
|
|
2649
2718
|
ggml_setup_op_has_task_pass();
|
@@ -2863,7 +2932,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
2863
2932
|
data_size *= ne[i];
|
2864
2933
|
}
|
2865
2934
|
|
2866
|
-
GGML_ASSERT(view_src == NULL || data_size + view_offs <= ggml_nbytes(view_src));
|
2935
|
+
GGML_ASSERT(view_src == NULL || data_size == 0 || data_size + view_offs <= ggml_nbytes(view_src));
|
2867
2936
|
|
2868
2937
|
void * data = view_src != NULL ? view_src->data : NULL;
|
2869
2938
|
if (data != NULL) {
|
@@ -4504,45 +4573,38 @@ void ggml_mul_mat_set_prec(
|
|
4504
4573
|
|
4505
4574
|
// ggml_mul_mat_id
|
4506
4575
|
|
4576
|
+
// NOTE: id will be removed in the future and instead all the experts listed in ids will be computed
|
4577
|
+
// this will allow computing all the used experts in a single matrix multiplication
|
4507
4578
|
struct ggml_tensor * ggml_mul_mat_id(
|
4508
4579
|
struct ggml_context * ctx,
|
4509
|
-
struct ggml_tensor *
|
4510
|
-
int n_as,
|
4580
|
+
struct ggml_tensor * as,
|
4511
4581
|
struct ggml_tensor * ids,
|
4512
4582
|
int id,
|
4513
4583
|
struct ggml_tensor * b) {
|
4514
4584
|
|
4515
4585
|
GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
4516
|
-
GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1);
|
4517
|
-
GGML_ASSERT(ids->ne[1] == b->ne[1]);
|
4586
|
+
GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); // ids is 2d
|
4587
|
+
GGML_ASSERT(ids->ne[1] == b->ne[1]); // must have an expert per b row
|
4518
4588
|
GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
|
4519
|
-
GGML_ASSERT(
|
4520
|
-
GGML_ASSERT(
|
4589
|
+
GGML_ASSERT(id >= 0 && id < ids->ne[0]); // valid id
|
4590
|
+
GGML_ASSERT(as->ne[0] == b->ne[0]); // can_mul_mat
|
4521
4591
|
|
4522
4592
|
bool is_node = false;
|
4523
4593
|
|
4524
|
-
if (as
|
4594
|
+
if (as->grad || b->grad) {
|
4525
4595
|
is_node = true;
|
4526
4596
|
}
|
4527
4597
|
|
4528
|
-
const int64_t ne[4] = { as
|
4598
|
+
const int64_t ne[4] = { as->ne[1], b->ne[1], b->ne[2], b->ne[3] };
|
4529
4599
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
4530
4600
|
|
4531
4601
|
ggml_set_op_params_i32(result, 0, id);
|
4532
|
-
ggml_set_op_params_i32(result, 1, n_as);
|
4533
4602
|
|
4534
4603
|
result->op = GGML_OP_MUL_MAT_ID;
|
4535
4604
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
4536
|
-
result->src[0] =
|
4605
|
+
result->src[0] = as;
|
4537
4606
|
result->src[1] = b;
|
4538
|
-
|
4539
|
-
for (int i = 0; i < n_as; i++) {
|
4540
|
-
struct ggml_tensor * a = as[i];
|
4541
|
-
GGML_ASSERT(ggml_are_same_shape(as[0], a));
|
4542
|
-
GGML_ASSERT(ggml_can_mul_mat(a, b));
|
4543
|
-
GGML_ASSERT(!ggml_is_transposed(a));
|
4544
|
-
result->src[i + 2] = a;
|
4545
|
-
}
|
4607
|
+
result->src[2] = ids;
|
4546
4608
|
|
4547
4609
|
return result;
|
4548
4610
|
}
|
@@ -8083,6 +8145,7 @@ static void ggml_compute_forward_add(
|
|
8083
8145
|
case GGML_TYPE_IQ2_XS:
|
8084
8146
|
case GGML_TYPE_IQ3_XXS:
|
8085
8147
|
case GGML_TYPE_IQ1_S:
|
8148
|
+
case GGML_TYPE_IQ1_M:
|
8086
8149
|
case GGML_TYPE_IQ4_NL:
|
8087
8150
|
case GGML_TYPE_IQ4_XS:
|
8088
8151
|
case GGML_TYPE_IQ3_S:
|
@@ -8365,6 +8428,7 @@ static void ggml_compute_forward_add1(
|
|
8365
8428
|
case GGML_TYPE_IQ2_XS:
|
8366
8429
|
case GGML_TYPE_IQ3_XXS:
|
8367
8430
|
case GGML_TYPE_IQ1_S:
|
8431
|
+
case GGML_TYPE_IQ1_M:
|
8368
8432
|
case GGML_TYPE_IQ4_NL:
|
8369
8433
|
case GGML_TYPE_IQ4_XS:
|
8370
8434
|
case GGML_TYPE_IQ3_S:
|
@@ -8492,6 +8556,7 @@ static void ggml_compute_forward_acc(
|
|
8492
8556
|
case GGML_TYPE_IQ2_XS:
|
8493
8557
|
case GGML_TYPE_IQ3_XXS:
|
8494
8558
|
case GGML_TYPE_IQ1_S:
|
8559
|
+
case GGML_TYPE_IQ1_M:
|
8495
8560
|
case GGML_TYPE_IQ4_NL:
|
8496
8561
|
case GGML_TYPE_IQ4_XS:
|
8497
8562
|
case GGML_TYPE_IQ3_S:
|
@@ -10876,10 +10941,9 @@ static void ggml_compute_forward_mul_mat_id(
|
|
10876
10941
|
const struct ggml_compute_params * params,
|
10877
10942
|
struct ggml_tensor * dst) {
|
10878
10943
|
|
10879
|
-
const struct ggml_tensor *
|
10944
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
10880
10945
|
const struct ggml_tensor * src1 = dst->src[1];
|
10881
|
-
|
10882
|
-
const struct ggml_tensor * src0 = dst->src[2]; // only for GGML_TENSOR_BINARY_OP_LOCALS
|
10946
|
+
const struct ggml_tensor * ids = dst->src[2];
|
10883
10947
|
|
10884
10948
|
GGML_TENSOR_BINARY_OP_LOCALS
|
10885
10949
|
|
@@ -10909,13 +10973,13 @@ static void ggml_compute_forward_mul_mat_id(
|
|
10909
10973
|
GGML_ASSERT(nb1 <= nb2);
|
10910
10974
|
GGML_ASSERT(nb2 <= nb3);
|
10911
10975
|
|
10912
|
-
// broadcast
|
10913
|
-
|
10914
|
-
|
10976
|
+
// broadcast is not supported with mmid
|
10977
|
+
assert(ne12 == 1);
|
10978
|
+
assert(ne13 == 1);
|
10915
10979
|
|
10916
10980
|
// row groups
|
10917
10981
|
const int id = ggml_get_op_params_i32(dst, 0);
|
10918
|
-
const int n_as =
|
10982
|
+
const int n_as = src0->ne[2];
|
10919
10983
|
|
10920
10984
|
char * wdata_src1_end = (src1->type == vec_dot_type) ?
|
10921
10985
|
(char *) params->wdata :
|
@@ -10975,7 +11039,7 @@ static void ggml_compute_forward_mul_mat_id(
|
|
10975
11039
|
continue;
|
10976
11040
|
}
|
10977
11041
|
|
10978
|
-
|
11042
|
+
size_t src0_offset = cur_a*src0->nb[2];
|
10979
11043
|
|
10980
11044
|
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
10981
11045
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
@@ -11010,9 +11074,6 @@ static void ggml_compute_forward_mul_mat_id(
|
|
11010
11074
|
continue;
|
11011
11075
|
}
|
11012
11076
|
|
11013
|
-
assert(ne12 % ne02 == 0);
|
11014
|
-
assert(ne13 % ne03 == 0);
|
11015
|
-
|
11016
11077
|
// block-tiling attempt
|
11017
11078
|
const int64_t blck_0 = 16;
|
11018
11079
|
const int64_t blck_1 = 16;
|
@@ -11029,14 +11090,14 @@ static void ggml_compute_forward_mul_mat_id(
|
|
11029
11090
|
const int64_t i11 = MMID_MATRIX_ROW(cur_a, _i11);
|
11030
11091
|
|
11031
11092
|
// broadcast src0 into src1
|
11032
|
-
const int64_t i03 = i13/r3;
|
11033
|
-
const int64_t i02 = i12/r2;
|
11093
|
+
//const int64_t i03 = i13/r3;
|
11094
|
+
//const int64_t i02 = i12/r2;
|
11034
11095
|
|
11035
11096
|
const int64_t i1 = i11;
|
11036
11097
|
const int64_t i2 = i12;
|
11037
11098
|
const int64_t i3 = i13;
|
11038
11099
|
|
11039
|
-
const char * src0_row = (const char *)
|
11100
|
+
const char * src0_row = (const char *) src0->data + src0_offset;
|
11040
11101
|
|
11041
11102
|
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
|
11042
11103
|
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
|
@@ -11395,6 +11456,7 @@ static void ggml_compute_forward_out_prod(
|
|
11395
11456
|
case GGML_TYPE_IQ2_XS:
|
11396
11457
|
case GGML_TYPE_IQ3_XXS:
|
11397
11458
|
case GGML_TYPE_IQ1_S:
|
11459
|
+
case GGML_TYPE_IQ1_M:
|
11398
11460
|
case GGML_TYPE_IQ4_NL:
|
11399
11461
|
case GGML_TYPE_IQ4_XS:
|
11400
11462
|
case GGML_TYPE_IQ3_S:
|
@@ -11586,6 +11648,7 @@ static void ggml_compute_forward_set(
|
|
11586
11648
|
case GGML_TYPE_IQ2_XS:
|
11587
11649
|
case GGML_TYPE_IQ3_XXS:
|
11588
11650
|
case GGML_TYPE_IQ1_S:
|
11651
|
+
case GGML_TYPE_IQ1_M:
|
11589
11652
|
case GGML_TYPE_IQ4_NL:
|
11590
11653
|
case GGML_TYPE_IQ4_XS:
|
11591
11654
|
case GGML_TYPE_IQ3_S:
|
@@ -11809,6 +11872,7 @@ static void ggml_compute_forward_get_rows(
|
|
11809
11872
|
case GGML_TYPE_IQ2_XS:
|
11810
11873
|
case GGML_TYPE_IQ3_XXS:
|
11811
11874
|
case GGML_TYPE_IQ1_S:
|
11875
|
+
case GGML_TYPE_IQ1_M:
|
11812
11876
|
case GGML_TYPE_IQ4_NL:
|
11813
11877
|
case GGML_TYPE_IQ4_XS:
|
11814
11878
|
case GGML_TYPE_IQ3_S:
|
@@ -12512,6 +12576,7 @@ static void ggml_compute_forward_alibi(
|
|
12512
12576
|
case GGML_TYPE_IQ2_XS:
|
12513
12577
|
case GGML_TYPE_IQ3_XXS:
|
12514
12578
|
case GGML_TYPE_IQ1_S:
|
12579
|
+
case GGML_TYPE_IQ1_M:
|
12515
12580
|
case GGML_TYPE_IQ4_NL:
|
12516
12581
|
case GGML_TYPE_IQ4_XS:
|
12517
12582
|
case GGML_TYPE_IQ3_S:
|
@@ -12600,6 +12665,7 @@ static void ggml_compute_forward_clamp(
|
|
12600
12665
|
case GGML_TYPE_IQ2_XS:
|
12601
12666
|
case GGML_TYPE_IQ3_XXS:
|
12602
12667
|
case GGML_TYPE_IQ1_S:
|
12668
|
+
case GGML_TYPE_IQ1_M:
|
12603
12669
|
case GGML_TYPE_IQ4_NL:
|
12604
12670
|
case GGML_TYPE_IQ4_XS:
|
12605
12671
|
case GGML_TYPE_IQ3_S:
|
@@ -16041,30 +16107,10 @@ static void ggml_compute_forward_cross_entropy_loss_back(
|
|
16041
16107
|
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
16042
16108
|
GGML_ASSERT(params);
|
16043
16109
|
|
16044
|
-
if (tensor->op == GGML_OP_NONE) {
|
16045
|
-
return;
|
16046
|
-
}
|
16047
|
-
|
16048
|
-
#if defined(GGML_USE_VULKAN)
|
16049
|
-
const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
|
16050
|
-
#ifdef GGML_VULKAN_CHECK_RESULTS
|
16051
|
-
if (skip_cpu) {
|
16052
|
-
ggml_vk_check_results_1_cpu_assist(params, tensor);
|
16053
|
-
}
|
16054
|
-
#endif
|
16055
|
-
if (skip_cpu) {
|
16110
|
+
if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) {
|
16056
16111
|
return;
|
16057
16112
|
}
|
16058
|
-
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
|
16059
|
-
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
|
16060
|
-
#endif // GGML_USE_VULKAN
|
16061
16113
|
|
16062
|
-
#ifdef GGML_USE_SYCL
|
16063
|
-
bool skip_cpu = ggml_sycl_compute_forward(params, tensor);
|
16064
|
-
if (skip_cpu) {
|
16065
|
-
return;
|
16066
|
-
}
|
16067
|
-
#endif // GGML_USE_SYCL
|
16068
16114
|
switch (tensor->op) {
|
16069
16115
|
case GGML_OP_DUP:
|
16070
16116
|
{
|
@@ -17916,6 +17962,12 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
|
|
17916
17962
|
static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_threads) {
|
17917
17963
|
int n_tasks = 0;
|
17918
17964
|
|
17965
|
+
if (ggml_is_empty(node)) {
|
17966
|
+
// no need to multi-thread a no-op
|
17967
|
+
n_tasks = 1;
|
17968
|
+
return n_tasks;
|
17969
|
+
}
|
17970
|
+
|
17919
17971
|
switch (node->op) {
|
17920
17972
|
case GGML_OP_CPY:
|
17921
17973
|
case GGML_OP_DUP:
|
@@ -18401,13 +18453,13 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
18401
18453
|
case GGML_OP_MUL_MAT_ID:
|
18402
18454
|
{
|
18403
18455
|
cur = 0;
|
18404
|
-
const struct ggml_tensor * src0 = node->src[
|
18456
|
+
const struct ggml_tensor * src0 = node->src[0];
|
18405
18457
|
const struct ggml_tensor * src1 = node->src[1];
|
18406
18458
|
const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
|
18407
18459
|
if (src1->type != vec_dot_type) {
|
18408
18460
|
cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
|
18409
18461
|
}
|
18410
|
-
const int n_as =
|
18462
|
+
const int n_as = src0->ne[2];
|
18411
18463
|
cur += GGML_PAD(cur, sizeof(int64_t)); // align
|
18412
18464
|
cur += n_as * sizeof(int64_t); // matrix_row_counts
|
18413
18465
|
cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
|
@@ -18534,17 +18586,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|
18534
18586
|
}
|
18535
18587
|
}
|
18536
18588
|
|
18537
|
-
#ifdef GGML_USE_VULKAN
|
18538
|
-
for (int i = 0; i < cgraph->n_nodes; i++) {
|
18539
|
-
ggml_vk_preallocate_buffers_graph_cpu_assist(cgraph->nodes[i]);
|
18540
|
-
}
|
18541
|
-
ggml_vk_preallocate_buffers_cpu_assist();
|
18542
|
-
|
18543
|
-
for (int i = 0; i < cgraph->n_nodes; i++) {
|
18544
|
-
ggml_vk_build_graph_cpu_assist(cgraph->nodes[i], i == cgraph->n_nodes - 1);
|
18545
|
-
}
|
18546
|
-
#endif
|
18547
|
-
|
18548
18589
|
const int n_threads = cplan->n_threads;
|
18549
18590
|
|
18550
18591
|
struct ggml_compute_state_shared state_shared = {
|
@@ -18601,10 +18642,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|
18601
18642
|
}
|
18602
18643
|
}
|
18603
18644
|
|
18604
|
-
#ifdef GGML_USE_VULKAN
|
18605
|
-
ggml_vk_graph_cleanup_cpu_assist();
|
18606
|
-
#endif
|
18607
|
-
|
18608
18645
|
// performance stats (graph)
|
18609
18646
|
{
|
18610
18647
|
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
|
@@ -18739,7 +18776,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|
18739
18776
|
|
18740
18777
|
// write binary data
|
18741
18778
|
{
|
18742
|
-
FILE * fout =
|
18779
|
+
FILE * fout = ggml_fopen(fname, "wb");
|
18743
18780
|
|
18744
18781
|
if (!fout) {
|
18745
18782
|
fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
|
@@ -18877,7 +18914,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|
18877
18914
|
|
18878
18915
|
// read file into data
|
18879
18916
|
{
|
18880
|
-
FILE * fin =
|
18917
|
+
FILE * fin = ggml_fopen(fname, "rb");
|
18881
18918
|
if (!fin) {
|
18882
18919
|
fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
|
18883
18920
|
return result;
|
@@ -19213,7 +19250,7 @@ static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node,
|
|
19213
19250
|
void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {
|
19214
19251
|
char color[16];
|
19215
19252
|
|
19216
|
-
FILE * fp =
|
19253
|
+
FILE * fp = ggml_fopen(filename, "w");
|
19217
19254
|
GGML_ASSERT(fp);
|
19218
19255
|
|
19219
19256
|
fprintf(fp, "digraph G {\n");
|
@@ -20260,7 +20297,8 @@ void ggml_quantize_init(enum ggml_type type) {
|
|
20260
20297
|
case GGML_TYPE_IQ2_XXS:
|
20261
20298
|
case GGML_TYPE_IQ2_XS:
|
20262
20299
|
case GGML_TYPE_IQ2_S:
|
20263
|
-
case GGML_TYPE_IQ1_S:
|
20300
|
+
case GGML_TYPE_IQ1_S:
|
20301
|
+
case GGML_TYPE_IQ1_M: iq2xs_init_impl(type); break;
|
20264
20302
|
case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
|
20265
20303
|
case GGML_TYPE_IQ3_S: iq3xs_init_impl(512); break;
|
20266
20304
|
default: // nothing
|
@@ -20285,18 +20323,19 @@ bool ggml_quantize_requires_imatrix(enum ggml_type type) {
|
|
20285
20323
|
return
|
20286
20324
|
type == GGML_TYPE_IQ2_XXS ||
|
20287
20325
|
type == GGML_TYPE_IQ2_XS ||
|
20288
|
-
type == GGML_TYPE_IQ1_S
|
20326
|
+
type == GGML_TYPE_IQ1_S;// ||
|
20327
|
+
//type == GGML_TYPE_IQ1_M;
|
20289
20328
|
}
|
20290
20329
|
|
20291
20330
|
size_t ggml_quantize_chunk(
|
20292
20331
|
enum ggml_type type,
|
20293
20332
|
const float * src,
|
20294
20333
|
void * dst,
|
20295
|
-
|
20296
|
-
|
20297
|
-
|
20334
|
+
int64_t start,
|
20335
|
+
int64_t nrows,
|
20336
|
+
int64_t n_per_row,
|
20298
20337
|
const float * imatrix) {
|
20299
|
-
const
|
20338
|
+
const int64_t n = (int64_t) nrows * n_per_row;
|
20300
20339
|
|
20301
20340
|
if (ggml_quantize_requires_imatrix(type)) {
|
20302
20341
|
GGML_ASSERT(imatrix != NULL);
|
@@ -20329,6 +20368,7 @@ size_t ggml_quantize_chunk(
|
|
20329
20368
|
case GGML_TYPE_IQ3_S: result = quantize_iq3_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20330
20369
|
case GGML_TYPE_IQ2_S: result = quantize_iq2_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20331
20370
|
case GGML_TYPE_IQ1_S: result = quantize_iq1_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20371
|
+
case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20332
20372
|
case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20333
20373
|
#if QK_K == 64
|
20334
20374
|
case GGML_TYPE_IQ4_XS: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
@@ -20531,7 +20571,7 @@ struct gguf_context * gguf_init_empty(void) {
|
|
20531
20571
|
}
|
20532
20572
|
|
20533
20573
|
struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
|
20534
|
-
FILE * file =
|
20574
|
+
FILE * file = ggml_fopen(fname, "rb");
|
20535
20575
|
if (!file) {
|
20536
20576
|
return NULL;
|
20537
20577
|
}
|
@@ -21486,7 +21526,7 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf *
|
|
21486
21526
|
}
|
21487
21527
|
|
21488
21528
|
void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) {
|
21489
|
-
FILE * file =
|
21529
|
+
FILE * file = ggml_fopen(fname, "wb");
|
21490
21530
|
if (!file) {
|
21491
21531
|
GGML_ASSERT(false && "failed to open file for writing");
|
21492
21532
|
}
|
@@ -21628,15 +21668,15 @@ int ggml_cpu_has_wasm_simd(void) {
|
|
21628
21668
|
}
|
21629
21669
|
|
21630
21670
|
int ggml_cpu_has_blas(void) {
|
21631
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(
|
21671
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
|
21632
21672
|
return 1;
|
21633
21673
|
#else
|
21634
21674
|
return 0;
|
21635
21675
|
#endif
|
21636
21676
|
}
|
21637
21677
|
|
21638
|
-
int
|
21639
|
-
#if defined(
|
21678
|
+
int ggml_cpu_has_cuda(void) {
|
21679
|
+
#if defined(GGML_USE_CUDA)
|
21640
21680
|
return 1;
|
21641
21681
|
#else
|
21642
21682
|
return 0;
|
@@ -21676,7 +21716,7 @@ int ggml_cpu_has_sycl(void) {
|
|
21676
21716
|
}
|
21677
21717
|
|
21678
21718
|
int ggml_cpu_has_gpublas(void) {
|
21679
|
-
return
|
21719
|
+
return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
|
21680
21720
|
ggml_cpu_has_sycl();
|
21681
21721
|
}
|
21682
21722
|
|
data/vendor/tmp/llama.cpp/ggml.h
CHANGED
@@ -214,9 +214,10 @@
|
|
214
214
|
# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
215
215
|
#endif
|
216
216
|
|
217
|
-
#include <stdint.h>
|
218
|
-
#include <stddef.h>
|
219
217
|
#include <stdbool.h>
|
218
|
+
#include <stddef.h>
|
219
|
+
#include <stdint.h>
|
220
|
+
#include <stdio.h>
|
220
221
|
|
221
222
|
#define GGML_FILE_MAGIC 0x67676d6c // "ggml"
|
222
223
|
#define GGML_FILE_VERSION 1
|
@@ -331,8 +332,8 @@ extern "C" {
|
|
331
332
|
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
|
332
333
|
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
|
333
334
|
|
334
|
-
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y,
|
335
|
-
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y,
|
335
|
+
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n);
|
336
|
+
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n);
|
336
337
|
|
337
338
|
struct ggml_object;
|
338
339
|
struct ggml_context;
|
@@ -368,6 +369,7 @@ extern "C" {
|
|
368
369
|
GGML_TYPE_I32 = 26,
|
369
370
|
GGML_TYPE_I64 = 27,
|
370
371
|
GGML_TYPE_F64 = 28,
|
372
|
+
GGML_TYPE_IQ1_M = 29,
|
371
373
|
GGML_TYPE_COUNT,
|
372
374
|
};
|
373
375
|
|
@@ -407,6 +409,7 @@ extern "C" {
|
|
407
409
|
GGML_FTYPE_MOSTLY_IQ3_S = 20, // except 1d tensors
|
408
410
|
GGML_FTYPE_MOSTLY_IQ2_S = 21, // except 1d tensors
|
409
411
|
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
|
412
|
+
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
|
410
413
|
};
|
411
414
|
|
412
415
|
// available tensor operations:
|
@@ -708,6 +711,9 @@ extern "C" {
|
|
708
711
|
|
709
712
|
GGML_API void ggml_print_backtrace(void);
|
710
713
|
|
714
|
+
// accepts a UTF-8 path, even on Windows
|
715
|
+
GGML_API FILE * ggml_fopen(const char * fname, const char * mode);
|
716
|
+
|
711
717
|
GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
|
712
718
|
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
|
713
719
|
|
@@ -744,6 +750,7 @@ extern "C" {
|
|
744
750
|
GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
745
751
|
GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
746
752
|
GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
753
|
+
GGML_API GGML_CALL bool ggml_is_empty (const struct ggml_tensor * tensor);
|
747
754
|
GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
|
748
755
|
GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
|
749
756
|
GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
|
@@ -1157,8 +1164,7 @@ extern "C" {
|
|
1157
1164
|
// ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
|
1158
1165
|
GGML_API struct ggml_tensor * ggml_mul_mat_id(
|
1159
1166
|
struct ggml_context * ctx,
|
1160
|
-
struct ggml_tensor *
|
1161
|
-
int n_as,
|
1167
|
+
struct ggml_tensor * as,
|
1162
1168
|
struct ggml_tensor * ids,
|
1163
1169
|
int id,
|
1164
1170
|
struct ggml_tensor * b);
|
@@ -2204,9 +2210,9 @@ extern "C" {
|
|
2204
2210
|
enum ggml_type type,
|
2205
2211
|
const float * src,
|
2206
2212
|
void * dst,
|
2207
|
-
|
2208
|
-
|
2209
|
-
|
2213
|
+
int64_t start,
|
2214
|
+
int64_t nrows,
|
2215
|
+
int64_t n_per_row,
|
2210
2216
|
const float * imatrix);
|
2211
2217
|
|
2212
2218
|
//
|
@@ -2350,7 +2356,7 @@ extern "C" {
|
|
2350
2356
|
GGML_API int ggml_cpu_has_fp16_va (void);
|
2351
2357
|
GGML_API int ggml_cpu_has_wasm_simd (void);
|
2352
2358
|
GGML_API int ggml_cpu_has_blas (void);
|
2353
|
-
GGML_API int
|
2359
|
+
GGML_API int ggml_cpu_has_cuda (void);
|
2354
2360
|
GGML_API int ggml_cpu_has_clblast (void);
|
2355
2361
|
GGML_API int ggml_cpu_has_vulkan (void);
|
2356
2362
|
GGML_API int ggml_cpu_has_kompute (void);
|
@@ -2371,8 +2377,8 @@ extern "C" {
|
|
2371
2377
|
#else
|
2372
2378
|
#define GGML_RESTRICT restrict
|
2373
2379
|
#endif
|
2374
|
-
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y,
|
2375
|
-
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
|
2380
|
+
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
2381
|
+
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
2376
2382
|
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
|
2377
2383
|
const void * GGML_RESTRICT y, size_t by, int nrc);
|
2378
2384
|
|