llama_cpp 0.14.5 → 0.14.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/vendor/tmp/llama.cpp/Makefile +18 -6
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +135 -46
- data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +130 -83
- data/vendor/tmp/llama.cpp/ggml-metal.metal +505 -1467
- data/vendor/tmp/llama.cpp/ggml-quants.c +1 -1
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +65 -52
- data/vendor/tmp/llama.cpp/ggml.c +153 -87
- data/vendor/tmp/llama.cpp/ggml.h +5 -4
- data/vendor/tmp/llama.cpp/llama.cpp +885 -144
- data/vendor/tmp/llama.cpp/sgemm.cpp +1148 -0
- data/vendor/tmp/llama.cpp/sgemm.h +12 -0
- metadata +4 -2
@@ -132,7 +132,7 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) {
|
|
132
132
|
}
|
133
133
|
|
134
134
|
static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
|
135
|
-
#if defined(__AVXVNNI__) || defined(__AVX512VNNI__)
|
135
|
+
#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
|
136
136
|
const __m256i zero = _mm256_setzero_si256();
|
137
137
|
const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
|
138
138
|
return _mm256_cvtepi32_ps(summed_pairs);
|
@@ -3154,7 +3154,6 @@ typedef float (*vec_dot_q_mul_mat_sycl_t)(
|
|
3154
3154
|
#define SYCL_SCALE_BLOCK_SIZE 256
|
3155
3155
|
#define SYCL_CLAMP_BLOCK_SIZE 256
|
3156
3156
|
#define SYCL_ROPE_BLOCK_SIZE 256
|
3157
|
-
#define SYCL_SOFT_MAX_BLOCK_SIZE 1024
|
3158
3157
|
#define SYCL_ALIBI_BLOCK_SIZE 32
|
3159
3158
|
#define SYCL_DIAG_MASK_INF_BLOCK_SIZE 32
|
3160
3159
|
#define SYCL_QUANTIZE_BLOCK_SIZE 256
|
@@ -13080,11 +13079,13 @@ static void soft_max_f32_sycl(const float * x, const float * mask, const float *
|
|
13080
13079
|
const int nrows_y, const float scale, const float max_bias,
|
13081
13080
|
dpct::queue_ptr stream) {
|
13082
13081
|
int nth = WARP_SIZE;
|
13083
|
-
|
13082
|
+
int max_block_size = g_work_group_size;
|
13083
|
+
while (nth < ncols_x && nth < max_block_size) nth *= 2;
|
13084
|
+
if (nth>max_block_size) nth = max_block_size;
|
13085
|
+
|
13084
13086
|
const sycl::range<3> block_dims(1, 1, nth);
|
13085
13087
|
const sycl::range<3> block_nums(1, 1, nrows_x);
|
13086
13088
|
const size_t n_local_scratch = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE);
|
13087
|
-
static_assert(SYCL_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
|
13088
13089
|
|
13089
13090
|
const uint32_t n_head_kv = nrows_x/nrows_y;
|
13090
13091
|
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
|
@@ -13094,6 +13095,12 @@ static void soft_max_f32_sycl(const float * x, const float * mask, const float *
|
|
13094
13095
|
|
13095
13096
|
const size_t local_mem_size = stream->get_device().get_info<sycl::info::device::local_mem_size>();
|
13096
13097
|
if (n_local_scratch*sizeof(float) < local_mem_size) {
|
13098
|
+
if (ncols_x > max_block_size) {
|
13099
|
+
soft_max_f32_submitter<true, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
13100
|
+
max_bias, m0, m1, n_head_log2, block_nums,
|
13101
|
+
block_dims, n_local_scratch, stream);
|
13102
|
+
return;
|
13103
|
+
}
|
13097
13104
|
switch (ncols_x) {
|
13098
13105
|
case 32:
|
13099
13106
|
soft_max_f32_submitter<true, 32, 32>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
@@ -15989,73 +15996,76 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) {
|
|
15989
15996
|
static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
15990
15997
|
const ggml_tensor *src1,
|
15991
15998
|
ggml_tensor *dst) try {
|
15992
|
-
|
15993
|
-
|
15994
|
-
|
15995
|
-
|
15999
|
+
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT &&
|
16000
|
+
"mul_mat_id does not support split buffers");
|
16001
|
+
const ggml_tensor *ids = dst->src[2];
|
16002
|
+
const dpct::queue_ptr stream = g_syclStreams[g_main_device][0];
|
15996
16003
|
|
15997
|
-
const
|
15998
|
-
const
|
16004
|
+
const size_t nb11 = src1->nb[1];
|
16005
|
+
const size_t nb1 = dst->nb[1];
|
15999
16006
|
|
16000
|
-
const
|
16001
|
-
const int32_t
|
16002
|
-
const int32_t n_as = ((int32_t *) dst->op_params)[1];
|
16007
|
+
const int32_t id = ((int32_t *)dst->op_params)[0];
|
16008
|
+
const int32_t n_as = src0->ne[2];
|
16003
16009
|
|
16004
16010
|
std::vector<char> ids_host(ggml_nbytes(ids));
|
16011
|
+
const char *ids_dev = (const char *)ids->data;
|
16005
16012
|
|
16006
|
-
|
16007
|
-
|
16008
|
-
|
16009
|
-
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
|
16010
|
-
SYCL_CHECK(CHECK_TRY_ERROR(
|
16011
|
-
stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids)).wait()));
|
16012
|
-
// SYCL_CHECK(CHECK_TRY_ERROR(stream->wait()));
|
16013
|
-
} else {
|
16014
|
-
memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
|
16015
|
-
}
|
16013
|
+
SYCL_CHECK(CHECK_TRY_ERROR(
|
16014
|
+
stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids))));
|
16015
|
+
SYCL_CHECK(CHECK_TRY_ERROR(stream->wait()));
|
16016
16016
|
|
16017
|
-
const ggml_tensor_extra_gpu *
|
16018
|
-
|
16017
|
+
const ggml_tensor_extra_gpu *src0_extra =
|
16018
|
+
(const ggml_tensor_extra_gpu *)src0->extra;
|
16019
|
+
const ggml_tensor_extra_gpu *src1_extra =
|
16020
|
+
(const ggml_tensor_extra_gpu *)src1->extra;
|
16021
|
+
const ggml_tensor_extra_gpu *dst_extra =
|
16022
|
+
(const ggml_tensor_extra_gpu *)dst->extra;
|
16019
16023
|
|
16024
|
+
ggml_tensor_extra_gpu src0_row_extra;
|
16020
16025
|
ggml_tensor_extra_gpu src1_row_extra;
|
16021
16026
|
ggml_tensor_extra_gpu dst_row_extra;
|
16022
16027
|
|
16028
|
+
ggml_tensor src0_row = *src0;
|
16023
16029
|
ggml_tensor src1_row = *src1;
|
16024
16030
|
ggml_tensor dst_row = *dst;
|
16025
16031
|
|
16026
16032
|
src1_row.backend = GGML_BACKEND_TYPE_GPU;
|
16027
16033
|
dst_row.backend = GGML_BACKEND_TYPE_GPU;
|
16028
16034
|
|
16035
|
+
src0_row.extra = &src0_row_extra;
|
16029
16036
|
src1_row.extra = &src1_row_extra;
|
16030
16037
|
dst_row.extra = &dst_row_extra;
|
16031
16038
|
|
16032
|
-
char *
|
16033
|
-
|
16034
|
-
|
16035
|
-
|
16039
|
+
char *src0_original = src1->backend == GGML_BACKEND_TYPE_CPU
|
16040
|
+
? (char *)src0->data
|
16041
|
+
: (char *)src0_extra->data_device[g_main_device];
|
16042
|
+
char *src1_original = src1->backend == GGML_BACKEND_TYPE_CPU
|
16043
|
+
? (char *)src1->data
|
16044
|
+
: (char *)src1_extra->data_device[g_main_device];
|
16045
|
+
char *dst_original = dst->backend == GGML_BACKEND_TYPE_CPU
|
16046
|
+
? (char *)dst->data
|
16047
|
+
: (char *)dst_extra->data_device[g_main_device];
|
16036
16048
|
|
16037
|
-
|
16038
|
-
|
16039
|
-
|
16049
|
+
src0_row.ne[2] = 1;
|
16050
|
+
src0_row.ne[3] = 1;
|
16051
|
+
src0_row.nb[3] = src0->nb[2];
|
16040
16052
|
|
16053
|
+
if (src1->ne[1] == 1) {
|
16041
16054
|
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
16042
|
-
|
16043
|
-
|
16044
|
-
|
16045
|
-
|
16046
|
-
const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
16055
|
+
const int32_t row_id =
|
16056
|
+
*(const int32_t *)(ids_host.data() + i01 * ids->nb[1] +
|
16057
|
+
id * ids->nb[0]);
|
16047
16058
|
|
16048
16059
|
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
16049
16060
|
|
16050
|
-
|
16051
|
-
|
16052
|
-
src1_row_extra.data_device[g_main_device] =
|
16053
|
-
|
16061
|
+
src0_row_extra.data_device[g_main_device] =
|
16062
|
+
src0_original + row_id * src0->nb[2];
|
16063
|
+
src1_row_extra.data_device[g_main_device] =
|
16064
|
+
src1_original + i01 * src1->nb[1];
|
16065
|
+
dst_row_extra.data_device[g_main_device] =
|
16066
|
+
dst_original + i01 * dst->nb[1];
|
16054
16067
|
|
16055
|
-
|
16056
|
-
dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set?
|
16057
|
-
|
16058
|
-
ggml_sycl_mul_mat(src0_row, &src1_row, &dst_row);
|
16068
|
+
ggml_sycl_mul_mat(&src0_row, &src1_row, &dst_row);
|
16059
16069
|
}
|
16060
16070
|
} else {
|
16061
16071
|
sycl_pool_alloc<char> src1_contiguous(sizeof(float)*ggml_nelements(src1));
|
@@ -16065,8 +16075,6 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
|
16065
16075
|
dst_row_extra.data_device[g_main_device] = dst_contiguous.get();
|
16066
16076
|
|
16067
16077
|
for (int32_t row_id = 0; row_id < n_as; ++row_id) {
|
16068
|
-
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
|
16069
|
-
|
16070
16078
|
int64_t num_src1_rows = 0;
|
16071
16079
|
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
16072
16080
|
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
@@ -16079,7 +16087,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
|
16079
16087
|
|
16080
16088
|
SYCL_CHECK(CHECK_TRY_ERROR(
|
16081
16089
|
stream->memcpy(src1_contiguous.get() + num_src1_rows * nb11,
|
16082
|
-
src1_original + i01 * nb11, nb11)
|
16090
|
+
src1_original + i01 * nb11, nb11)));
|
16083
16091
|
num_src1_rows++;
|
16084
16092
|
}
|
16085
16093
|
|
@@ -16087,6 +16095,9 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
|
16087
16095
|
continue;
|
16088
16096
|
}
|
16089
16097
|
|
16098
|
+
src0_row_extra.data_device[g_main_device] =
|
16099
|
+
src0_original + row_id * src0->nb[2];
|
16100
|
+
|
16090
16101
|
src1_row.ne[1] = num_src1_rows;
|
16091
16102
|
dst_row.ne[1] = num_src1_rows;
|
16092
16103
|
|
@@ -16098,7 +16109,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
|
16098
16109
|
dst_row.nb[2] = num_src1_rows*nb1;
|
16099
16110
|
dst_row.nb[3] = num_src1_rows*nb1;
|
16100
16111
|
|
16101
|
-
ggml_sycl_mul_mat(src0_row, &src1_row, &dst_row);
|
16112
|
+
ggml_sycl_mul_mat(&src0_row, &src1_row, &dst_row);
|
16102
16113
|
|
16103
16114
|
num_src1_rows = 0;
|
16104
16115
|
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
@@ -16112,7 +16123,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
|
16112
16123
|
|
16113
16124
|
SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(
|
16114
16125
|
dst_original + i01 * nb1,
|
16115
|
-
dst_contiguous.get() + num_src1_rows * nb1, nb1)
|
16126
|
+
dst_contiguous.get() + num_src1_rows * nb1, nb1)));
|
16116
16127
|
num_src1_rows++;
|
16117
16128
|
}
|
16118
16129
|
}
|
@@ -16814,11 +16825,13 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
16814
16825
|
const dpct::queue_ptr stream = g_syclStreams[ctx->device][0];
|
16815
16826
|
SYCL_CHECK(
|
16816
16827
|
CHECK_TRY_ERROR(dpct::dev_mgr::instance().get_device(ctx->device).queues_wait_and_throw()));
|
16817
|
-
|
16828
|
+
char* host_buf = (char*)malloc(size);
|
16829
|
+
memcpy(host_buf, data, size);
|
16818
16830
|
SYCL_CHECK(
|
16819
16831
|
CHECK_TRY_ERROR((*stream)
|
16820
|
-
.memcpy((char *)tensor->data + offset,
|
16832
|
+
.memcpy((char *)tensor->data + offset, host_buf, size)
|
16821
16833
|
.wait()));
|
16834
|
+
free(host_buf);
|
16822
16835
|
}
|
16823
16836
|
catch (sycl::exception const &exc) {
|
16824
16837
|
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
@@ -17739,7 +17752,7 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
|
|
17739
17752
|
|
17740
17753
|
GGML_CALL static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
|
17741
17754
|
const int min_batch_size = 32;
|
17742
|
-
return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
|
17755
|
+
return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS && op->op != GGML_OP_MUL_MAT_ID;
|
17743
17756
|
GGML_UNUSED(backend);
|
17744
17757
|
}
|
17745
17758
|
|
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
@@ -4,6 +4,7 @@
|
|
4
4
|
#include "ggml-impl.h"
|
5
5
|
#include "ggml-quants.h"
|
6
6
|
#include "ggml.h"
|
7
|
+
#include "sgemm.h"
|
7
8
|
|
8
9
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
9
10
|
#include <malloc.h> // using malloc.h with MSC/MINGW
|
@@ -32,6 +33,10 @@
|
|
32
33
|
#include <unistd.h>
|
33
34
|
#endif
|
34
35
|
|
36
|
+
#ifdef __ARM_FEATURE_MATMUL_INT8
|
37
|
+
#undef GGML_USE_LLAMAFILE
|
38
|
+
#endif
|
39
|
+
|
35
40
|
#if defined(_MSC_VER)
|
36
41
|
// disable "possible loss of data" to avoid hundreds of casts
|
37
42
|
// we should just be careful :)
|
@@ -4573,21 +4578,32 @@ void ggml_mul_mat_set_prec(
|
|
4573
4578
|
|
4574
4579
|
// ggml_mul_mat_id
|
4575
4580
|
|
4576
|
-
|
4577
|
-
|
4581
|
+
/*
|
4582
|
+
c = ggml_mul_mat_id(ctx, as, b, ids);
|
4583
|
+
|
4584
|
+
as -> [cols, rows, n_expert]
|
4585
|
+
ids -> [n_experts_used, n_tokens] (i32)
|
4586
|
+
b -> [cols, n_expert_used, n_tokens]
|
4587
|
+
c -> [cols, n_expert_used, n_tokens]
|
4588
|
+
|
4589
|
+
in b, n_experts_used can be broadcasted to match the n_expert_used of ids
|
4590
|
+
|
4591
|
+
c ~= as[:,:,i] @ b[:,i%r,t], i = ids[e,t] for all e,t in ids
|
4592
|
+
*/
|
4578
4593
|
struct ggml_tensor * ggml_mul_mat_id(
|
4579
4594
|
struct ggml_context * ctx,
|
4580
4595
|
struct ggml_tensor * as,
|
4581
|
-
struct ggml_tensor *
|
4582
|
-
|
4583
|
-
|
4584
|
-
|
4596
|
+
struct ggml_tensor * b,
|
4597
|
+
struct ggml_tensor * ids) {
|
4598
|
+
GGML_ASSERT(!ggml_is_transposed(as));
|
4585
4599
|
GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
4600
|
+
|
4601
|
+
GGML_ASSERT(as->ne[3] == 1); // as is 3d (one matrix per expert)
|
4602
|
+
GGML_ASSERT(b->ne[3] == 1); // b is 3d
|
4586
4603
|
GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); // ids is 2d
|
4587
|
-
GGML_ASSERT(ids->ne[1] == b->ne[
|
4588
|
-
GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
|
4589
|
-
GGML_ASSERT(id >= 0 && id < ids->ne[0]); // valid id
|
4604
|
+
GGML_ASSERT(ids->ne[1] == b->ne[2]); // must have an expert list per b row
|
4590
4605
|
GGML_ASSERT(as->ne[0] == b->ne[0]); // can_mul_mat
|
4606
|
+
GGML_ASSERT(ids->ne[0] % b->ne[1] == 0); // can broadcast
|
4591
4607
|
|
4592
4608
|
bool is_node = false;
|
4593
4609
|
|
@@ -4595,11 +4611,9 @@ struct ggml_tensor * ggml_mul_mat_id(
|
|
4595
4611
|
is_node = true;
|
4596
4612
|
}
|
4597
4613
|
|
4598
|
-
const int64_t ne[4] = { as->ne[1],
|
4614
|
+
const int64_t ne[4] = { as->ne[1], ids->ne[0], b->ne[2], 1 };
|
4599
4615
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
4600
4616
|
|
4601
|
-
ggml_set_op_params_i32(result, 0, id);
|
4602
|
-
|
4603
4617
|
result->op = GGML_OP_MUL_MAT_ID;
|
4604
4618
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
4605
4619
|
result->src[0] = as;
|
@@ -10810,6 +10824,28 @@ static void ggml_compute_forward_mul_mat(
|
|
10810
10824
|
}
|
10811
10825
|
#endif
|
10812
10826
|
|
10827
|
+
#if GGML_USE_LLAMAFILE
|
10828
|
+
if (nb10 == ggml_type_size(src1->type)) {
|
10829
|
+
for (int64_t i13 = 0; i13 < ne13; i13++)
|
10830
|
+
for (int64_t i12 = 0; i12 < ne12; i12++)
|
10831
|
+
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
10832
|
+
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
10833
|
+
nb01/ggml_type_size(src0->type),
|
10834
|
+
(const char *)src1->data + i12*nb12 + i13*nb13,
|
10835
|
+
nb11/ggml_type_size(src1->type),
|
10836
|
+
(char *)dst->data + i12*nb2 + i13*nb3,
|
10837
|
+
nb1/ggml_type_size(dst->type),
|
10838
|
+
ith, nth,
|
10839
|
+
params->type,
|
10840
|
+
src0->type,
|
10841
|
+
src1->type,
|
10842
|
+
dst->type))
|
10843
|
+
goto UseGgmlGemm1;
|
10844
|
+
return;
|
10845
|
+
}
|
10846
|
+
UseGgmlGemm1:;
|
10847
|
+
#endif
|
10848
|
+
|
10813
10849
|
if (params->type == GGML_TASK_TYPE_INIT) {
|
10814
10850
|
if (ith != 0) {
|
10815
10851
|
return;
|
@@ -10841,6 +10877,30 @@ static void ggml_compute_forward_mul_mat(
|
|
10841
10877
|
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
10842
10878
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
10843
10879
|
|
10880
|
+
#if GGML_USE_LLAMAFILE
|
10881
|
+
if (nb10 == ggml_type_size(src1->type) || src1->type != vec_dot_type) {
|
10882
|
+
for (int64_t i13 = 0; i13 < ne13; i13++)
|
10883
|
+
for (int64_t i12 = 0; i12 < ne12; i12++)
|
10884
|
+
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
10885
|
+
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
10886
|
+
nb01/ggml_type_size(src0->type),
|
10887
|
+
(const char *)wdata + ggml_row_size(vec_dot_type,
|
10888
|
+
nb12/ggml_type_size(src1->type)*i12 +
|
10889
|
+
nb13/ggml_type_size(src1->type)*i13),
|
10890
|
+
row_size/ggml_type_size(vec_dot_type),
|
10891
|
+
(char *)dst->data + i12*nb2 + i13*nb3,
|
10892
|
+
nb1/ggml_type_size(dst->type),
|
10893
|
+
ith, nth,
|
10894
|
+
params->type,
|
10895
|
+
src0->type,
|
10896
|
+
vec_dot_type,
|
10897
|
+
dst->type))
|
10898
|
+
goto UseGgmlGemm2;
|
10899
|
+
return;
|
10900
|
+
}
|
10901
|
+
UseGgmlGemm2:;
|
10902
|
+
#endif
|
10903
|
+
|
10844
10904
|
const int64_t nr0 = ne01; // src0 rows
|
10845
10905
|
const int64_t nr1 = ne1*ne12*ne13; // src1 rows
|
10846
10906
|
|
@@ -10958,11 +11018,6 @@ static void ggml_compute_forward_mul_mat_id(
|
|
10958
11018
|
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
10959
11019
|
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
10960
11020
|
|
10961
|
-
GGML_ASSERT(ne0 == ne01);
|
10962
|
-
GGML_ASSERT(ne1 == ne11);
|
10963
|
-
GGML_ASSERT(ne2 == ne12);
|
10964
|
-
GGML_ASSERT(ne3 == ne13);
|
10965
|
-
|
10966
11021
|
// we don't support permuted src0 or src1
|
10967
11022
|
GGML_ASSERT(nb00 == ggml_type_size(type));
|
10968
11023
|
GGML_ASSERT(nb10 == ggml_type_size(src1->type));
|
@@ -10973,22 +11028,21 @@ static void ggml_compute_forward_mul_mat_id(
|
|
10973
11028
|
GGML_ASSERT(nb1 <= nb2);
|
10974
11029
|
GGML_ASSERT(nb2 <= nb3);
|
10975
11030
|
|
10976
|
-
// broadcast is not supported with mmid
|
10977
|
-
assert(ne12 == 1);
|
10978
|
-
assert(ne13 == 1);
|
10979
|
-
|
10980
11031
|
// row groups
|
10981
|
-
const int
|
10982
|
-
const int n_as
|
11032
|
+
const int n_ids = ids->ne[0]; // n_expert_used
|
11033
|
+
const int n_as = ne02; // n_expert
|
10983
11034
|
|
10984
11035
|
char * wdata_src1_end = (src1->type == vec_dot_type) ?
|
10985
11036
|
(char *) params->wdata :
|
10986
11037
|
(char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
|
10987
11038
|
|
10988
|
-
|
10989
|
-
|
11039
|
+
struct mmid_row_mapping {
|
11040
|
+
int32_t i1;
|
11041
|
+
int32_t i2;
|
11042
|
+
};
|
10990
11043
|
|
10991
|
-
|
11044
|
+
int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
|
11045
|
+
struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *)(matrix_row_counts + n_as); // [n_as][ne11]
|
10992
11046
|
|
10993
11047
|
if (params->type == GGML_TASK_TYPE_INIT) {
|
10994
11048
|
if (ith != 0) {
|
@@ -11012,16 +11066,20 @@ static void ggml_compute_forward_mul_mat_id(
|
|
11012
11066
|
}
|
11013
11067
|
|
11014
11068
|
// initialize matrix_row_counts
|
11015
|
-
GGML_ASSERT(wdata == wdata_src1_end);
|
11016
11069
|
memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
|
11017
11070
|
|
11071
|
+
#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)]
|
11072
|
+
|
11018
11073
|
// group rows by src0 matrix
|
11019
|
-
for (int64_t
|
11020
|
-
|
11074
|
+
for (int64_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) {
|
11075
|
+
for (int id = 0; id < n_ids; ++id) {
|
11076
|
+
const int32_t i02 = *(const int32_t *) ((const char *) ids->data + iid1*ids->nb[1] + id*ids->nb[0]);
|
11077
|
+
|
11078
|
+
assert(i02 >= 0 && i02 < n_as);
|
11021
11079
|
|
11022
|
-
|
11023
|
-
|
11024
|
-
|
11080
|
+
MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = (struct mmid_row_mapping) {id, iid1};
|
11081
|
+
matrix_row_counts[i02] += 1;
|
11082
|
+
}
|
11025
11083
|
}
|
11026
11084
|
|
11027
11085
|
return;
|
@@ -11039,15 +11097,13 @@ static void ggml_compute_forward_mul_mat_id(
|
|
11039
11097
|
continue;
|
11040
11098
|
}
|
11041
11099
|
|
11042
|
-
|
11100
|
+
const char * src0_cur = (const char *) src0->data + cur_a*nb02;
|
11043
11101
|
|
11044
11102
|
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
11045
11103
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
11046
11104
|
|
11047
|
-
const int64_t nr0 = ne01;
|
11048
|
-
const int64_t nr1 = cne1
|
11049
|
-
|
11050
|
-
//printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
|
11105
|
+
const int64_t nr0 = ne01; // src0 rows
|
11106
|
+
const int64_t nr1 = cne1; // src1 rows
|
11051
11107
|
|
11052
11108
|
// distribute the thread work across the inner or outer loop based on which one is larger
|
11053
11109
|
|
@@ -11066,13 +11122,11 @@ static void ggml_compute_forward_mul_mat_id(
|
|
11066
11122
|
const int64_t ir110 = dr1*ith1;
|
11067
11123
|
const int64_t ir111 = MIN(ir110 + dr1, nr1);
|
11068
11124
|
|
11069
|
-
//printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
|
11070
|
-
|
11071
11125
|
// threads with no work simply yield (not sure if it helps)
|
11072
|
-
if (ir010 >= ir011 || ir110 >= ir111) {
|
11073
|
-
|
11074
|
-
|
11075
|
-
}
|
11126
|
+
//if (ir010 >= ir011 || ir110 >= ir111) {
|
11127
|
+
// sched_yield();
|
11128
|
+
// continue;
|
11129
|
+
//}
|
11076
11130
|
|
11077
11131
|
// block-tiling attempt
|
11078
11132
|
const int64_t blck_0 = 16;
|
@@ -11084,20 +11138,16 @@ static void ggml_compute_forward_mul_mat_id(
|
|
11084
11138
|
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
|
11085
11139
|
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
|
11086
11140
|
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
|
11087
|
-
const int64_t
|
11088
|
-
const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1;
|
11089
|
-
const int64_t _i11 = (ir1 - i13*ne12*cne1 - i12*cne1);
|
11090
|
-
const int64_t i11 = MMID_MATRIX_ROW(cur_a, _i11);
|
11141
|
+
const int64_t _i12 = ir1; // logical row index for this expert
|
11091
11142
|
|
11092
|
-
|
11093
|
-
|
11094
|
-
//const int64_t i02 = i12/r2;
|
11143
|
+
struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, _i12);
|
11144
|
+
const int id = row_mapping.i1; // selected expert index
|
11095
11145
|
|
11096
|
-
const int64_t
|
11097
|
-
const int64_t i2
|
11098
|
-
const int64_t i3 = i13;
|
11146
|
+
const int64_t i11 = id % ne11;
|
11147
|
+
const int64_t i12 = row_mapping.i2; // row index in src1
|
11099
11148
|
|
11100
|
-
const
|
11149
|
+
const int64_t i1 = id; // selected expert index
|
11150
|
+
const int64_t i2 = i12; // row
|
11101
11151
|
|
11102
11152
|
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
|
11103
11153
|
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
|
@@ -11105,25 +11155,26 @@ static void ggml_compute_forward_mul_mat_id(
|
|
11105
11155
|
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
11106
11156
|
const char * src1_col = (const char *) wdata +
|
11107
11157
|
(src1_cont || src1->type != vec_dot_type
|
11108
|
-
? (i11 + i12*ne11
|
11109
|
-
: (i11*nb11 + i12*nb12
|
11158
|
+
? (i11 + i12*ne11)*row_size
|
11159
|
+
: (i11*nb11 + i12*nb12));
|
11110
11160
|
|
11111
|
-
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2
|
11161
|
+
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2));
|
11112
11162
|
|
11113
11163
|
//for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
11114
11164
|
// vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
|
11115
11165
|
//}
|
11116
11166
|
|
11117
11167
|
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
11118
|
-
vec_dot(ne00, &tmp[ir0 - iir0], 0,
|
11168
|
+
vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1);
|
11119
11169
|
}
|
11170
|
+
|
11120
11171
|
memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
|
11121
11172
|
}
|
11122
11173
|
}
|
11123
11174
|
}
|
11124
11175
|
}
|
11125
11176
|
|
11126
|
-
|
11177
|
+
#undef MMID_MATRIX_ROW
|
11127
11178
|
}
|
11128
11179
|
|
11129
11180
|
// ggml_compute_forward_out_prod
|
@@ -18462,7 +18513,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
18462
18513
|
const int n_as = src0->ne[2];
|
18463
18514
|
cur += GGML_PAD(cur, sizeof(int64_t)); // align
|
18464
18515
|
cur += n_as * sizeof(int64_t); // matrix_row_counts
|
18465
|
-
cur += n_as * src1->ne[
|
18516
|
+
cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
|
18466
18517
|
} break;
|
18467
18518
|
case GGML_OP_OUT_PROD:
|
18468
18519
|
{
|
@@ -20550,6 +20601,32 @@ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
|
|
20550
20601
|
return ok;
|
20551
20602
|
}
|
20552
20603
|
|
20604
|
+
static void gguf_free_kv(struct gguf_kv * kv) {
|
20605
|
+
if (kv->key.data) {
|
20606
|
+
GGML_FREE(kv->key.data);
|
20607
|
+
}
|
20608
|
+
|
20609
|
+
if (kv->type == GGUF_TYPE_STRING) {
|
20610
|
+
if (kv->value.str.data) {
|
20611
|
+
GGML_FREE(kv->value.str.data);
|
20612
|
+
}
|
20613
|
+
}
|
20614
|
+
|
20615
|
+
if (kv->type == GGUF_TYPE_ARRAY) {
|
20616
|
+
if (kv->value.arr.data) {
|
20617
|
+
if (kv->value.arr.type == GGUF_TYPE_STRING) {
|
20618
|
+
for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
|
20619
|
+
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
|
20620
|
+
if (str->data) {
|
20621
|
+
GGML_FREE(str->data);
|
20622
|
+
}
|
20623
|
+
}
|
20624
|
+
}
|
20625
|
+
GGML_FREE(kv->value.arr.data);
|
20626
|
+
}
|
20627
|
+
}
|
20628
|
+
}
|
20629
|
+
|
20553
20630
|
struct gguf_context * gguf_init_empty(void) {
|
20554
20631
|
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
|
20555
20632
|
|
@@ -20862,12 +20939,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
20862
20939
|
|
20863
20940
|
ok = ok && cur != NULL;
|
20864
20941
|
|
20865
|
-
ggml_set_name(cur, ctx->infos[i].name.data);
|
20866
|
-
|
20867
20942
|
if (!ok) {
|
20868
20943
|
break;
|
20869
20944
|
}
|
20870
20945
|
|
20946
|
+
ggml_set_name(cur, ctx->infos[i].name.data);
|
20947
|
+
|
20871
20948
|
// point the data member to the appropriate location in the binary blob using the tensor infos
|
20872
20949
|
if (!params.no_alloc) {
|
20873
20950
|
//cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
|
@@ -20899,31 +20976,7 @@ void gguf_free(struct gguf_context * ctx) {
|
|
20899
20976
|
if (ctx->kv) {
|
20900
20977
|
// free string memory - not great..
|
20901
20978
|
for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
|
20902
|
-
|
20903
|
-
|
20904
|
-
if (kv->key.data) {
|
20905
|
-
GGML_FREE(kv->key.data);
|
20906
|
-
}
|
20907
|
-
|
20908
|
-
if (kv->type == GGUF_TYPE_STRING) {
|
20909
|
-
if (kv->value.str.data) {
|
20910
|
-
GGML_FREE(kv->value.str.data);
|
20911
|
-
}
|
20912
|
-
}
|
20913
|
-
|
20914
|
-
if (kv->type == GGUF_TYPE_ARRAY) {
|
20915
|
-
if (kv->value.arr.data) {
|
20916
|
-
if (kv->value.arr.type == GGUF_TYPE_STRING) {
|
20917
|
-
for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
|
20918
|
-
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
|
20919
|
-
if (str->data) {
|
20920
|
-
GGML_FREE(str->data);
|
20921
|
-
}
|
20922
|
-
}
|
20923
|
-
}
|
20924
|
-
GGML_FREE(kv->value.arr.data);
|
20925
|
-
}
|
20926
|
-
}
|
20979
|
+
gguf_free_kv(&ctx->kv[i]);
|
20927
20980
|
}
|
20928
20981
|
|
20929
20982
|
GGML_FREE(ctx->kv);
|
@@ -21148,6 +21201,19 @@ static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
|
|
21148
21201
|
return n_kv;
|
21149
21202
|
}
|
21150
21203
|
|
21204
|
+
void gguf_remove_key(struct gguf_context * ctx, const char * key) {
|
21205
|
+
const int idx = gguf_find_key(ctx, key);
|
21206
|
+
if (idx >= 0) {
|
21207
|
+
const int n_kv = gguf_get_n_kv(ctx);
|
21208
|
+
gguf_free_kv(&ctx->kv[idx]);
|
21209
|
+
for (int i = idx; i < n_kv-1; ++i) {
|
21210
|
+
ctx->kv[i] = ctx->kv[i+1];
|
21211
|
+
}
|
21212
|
+
ctx->kv = realloc(ctx->kv, (n_kv - 1) * sizeof(struct gguf_kv));
|
21213
|
+
ctx->header.n_kv--;
|
21214
|
+
}
|
21215
|
+
}
|
21216
|
+
|
21151
21217
|
void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) {
|
21152
21218
|
const int idx = gguf_get_or_add_key(ctx, key);
|
21153
21219
|
|
data/vendor/tmp/llama.cpp/ggml.h
CHANGED
@@ -1161,13 +1161,11 @@ extern "C" {
|
|
1161
1161
|
enum ggml_prec prec);
|
1162
1162
|
|
1163
1163
|
// indirect matrix multiplication
|
1164
|
-
// ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
|
1165
1164
|
GGML_API struct ggml_tensor * ggml_mul_mat_id(
|
1166
1165
|
struct ggml_context * ctx,
|
1167
1166
|
struct ggml_tensor * as,
|
1168
|
-
struct ggml_tensor *
|
1169
|
-
|
1170
|
-
struct ggml_tensor * b);
|
1167
|
+
struct ggml_tensor * b,
|
1168
|
+
struct ggml_tensor * ids);
|
1171
1169
|
|
1172
1170
|
// A: m columns, n rows,
|
1173
1171
|
// B: p columns, n rows,
|
@@ -2289,6 +2287,9 @@ extern "C" {
|
|
2289
2287
|
GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
|
2290
2288
|
GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int i);
|
2291
2289
|
|
2290
|
+
// removes key if it exists
|
2291
|
+
GGML_API void gguf_remove_key(struct gguf_context * ctx, const char * key);
|
2292
|
+
|
2292
2293
|
// overrides existing values or adds a new one
|
2293
2294
|
GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
|
2294
2295
|
GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val);
|