llama_cpp 0.14.5 → 0.14.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/vendor/tmp/llama.cpp/Makefile +18 -6
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +135 -46
- data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +130 -83
- data/vendor/tmp/llama.cpp/ggml-metal.metal +505 -1467
- data/vendor/tmp/llama.cpp/ggml-quants.c +1 -1
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +65 -52
- data/vendor/tmp/llama.cpp/ggml.c +153 -87
- data/vendor/tmp/llama.cpp/ggml.h +5 -4
- data/vendor/tmp/llama.cpp/llama.cpp +885 -144
- data/vendor/tmp/llama.cpp/sgemm.cpp +1148 -0
- data/vendor/tmp/llama.cpp/sgemm.h +12 -0
- metadata +4 -2
@@ -132,7 +132,7 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) {
|
|
132
132
|
}
|
133
133
|
|
134
134
|
static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
|
135
|
-
#if defined(__AVXVNNI__) || defined(__AVX512VNNI__)
|
135
|
+
#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
|
136
136
|
const __m256i zero = _mm256_setzero_si256();
|
137
137
|
const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
|
138
138
|
return _mm256_cvtepi32_ps(summed_pairs);
|
@@ -3154,7 +3154,6 @@ typedef float (*vec_dot_q_mul_mat_sycl_t)(
|
|
3154
3154
|
#define SYCL_SCALE_BLOCK_SIZE 256
|
3155
3155
|
#define SYCL_CLAMP_BLOCK_SIZE 256
|
3156
3156
|
#define SYCL_ROPE_BLOCK_SIZE 256
|
3157
|
-
#define SYCL_SOFT_MAX_BLOCK_SIZE 1024
|
3158
3157
|
#define SYCL_ALIBI_BLOCK_SIZE 32
|
3159
3158
|
#define SYCL_DIAG_MASK_INF_BLOCK_SIZE 32
|
3160
3159
|
#define SYCL_QUANTIZE_BLOCK_SIZE 256
|
@@ -13080,11 +13079,13 @@ static void soft_max_f32_sycl(const float * x, const float * mask, const float *
|
|
13080
13079
|
const int nrows_y, const float scale, const float max_bias,
|
13081
13080
|
dpct::queue_ptr stream) {
|
13082
13081
|
int nth = WARP_SIZE;
|
13083
|
-
|
13082
|
+
int max_block_size = g_work_group_size;
|
13083
|
+
while (nth < ncols_x && nth < max_block_size) nth *= 2;
|
13084
|
+
if (nth>max_block_size) nth = max_block_size;
|
13085
|
+
|
13084
13086
|
const sycl::range<3> block_dims(1, 1, nth);
|
13085
13087
|
const sycl::range<3> block_nums(1, 1, nrows_x);
|
13086
13088
|
const size_t n_local_scratch = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE);
|
13087
|
-
static_assert(SYCL_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
|
13088
13089
|
|
13089
13090
|
const uint32_t n_head_kv = nrows_x/nrows_y;
|
13090
13091
|
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
|
@@ -13094,6 +13095,12 @@ static void soft_max_f32_sycl(const float * x, const float * mask, const float *
|
|
13094
13095
|
|
13095
13096
|
const size_t local_mem_size = stream->get_device().get_info<sycl::info::device::local_mem_size>();
|
13096
13097
|
if (n_local_scratch*sizeof(float) < local_mem_size) {
|
13098
|
+
if (ncols_x > max_block_size) {
|
13099
|
+
soft_max_f32_submitter<true, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
13100
|
+
max_bias, m0, m1, n_head_log2, block_nums,
|
13101
|
+
block_dims, n_local_scratch, stream);
|
13102
|
+
return;
|
13103
|
+
}
|
13097
13104
|
switch (ncols_x) {
|
13098
13105
|
case 32:
|
13099
13106
|
soft_max_f32_submitter<true, 32, 32>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
@@ -15989,73 +15996,76 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) {
|
|
15989
15996
|
static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
15990
15997
|
const ggml_tensor *src1,
|
15991
15998
|
ggml_tensor *dst) try {
|
15992
|
-
|
15993
|
-
|
15994
|
-
|
15995
|
-
|
15999
|
+
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT &&
|
16000
|
+
"mul_mat_id does not support split buffers");
|
16001
|
+
const ggml_tensor *ids = dst->src[2];
|
16002
|
+
const dpct::queue_ptr stream = g_syclStreams[g_main_device][0];
|
15996
16003
|
|
15997
|
-
const
|
15998
|
-
const
|
16004
|
+
const size_t nb11 = src1->nb[1];
|
16005
|
+
const size_t nb1 = dst->nb[1];
|
15999
16006
|
|
16000
|
-
const
|
16001
|
-
const int32_t
|
16002
|
-
const int32_t n_as = ((int32_t *) dst->op_params)[1];
|
16007
|
+
const int32_t id = ((int32_t *)dst->op_params)[0];
|
16008
|
+
const int32_t n_as = src0->ne[2];
|
16003
16009
|
|
16004
16010
|
std::vector<char> ids_host(ggml_nbytes(ids));
|
16011
|
+
const char *ids_dev = (const char *)ids->data;
|
16005
16012
|
|
16006
|
-
|
16007
|
-
|
16008
|
-
|
16009
|
-
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
|
16010
|
-
SYCL_CHECK(CHECK_TRY_ERROR(
|
16011
|
-
stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids)).wait()));
|
16012
|
-
// SYCL_CHECK(CHECK_TRY_ERROR(stream->wait()));
|
16013
|
-
} else {
|
16014
|
-
memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
|
16015
|
-
}
|
16013
|
+
SYCL_CHECK(CHECK_TRY_ERROR(
|
16014
|
+
stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids))));
|
16015
|
+
SYCL_CHECK(CHECK_TRY_ERROR(stream->wait()));
|
16016
16016
|
|
16017
|
-
const ggml_tensor_extra_gpu *
|
16018
|
-
|
16017
|
+
const ggml_tensor_extra_gpu *src0_extra =
|
16018
|
+
(const ggml_tensor_extra_gpu *)src0->extra;
|
16019
|
+
const ggml_tensor_extra_gpu *src1_extra =
|
16020
|
+
(const ggml_tensor_extra_gpu *)src1->extra;
|
16021
|
+
const ggml_tensor_extra_gpu *dst_extra =
|
16022
|
+
(const ggml_tensor_extra_gpu *)dst->extra;
|
16019
16023
|
|
16024
|
+
ggml_tensor_extra_gpu src0_row_extra;
|
16020
16025
|
ggml_tensor_extra_gpu src1_row_extra;
|
16021
16026
|
ggml_tensor_extra_gpu dst_row_extra;
|
16022
16027
|
|
16028
|
+
ggml_tensor src0_row = *src0;
|
16023
16029
|
ggml_tensor src1_row = *src1;
|
16024
16030
|
ggml_tensor dst_row = *dst;
|
16025
16031
|
|
16026
16032
|
src1_row.backend = GGML_BACKEND_TYPE_GPU;
|
16027
16033
|
dst_row.backend = GGML_BACKEND_TYPE_GPU;
|
16028
16034
|
|
16035
|
+
src0_row.extra = &src0_row_extra;
|
16029
16036
|
src1_row.extra = &src1_row_extra;
|
16030
16037
|
dst_row.extra = &dst_row_extra;
|
16031
16038
|
|
16032
|
-
char *
|
16033
|
-
|
16034
|
-
|
16035
|
-
|
16039
|
+
char *src0_original = src1->backend == GGML_BACKEND_TYPE_CPU
|
16040
|
+
? (char *)src0->data
|
16041
|
+
: (char *)src0_extra->data_device[g_main_device];
|
16042
|
+
char *src1_original = src1->backend == GGML_BACKEND_TYPE_CPU
|
16043
|
+
? (char *)src1->data
|
16044
|
+
: (char *)src1_extra->data_device[g_main_device];
|
16045
|
+
char *dst_original = dst->backend == GGML_BACKEND_TYPE_CPU
|
16046
|
+
? (char *)dst->data
|
16047
|
+
: (char *)dst_extra->data_device[g_main_device];
|
16036
16048
|
|
16037
|
-
|
16038
|
-
|
16039
|
-
|
16049
|
+
src0_row.ne[2] = 1;
|
16050
|
+
src0_row.ne[3] = 1;
|
16051
|
+
src0_row.nb[3] = src0->nb[2];
|
16040
16052
|
|
16053
|
+
if (src1->ne[1] == 1) {
|
16041
16054
|
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
16042
|
-
|
16043
|
-
|
16044
|
-
|
16045
|
-
|
16046
|
-
const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
16055
|
+
const int32_t row_id =
|
16056
|
+
*(const int32_t *)(ids_host.data() + i01 * ids->nb[1] +
|
16057
|
+
id * ids->nb[0]);
|
16047
16058
|
|
16048
16059
|
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
16049
16060
|
|
16050
|
-
|
16051
|
-
|
16052
|
-
src1_row_extra.data_device[g_main_device] =
|
16053
|
-
|
16061
|
+
src0_row_extra.data_device[g_main_device] =
|
16062
|
+
src0_original + row_id * src0->nb[2];
|
16063
|
+
src1_row_extra.data_device[g_main_device] =
|
16064
|
+
src1_original + i01 * src1->nb[1];
|
16065
|
+
dst_row_extra.data_device[g_main_device] =
|
16066
|
+
dst_original + i01 * dst->nb[1];
|
16054
16067
|
|
16055
|
-
|
16056
|
-
dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set?
|
16057
|
-
|
16058
|
-
ggml_sycl_mul_mat(src0_row, &src1_row, &dst_row);
|
16068
|
+
ggml_sycl_mul_mat(&src0_row, &src1_row, &dst_row);
|
16059
16069
|
}
|
16060
16070
|
} else {
|
16061
16071
|
sycl_pool_alloc<char> src1_contiguous(sizeof(float)*ggml_nelements(src1));
|
@@ -16065,8 +16075,6 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
|
16065
16075
|
dst_row_extra.data_device[g_main_device] = dst_contiguous.get();
|
16066
16076
|
|
16067
16077
|
for (int32_t row_id = 0; row_id < n_as; ++row_id) {
|
16068
|
-
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
|
16069
|
-
|
16070
16078
|
int64_t num_src1_rows = 0;
|
16071
16079
|
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
16072
16080
|
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
@@ -16079,7 +16087,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
|
16079
16087
|
|
16080
16088
|
SYCL_CHECK(CHECK_TRY_ERROR(
|
16081
16089
|
stream->memcpy(src1_contiguous.get() + num_src1_rows * nb11,
|
16082
|
-
src1_original + i01 * nb11, nb11)
|
16090
|
+
src1_original + i01 * nb11, nb11)));
|
16083
16091
|
num_src1_rows++;
|
16084
16092
|
}
|
16085
16093
|
|
@@ -16087,6 +16095,9 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
|
16087
16095
|
continue;
|
16088
16096
|
}
|
16089
16097
|
|
16098
|
+
src0_row_extra.data_device[g_main_device] =
|
16099
|
+
src0_original + row_id * src0->nb[2];
|
16100
|
+
|
16090
16101
|
src1_row.ne[1] = num_src1_rows;
|
16091
16102
|
dst_row.ne[1] = num_src1_rows;
|
16092
16103
|
|
@@ -16098,7 +16109,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
|
16098
16109
|
dst_row.nb[2] = num_src1_rows*nb1;
|
16099
16110
|
dst_row.nb[3] = num_src1_rows*nb1;
|
16100
16111
|
|
16101
|
-
ggml_sycl_mul_mat(src0_row, &src1_row, &dst_row);
|
16112
|
+
ggml_sycl_mul_mat(&src0_row, &src1_row, &dst_row);
|
16102
16113
|
|
16103
16114
|
num_src1_rows = 0;
|
16104
16115
|
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
@@ -16112,7 +16123,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
|
16112
16123
|
|
16113
16124
|
SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(
|
16114
16125
|
dst_original + i01 * nb1,
|
16115
|
-
dst_contiguous.get() + num_src1_rows * nb1, nb1)
|
16126
|
+
dst_contiguous.get() + num_src1_rows * nb1, nb1)));
|
16116
16127
|
num_src1_rows++;
|
16117
16128
|
}
|
16118
16129
|
}
|
@@ -16814,11 +16825,13 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
16814
16825
|
const dpct::queue_ptr stream = g_syclStreams[ctx->device][0];
|
16815
16826
|
SYCL_CHECK(
|
16816
16827
|
CHECK_TRY_ERROR(dpct::dev_mgr::instance().get_device(ctx->device).queues_wait_and_throw()));
|
16817
|
-
|
16828
|
+
char* host_buf = (char*)malloc(size);
|
16829
|
+
memcpy(host_buf, data, size);
|
16818
16830
|
SYCL_CHECK(
|
16819
16831
|
CHECK_TRY_ERROR((*stream)
|
16820
|
-
.memcpy((char *)tensor->data + offset,
|
16832
|
+
.memcpy((char *)tensor->data + offset, host_buf, size)
|
16821
16833
|
.wait()));
|
16834
|
+
free(host_buf);
|
16822
16835
|
}
|
16823
16836
|
catch (sycl::exception const &exc) {
|
16824
16837
|
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
@@ -17739,7 +17752,7 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
|
|
17739
17752
|
|
17740
17753
|
GGML_CALL static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
|
17741
17754
|
const int min_batch_size = 32;
|
17742
|
-
return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
|
17755
|
+
return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS && op->op != GGML_OP_MUL_MAT_ID;
|
17743
17756
|
GGML_UNUSED(backend);
|
17744
17757
|
}
|
17745
17758
|
|
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
@@ -4,6 +4,7 @@
|
|
4
4
|
#include "ggml-impl.h"
|
5
5
|
#include "ggml-quants.h"
|
6
6
|
#include "ggml.h"
|
7
|
+
#include "sgemm.h"
|
7
8
|
|
8
9
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
9
10
|
#include <malloc.h> // using malloc.h with MSC/MINGW
|
@@ -32,6 +33,10 @@
|
|
32
33
|
#include <unistd.h>
|
33
34
|
#endif
|
34
35
|
|
36
|
+
#ifdef __ARM_FEATURE_MATMUL_INT8
|
37
|
+
#undef GGML_USE_LLAMAFILE
|
38
|
+
#endif
|
39
|
+
|
35
40
|
#if defined(_MSC_VER)
|
36
41
|
// disable "possible loss of data" to avoid hundreds of casts
|
37
42
|
// we should just be careful :)
|
@@ -4573,21 +4578,32 @@ void ggml_mul_mat_set_prec(
|
|
4573
4578
|
|
4574
4579
|
// ggml_mul_mat_id
|
4575
4580
|
|
4576
|
-
|
4577
|
-
|
4581
|
+
/*
|
4582
|
+
c = ggml_mul_mat_id(ctx, as, b, ids);
|
4583
|
+
|
4584
|
+
as -> [cols, rows, n_expert]
|
4585
|
+
ids -> [n_experts_used, n_tokens] (i32)
|
4586
|
+
b -> [cols, n_expert_used, n_tokens]
|
4587
|
+
c -> [cols, n_expert_used, n_tokens]
|
4588
|
+
|
4589
|
+
in b, n_experts_used can be broadcasted to match the n_expert_used of ids
|
4590
|
+
|
4591
|
+
c ~= as[:,:,i] @ b[:,i%r,t], i = ids[e,t] for all e,t in ids
|
4592
|
+
*/
|
4578
4593
|
struct ggml_tensor * ggml_mul_mat_id(
|
4579
4594
|
struct ggml_context * ctx,
|
4580
4595
|
struct ggml_tensor * as,
|
4581
|
-
struct ggml_tensor *
|
4582
|
-
|
4583
|
-
|
4584
|
-
|
4596
|
+
struct ggml_tensor * b,
|
4597
|
+
struct ggml_tensor * ids) {
|
4598
|
+
GGML_ASSERT(!ggml_is_transposed(as));
|
4585
4599
|
GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
4600
|
+
|
4601
|
+
GGML_ASSERT(as->ne[3] == 1); // as is 3d (one matrix per expert)
|
4602
|
+
GGML_ASSERT(b->ne[3] == 1); // b is 3d
|
4586
4603
|
GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); // ids is 2d
|
4587
|
-
GGML_ASSERT(ids->ne[1] == b->ne[
|
4588
|
-
GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
|
4589
|
-
GGML_ASSERT(id >= 0 && id < ids->ne[0]); // valid id
|
4604
|
+
GGML_ASSERT(ids->ne[1] == b->ne[2]); // must have an expert list per b row
|
4590
4605
|
GGML_ASSERT(as->ne[0] == b->ne[0]); // can_mul_mat
|
4606
|
+
GGML_ASSERT(ids->ne[0] % b->ne[1] == 0); // can broadcast
|
4591
4607
|
|
4592
4608
|
bool is_node = false;
|
4593
4609
|
|
@@ -4595,11 +4611,9 @@ struct ggml_tensor * ggml_mul_mat_id(
|
|
4595
4611
|
is_node = true;
|
4596
4612
|
}
|
4597
4613
|
|
4598
|
-
const int64_t ne[4] = { as->ne[1],
|
4614
|
+
const int64_t ne[4] = { as->ne[1], ids->ne[0], b->ne[2], 1 };
|
4599
4615
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
4600
4616
|
|
4601
|
-
ggml_set_op_params_i32(result, 0, id);
|
4602
|
-
|
4603
4617
|
result->op = GGML_OP_MUL_MAT_ID;
|
4604
4618
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
4605
4619
|
result->src[0] = as;
|
@@ -10810,6 +10824,28 @@ static void ggml_compute_forward_mul_mat(
|
|
10810
10824
|
}
|
10811
10825
|
#endif
|
10812
10826
|
|
10827
|
+
#if GGML_USE_LLAMAFILE
|
10828
|
+
if (nb10 == ggml_type_size(src1->type)) {
|
10829
|
+
for (int64_t i13 = 0; i13 < ne13; i13++)
|
10830
|
+
for (int64_t i12 = 0; i12 < ne12; i12++)
|
10831
|
+
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
10832
|
+
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
10833
|
+
nb01/ggml_type_size(src0->type),
|
10834
|
+
(const char *)src1->data + i12*nb12 + i13*nb13,
|
10835
|
+
nb11/ggml_type_size(src1->type),
|
10836
|
+
(char *)dst->data + i12*nb2 + i13*nb3,
|
10837
|
+
nb1/ggml_type_size(dst->type),
|
10838
|
+
ith, nth,
|
10839
|
+
params->type,
|
10840
|
+
src0->type,
|
10841
|
+
src1->type,
|
10842
|
+
dst->type))
|
10843
|
+
goto UseGgmlGemm1;
|
10844
|
+
return;
|
10845
|
+
}
|
10846
|
+
UseGgmlGemm1:;
|
10847
|
+
#endif
|
10848
|
+
|
10813
10849
|
if (params->type == GGML_TASK_TYPE_INIT) {
|
10814
10850
|
if (ith != 0) {
|
10815
10851
|
return;
|
@@ -10841,6 +10877,30 @@ static void ggml_compute_forward_mul_mat(
|
|
10841
10877
|
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
10842
10878
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
10843
10879
|
|
10880
|
+
#if GGML_USE_LLAMAFILE
|
10881
|
+
if (nb10 == ggml_type_size(src1->type) || src1->type != vec_dot_type) {
|
10882
|
+
for (int64_t i13 = 0; i13 < ne13; i13++)
|
10883
|
+
for (int64_t i12 = 0; i12 < ne12; i12++)
|
10884
|
+
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
10885
|
+
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
10886
|
+
nb01/ggml_type_size(src0->type),
|
10887
|
+
(const char *)wdata + ggml_row_size(vec_dot_type,
|
10888
|
+
nb12/ggml_type_size(src1->type)*i12 +
|
10889
|
+
nb13/ggml_type_size(src1->type)*i13),
|
10890
|
+
row_size/ggml_type_size(vec_dot_type),
|
10891
|
+
(char *)dst->data + i12*nb2 + i13*nb3,
|
10892
|
+
nb1/ggml_type_size(dst->type),
|
10893
|
+
ith, nth,
|
10894
|
+
params->type,
|
10895
|
+
src0->type,
|
10896
|
+
vec_dot_type,
|
10897
|
+
dst->type))
|
10898
|
+
goto UseGgmlGemm2;
|
10899
|
+
return;
|
10900
|
+
}
|
10901
|
+
UseGgmlGemm2:;
|
10902
|
+
#endif
|
10903
|
+
|
10844
10904
|
const int64_t nr0 = ne01; // src0 rows
|
10845
10905
|
const int64_t nr1 = ne1*ne12*ne13; // src1 rows
|
10846
10906
|
|
@@ -10958,11 +11018,6 @@ static void ggml_compute_forward_mul_mat_id(
|
|
10958
11018
|
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
10959
11019
|
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
10960
11020
|
|
10961
|
-
GGML_ASSERT(ne0 == ne01);
|
10962
|
-
GGML_ASSERT(ne1 == ne11);
|
10963
|
-
GGML_ASSERT(ne2 == ne12);
|
10964
|
-
GGML_ASSERT(ne3 == ne13);
|
10965
|
-
|
10966
11021
|
// we don't support permuted src0 or src1
|
10967
11022
|
GGML_ASSERT(nb00 == ggml_type_size(type));
|
10968
11023
|
GGML_ASSERT(nb10 == ggml_type_size(src1->type));
|
@@ -10973,22 +11028,21 @@ static void ggml_compute_forward_mul_mat_id(
|
|
10973
11028
|
GGML_ASSERT(nb1 <= nb2);
|
10974
11029
|
GGML_ASSERT(nb2 <= nb3);
|
10975
11030
|
|
10976
|
-
// broadcast is not supported with mmid
|
10977
|
-
assert(ne12 == 1);
|
10978
|
-
assert(ne13 == 1);
|
10979
|
-
|
10980
11031
|
// row groups
|
10981
|
-
const int
|
10982
|
-
const int n_as
|
11032
|
+
const int n_ids = ids->ne[0]; // n_expert_used
|
11033
|
+
const int n_as = ne02; // n_expert
|
10983
11034
|
|
10984
11035
|
char * wdata_src1_end = (src1->type == vec_dot_type) ?
|
10985
11036
|
(char *) params->wdata :
|
10986
11037
|
(char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
|
10987
11038
|
|
10988
|
-
|
10989
|
-
|
11039
|
+
struct mmid_row_mapping {
|
11040
|
+
int32_t i1;
|
11041
|
+
int32_t i2;
|
11042
|
+
};
|
10990
11043
|
|
10991
|
-
|
11044
|
+
int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
|
11045
|
+
struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *)(matrix_row_counts + n_as); // [n_as][ne11]
|
10992
11046
|
|
10993
11047
|
if (params->type == GGML_TASK_TYPE_INIT) {
|
10994
11048
|
if (ith != 0) {
|
@@ -11012,16 +11066,20 @@ static void ggml_compute_forward_mul_mat_id(
|
|
11012
11066
|
}
|
11013
11067
|
|
11014
11068
|
// initialize matrix_row_counts
|
11015
|
-
GGML_ASSERT(wdata == wdata_src1_end);
|
11016
11069
|
memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
|
11017
11070
|
|
11071
|
+
#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)]
|
11072
|
+
|
11018
11073
|
// group rows by src0 matrix
|
11019
|
-
for (int64_t
|
11020
|
-
|
11074
|
+
for (int64_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) {
|
11075
|
+
for (int id = 0; id < n_ids; ++id) {
|
11076
|
+
const int32_t i02 = *(const int32_t *) ((const char *) ids->data + iid1*ids->nb[1] + id*ids->nb[0]);
|
11077
|
+
|
11078
|
+
assert(i02 >= 0 && i02 < n_as);
|
11021
11079
|
|
11022
|
-
|
11023
|
-
|
11024
|
-
|
11080
|
+
MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = (struct mmid_row_mapping) {id, iid1};
|
11081
|
+
matrix_row_counts[i02] += 1;
|
11082
|
+
}
|
11025
11083
|
}
|
11026
11084
|
|
11027
11085
|
return;
|
@@ -11039,15 +11097,13 @@ static void ggml_compute_forward_mul_mat_id(
|
|
11039
11097
|
continue;
|
11040
11098
|
}
|
11041
11099
|
|
11042
|
-
|
11100
|
+
const char * src0_cur = (const char *) src0->data + cur_a*nb02;
|
11043
11101
|
|
11044
11102
|
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
11045
11103
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
11046
11104
|
|
11047
|
-
const int64_t nr0 = ne01;
|
11048
|
-
const int64_t nr1 = cne1
|
11049
|
-
|
11050
|
-
//printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
|
11105
|
+
const int64_t nr0 = ne01; // src0 rows
|
11106
|
+
const int64_t nr1 = cne1; // src1 rows
|
11051
11107
|
|
11052
11108
|
// distribute the thread work across the inner or outer loop based on which one is larger
|
11053
11109
|
|
@@ -11066,13 +11122,11 @@ static void ggml_compute_forward_mul_mat_id(
|
|
11066
11122
|
const int64_t ir110 = dr1*ith1;
|
11067
11123
|
const int64_t ir111 = MIN(ir110 + dr1, nr1);
|
11068
11124
|
|
11069
|
-
//printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
|
11070
|
-
|
11071
11125
|
// threads with no work simply yield (not sure if it helps)
|
11072
|
-
if (ir010 >= ir011 || ir110 >= ir111) {
|
11073
|
-
|
11074
|
-
|
11075
|
-
}
|
11126
|
+
//if (ir010 >= ir011 || ir110 >= ir111) {
|
11127
|
+
// sched_yield();
|
11128
|
+
// continue;
|
11129
|
+
//}
|
11076
11130
|
|
11077
11131
|
// block-tiling attempt
|
11078
11132
|
const int64_t blck_0 = 16;
|
@@ -11084,20 +11138,16 @@ static void ggml_compute_forward_mul_mat_id(
|
|
11084
11138
|
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
|
11085
11139
|
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
|
11086
11140
|
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
|
11087
|
-
const int64_t
|
11088
|
-
const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1;
|
11089
|
-
const int64_t _i11 = (ir1 - i13*ne12*cne1 - i12*cne1);
|
11090
|
-
const int64_t i11 = MMID_MATRIX_ROW(cur_a, _i11);
|
11141
|
+
const int64_t _i12 = ir1; // logical row index for this expert
|
11091
11142
|
|
11092
|
-
|
11093
|
-
|
11094
|
-
//const int64_t i02 = i12/r2;
|
11143
|
+
struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, _i12);
|
11144
|
+
const int id = row_mapping.i1; // selected expert index
|
11095
11145
|
|
11096
|
-
const int64_t
|
11097
|
-
const int64_t i2
|
11098
|
-
const int64_t i3 = i13;
|
11146
|
+
const int64_t i11 = id % ne11;
|
11147
|
+
const int64_t i12 = row_mapping.i2; // row index in src1
|
11099
11148
|
|
11100
|
-
const
|
11149
|
+
const int64_t i1 = id; // selected expert index
|
11150
|
+
const int64_t i2 = i12; // row
|
11101
11151
|
|
11102
11152
|
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
|
11103
11153
|
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
|
@@ -11105,25 +11155,26 @@ static void ggml_compute_forward_mul_mat_id(
|
|
11105
11155
|
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
11106
11156
|
const char * src1_col = (const char *) wdata +
|
11107
11157
|
(src1_cont || src1->type != vec_dot_type
|
11108
|
-
? (i11 + i12*ne11
|
11109
|
-
: (i11*nb11 + i12*nb12
|
11158
|
+
? (i11 + i12*ne11)*row_size
|
11159
|
+
: (i11*nb11 + i12*nb12));
|
11110
11160
|
|
11111
|
-
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2
|
11161
|
+
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2));
|
11112
11162
|
|
11113
11163
|
//for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
11114
11164
|
// vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
|
11115
11165
|
//}
|
11116
11166
|
|
11117
11167
|
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
11118
|
-
vec_dot(ne00, &tmp[ir0 - iir0], 0,
|
11168
|
+
vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1);
|
11119
11169
|
}
|
11170
|
+
|
11120
11171
|
memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
|
11121
11172
|
}
|
11122
11173
|
}
|
11123
11174
|
}
|
11124
11175
|
}
|
11125
11176
|
|
11126
|
-
|
11177
|
+
#undef MMID_MATRIX_ROW
|
11127
11178
|
}
|
11128
11179
|
|
11129
11180
|
// ggml_compute_forward_out_prod
|
@@ -18462,7 +18513,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
18462
18513
|
const int n_as = src0->ne[2];
|
18463
18514
|
cur += GGML_PAD(cur, sizeof(int64_t)); // align
|
18464
18515
|
cur += n_as * sizeof(int64_t); // matrix_row_counts
|
18465
|
-
cur += n_as * src1->ne[
|
18516
|
+
cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
|
18466
18517
|
} break;
|
18467
18518
|
case GGML_OP_OUT_PROD:
|
18468
18519
|
{
|
@@ -20550,6 +20601,32 @@ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
|
|
20550
20601
|
return ok;
|
20551
20602
|
}
|
20552
20603
|
|
20604
|
+
static void gguf_free_kv(struct gguf_kv * kv) {
|
20605
|
+
if (kv->key.data) {
|
20606
|
+
GGML_FREE(kv->key.data);
|
20607
|
+
}
|
20608
|
+
|
20609
|
+
if (kv->type == GGUF_TYPE_STRING) {
|
20610
|
+
if (kv->value.str.data) {
|
20611
|
+
GGML_FREE(kv->value.str.data);
|
20612
|
+
}
|
20613
|
+
}
|
20614
|
+
|
20615
|
+
if (kv->type == GGUF_TYPE_ARRAY) {
|
20616
|
+
if (kv->value.arr.data) {
|
20617
|
+
if (kv->value.arr.type == GGUF_TYPE_STRING) {
|
20618
|
+
for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
|
20619
|
+
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
|
20620
|
+
if (str->data) {
|
20621
|
+
GGML_FREE(str->data);
|
20622
|
+
}
|
20623
|
+
}
|
20624
|
+
}
|
20625
|
+
GGML_FREE(kv->value.arr.data);
|
20626
|
+
}
|
20627
|
+
}
|
20628
|
+
}
|
20629
|
+
|
20553
20630
|
struct gguf_context * gguf_init_empty(void) {
|
20554
20631
|
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
|
20555
20632
|
|
@@ -20862,12 +20939,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
20862
20939
|
|
20863
20940
|
ok = ok && cur != NULL;
|
20864
20941
|
|
20865
|
-
ggml_set_name(cur, ctx->infos[i].name.data);
|
20866
|
-
|
20867
20942
|
if (!ok) {
|
20868
20943
|
break;
|
20869
20944
|
}
|
20870
20945
|
|
20946
|
+
ggml_set_name(cur, ctx->infos[i].name.data);
|
20947
|
+
|
20871
20948
|
// point the data member to the appropriate location in the binary blob using the tensor infos
|
20872
20949
|
if (!params.no_alloc) {
|
20873
20950
|
//cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
|
@@ -20899,31 +20976,7 @@ void gguf_free(struct gguf_context * ctx) {
|
|
20899
20976
|
if (ctx->kv) {
|
20900
20977
|
// free string memory - not great..
|
20901
20978
|
for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
|
20902
|
-
|
20903
|
-
|
20904
|
-
if (kv->key.data) {
|
20905
|
-
GGML_FREE(kv->key.data);
|
20906
|
-
}
|
20907
|
-
|
20908
|
-
if (kv->type == GGUF_TYPE_STRING) {
|
20909
|
-
if (kv->value.str.data) {
|
20910
|
-
GGML_FREE(kv->value.str.data);
|
20911
|
-
}
|
20912
|
-
}
|
20913
|
-
|
20914
|
-
if (kv->type == GGUF_TYPE_ARRAY) {
|
20915
|
-
if (kv->value.arr.data) {
|
20916
|
-
if (kv->value.arr.type == GGUF_TYPE_STRING) {
|
20917
|
-
for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
|
20918
|
-
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
|
20919
|
-
if (str->data) {
|
20920
|
-
GGML_FREE(str->data);
|
20921
|
-
}
|
20922
|
-
}
|
20923
|
-
}
|
20924
|
-
GGML_FREE(kv->value.arr.data);
|
20925
|
-
}
|
20926
|
-
}
|
20979
|
+
gguf_free_kv(&ctx->kv[i]);
|
20927
20980
|
}
|
20928
20981
|
|
20929
20982
|
GGML_FREE(ctx->kv);
|
@@ -21148,6 +21201,19 @@ static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
|
|
21148
21201
|
return n_kv;
|
21149
21202
|
}
|
21150
21203
|
|
21204
|
+
void gguf_remove_key(struct gguf_context * ctx, const char * key) {
|
21205
|
+
const int idx = gguf_find_key(ctx, key);
|
21206
|
+
if (idx >= 0) {
|
21207
|
+
const int n_kv = gguf_get_n_kv(ctx);
|
21208
|
+
gguf_free_kv(&ctx->kv[idx]);
|
21209
|
+
for (int i = idx; i < n_kv-1; ++i) {
|
21210
|
+
ctx->kv[i] = ctx->kv[i+1];
|
21211
|
+
}
|
21212
|
+
ctx->kv = realloc(ctx->kv, (n_kv - 1) * sizeof(struct gguf_kv));
|
21213
|
+
ctx->header.n_kv--;
|
21214
|
+
}
|
21215
|
+
}
|
21216
|
+
|
21151
21217
|
void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) {
|
21152
21218
|
const int idx = gguf_get_or_add_key(ctx, key);
|
21153
21219
|
|
data/vendor/tmp/llama.cpp/ggml.h
CHANGED
@@ -1161,13 +1161,11 @@ extern "C" {
|
|
1161
1161
|
enum ggml_prec prec);
|
1162
1162
|
|
1163
1163
|
// indirect matrix multiplication
|
1164
|
-
// ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
|
1165
1164
|
GGML_API struct ggml_tensor * ggml_mul_mat_id(
|
1166
1165
|
struct ggml_context * ctx,
|
1167
1166
|
struct ggml_tensor * as,
|
1168
|
-
struct ggml_tensor *
|
1169
|
-
|
1170
|
-
struct ggml_tensor * b);
|
1167
|
+
struct ggml_tensor * b,
|
1168
|
+
struct ggml_tensor * ids);
|
1171
1169
|
|
1172
1170
|
// A: m columns, n rows,
|
1173
1171
|
// B: p columns, n rows,
|
@@ -2289,6 +2287,9 @@ extern "C" {
|
|
2289
2287
|
GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
|
2290
2288
|
GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int i);
|
2291
2289
|
|
2290
|
+
// removes key if it exists
|
2291
|
+
GGML_API void gguf_remove_key(struct gguf_context * ctx, const char * key);
|
2292
|
+
|
2292
2293
|
// overrides existing values or adds a new one
|
2293
2294
|
GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
|
2294
2295
|
GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val);
|