llama_cpp 0.13.0 → 0.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +20 -0
- data/ext/llama_cpp/llama_cpp.cpp +130 -26
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -4
- data/vendor/tmp/llama.cpp/Makefile +30 -15
- data/vendor/tmp/llama.cpp/ggml-alloc.c +45 -64
- data/vendor/tmp/llama.cpp/ggml-alloc.h +13 -5
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +17 -5
- data/vendor/tmp/llama.cpp/ggml-backend.c +371 -151
- data/vendor/tmp/llama.cpp/ggml-backend.h +54 -29
- data/vendor/tmp/llama.cpp/ggml-common.h +1830 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +765 -830
- data/vendor/tmp/llama.cpp/ggml-impl.h +6 -2
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -2
- data/vendor/tmp/llama.cpp/ggml-metal.m +105 -27
- data/vendor/tmp/llama.cpp/ggml-metal.metal +99 -920
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +2 -2
- data/vendor/tmp/llama.cpp/ggml-quants.c +557 -1129
- data/vendor/tmp/llama.cpp/ggml-quants.h +27 -259
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +3332 -1195
- data/vendor/tmp/llama.cpp/ggml-sycl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +39336 -43461
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1302 -781
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +1 -0
- data/vendor/tmp/llama.cpp/ggml.c +734 -356
- data/vendor/tmp/llama.cpp/ggml.h +91 -51
- data/vendor/tmp/llama.cpp/llama.cpp +1938 -759
- data/vendor/tmp/llama.cpp/llama.h +53 -21
- data/vendor/tmp/llama.cpp/unicode.cpp +1672 -0
- data/vendor/tmp/llama.cpp/unicode.h +16 -774
- metadata +4 -2
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
@@ -320,6 +320,17 @@ static ggml_fp16_t ggml_table_exp_f16[1 << 16];
|
|
320
320
|
// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
|
321
321
|
float ggml_table_f32_f16[1 << 16];
|
322
322
|
|
323
|
+
const char * ggml_status_to_string(enum ggml_status status) {
|
324
|
+
switch (status) {
|
325
|
+
case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)";
|
326
|
+
case GGML_STATUS_FAILED: return "GGML status: error (operation failed)";
|
327
|
+
case GGML_STATUS_SUCCESS: return "GGML status: success";
|
328
|
+
case GGML_STATUS_ABORTED: return "GGML status: warning (operation aborted)";
|
329
|
+
}
|
330
|
+
|
331
|
+
return "GGML status: unknown";
|
332
|
+
}
|
333
|
+
|
323
334
|
// note: do not use these inside ggml.c
|
324
335
|
// these are meant to be used via the ggml.h API
|
325
336
|
float ggml_fp16_to_fp32(ggml_fp16_t x) {
|
@@ -459,6 +470,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
459
470
|
.type_size = sizeof(int32_t),
|
460
471
|
.is_quantized = false,
|
461
472
|
},
|
473
|
+
[GGML_TYPE_I64] = {
|
474
|
+
.type_name = "i64",
|
475
|
+
.blck_size = 1,
|
476
|
+
.type_size = sizeof(int64_t),
|
477
|
+
.is_quantized = false,
|
478
|
+
},
|
479
|
+
[GGML_TYPE_F64] = {
|
480
|
+
.type_name = "f64",
|
481
|
+
.blck_size = 1,
|
482
|
+
.type_size = sizeof(double),
|
483
|
+
.is_quantized = false,
|
484
|
+
.nrows = 1,
|
485
|
+
},
|
462
486
|
[GGML_TYPE_F32] = {
|
463
487
|
.type_name = "f32",
|
464
488
|
.blck_size = 1,
|
@@ -846,7 +870,7 @@ inline static float vaddvq_f32(float32x4_t v) {
|
|
846
870
|
#define GGML_F16x8 float16x8_t
|
847
871
|
#define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
|
848
872
|
#define GGML_F16x8_SET1(x) vdupq_n_f16(x)
|
849
|
-
#define GGML_F16x8_LOAD(x) vld1q_f16((const
|
873
|
+
#define GGML_F16x8_LOAD(x) vld1q_f16((const ggml_fp16_internal_t *)(x))
|
850
874
|
#define GGML_F16x8_STORE vst1q_f16
|
851
875
|
#define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
|
852
876
|
#define GGML_F16x8_ADD vaddq_f16
|
@@ -889,7 +913,7 @@ inline static float vaddvq_f32(float32x4_t v) {
|
|
889
913
|
#define GGML_F32Cx4 float32x4_t
|
890
914
|
#define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
|
891
915
|
#define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
|
892
|
-
#define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const
|
916
|
+
#define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const ggml_fp16_internal_t *)(x)))
|
893
917
|
#define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
|
894
918
|
#define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
|
895
919
|
#define GGML_F32Cx4_ADD vaddq_f32
|
@@ -1822,12 +1846,16 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
1822
1846
|
"POOL_2D",
|
1823
1847
|
"UPSCALE",
|
1824
1848
|
"PAD",
|
1849
|
+
"ARANGE",
|
1850
|
+
"TIMESTEP_EMBEDDING",
|
1825
1851
|
"ARGSORT",
|
1826
1852
|
"LEAKY_RELU",
|
1827
1853
|
|
1828
1854
|
"FLASH_ATTN",
|
1829
1855
|
"FLASH_FF",
|
1830
1856
|
"FLASH_ATTN_BACK",
|
1857
|
+
"SSM_CONV",
|
1858
|
+
"SSM_SCAN",
|
1831
1859
|
"WIN_PART",
|
1832
1860
|
"WIN_UNPART",
|
1833
1861
|
"GET_REL_POS",
|
@@ -1850,7 +1878,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
1850
1878
|
"CROSS_ENTROPY_LOSS_BACK",
|
1851
1879
|
};
|
1852
1880
|
|
1853
|
-
static_assert(GGML_OP_COUNT ==
|
1881
|
+
static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
|
1854
1882
|
|
1855
1883
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
1856
1884
|
"none",
|
@@ -1908,12 +1936,16 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
1908
1936
|
"pool_2d(x)",
|
1909
1937
|
"upscale(x)",
|
1910
1938
|
"pad(x)",
|
1939
|
+
"arange(start, stop, step)",
|
1940
|
+
"timestep_embedding(timesteps, dim, max_period)",
|
1911
1941
|
"argsort(x)",
|
1912
1942
|
"leaky_relu(x)",
|
1913
1943
|
|
1914
1944
|
"flash_attn(x)",
|
1915
1945
|
"flash_ff(x)",
|
1916
1946
|
"flash_attn_back(x)",
|
1947
|
+
"ssm_conv(x)",
|
1948
|
+
"ssm_scan(x)",
|
1917
1949
|
"win_part(x)",
|
1918
1950
|
"win_unpart(x)",
|
1919
1951
|
"get_rel_pos(x)",
|
@@ -1936,7 +1968,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
1936
1968
|
"cross_entropy_loss_back(x,y)",
|
1937
1969
|
};
|
1938
1970
|
|
1939
|
-
static_assert(GGML_OP_COUNT ==
|
1971
|
+
static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
|
1940
1972
|
|
1941
1973
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
1942
1974
|
|
@@ -2139,7 +2171,10 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
|
|
2139
2171
|
getcpu_ret = getcpu(¤t_cpu, &g_state.numa.current_node);
|
2140
2172
|
#else
|
2141
2173
|
// old glibc doesn't have a wrapper for this call. Fall back on direct syscall
|
2142
|
-
|
2174
|
+
# if !defined(SYS_getcpu) && defined(SYS_get_cpu)
|
2175
|
+
# define SYS_getcpu SYS_get_cpu // some older glibc versions use this name
|
2176
|
+
# endif
|
2177
|
+
getcpu_ret = syscall(SYS_getcpu, ¤t_cpu, &g_state.numa.current_node);
|
2143
2178
|
#endif
|
2144
2179
|
|
2145
2180
|
if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) {
|
@@ -2895,11 +2930,21 @@ static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_
|
|
2895
2930
|
return ((const int32_t *)(tensor->op_params))[i];
|
2896
2931
|
}
|
2897
2932
|
|
2933
|
+
static float ggml_get_op_params_f32(const struct ggml_tensor * tensor, uint32_t i) {
|
2934
|
+
assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
|
2935
|
+
return ((const float *)(tensor->op_params))[i];
|
2936
|
+
}
|
2937
|
+
|
2898
2938
|
static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
|
2899
2939
|
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
|
2900
2940
|
((int32_t *)(tensor->op_params))[i] = value;
|
2901
2941
|
}
|
2902
2942
|
|
2943
|
+
static void ggml_set_op_params_f32(struct ggml_tensor * tensor, uint32_t i, float value) {
|
2944
|
+
assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
|
2945
|
+
((float *)(tensor->op_params))[i] = value;
|
2946
|
+
}
|
2947
|
+
|
2903
2948
|
struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
|
2904
2949
|
memset(tensor->data, 0, ggml_nbytes(tensor));
|
2905
2950
|
return tensor;
|
@@ -5898,6 +5943,55 @@ struct ggml_tensor * ggml_upscale(
|
|
5898
5943
|
return ggml_upscale_impl(ctx, a, scale_factor);
|
5899
5944
|
}
|
5900
5945
|
|
5946
|
+
struct ggml_tensor * ggml_arange(
|
5947
|
+
struct ggml_context * ctx,
|
5948
|
+
float start,
|
5949
|
+
float stop,
|
5950
|
+
float step) {
|
5951
|
+
|
5952
|
+
GGML_ASSERT(stop > start);
|
5953
|
+
|
5954
|
+
const int64_t steps = (int64_t) ceilf((stop - start) / step);
|
5955
|
+
|
5956
|
+
struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, steps);
|
5957
|
+
|
5958
|
+
result->op = GGML_OP_ARANGE;
|
5959
|
+
ggml_set_op_params_f32(result, 0, start);
|
5960
|
+
ggml_set_op_params_f32(result, 1, stop);
|
5961
|
+
ggml_set_op_params_f32(result, 2, step);
|
5962
|
+
|
5963
|
+
return result;
|
5964
|
+
}
|
5965
|
+
|
5966
|
+
struct ggml_tensor * ggml_timestep_embedding(
|
5967
|
+
struct ggml_context * ctx,
|
5968
|
+
struct ggml_tensor * timesteps,
|
5969
|
+
int dim,
|
5970
|
+
int max_period) {
|
5971
|
+
bool is_node = false;
|
5972
|
+
|
5973
|
+
if (timesteps->grad) {
|
5974
|
+
GGML_ASSERT(false); // TODO: implement backward
|
5975
|
+
is_node = true;
|
5976
|
+
}
|
5977
|
+
|
5978
|
+
int actual_dim = dim;
|
5979
|
+
if (dim % 2 != 0) {
|
5980
|
+
actual_dim = dim + 1;
|
5981
|
+
}
|
5982
|
+
|
5983
|
+
struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, actual_dim, timesteps->ne[0]);
|
5984
|
+
|
5985
|
+
result->op = GGML_OP_TIMESTEP_EMBEDDING;
|
5986
|
+
ggml_set_op_params_i32(result, 0, dim);
|
5987
|
+
ggml_set_op_params_i32(result, 1, max_period);
|
5988
|
+
|
5989
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5990
|
+
result->src[0] = timesteps;
|
5991
|
+
|
5992
|
+
return result;
|
5993
|
+
}
|
5994
|
+
|
5901
5995
|
// ggml_argsort
|
5902
5996
|
|
5903
5997
|
struct ggml_tensor * ggml_argsort(
|
@@ -6077,6 +6171,108 @@ struct ggml_tensor * ggml_flash_attn_back(
|
|
6077
6171
|
return result;
|
6078
6172
|
}
|
6079
6173
|
|
6174
|
+
// ggml_ssm_conv
|
6175
|
+
|
6176
|
+
struct ggml_tensor * ggml_ssm_conv(
|
6177
|
+
struct ggml_context * ctx,
|
6178
|
+
struct ggml_tensor * s,
|
6179
|
+
struct ggml_tensor * x,
|
6180
|
+
struct ggml_tensor * c,
|
6181
|
+
struct ggml_tensor * sq) {
|
6182
|
+
GGML_ASSERT(ggml_is_3d(s));
|
6183
|
+
GGML_ASSERT(ggml_is_matrix(x));
|
6184
|
+
GGML_ASSERT(ggml_is_matrix(c));
|
6185
|
+
GGML_ASSERT(ggml_is_matrix(sq));
|
6186
|
+
GGML_ASSERT(sq->type == GGML_TYPE_I32);
|
6187
|
+
|
6188
|
+
const int64_t d_conv = c->ne[0];
|
6189
|
+
const int64_t d_inner = c->ne[1];
|
6190
|
+
const int64_t n_tokens = x->ne[1];
|
6191
|
+
const int64_t n_kv = s->ne[2];
|
6192
|
+
|
6193
|
+
GGML_ASSERT( s->ne[0] == d_conv - 1);
|
6194
|
+
GGML_ASSERT( s->ne[1] == d_inner);
|
6195
|
+
GGML_ASSERT( x->ne[0] == d_inner);
|
6196
|
+
GGML_ASSERT(sq->ne[0] == n_kv);
|
6197
|
+
GGML_ASSERT(sq->ne[1] == n_tokens);
|
6198
|
+
|
6199
|
+
bool is_node = false;
|
6200
|
+
|
6201
|
+
if (s->grad || x->grad || c->grad || sq->grad) {
|
6202
|
+
GGML_ASSERT(false); // TODO: implement
|
6203
|
+
is_node = true;
|
6204
|
+
}
|
6205
|
+
|
6206
|
+
// 2-in-1 concatenated x and conv_states, {d_inner, n_tokens} with {d_conv, d_inner, n_kv}
|
6207
|
+
struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, (d_inner*n_tokens) + (d_conv*d_inner*n_kv));
|
6208
|
+
|
6209
|
+
result->op = GGML_OP_SSM_CONV;
|
6210
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6211
|
+
result->src[0] = s;
|
6212
|
+
result->src[1] = x;
|
6213
|
+
result->src[2] = c;
|
6214
|
+
result->src[3] = sq;
|
6215
|
+
|
6216
|
+
return result;
|
6217
|
+
}
|
6218
|
+
|
6219
|
+
// ggml_ssm_scan
|
6220
|
+
|
6221
|
+
struct ggml_tensor * ggml_ssm_scan(
|
6222
|
+
struct ggml_context * ctx,
|
6223
|
+
struct ggml_tensor * s,
|
6224
|
+
struct ggml_tensor * x,
|
6225
|
+
struct ggml_tensor * dt,
|
6226
|
+
struct ggml_tensor * A,
|
6227
|
+
struct ggml_tensor * B,
|
6228
|
+
struct ggml_tensor * C,
|
6229
|
+
struct ggml_tensor * sq) {
|
6230
|
+
GGML_ASSERT(ggml_is_contiguous(s));
|
6231
|
+
GGML_ASSERT(ggml_is_contiguous(x));
|
6232
|
+
GGML_ASSERT(ggml_is_contiguous(dt));
|
6233
|
+
GGML_ASSERT(ggml_is_contiguous(A));
|
6234
|
+
GGML_ASSERT(sq->type == GGML_TYPE_I32);
|
6235
|
+
GGML_ASSERT(B->nb[0] == ggml_type_size(B->type));
|
6236
|
+
GGML_ASSERT(C->nb[0] == ggml_type_size(C->type));
|
6237
|
+
GGML_ASSERT(ggml_are_same_shape(x, dt));
|
6238
|
+
|
6239
|
+
{
|
6240
|
+
const int64_t d_state = s->ne[0];
|
6241
|
+
const int64_t d_inner = s->ne[1];
|
6242
|
+
const int64_t n_tokens = x->ne[1];
|
6243
|
+
|
6244
|
+
GGML_ASSERT(x->ne[0] == d_inner);
|
6245
|
+
GGML_ASSERT(A->ne[0] == d_state);
|
6246
|
+
GGML_ASSERT(A->ne[1] == d_inner);
|
6247
|
+
GGML_ASSERT(B->ne[0] == d_state);
|
6248
|
+
GGML_ASSERT(B->ne[1] == n_tokens);
|
6249
|
+
GGML_ASSERT(C->ne[0] == d_state);
|
6250
|
+
GGML_ASSERT(C->ne[1] == n_tokens);
|
6251
|
+
}
|
6252
|
+
|
6253
|
+
bool is_node = false;
|
6254
|
+
|
6255
|
+
if (s->grad || x->grad || dt->grad || A->grad || B->grad || C->grad || sq->grad) {
|
6256
|
+
GGML_ASSERT(false); // TODO: implement
|
6257
|
+
is_node = true;
|
6258
|
+
}
|
6259
|
+
|
6260
|
+
// 2-in-1 concatenated y and ssm_states, {d_inner, n_tokens} with {d_state, d_inner, n_kv}
|
6261
|
+
struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ggml_nelements(x) + ggml_nelements(s));
|
6262
|
+
|
6263
|
+
result->op = GGML_OP_SSM_SCAN;
|
6264
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6265
|
+
result->src[0] = s;
|
6266
|
+
result->src[1] = x;
|
6267
|
+
result->src[2] = dt;
|
6268
|
+
result->src[3] = A;
|
6269
|
+
result->src[4] = B;
|
6270
|
+
result->src[5] = C;
|
6271
|
+
result->src[6] = sq;
|
6272
|
+
|
6273
|
+
return result;
|
6274
|
+
}
|
6275
|
+
|
6080
6276
|
// ggml_win_part
|
6081
6277
|
|
6082
6278
|
struct ggml_tensor * ggml_win_part(
|
@@ -10231,7 +10427,7 @@ static void ggml_compute_forward_group_norm_f32(
|
|
10231
10427
|
int n_channels = src0->ne[2];
|
10232
10428
|
int n_groups = dst->op_params[0];
|
10233
10429
|
int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
|
10234
|
-
for (int i = ith; i < n_groups; i+=nth) {
|
10430
|
+
for (int i = ith; i < n_groups; i += nth) {
|
10235
10431
|
int start = i * n_channels_per_group;
|
10236
10432
|
int end = start + n_channels_per_group;
|
10237
10433
|
if (end > n_channels) {
|
@@ -10245,28 +10441,32 @@ static void ggml_compute_forward_group_norm_f32(
|
|
10245
10441
|
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
10246
10442
|
const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
|
10247
10443
|
|
10444
|
+
ggml_float sumr = 0.0;
|
10248
10445
|
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
10249
|
-
|
10446
|
+
sumr += (ggml_float)x[i00];
|
10250
10447
|
}
|
10448
|
+
sum += sumr;
|
10251
10449
|
}
|
10252
10450
|
}
|
10253
|
-
float mean = sum / (ne00 * ne01 * step);
|
10254
|
-
ggml_float sum2 = 0.0;
|
10451
|
+
const float mean = sum / (ne00 * ne01 * step);
|
10255
10452
|
|
10453
|
+
ggml_float sum2 = 0.0;
|
10256
10454
|
for (int64_t i02 = start; i02 < end; i02++) {
|
10257
10455
|
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
10258
10456
|
const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
|
10259
10457
|
|
10260
10458
|
float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
|
10261
10459
|
|
10460
|
+
ggml_float sumr = 0.0;
|
10262
10461
|
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
10263
10462
|
float v = x[i00] - mean;
|
10264
10463
|
y[i00] = v;
|
10265
|
-
|
10464
|
+
sumr += (ggml_float)(v * v);
|
10266
10465
|
}
|
10466
|
+
sum2 += sumr;
|
10267
10467
|
}
|
10268
10468
|
}
|
10269
|
-
float variance = sum2 / (ne00 * ne01 * step);
|
10469
|
+
const float variance = sum2 / (ne00 * ne01 * step);
|
10270
10470
|
const float scale = 1.0f / sqrtf(variance + eps);
|
10271
10471
|
|
10272
10472
|
for (int64_t i02 = start; i02 < end; i02++) {
|
@@ -11373,8 +11573,6 @@ static void ggml_compute_forward_get_rows_q(
|
|
11373
11573
|
const struct ggml_tensor * src0 = dst->src[0];
|
11374
11574
|
const struct ggml_tensor * src1 = dst->src[1];
|
11375
11575
|
|
11376
|
-
assert(params->ith == 0);
|
11377
|
-
|
11378
11576
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
11379
11577
|
return;
|
11380
11578
|
}
|
@@ -11382,7 +11580,7 @@ static void ggml_compute_forward_get_rows_q(
|
|
11382
11580
|
GGML_TENSOR_BINARY_OP_LOCALS
|
11383
11581
|
|
11384
11582
|
const int64_t nc = ne00;
|
11385
|
-
const int64_t nr = ggml_nelements(src1);
|
11583
|
+
const int64_t nr = ggml_nelements(src1);
|
11386
11584
|
|
11387
11585
|
const enum ggml_type type = src0->type;
|
11388
11586
|
ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
|
@@ -11392,17 +11590,25 @@ static void ggml_compute_forward_get_rows_q(
|
|
11392
11590
|
assert(nb00 == ggml_type_size(type));
|
11393
11591
|
assert(ggml_nrows(dst) == nr);
|
11394
11592
|
|
11395
|
-
|
11396
|
-
|
11397
|
-
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
11398
|
-
for (int64_t i10 = 0; i10 < ne10; ++i10) {
|
11399
|
-
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
11593
|
+
const int ith = params->ith;
|
11594
|
+
const int nth = params->nth;
|
11400
11595
|
|
11401
|
-
|
11402
|
-
|
11403
|
-
|
11404
|
-
|
11405
|
-
|
11596
|
+
// rows per thread
|
11597
|
+
const int dr = (nr + nth - 1)/nth;
|
11598
|
+
|
11599
|
+
// row range for this thread
|
11600
|
+
const int ir0 = dr*ith;
|
11601
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
11602
|
+
|
11603
|
+
for (int64_t i = ir0; i < ir1; ++i) {
|
11604
|
+
const int64_t i12 = i/(ne11*ne10);
|
11605
|
+
const int64_t i11 = (i - i12*ne11*ne10)/ne10;
|
11606
|
+
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
11607
|
+
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
11608
|
+
|
11609
|
+
dequantize_row_q(
|
11610
|
+
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
11611
|
+
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
11406
11612
|
}
|
11407
11613
|
}
|
11408
11614
|
|
@@ -11413,8 +11619,6 @@ static void ggml_compute_forward_get_rows_f16(
|
|
11413
11619
|
const struct ggml_tensor * src0 = dst->src[0];
|
11414
11620
|
const struct ggml_tensor * src1 = dst->src[1];
|
11415
11621
|
|
11416
|
-
assert(params->ith == 0);
|
11417
|
-
|
11418
11622
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
11419
11623
|
return;
|
11420
11624
|
}
|
@@ -11422,24 +11626,32 @@ static void ggml_compute_forward_get_rows_f16(
|
|
11422
11626
|
GGML_TENSOR_BINARY_OP_LOCALS
|
11423
11627
|
|
11424
11628
|
const int64_t nc = ne00;
|
11425
|
-
const int64_t nr = ggml_nelements(src1);
|
11629
|
+
const int64_t nr = ggml_nelements(src1);
|
11426
11630
|
|
11427
11631
|
assert(ne0 == nc);
|
11428
11632
|
assert(ne02 == ne11);
|
11429
11633
|
assert(nb00 == sizeof(ggml_fp16_t));
|
11430
11634
|
assert(ggml_nrows(dst) == nr);
|
11431
11635
|
|
11432
|
-
|
11433
|
-
|
11434
|
-
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
11435
|
-
for (int64_t i10 = 0; i10 < ne10; ++i10) {
|
11436
|
-
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
11636
|
+
const int ith = params->ith;
|
11637
|
+
const int nth = params->nth;
|
11437
11638
|
|
11438
|
-
|
11439
|
-
|
11440
|
-
|
11441
|
-
|
11442
|
-
|
11639
|
+
// rows per thread
|
11640
|
+
const int dr = (nr + nth - 1)/nth;
|
11641
|
+
|
11642
|
+
// row range for this thread
|
11643
|
+
const int ir0 = dr*ith;
|
11644
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
11645
|
+
|
11646
|
+
for (int64_t i = ir0; i < ir1; ++i) {
|
11647
|
+
const int64_t i12 = i/(ne11*ne10);
|
11648
|
+
const int64_t i11 = (i - i12*ne11*ne10)/ne10;
|
11649
|
+
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
11650
|
+
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
11651
|
+
|
11652
|
+
ggml_fp16_to_fp32_row(
|
11653
|
+
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
11654
|
+
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
11443
11655
|
}
|
11444
11656
|
}
|
11445
11657
|
|
@@ -11450,8 +11662,6 @@ static void ggml_compute_forward_get_rows_f32(
|
|
11450
11662
|
const struct ggml_tensor * src0 = dst->src[0];
|
11451
11663
|
const struct ggml_tensor * src1 = dst->src[1];
|
11452
11664
|
|
11453
|
-
assert(params->ith == 0);
|
11454
|
-
|
11455
11665
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
11456
11666
|
return;
|
11457
11667
|
}
|
@@ -11459,24 +11669,32 @@ static void ggml_compute_forward_get_rows_f32(
|
|
11459
11669
|
GGML_TENSOR_BINARY_OP_LOCALS
|
11460
11670
|
|
11461
11671
|
const int64_t nc = ne00;
|
11462
|
-
const int64_t nr = ggml_nelements(src1);
|
11672
|
+
const int64_t nr = ggml_nelements(src1);
|
11463
11673
|
|
11464
11674
|
assert(ne0 == nc);
|
11465
11675
|
assert(ne02 == ne11);
|
11466
11676
|
assert(nb00 == sizeof(float));
|
11467
11677
|
assert(ggml_nrows(dst) == nr);
|
11468
11678
|
|
11469
|
-
|
11470
|
-
|
11471
|
-
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
11472
|
-
for (int64_t i10 = 0; i10 < ne10; ++i10) {
|
11473
|
-
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
11679
|
+
const int ith = params->ith;
|
11680
|
+
const int nth = params->nth;
|
11474
11681
|
|
11475
|
-
|
11476
|
-
|
11477
|
-
|
11478
|
-
|
11479
|
-
|
11682
|
+
// rows per thread
|
11683
|
+
const int dr = (nr + nth - 1)/nth;
|
11684
|
+
|
11685
|
+
// row range for this thread
|
11686
|
+
const int ir0 = dr*ith;
|
11687
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
11688
|
+
|
11689
|
+
for (int64_t i = ir0; i < ir1; ++i) {
|
11690
|
+
const int64_t i12 = i/(ne11*ne10);
|
11691
|
+
const int64_t i11 = (i - i12*ne11*ne10)/ne10;
|
11692
|
+
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
11693
|
+
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
11694
|
+
|
11695
|
+
ggml_vec_cpy_f32(nc,
|
11696
|
+
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
|
11697
|
+
(float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
|
11480
11698
|
}
|
11481
11699
|
}
|
11482
11700
|
|
@@ -12213,6 +12431,8 @@ static void ggml_compute_forward_alibi(
|
|
12213
12431
|
case GGML_TYPE_I8:
|
12214
12432
|
case GGML_TYPE_I16:
|
12215
12433
|
case GGML_TYPE_I32:
|
12434
|
+
case GGML_TYPE_I64:
|
12435
|
+
case GGML_TYPE_F64:
|
12216
12436
|
case GGML_TYPE_COUNT:
|
12217
12437
|
{
|
12218
12438
|
GGML_ASSERT(false);
|
@@ -12299,6 +12519,8 @@ static void ggml_compute_forward_clamp(
|
|
12299
12519
|
case GGML_TYPE_I8:
|
12300
12520
|
case GGML_TYPE_I16:
|
12301
12521
|
case GGML_TYPE_I32:
|
12522
|
+
case GGML_TYPE_I64:
|
12523
|
+
case GGML_TYPE_F64:
|
12302
12524
|
case GGML_TYPE_COUNT:
|
12303
12525
|
{
|
12304
12526
|
GGML_ASSERT(false);
|
@@ -13547,6 +13769,106 @@ static void ggml_compute_forward_pad(
|
|
13547
13769
|
}
|
13548
13770
|
}
|
13549
13771
|
|
13772
|
+
|
13773
|
+
// ggml_compute_forward_arange
|
13774
|
+
|
13775
|
+
static void ggml_compute_forward_arange_f32(
|
13776
|
+
const struct ggml_compute_params * params,
|
13777
|
+
struct ggml_tensor * dst) {
|
13778
|
+
|
13779
|
+
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
13780
|
+
return;
|
13781
|
+
}
|
13782
|
+
|
13783
|
+
GGML_ASSERT(dst->nb[0] == sizeof(float));
|
13784
|
+
|
13785
|
+
const int ith = params->ith;
|
13786
|
+
const int nth = params->nth;
|
13787
|
+
|
13788
|
+
const float start = ggml_get_op_params_f32(dst, 0);
|
13789
|
+
const float stop = ggml_get_op_params_f32(dst, 1);
|
13790
|
+
const float step = ggml_get_op_params_f32(dst, 2);
|
13791
|
+
|
13792
|
+
const int64_t steps = (int64_t) ceilf((stop - start) / step);
|
13793
|
+
|
13794
|
+
GGML_ASSERT(ggml_nelements(dst) == steps);
|
13795
|
+
|
13796
|
+
for (int64_t i = ith; i < steps; i+= nth) {
|
13797
|
+
float value = start + step * i;
|
13798
|
+
((float *)dst->data)[i] = value;
|
13799
|
+
}
|
13800
|
+
}
|
13801
|
+
|
13802
|
+
static void ggml_compute_forward_arange(
|
13803
|
+
const struct ggml_compute_params * params,
|
13804
|
+
struct ggml_tensor * dst) {
|
13805
|
+
switch (dst->type) {
|
13806
|
+
case GGML_TYPE_F32:
|
13807
|
+
{
|
13808
|
+
ggml_compute_forward_arange_f32(params, dst);
|
13809
|
+
} break;
|
13810
|
+
default:
|
13811
|
+
{
|
13812
|
+
GGML_ASSERT(false);
|
13813
|
+
} break;
|
13814
|
+
}
|
13815
|
+
}
|
13816
|
+
|
13817
|
+
static void ggml_compute_forward_timestep_embedding_f32(
|
13818
|
+
const struct ggml_compute_params * params,
|
13819
|
+
struct ggml_tensor * dst) {
|
13820
|
+
|
13821
|
+
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
13822
|
+
return;
|
13823
|
+
}
|
13824
|
+
|
13825
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
13826
|
+
|
13827
|
+
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
13828
|
+
|
13829
|
+
const int ith = params->ith;
|
13830
|
+
const int nth = params->nth;
|
13831
|
+
|
13832
|
+
GGML_TENSOR_UNARY_OP_LOCALS
|
13833
|
+
|
13834
|
+
const int dim = ggml_get_op_params_i32(dst, 0);
|
13835
|
+
const int max_period = ggml_get_op_params_i32(dst, 1);
|
13836
|
+
|
13837
|
+
int half = dim / 2;
|
13838
|
+
|
13839
|
+
for (int64_t i = 0; i < ne00; i++) {
|
13840
|
+
float * embed_data = (float *)((char *) dst->data + i*nb1);
|
13841
|
+
for (int64_t j = ith; j < half; j += nth) {
|
13842
|
+
float timestep = ((float *)src0->data)[i];
|
13843
|
+
float freq = (float)expf(-logf(max_period) * j / half);
|
13844
|
+
float arg = timestep * freq;
|
13845
|
+
embed_data[j] = cosf(arg);
|
13846
|
+
embed_data[j + half] = sinf(arg);
|
13847
|
+
}
|
13848
|
+
if (dim % 2 != 0 && ith == 0) {
|
13849
|
+
embed_data[dim] = 0.f;
|
13850
|
+
}
|
13851
|
+
}
|
13852
|
+
}
|
13853
|
+
|
13854
|
+
static void ggml_compute_forward_timestep_embedding(
|
13855
|
+
const struct ggml_compute_params * params,
|
13856
|
+
struct ggml_tensor * dst) {
|
13857
|
+
|
13858
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
13859
|
+
|
13860
|
+
switch (src0->type) {
|
13861
|
+
case GGML_TYPE_F32:
|
13862
|
+
{
|
13863
|
+
ggml_compute_forward_timestep_embedding_f32(params, dst);
|
13864
|
+
} break;
|
13865
|
+
default:
|
13866
|
+
{
|
13867
|
+
GGML_ASSERT(false);
|
13868
|
+
} break;
|
13869
|
+
}
|
13870
|
+
}
|
13871
|
+
|
13550
13872
|
// ggml_compute_forward_argsort
|
13551
13873
|
|
13552
13874
|
static void ggml_compute_forward_argsort_f32(
|
@@ -14590,6 +14912,257 @@ static void ggml_compute_forward_flash_attn_back(
|
|
14590
14912
|
}
|
14591
14913
|
}
|
14592
14914
|
|
14915
|
+
// ggml_compute_forward_ssm_conv
|
14916
|
+
|
14917
|
+
static void ggml_compute_forward_ssm_conv_f32(
|
14918
|
+
const struct ggml_compute_params * params,
|
14919
|
+
struct ggml_tensor * dst) {
|
14920
|
+
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
14921
|
+
return;
|
14922
|
+
}
|
14923
|
+
|
14924
|
+
const struct ggml_tensor * src0 = dst->src[0]; // conv_state
|
14925
|
+
const struct ggml_tensor * src1 = dst->src[1]; // x
|
14926
|
+
const struct ggml_tensor * src2 = dst->src[2]; // conv1d.weight
|
14927
|
+
const struct ggml_tensor * src3 = dst->src[3]; // state_seq
|
14928
|
+
|
14929
|
+
const int ith = params->ith;
|
14930
|
+
const int nth = params->nth;
|
14931
|
+
|
14932
|
+
const int nc = src2->ne[0]; // d_conv
|
14933
|
+
const int nr = src0->ne[1]; // d_inner
|
14934
|
+
const int n_t = src1->ne[1]; // n_tokens
|
14935
|
+
const int n_kv = src0->ne[2]; // max number of sequences in the batch
|
14936
|
+
|
14937
|
+
GGML_ASSERT((nr*n_t) + (nc*nr*n_kv) == ggml_nelements(dst));
|
14938
|
+
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
14939
|
+
GGML_ASSERT(src1->nb[0] == sizeof(float));
|
14940
|
+
GGML_ASSERT(src2->nb[0] == sizeof(float));
|
14941
|
+
GGML_ASSERT(src3->nb[0] == sizeof(int32_t));
|
14942
|
+
GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float));
|
14943
|
+
// for use with the destination state offset between sequences
|
14944
|
+
GGML_ASSERT(src2->nb[2] == src2->ne[1]*src2->ne[0]*sizeof(float));
|
14945
|
+
|
14946
|
+
// rows per thread
|
14947
|
+
const int dr = (nr + nth - 1)/nth;
|
14948
|
+
|
14949
|
+
// row range for this thread
|
14950
|
+
const int ir0 = dr*ith;
|
14951
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
14952
|
+
const int ir = ir1 - ir0;
|
14953
|
+
|
14954
|
+
if (n_kv > 1) {
|
14955
|
+
// multiple sequences means it's hard to know when it's the first time a state is read,
|
14956
|
+
// so copy them all over to the destination, just to be sure.
|
14957
|
+
for (int i3 = 0; i3 < n_kv; ++i3) {
|
14958
|
+
float * s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]));
|
14959
|
+
float * s = (float *) ((char *) dst->data + ir0*(src2->nb[1]) + i3*(src2->nb[2]) + nr*n_t*sizeof(float));
|
14960
|
+
// can't use memcpy because of d_conv vs d_conv - 1
|
14961
|
+
for (int i1 = 0; i1 < ir; ++i1) {
|
14962
|
+
for (int i0 = 0; i0 < nc - 1; ++i0) {
|
14963
|
+
// copy s0 to last (d_conv - 1) columns of s
|
14964
|
+
s[1 + i0 + i1*nc] = s0[i0 + i1*(nc - 1)];
|
14965
|
+
}
|
14966
|
+
}
|
14967
|
+
}
|
14968
|
+
}
|
14969
|
+
|
14970
|
+
for (int i2 = 0; i2 < n_t; ++i2) {
|
14971
|
+
int32_t * sq = (int32_t *) ((char *) src3->data + i2*(src3->nb[1])); // {n_kv, n_tokens}
|
14972
|
+
float * x = (float *) ((char *) dst->data + ir0*sizeof(float) + i2*(nr*sizeof(float))); // {d_inner, n_tokens}
|
14973
|
+
float * s = (float *) ((char *) dst->data + ir0*(src2->nb[1]) + sq[0]*(src2->nb[2]) + nr*n_t*sizeof(float)); // {d_conv, d_inner, n_kv}
|
14974
|
+
float * s0; // {d_conv - 1, d_inner, n_kv}
|
14975
|
+
float * x0 = (float *) ((char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1])); // {d_inner, n_tokens}
|
14976
|
+
float * c = (float *) ((char *) src2->data + ir0*(src2->nb[1])); // {d_conv, d_inner}
|
14977
|
+
int ne0s0;
|
14978
|
+
|
14979
|
+
GGML_ASSERT(0 <= sq[0] && sq[0] < n_kv);
|
14980
|
+
|
14981
|
+
// avoid needing to copy the state for the first token
|
14982
|
+
if (i2 == 0) {
|
14983
|
+
s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + sq[0]*(src0->nb[2])); // {d_conv - 1, d_inner, n_kv}
|
14984
|
+
ne0s0 = src0->ne[0];
|
14985
|
+
} else {
|
14986
|
+
// the source is the last (d_conv - 1) columns of the destination
|
14987
|
+
s0 = s + 1;
|
14988
|
+
ne0s0 = nc;
|
14989
|
+
}
|
14990
|
+
|
14991
|
+
// d_inner
|
14992
|
+
for (int i1 = 0; i1 < ir; ++i1) {
|
14993
|
+
// shift state left
|
14994
|
+
for (int i0 = 0; i0 < nc - 1; ++i0) {
|
14995
|
+
s[i0 + i1*nc] = s0[i0 + i1*ne0s0];
|
14996
|
+
}
|
14997
|
+
// insert x on the last column
|
14998
|
+
s[(nc - 1) + i1*nc] = x0[i1];
|
14999
|
+
}
|
15000
|
+
|
15001
|
+
// handle copies when there are multiple output states
|
15002
|
+
for (int i3 = 1; i3 < n_kv; ++i3) {
|
15003
|
+
int32_t seq = sq[i3];
|
15004
|
+
if (0 <= seq && seq < n_kv) {
|
15005
|
+
float * s1 = s + (seq - sq[0])*nc*nr;
|
15006
|
+
memcpy(s1, s, nc*ir*sizeof(float));
|
15007
|
+
} else {
|
15008
|
+
// stop at negative or too big seq_ids
|
15009
|
+
break;
|
15010
|
+
}
|
15011
|
+
}
|
15012
|
+
|
15013
|
+
// it seems a little faster when this is separate from the state shift
|
15014
|
+
for (int i1 = 0; i1 < ir; ++i1) {
|
15015
|
+
// rowwise dot product
|
15016
|
+
float sumf = 0.0f;
|
15017
|
+
for (int i0 = 0; i0 < nc; ++i0) {
|
15018
|
+
int i = i0 + i1*nc;
|
15019
|
+
sumf += s[i] * c[i];
|
15020
|
+
}
|
15021
|
+
x[i1] = sumf;
|
15022
|
+
}
|
15023
|
+
}
|
15024
|
+
}
|
15025
|
+
|
15026
|
+
static void ggml_compute_forward_ssm_conv(
|
15027
|
+
const struct ggml_compute_params * params,
|
15028
|
+
struct ggml_tensor * dst) {
|
15029
|
+
switch (dst->src[0]->type) {
|
15030
|
+
case GGML_TYPE_F32:
|
15031
|
+
{
|
15032
|
+
ggml_compute_forward_ssm_conv_f32(params, dst);
|
15033
|
+
} break;
|
15034
|
+
default:
|
15035
|
+
{
|
15036
|
+
GGML_ASSERT(false);
|
15037
|
+
} break;
|
15038
|
+
}
|
15039
|
+
}
|
15040
|
+
|
15041
|
+
// ggml_compute_forward_ssm_scan
|
15042
|
+
|
15043
|
+
static void ggml_compute_forward_ssm_scan_f32(
|
15044
|
+
const struct ggml_compute_params * params,
|
15045
|
+
struct ggml_tensor * dst) {
|
15046
|
+
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
15047
|
+
return;
|
15048
|
+
}
|
15049
|
+
|
15050
|
+
const struct ggml_tensor * src0 = dst->src[0]; // s
|
15051
|
+
const struct ggml_tensor * src1 = dst->src[1]; // x
|
15052
|
+
const struct ggml_tensor * src2 = dst->src[2]; // dt
|
15053
|
+
const struct ggml_tensor * src3 = dst->src[3]; // A
|
15054
|
+
const struct ggml_tensor * src4 = dst->src[4]; // B
|
15055
|
+
const struct ggml_tensor * src5 = dst->src[5]; // C
|
15056
|
+
const struct ggml_tensor * src6 = dst->src[6]; // sq
|
15057
|
+
|
15058
|
+
const int ith = params->ith;
|
15059
|
+
const int nth = params->nth;
|
15060
|
+
|
15061
|
+
const int64_t nc = src0->ne[0]; // d_state
|
15062
|
+
const int64_t nr = src0->ne[1]; // d_inner
|
15063
|
+
const int64_t n_t = src1->ne[1]; // number of tokens in the batch
|
15064
|
+
const int64_t n_kv = src0->ne[2]; // max number of sequences in the batch
|
15065
|
+
|
15066
|
+
GGML_ASSERT(ggml_nelements(src1) + ggml_nelements(src0) == ggml_nelements(dst));
|
15067
|
+
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
15068
|
+
GGML_ASSERT(src1->nb[0] == sizeof(float));
|
15069
|
+
GGML_ASSERT(src2->nb[0] == sizeof(float));
|
15070
|
+
GGML_ASSERT(src3->nb[0] == sizeof(float));
|
15071
|
+
GGML_ASSERT(src4->nb[0] == sizeof(float));
|
15072
|
+
GGML_ASSERT(src5->nb[0] == sizeof(float));
|
15073
|
+
// required for the dot product between s and C, and when copying the states
|
15074
|
+
GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float));
|
15075
|
+
// required for per-sequence offsets for states
|
15076
|
+
GGML_ASSERT(src0->nb[2] == src0->ne[0]*src0->ne[1]*sizeof(float));
|
15077
|
+
// required to get correct offset for state destination (i.e. src1->nb[2])
|
15078
|
+
GGML_ASSERT(src1->nb[2] == src1->ne[0]*src1->ne[1]*sizeof(float));
|
15079
|
+
|
15080
|
+
// rows per thread
|
15081
|
+
const int dr = (nr + nth - 1)/nth;
|
15082
|
+
|
15083
|
+
// row range for this thread
|
15084
|
+
const int ir0 = dr*ith;
|
15085
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
15086
|
+
const int ir = ir1 - ir0;
|
15087
|
+
|
15088
|
+
if (n_kv > 1) {
|
15089
|
+
// it's hard to know if the source states have already been copied
|
15090
|
+
// when there are multiple, so copy them already.
|
15091
|
+
for (int i3 = 0; i3 < n_kv; ++i3) {
|
15092
|
+
float * s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]));
|
15093
|
+
float * s = (float *) ((char *) dst->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]) + src1->nb[2]);
|
15094
|
+
memcpy(s, s0, nc*ir*sizeof(float));
|
15095
|
+
}
|
15096
|
+
}
|
15097
|
+
|
15098
|
+
for (int i2 = 0; i2 < n_t; ++i2) {
|
15099
|
+
int32_t * sq = (int32_t *) ((char *) src6->data + i2*(src6->nb[1])); // {n_kv, n_tokens}
|
15100
|
+
float * y = (float *) ((char *) dst->data + ir0*(src1->nb[0]) + i2*(src1->nb[1])); // {d_inner, n_tokens}
|
15101
|
+
float * s = (float *) ((char *) dst->data + ir0*(src0->nb[1]) + sq[0]*(src0->nb[2]) + src1->nb[2]); // {d_state, d_inner, n_kv}
|
15102
|
+
float * s0;
|
15103
|
+
float * x = (float *) ((char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1])); // {d_inner, n_tokens}
|
15104
|
+
float * dt = (float *) ((char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1])); // {d_inner, n_tokens}
|
15105
|
+
float * A = (float *) ((char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
|
15106
|
+
float * B = (float *) ((char *) src4->data + i2*(src4->nb[1])); // {d_state, n_tokens}
|
15107
|
+
float * C = (float *) ((char *) src5->data + i2*(src5->nb[1])); // {d_state, n_tokens}
|
15108
|
+
|
15109
|
+
GGML_ASSERT(0 <= sq[0] && sq[0] < n_kv);
|
15110
|
+
|
15111
|
+
// avoid needing to copy the state for the first token
|
15112
|
+
if (i2 == 0) {
|
15113
|
+
s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + sq[0]*(src0->nb[2])); // {d_state, d_inner, n_kv}
|
15114
|
+
} else {
|
15115
|
+
// otherwise the source is the same as the destination
|
15116
|
+
s0 = s;
|
15117
|
+
}
|
15118
|
+
|
15119
|
+
// d_inner
|
15120
|
+
for (int i1 = 0; i1 < ir; ++i1) {
|
15121
|
+
// ref: https://github.com/state-spaces/mamba/blob/34076d664838588a3c97727b263478ab9f621a07/mamba_ssm/ops/triton/selective_state_update.py#L78
|
15122
|
+
float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1];
|
15123
|
+
float x_dt = x[i1] * dt_soft_plus;
|
15124
|
+
float sumf = 0.0f;
|
15125
|
+
// d_state
|
15126
|
+
for (int i0 = 0; i0 < nc; ++i0) {
|
15127
|
+
int i = i0 + i1*nc;
|
15128
|
+
// state = prev_state * dA + dB * x
|
15129
|
+
float state = (s0[i] * expf(dt_soft_plus * A[i])) + (B[i0] * x_dt);
|
15130
|
+
// y = rowwise_dotprod(state, C)
|
15131
|
+
sumf += state * C[i0];
|
15132
|
+
s[i] = state;
|
15133
|
+
}
|
15134
|
+
y[i1] = sumf;
|
15135
|
+
}
|
15136
|
+
|
15137
|
+
// handle copies when there are multiple output states
|
15138
|
+
for (int i3 = 1; i3 < n_kv; ++i3) {
|
15139
|
+
int32_t seq = sq[i3];
|
15140
|
+
if (0 <= seq && seq < n_kv) {
|
15141
|
+
float * s1 = s + (seq - sq[0])*nc*nr;
|
15142
|
+
memcpy(s1, s, nc*ir*sizeof(float));
|
15143
|
+
} else {
|
15144
|
+
// stop at negative or too big seq_ids
|
15145
|
+
break;
|
15146
|
+
}
|
15147
|
+
}
|
15148
|
+
}
|
15149
|
+
}
|
15150
|
+
|
15151
|
+
static void ggml_compute_forward_ssm_scan(
|
15152
|
+
const struct ggml_compute_params * params,
|
15153
|
+
struct ggml_tensor * dst) {
|
15154
|
+
switch (dst->src[0]->type) {
|
15155
|
+
case GGML_TYPE_F32:
|
15156
|
+
{
|
15157
|
+
ggml_compute_forward_ssm_scan_f32(params, dst);
|
15158
|
+
} break;
|
15159
|
+
default:
|
15160
|
+
{
|
15161
|
+
GGML_ASSERT(false);
|
15162
|
+
} break;
|
15163
|
+
}
|
15164
|
+
}
|
15165
|
+
|
14593
15166
|
// ggml_compute_forward_win_part
|
14594
15167
|
|
14595
15168
|
static void ggml_compute_forward_win_part_f32(
|
@@ -15615,6 +16188,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
15615
16188
|
{
|
15616
16189
|
ggml_compute_forward_pad(params, tensor);
|
15617
16190
|
} break;
|
16191
|
+
case GGML_OP_ARANGE:
|
16192
|
+
{
|
16193
|
+
ggml_compute_forward_arange(params, tensor);
|
16194
|
+
} break;
|
16195
|
+
case GGML_OP_TIMESTEP_EMBEDDING:
|
16196
|
+
{
|
16197
|
+
ggml_compute_forward_timestep_embedding(params, tensor);
|
16198
|
+
} break;
|
15618
16199
|
case GGML_OP_ARGSORT:
|
15619
16200
|
{
|
15620
16201
|
ggml_compute_forward_argsort(params, tensor);
|
@@ -15641,6 +16222,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
15641
16222
|
bool masked = t != 0;
|
15642
16223
|
ggml_compute_forward_flash_attn_back(params, masked, tensor);
|
15643
16224
|
} break;
|
16225
|
+
case GGML_OP_SSM_CONV:
|
16226
|
+
{
|
16227
|
+
ggml_compute_forward_ssm_conv(params, tensor);
|
16228
|
+
} break;
|
16229
|
+
case GGML_OP_SSM_SCAN:
|
16230
|
+
{
|
16231
|
+
ggml_compute_forward_ssm_scan(params, tensor);
|
16232
|
+
} break;
|
15644
16233
|
case GGML_OP_WIN_PART:
|
15645
16234
|
{
|
15646
16235
|
ggml_compute_forward_win_part(params, tensor);
|
@@ -16617,6 +17206,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
16617
17206
|
{
|
16618
17207
|
GGML_ASSERT(false); // TODO: not implemented
|
16619
17208
|
} break;
|
17209
|
+
case GGML_OP_ARANGE:
|
17210
|
+
{
|
17211
|
+
GGML_ASSERT(false); // TODO: not implemented
|
17212
|
+
} break;
|
17213
|
+
case GGML_OP_TIMESTEP_EMBEDDING:
|
17214
|
+
{
|
17215
|
+
GGML_ASSERT(false); // TODO: not implemented
|
17216
|
+
} break;
|
16620
17217
|
case GGML_OP_ARGSORT:
|
16621
17218
|
{
|
16622
17219
|
GGML_ASSERT(false); // TODO: not implemented
|
@@ -16687,6 +17284,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
16687
17284
|
{
|
16688
17285
|
GGML_ASSERT(false); // not supported
|
16689
17286
|
} break;
|
17287
|
+
case GGML_OP_SSM_CONV:
|
17288
|
+
case GGML_OP_SSM_SCAN:
|
17289
|
+
{
|
17290
|
+
GGML_ASSERT(false); // TODO: not implemented
|
17291
|
+
} break;
|
16690
17292
|
case GGML_OP_WIN_PART:
|
16691
17293
|
case GGML_OP_WIN_UNPART:
|
16692
17294
|
case GGML_OP_UNARY:
|
@@ -17217,6 +17819,7 @@ struct ggml_compute_state {
|
|
17217
17819
|
ggml_thread_t thrd;
|
17218
17820
|
int ith;
|
17219
17821
|
struct ggml_compute_state_shared * shared;
|
17822
|
+
enum ggml_status ec;
|
17220
17823
|
};
|
17221
17824
|
|
17222
17825
|
static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
|
@@ -17228,7 +17831,7 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
|
|
17228
17831
|
node->perf_time_us += time_us_cur;
|
17229
17832
|
}
|
17230
17833
|
|
17231
|
-
static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
17834
|
+
static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_threads) {
|
17232
17835
|
int n_tasks = 0;
|
17233
17836
|
|
17234
17837
|
switch (node->op) {
|
@@ -17309,6 +17912,12 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
17309
17912
|
{
|
17310
17913
|
n_tasks = n_threads;
|
17311
17914
|
} break;
|
17915
|
+
case GGML_OP_GET_ROWS:
|
17916
|
+
{
|
17917
|
+
// FIXME: the cost of launching additional threads decreases performance with GPU offloading
|
17918
|
+
//n_tasks = MIN(n_threads, ggml_nelements(node->src[1]));
|
17919
|
+
n_tasks = MIN(n_cur_threads, ggml_nelements(node->src[1]));
|
17920
|
+
} break;
|
17312
17921
|
case GGML_OP_SCALE:
|
17313
17922
|
case GGML_OP_SET:
|
17314
17923
|
case GGML_OP_CONT:
|
@@ -17316,7 +17925,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
17316
17925
|
case GGML_OP_VIEW:
|
17317
17926
|
case GGML_OP_PERMUTE:
|
17318
17927
|
case GGML_OP_TRANSPOSE:
|
17319
|
-
case GGML_OP_GET_ROWS:
|
17320
17928
|
case GGML_OP_GET_ROWS_BACK:
|
17321
17929
|
case GGML_OP_DIAG:
|
17322
17930
|
{
|
@@ -17368,6 +17976,14 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
17368
17976
|
{
|
17369
17977
|
n_tasks = n_threads;
|
17370
17978
|
} break;
|
17979
|
+
case GGML_OP_ARANGE:
|
17980
|
+
{
|
17981
|
+
n_tasks = n_threads;
|
17982
|
+
} break;
|
17983
|
+
case GGML_OP_TIMESTEP_EMBEDDING:
|
17984
|
+
{
|
17985
|
+
n_tasks = n_threads;
|
17986
|
+
} break;
|
17371
17987
|
case GGML_OP_ARGSORT:
|
17372
17988
|
{
|
17373
17989
|
n_tasks = n_threads;
|
@@ -17384,6 +18000,11 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
17384
18000
|
{
|
17385
18001
|
n_tasks = n_threads;
|
17386
18002
|
} break;
|
18003
|
+
case GGML_OP_SSM_CONV:
|
18004
|
+
case GGML_OP_SSM_SCAN:
|
18005
|
+
{
|
18006
|
+
n_tasks = n_threads;
|
18007
|
+
} break;
|
17387
18008
|
case GGML_OP_WIN_PART:
|
17388
18009
|
case GGML_OP_WIN_UNPART:
|
17389
18010
|
case GGML_OP_GET_REL_POS:
|
@@ -17502,7 +18123,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
17502
18123
|
while (true) {
|
17503
18124
|
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
|
17504
18125
|
state->shared->node_n += 1;
|
17505
|
-
|
18126
|
+
state->ec = GGML_STATUS_ABORTED;
|
18127
|
+
return 0;
|
17506
18128
|
}
|
17507
18129
|
|
17508
18130
|
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
@@ -17520,7 +18142,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
17520
18142
|
/* FINALIZE */
|
17521
18143
|
struct ggml_tensor * node = cgraph->nodes[node_n];
|
17522
18144
|
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
17523
|
-
params.nth = ggml_get_n_tasks(node, n_threads);
|
18145
|
+
params.nth = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
|
17524
18146
|
ggml_compute_forward(¶ms, node);
|
17525
18147
|
}
|
17526
18148
|
ggml_graph_compute_perf_stats_node(node, state->shared);
|
@@ -17530,7 +18152,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
17530
18152
|
while (++node_n < cgraph->n_nodes) {
|
17531
18153
|
GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
|
17532
18154
|
struct ggml_tensor * node = cgraph->nodes[node_n];
|
17533
|
-
const int n_tasks = ggml_get_n_tasks(node, n_threads);
|
18155
|
+
const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
|
17534
18156
|
|
17535
18157
|
state->shared->perf_node_start_cycles = ggml_perf_cycles();
|
17536
18158
|
state->shared->perf_node_start_time_us = ggml_perf_time_us();
|
@@ -17578,7 +18200,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
17578
18200
|
|
17579
18201
|
/* INIT & COMPUTE */
|
17580
18202
|
struct ggml_tensor * node = cgraph->nodes[node_n];
|
17581
|
-
const int n_tasks = ggml_get_n_tasks(node, n_threads);
|
18203
|
+
const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
|
17582
18204
|
|
17583
18205
|
struct ggml_compute_params params = {
|
17584
18206
|
/*.type =*/ GGML_TASK_TYPE_INIT,
|
@@ -17624,7 +18246,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
17624
18246
|
}
|
17625
18247
|
}
|
17626
18248
|
|
17627
|
-
return
|
18249
|
+
return 0;
|
17628
18250
|
}
|
17629
18251
|
|
17630
18252
|
struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) {
|
@@ -17643,7 +18265,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
17643
18265
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
17644
18266
|
struct ggml_tensor * node = cgraph->nodes[i];
|
17645
18267
|
|
17646
|
-
const int n_tasks = ggml_get_n_tasks(node, n_threads);
|
18268
|
+
const int n_tasks = ggml_get_n_tasks(node, n_threads, 1);
|
17647
18269
|
|
17648
18270
|
max_tasks = MAX(max_tasks, n_tasks);
|
17649
18271
|
|
@@ -17820,7 +18442,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
17820
18442
|
return cplan;
|
17821
18443
|
}
|
17822
18444
|
|
17823
|
-
|
18445
|
+
enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
17824
18446
|
{
|
17825
18447
|
GGML_ASSERT(cplan);
|
17826
18448
|
GGML_ASSERT(cplan->n_threads > 0);
|
@@ -17864,6 +18486,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|
17864
18486
|
.thrd = 0,
|
17865
18487
|
.ith = j,
|
17866
18488
|
.shared = &state_shared,
|
18489
|
+
.ec = GGML_STATUS_SUCCESS,
|
17867
18490
|
};
|
17868
18491
|
|
17869
18492
|
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
|
@@ -17874,12 +18497,14 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|
17874
18497
|
|
17875
18498
|
workers[0].ith = 0;
|
17876
18499
|
workers[0].shared = &state_shared;
|
18500
|
+
workers[0].ec = GGML_STATUS_SUCCESS;
|
17877
18501
|
|
17878
18502
|
const int64_t perf_start_cycles = ggml_perf_cycles();
|
17879
18503
|
const int64_t perf_start_time_us = ggml_perf_time_us();
|
17880
18504
|
|
17881
18505
|
// this is a work thread too
|
17882
|
-
|
18506
|
+
ggml_graph_compute_thread(&workers[0]);
|
18507
|
+
enum ggml_status compute_status = workers[0].ec;
|
17883
18508
|
|
17884
18509
|
// don't leave affinity set on the main thread
|
17885
18510
|
clear_numa_thread_affinity();
|
@@ -17889,6 +18514,8 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|
17889
18514
|
for (int j = 1; j < n_threads; j++) {
|
17890
18515
|
const int rc = ggml_thread_join(workers[j].thrd, NULL);
|
17891
18516
|
GGML_ASSERT(rc == 0);
|
18517
|
+
if (workers[j].ec != GGML_STATUS_SUCCESS)
|
18518
|
+
compute_status = workers[j].ec;
|
17892
18519
|
}
|
17893
18520
|
}
|
17894
18521
|
|
@@ -17916,14 +18543,14 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|
17916
18543
|
return compute_status;
|
17917
18544
|
}
|
17918
18545
|
|
17919
|
-
|
18546
|
+
enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
|
17920
18547
|
struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
|
17921
18548
|
|
17922
18549
|
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
|
17923
18550
|
|
17924
18551
|
cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
|
17925
18552
|
|
17926
|
-
ggml_graph_compute(cgraph, &cplan);
|
18553
|
+
return ggml_graph_compute(cgraph, &cplan);
|
17927
18554
|
}
|
17928
18555
|
|
17929
18556
|
struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
|
@@ -19572,133 +20199,6 @@ void ggml_quantize_free(void) {
|
|
19572
20199
|
ggml_critical_section_end();
|
19573
20200
|
}
|
19574
20201
|
|
19575
|
-
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) {
|
19576
|
-
assert(k % QK4_0 == 0);
|
19577
|
-
const int nb = k / QK4_0;
|
19578
|
-
|
19579
|
-
for (int b = 0; b < n; b += k) {
|
19580
|
-
block_q4_0 * restrict y = (block_q4_0 *) dst + b/QK4_0;
|
19581
|
-
|
19582
|
-
quantize_row_q4_0_reference(src + b, y, k);
|
19583
|
-
|
19584
|
-
for (int i = 0; i < nb; i++) {
|
19585
|
-
for (int j = 0; j < QK4_0; j += 2) {
|
19586
|
-
const uint8_t vi0 = y[i].qs[j/2] & 0x0F;
|
19587
|
-
const uint8_t vi1 = y[i].qs[j/2] >> 4;
|
19588
|
-
|
19589
|
-
hist[vi0]++;
|
19590
|
-
hist[vi1]++;
|
19591
|
-
}
|
19592
|
-
}
|
19593
|
-
}
|
19594
|
-
|
19595
|
-
return (n/QK4_0*sizeof(block_q4_0));
|
19596
|
-
}
|
19597
|
-
|
19598
|
-
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) {
|
19599
|
-
assert(k % QK4_1 == 0);
|
19600
|
-
const int nb = k / QK4_1;
|
19601
|
-
|
19602
|
-
for (int b = 0; b < n; b += k) {
|
19603
|
-
block_q4_1 * restrict y = (block_q4_1 *) dst + b/QK4_1;
|
19604
|
-
|
19605
|
-
quantize_row_q4_1_reference(src + b, y, k);
|
19606
|
-
|
19607
|
-
for (int i = 0; i < nb; i++) {
|
19608
|
-
for (int j = 0; j < QK4_1; j += 2) {
|
19609
|
-
const uint8_t vi0 = y[i].qs[j/2] & 0x0F;
|
19610
|
-
const uint8_t vi1 = y[i].qs[j/2] >> 4;
|
19611
|
-
|
19612
|
-
hist[vi0]++;
|
19613
|
-
hist[vi1]++;
|
19614
|
-
}
|
19615
|
-
}
|
19616
|
-
}
|
19617
|
-
|
19618
|
-
return (n/QK4_1*sizeof(block_q4_1));
|
19619
|
-
}
|
19620
|
-
|
19621
|
-
size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist) {
|
19622
|
-
assert(k % QK5_0 == 0);
|
19623
|
-
const int nb = k / QK5_0;
|
19624
|
-
|
19625
|
-
for (int b = 0; b < n; b += k) {
|
19626
|
-
block_q5_0 * restrict y = (block_q5_0 *)dst + b/QK5_0;
|
19627
|
-
|
19628
|
-
quantize_row_q5_0_reference(src + b, y, k);
|
19629
|
-
|
19630
|
-
for (int i = 0; i < nb; i++) {
|
19631
|
-
uint32_t qh;
|
19632
|
-
memcpy(&qh, &y[i].qh, sizeof(qh));
|
19633
|
-
|
19634
|
-
for (int j = 0; j < QK5_0; j += 2) {
|
19635
|
-
const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
|
19636
|
-
const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
|
19637
|
-
|
19638
|
-
// cast to 16 bins
|
19639
|
-
const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
|
19640
|
-
const uint8_t vi1 = ((y[i].qs[j/2] >> 4) | vh1) / 2;
|
19641
|
-
|
19642
|
-
hist[vi0]++;
|
19643
|
-
hist[vi1]++;
|
19644
|
-
}
|
19645
|
-
}
|
19646
|
-
}
|
19647
|
-
|
19648
|
-
return (n/QK5_0*sizeof(block_q5_0));
|
19649
|
-
}
|
19650
|
-
|
19651
|
-
size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist) {
|
19652
|
-
assert(k % QK5_1 == 0);
|
19653
|
-
const int nb = k / QK5_1;
|
19654
|
-
|
19655
|
-
for (int b = 0; b < n; b += k) {
|
19656
|
-
block_q5_1 * restrict y = (block_q5_1 *)dst + b/QK5_1;
|
19657
|
-
|
19658
|
-
quantize_row_q5_1_reference(src + b, y, k);
|
19659
|
-
|
19660
|
-
for (int i = 0; i < nb; i++) {
|
19661
|
-
uint32_t qh;
|
19662
|
-
memcpy(&qh, &y[i].qh, sizeof(qh));
|
19663
|
-
|
19664
|
-
for (int j = 0; j < QK5_1; j += 2) {
|
19665
|
-
const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
|
19666
|
-
const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
|
19667
|
-
|
19668
|
-
// cast to 16 bins
|
19669
|
-
const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
|
19670
|
-
const uint8_t vi1 = ((y[i].qs[j/2] >> 4) | vh1) / 2;
|
19671
|
-
|
19672
|
-
hist[vi0]++;
|
19673
|
-
hist[vi1]++;
|
19674
|
-
}
|
19675
|
-
}
|
19676
|
-
}
|
19677
|
-
|
19678
|
-
return (n/QK5_1*sizeof(block_q5_1));
|
19679
|
-
}
|
19680
|
-
|
19681
|
-
size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist) {
|
19682
|
-
assert(k % QK8_0 == 0);
|
19683
|
-
const int nb = k / QK8_0;
|
19684
|
-
|
19685
|
-
for (int b = 0; b < n; b += k) {
|
19686
|
-
block_q8_0 * restrict y = (block_q8_0 *)dst + b/QK8_0;
|
19687
|
-
|
19688
|
-
quantize_row_q8_0_reference(src + b, y, k);
|
19689
|
-
|
19690
|
-
for (int i = 0; i < nb; i++) {
|
19691
|
-
for (int j = 0; j < QK8_0; ++j) {
|
19692
|
-
const int8_t vi = y[i].qs[j];
|
19693
|
-
|
19694
|
-
hist[vi/16 + 8]++;
|
19695
|
-
}
|
19696
|
-
}
|
19697
|
-
}
|
19698
|
-
|
19699
|
-
return (n/QK8_0*sizeof(block_q8_0));
|
19700
|
-
}
|
19701
|
-
|
19702
20202
|
bool ggml_quantize_requires_imatrix(enum ggml_type type) {
|
19703
20203
|
return
|
19704
20204
|
type == GGML_TYPE_IQ2_XXS ||
|
@@ -19706,177 +20206,52 @@ bool ggml_quantize_requires_imatrix(enum ggml_type type) {
|
|
19706
20206
|
type == GGML_TYPE_IQ1_S;
|
19707
20207
|
}
|
19708
20208
|
|
19709
|
-
size_t ggml_quantize_chunk(
|
19710
|
-
|
20209
|
+
size_t ggml_quantize_chunk(
|
20210
|
+
enum ggml_type type,
|
20211
|
+
const float * src,
|
20212
|
+
void * dst,
|
20213
|
+
int start,
|
20214
|
+
int nrows,
|
20215
|
+
int n_per_row,
|
20216
|
+
const float * imatrix) {
|
20217
|
+
const int n = nrows * n_per_row;
|
20218
|
+
|
20219
|
+
if (ggml_quantize_requires_imatrix(type)) {
|
20220
|
+
GGML_ASSERT(imatrix != NULL);
|
20221
|
+
}
|
20222
|
+
|
20223
|
+
GGML_ASSERT(start % type_traits[type].blck_size == 0);
|
20224
|
+
GGML_ASSERT(start % n_per_row == 0);
|
20225
|
+
|
19711
20226
|
ggml_quantize_init(type); // this is noop if already initialized
|
20227
|
+
|
20228
|
+
const size_t start_row = start / n_per_row;
|
20229
|
+
const size_t row_size = ggml_row_size(type, n_per_row);
|
20230
|
+
|
19712
20231
|
size_t result = 0;
|
19713
|
-
|
20232
|
+
|
19714
20233
|
switch (type) {
|
19715
|
-
case GGML_TYPE_Q4_0:
|
19716
|
-
|
19717
|
-
|
19718
|
-
|
19719
|
-
|
19720
|
-
|
19721
|
-
|
19722
|
-
|
19723
|
-
|
19724
|
-
case
|
19725
|
-
|
19726
|
-
|
19727
|
-
|
19728
|
-
|
19729
|
-
|
19730
|
-
|
19731
|
-
|
19732
|
-
} break;
|
19733
|
-
case GGML_TYPE_Q5_0:
|
19734
|
-
{
|
19735
|
-
GGML_ASSERT(start % QK5_0 == 0);
|
19736
|
-
GGML_ASSERT(start % n_per_row == 0);
|
19737
|
-
size_t start_row = start / n_per_row;
|
19738
|
-
size_t row_size = ggml_row_size(type, n_per_row);
|
19739
|
-
result = quantize_q5_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
19740
|
-
GGML_ASSERT(result == row_size * nrows);
|
19741
|
-
} break;
|
19742
|
-
case GGML_TYPE_Q5_1:
|
19743
|
-
{
|
19744
|
-
GGML_ASSERT(start % QK5_1 == 0);
|
19745
|
-
GGML_ASSERT(start % n_per_row == 0);
|
19746
|
-
size_t start_row = start / n_per_row;
|
19747
|
-
size_t row_size = ggml_row_size(type, n_per_row);
|
19748
|
-
result = quantize_q5_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
19749
|
-
GGML_ASSERT(result == row_size * nrows);
|
19750
|
-
} break;
|
19751
|
-
case GGML_TYPE_Q8_0:
|
19752
|
-
{
|
19753
|
-
GGML_ASSERT(start % QK8_0 == 0);
|
19754
|
-
block_q8_0 * block = (block_q8_0*)dst + start / QK8_0;
|
19755
|
-
result = ggml_quantize_q8_0(src + start, block, n, n, hist);
|
19756
|
-
} break;
|
19757
|
-
case GGML_TYPE_Q2_K:
|
19758
|
-
{
|
19759
|
-
GGML_ASSERT(start % QK_K == 0);
|
19760
|
-
GGML_ASSERT(start % n_per_row == 0);
|
19761
|
-
size_t start_row = start / n_per_row;
|
19762
|
-
size_t row_size = ggml_row_size(type, n_per_row);
|
19763
|
-
result = quantize_q2_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
19764
|
-
GGML_ASSERT(result == row_size * nrows);
|
19765
|
-
} break;
|
19766
|
-
case GGML_TYPE_Q3_K:
|
19767
|
-
{
|
19768
|
-
GGML_ASSERT(start % QK_K == 0);
|
19769
|
-
GGML_ASSERT(start % n_per_row == 0);
|
19770
|
-
size_t start_row = start / n_per_row;
|
19771
|
-
size_t row_size = ggml_row_size(type, n_per_row);
|
19772
|
-
result = quantize_q3_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
19773
|
-
GGML_ASSERT(result == row_size * nrows);
|
19774
|
-
} break;
|
19775
|
-
case GGML_TYPE_Q4_K:
|
19776
|
-
{
|
19777
|
-
GGML_ASSERT(start % QK_K == 0);
|
19778
|
-
GGML_ASSERT(start % n_per_row == 0);
|
19779
|
-
size_t start_row = start / n_per_row;
|
19780
|
-
size_t row_size = ggml_row_size(type, n_per_row);
|
19781
|
-
result = quantize_q4_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
19782
|
-
GGML_ASSERT(result == row_size * nrows);
|
19783
|
-
} break;
|
19784
|
-
case GGML_TYPE_Q5_K:
|
19785
|
-
{
|
19786
|
-
GGML_ASSERT(start % QK_K == 0);
|
19787
|
-
GGML_ASSERT(start % n_per_row == 0);
|
19788
|
-
size_t start_row = start / n_per_row;
|
19789
|
-
size_t row_size = ggml_row_size(type, n_per_row);
|
19790
|
-
result = quantize_q5_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
19791
|
-
GGML_ASSERT(result == row_size * nrows);
|
19792
|
-
} break;
|
19793
|
-
case GGML_TYPE_Q6_K:
|
19794
|
-
{
|
19795
|
-
GGML_ASSERT(start % QK_K == 0);
|
19796
|
-
GGML_ASSERT(start % n_per_row == 0);
|
19797
|
-
size_t start_row = start / n_per_row;
|
19798
|
-
size_t row_size = ggml_row_size(type, n_per_row);
|
19799
|
-
result = quantize_q6_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
19800
|
-
GGML_ASSERT(result == row_size * nrows);
|
19801
|
-
} break;
|
19802
|
-
case GGML_TYPE_IQ2_XXS:
|
19803
|
-
{
|
19804
|
-
GGML_ASSERT(start % QK_K == 0);
|
19805
|
-
GGML_ASSERT(start % n_per_row == 0);
|
19806
|
-
GGML_ASSERT(imatrix);
|
19807
|
-
size_t start_row = start / n_per_row;
|
19808
|
-
size_t row_size = ggml_row_size(type, n_per_row);
|
19809
|
-
result = quantize_iq2_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
19810
|
-
GGML_ASSERT(result == row_size * nrows);
|
19811
|
-
} break;
|
19812
|
-
case GGML_TYPE_IQ2_XS:
|
19813
|
-
{
|
19814
|
-
GGML_ASSERT(start % QK_K == 0);
|
19815
|
-
GGML_ASSERT(start % n_per_row == 0);
|
19816
|
-
GGML_ASSERT(imatrix);
|
19817
|
-
size_t start_row = start / n_per_row;
|
19818
|
-
size_t row_size = ggml_row_size(type, n_per_row);
|
19819
|
-
result = quantize_iq2_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
19820
|
-
GGML_ASSERT(result == row_size * nrows);
|
19821
|
-
} break;
|
19822
|
-
case GGML_TYPE_IQ3_XXS:
|
19823
|
-
{
|
19824
|
-
GGML_ASSERT(start % QK_K == 0);
|
19825
|
-
GGML_ASSERT(start % n_per_row == 0);
|
19826
|
-
size_t start_row = start / n_per_row;
|
19827
|
-
size_t row_size = ggml_row_size(type, n_per_row);
|
19828
|
-
result = quantize_iq3_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
19829
|
-
GGML_ASSERT(result == row_size * nrows);
|
19830
|
-
} break;
|
19831
|
-
case GGML_TYPE_IQ3_S:
|
19832
|
-
{
|
19833
|
-
GGML_ASSERT(start % QK_K == 0);
|
19834
|
-
GGML_ASSERT(start % n_per_row == 0);
|
19835
|
-
size_t start_row = start / n_per_row;
|
19836
|
-
size_t row_size = ggml_row_size(type, n_per_row);
|
19837
|
-
result = quantize_iq3_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
19838
|
-
GGML_ASSERT(result == row_size * nrows);
|
19839
|
-
} break;
|
19840
|
-
case GGML_TYPE_IQ2_S:
|
19841
|
-
{
|
19842
|
-
GGML_ASSERT(start % QK_K == 0);
|
19843
|
-
GGML_ASSERT(start % n_per_row == 0);
|
19844
|
-
size_t start_row = start / n_per_row;
|
19845
|
-
size_t row_size = ggml_row_size(type, n_per_row);
|
19846
|
-
result = quantize_iq2_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
19847
|
-
GGML_ASSERT(result == row_size * nrows);
|
19848
|
-
} break;
|
19849
|
-
case GGML_TYPE_IQ1_S:
|
19850
|
-
{
|
19851
|
-
GGML_ASSERT(start % QK_K == 0);
|
19852
|
-
GGML_ASSERT(start % n_per_row == 0);
|
19853
|
-
size_t start_row = start / n_per_row;
|
19854
|
-
size_t row_size = ggml_row_size(type, n_per_row);
|
19855
|
-
result = quantize_iq1_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
19856
|
-
GGML_ASSERT(result == row_size * nrows);
|
19857
|
-
} break;
|
19858
|
-
case GGML_TYPE_IQ4_NL:
|
20234
|
+
case GGML_TYPE_Q4_0: result = quantize_q4_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20235
|
+
case GGML_TYPE_Q4_1: result = quantize_q4_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20236
|
+
case GGML_TYPE_Q5_0: result = quantize_q5_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20237
|
+
case GGML_TYPE_Q5_1: result = quantize_q5_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20238
|
+
case GGML_TYPE_Q8_0: result = quantize_q8_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20239
|
+
case GGML_TYPE_Q2_K: result = quantize_q2_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20240
|
+
case GGML_TYPE_Q3_K: result = quantize_q3_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20241
|
+
case GGML_TYPE_Q4_K: result = quantize_q4_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20242
|
+
case GGML_TYPE_Q5_K: result = quantize_q5_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20243
|
+
case GGML_TYPE_Q6_K: result = quantize_q6_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20244
|
+
case GGML_TYPE_IQ2_XXS: result = quantize_iq2_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20245
|
+
case GGML_TYPE_IQ2_XS: result = quantize_iq2_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20246
|
+
case GGML_TYPE_IQ3_XXS: result = quantize_iq3_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20247
|
+
case GGML_TYPE_IQ3_S: result = quantize_iq3_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20248
|
+
case GGML_TYPE_IQ2_S: result = quantize_iq2_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20249
|
+
case GGML_TYPE_IQ1_S: result = quantize_iq1_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20250
|
+
case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
19859
20251
|
#if QK_K == 64
|
19860
|
-
case GGML_TYPE_IQ4_XS:
|
19861
|
-
#
|
19862
|
-
|
19863
|
-
GGML_ASSERT(start % QK4_NL == 0);
|
19864
|
-
GGML_ASSERT(start % n_per_row == 0);
|
19865
|
-
size_t start_row = start / n_per_row;
|
19866
|
-
size_t row_size = ggml_row_size(type, n_per_row);
|
19867
|
-
result = quantize_iq4_nl(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
19868
|
-
GGML_ASSERT(result == row_size * nrows);
|
19869
|
-
} break;
|
19870
|
-
#if QK_K != 64
|
19871
|
-
case GGML_TYPE_IQ4_XS:
|
19872
|
-
{
|
19873
|
-
GGML_ASSERT(start % QK_K == 0);
|
19874
|
-
GGML_ASSERT(start % n_per_row == 0);
|
19875
|
-
size_t start_row = start / n_per_row;
|
19876
|
-
size_t row_size = ggml_row_size(type, n_per_row);
|
19877
|
-
result = quantize_iq4_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
19878
|
-
GGML_ASSERT(result == row_size * nrows);
|
19879
|
-
} break;
|
20252
|
+
case GGML_TYPE_IQ4_XS: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20253
|
+
#else
|
20254
|
+
case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
19880
20255
|
#endif
|
19881
20256
|
case GGML_TYPE_F16:
|
19882
20257
|
{
|
@@ -19893,6 +20268,9 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
|
19893
20268
|
default:
|
19894
20269
|
assert(false);
|
19895
20270
|
}
|
20271
|
+
|
20272
|
+
GGML_ASSERT(result == nrows * row_size);
|
20273
|
+
|
19896
20274
|
return result;
|
19897
20275
|
}
|
19898
20276
|
|