llama_cpp 0.13.0 → 0.14.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +20 -0
- data/ext/llama_cpp/llama_cpp.cpp +130 -26
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -4
- data/vendor/tmp/llama.cpp/Makefile +30 -15
- data/vendor/tmp/llama.cpp/ggml-alloc.c +45 -64
- data/vendor/tmp/llama.cpp/ggml-alloc.h +13 -5
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +17 -5
- data/vendor/tmp/llama.cpp/ggml-backend.c +371 -151
- data/vendor/tmp/llama.cpp/ggml-backend.h +54 -29
- data/vendor/tmp/llama.cpp/ggml-common.h +1830 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +765 -830
- data/vendor/tmp/llama.cpp/ggml-impl.h +6 -2
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -2
- data/vendor/tmp/llama.cpp/ggml-metal.m +105 -27
- data/vendor/tmp/llama.cpp/ggml-metal.metal +99 -920
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +2 -2
- data/vendor/tmp/llama.cpp/ggml-quants.c +557 -1129
- data/vendor/tmp/llama.cpp/ggml-quants.h +27 -259
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +3332 -1195
- data/vendor/tmp/llama.cpp/ggml-sycl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +39336 -43461
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1302 -781
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +1 -0
- data/vendor/tmp/llama.cpp/ggml.c +734 -356
- data/vendor/tmp/llama.cpp/ggml.h +91 -51
- data/vendor/tmp/llama.cpp/llama.cpp +1938 -759
- data/vendor/tmp/llama.cpp/llama.h +53 -21
- data/vendor/tmp/llama.cpp/unicode.cpp +1672 -0
- data/vendor/tmp/llama.cpp/unicode.h +16 -774
- metadata +4 -2
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
@@ -320,6 +320,17 @@ static ggml_fp16_t ggml_table_exp_f16[1 << 16];
|
|
320
320
|
// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
|
321
321
|
float ggml_table_f32_f16[1 << 16];
|
322
322
|
|
323
|
+
const char * ggml_status_to_string(enum ggml_status status) {
|
324
|
+
switch (status) {
|
325
|
+
case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)";
|
326
|
+
case GGML_STATUS_FAILED: return "GGML status: error (operation failed)";
|
327
|
+
case GGML_STATUS_SUCCESS: return "GGML status: success";
|
328
|
+
case GGML_STATUS_ABORTED: return "GGML status: warning (operation aborted)";
|
329
|
+
}
|
330
|
+
|
331
|
+
return "GGML status: unknown";
|
332
|
+
}
|
333
|
+
|
323
334
|
// note: do not use these inside ggml.c
|
324
335
|
// these are meant to be used via the ggml.h API
|
325
336
|
float ggml_fp16_to_fp32(ggml_fp16_t x) {
|
@@ -459,6 +470,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
459
470
|
.type_size = sizeof(int32_t),
|
460
471
|
.is_quantized = false,
|
461
472
|
},
|
473
|
+
[GGML_TYPE_I64] = {
|
474
|
+
.type_name = "i64",
|
475
|
+
.blck_size = 1,
|
476
|
+
.type_size = sizeof(int64_t),
|
477
|
+
.is_quantized = false,
|
478
|
+
},
|
479
|
+
[GGML_TYPE_F64] = {
|
480
|
+
.type_name = "f64",
|
481
|
+
.blck_size = 1,
|
482
|
+
.type_size = sizeof(double),
|
483
|
+
.is_quantized = false,
|
484
|
+
.nrows = 1,
|
485
|
+
},
|
462
486
|
[GGML_TYPE_F32] = {
|
463
487
|
.type_name = "f32",
|
464
488
|
.blck_size = 1,
|
@@ -846,7 +870,7 @@ inline static float vaddvq_f32(float32x4_t v) {
|
|
846
870
|
#define GGML_F16x8 float16x8_t
|
847
871
|
#define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
|
848
872
|
#define GGML_F16x8_SET1(x) vdupq_n_f16(x)
|
849
|
-
#define GGML_F16x8_LOAD(x) vld1q_f16((const
|
873
|
+
#define GGML_F16x8_LOAD(x) vld1q_f16((const ggml_fp16_internal_t *)(x))
|
850
874
|
#define GGML_F16x8_STORE vst1q_f16
|
851
875
|
#define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
|
852
876
|
#define GGML_F16x8_ADD vaddq_f16
|
@@ -889,7 +913,7 @@ inline static float vaddvq_f32(float32x4_t v) {
|
|
889
913
|
#define GGML_F32Cx4 float32x4_t
|
890
914
|
#define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
|
891
915
|
#define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
|
892
|
-
#define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const
|
916
|
+
#define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const ggml_fp16_internal_t *)(x)))
|
893
917
|
#define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
|
894
918
|
#define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
|
895
919
|
#define GGML_F32Cx4_ADD vaddq_f32
|
@@ -1822,12 +1846,16 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
1822
1846
|
"POOL_2D",
|
1823
1847
|
"UPSCALE",
|
1824
1848
|
"PAD",
|
1849
|
+
"ARANGE",
|
1850
|
+
"TIMESTEP_EMBEDDING",
|
1825
1851
|
"ARGSORT",
|
1826
1852
|
"LEAKY_RELU",
|
1827
1853
|
|
1828
1854
|
"FLASH_ATTN",
|
1829
1855
|
"FLASH_FF",
|
1830
1856
|
"FLASH_ATTN_BACK",
|
1857
|
+
"SSM_CONV",
|
1858
|
+
"SSM_SCAN",
|
1831
1859
|
"WIN_PART",
|
1832
1860
|
"WIN_UNPART",
|
1833
1861
|
"GET_REL_POS",
|
@@ -1850,7 +1878,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
1850
1878
|
"CROSS_ENTROPY_LOSS_BACK",
|
1851
1879
|
};
|
1852
1880
|
|
1853
|
-
static_assert(GGML_OP_COUNT ==
|
1881
|
+
static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
|
1854
1882
|
|
1855
1883
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
1856
1884
|
"none",
|
@@ -1908,12 +1936,16 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
1908
1936
|
"pool_2d(x)",
|
1909
1937
|
"upscale(x)",
|
1910
1938
|
"pad(x)",
|
1939
|
+
"arange(start, stop, step)",
|
1940
|
+
"timestep_embedding(timesteps, dim, max_period)",
|
1911
1941
|
"argsort(x)",
|
1912
1942
|
"leaky_relu(x)",
|
1913
1943
|
|
1914
1944
|
"flash_attn(x)",
|
1915
1945
|
"flash_ff(x)",
|
1916
1946
|
"flash_attn_back(x)",
|
1947
|
+
"ssm_conv(x)",
|
1948
|
+
"ssm_scan(x)",
|
1917
1949
|
"win_part(x)",
|
1918
1950
|
"win_unpart(x)",
|
1919
1951
|
"get_rel_pos(x)",
|
@@ -1936,7 +1968,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
1936
1968
|
"cross_entropy_loss_back(x,y)",
|
1937
1969
|
};
|
1938
1970
|
|
1939
|
-
static_assert(GGML_OP_COUNT ==
|
1971
|
+
static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
|
1940
1972
|
|
1941
1973
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
1942
1974
|
|
@@ -2139,7 +2171,10 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
|
|
2139
2171
|
getcpu_ret = getcpu(¤t_cpu, &g_state.numa.current_node);
|
2140
2172
|
#else
|
2141
2173
|
// old glibc doesn't have a wrapper for this call. Fall back on direct syscall
|
2142
|
-
|
2174
|
+
# if !defined(SYS_getcpu) && defined(SYS_get_cpu)
|
2175
|
+
# define SYS_getcpu SYS_get_cpu // some older glibc versions use this name
|
2176
|
+
# endif
|
2177
|
+
getcpu_ret = syscall(SYS_getcpu, ¤t_cpu, &g_state.numa.current_node);
|
2143
2178
|
#endif
|
2144
2179
|
|
2145
2180
|
if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) {
|
@@ -2895,11 +2930,21 @@ static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_
|
|
2895
2930
|
return ((const int32_t *)(tensor->op_params))[i];
|
2896
2931
|
}
|
2897
2932
|
|
2933
|
+
static float ggml_get_op_params_f32(const struct ggml_tensor * tensor, uint32_t i) {
|
2934
|
+
assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
|
2935
|
+
return ((const float *)(tensor->op_params))[i];
|
2936
|
+
}
|
2937
|
+
|
2898
2938
|
static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
|
2899
2939
|
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
|
2900
2940
|
((int32_t *)(tensor->op_params))[i] = value;
|
2901
2941
|
}
|
2902
2942
|
|
2943
|
+
static void ggml_set_op_params_f32(struct ggml_tensor * tensor, uint32_t i, float value) {
|
2944
|
+
assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
|
2945
|
+
((float *)(tensor->op_params))[i] = value;
|
2946
|
+
}
|
2947
|
+
|
2903
2948
|
struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
|
2904
2949
|
memset(tensor->data, 0, ggml_nbytes(tensor));
|
2905
2950
|
return tensor;
|
@@ -5898,6 +5943,55 @@ struct ggml_tensor * ggml_upscale(
|
|
5898
5943
|
return ggml_upscale_impl(ctx, a, scale_factor);
|
5899
5944
|
}
|
5900
5945
|
|
5946
|
+
struct ggml_tensor * ggml_arange(
|
5947
|
+
struct ggml_context * ctx,
|
5948
|
+
float start,
|
5949
|
+
float stop,
|
5950
|
+
float step) {
|
5951
|
+
|
5952
|
+
GGML_ASSERT(stop > start);
|
5953
|
+
|
5954
|
+
const int64_t steps = (int64_t) ceilf((stop - start) / step);
|
5955
|
+
|
5956
|
+
struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, steps);
|
5957
|
+
|
5958
|
+
result->op = GGML_OP_ARANGE;
|
5959
|
+
ggml_set_op_params_f32(result, 0, start);
|
5960
|
+
ggml_set_op_params_f32(result, 1, stop);
|
5961
|
+
ggml_set_op_params_f32(result, 2, step);
|
5962
|
+
|
5963
|
+
return result;
|
5964
|
+
}
|
5965
|
+
|
5966
|
+
struct ggml_tensor * ggml_timestep_embedding(
|
5967
|
+
struct ggml_context * ctx,
|
5968
|
+
struct ggml_tensor * timesteps,
|
5969
|
+
int dim,
|
5970
|
+
int max_period) {
|
5971
|
+
bool is_node = false;
|
5972
|
+
|
5973
|
+
if (timesteps->grad) {
|
5974
|
+
GGML_ASSERT(false); // TODO: implement backward
|
5975
|
+
is_node = true;
|
5976
|
+
}
|
5977
|
+
|
5978
|
+
int actual_dim = dim;
|
5979
|
+
if (dim % 2 != 0) {
|
5980
|
+
actual_dim = dim + 1;
|
5981
|
+
}
|
5982
|
+
|
5983
|
+
struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, actual_dim, timesteps->ne[0]);
|
5984
|
+
|
5985
|
+
result->op = GGML_OP_TIMESTEP_EMBEDDING;
|
5986
|
+
ggml_set_op_params_i32(result, 0, dim);
|
5987
|
+
ggml_set_op_params_i32(result, 1, max_period);
|
5988
|
+
|
5989
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5990
|
+
result->src[0] = timesteps;
|
5991
|
+
|
5992
|
+
return result;
|
5993
|
+
}
|
5994
|
+
|
5901
5995
|
// ggml_argsort
|
5902
5996
|
|
5903
5997
|
struct ggml_tensor * ggml_argsort(
|
@@ -6077,6 +6171,108 @@ struct ggml_tensor * ggml_flash_attn_back(
|
|
6077
6171
|
return result;
|
6078
6172
|
}
|
6079
6173
|
|
6174
|
+
// ggml_ssm_conv
|
6175
|
+
|
6176
|
+
struct ggml_tensor * ggml_ssm_conv(
|
6177
|
+
struct ggml_context * ctx,
|
6178
|
+
struct ggml_tensor * s,
|
6179
|
+
struct ggml_tensor * x,
|
6180
|
+
struct ggml_tensor * c,
|
6181
|
+
struct ggml_tensor * sq) {
|
6182
|
+
GGML_ASSERT(ggml_is_3d(s));
|
6183
|
+
GGML_ASSERT(ggml_is_matrix(x));
|
6184
|
+
GGML_ASSERT(ggml_is_matrix(c));
|
6185
|
+
GGML_ASSERT(ggml_is_matrix(sq));
|
6186
|
+
GGML_ASSERT(sq->type == GGML_TYPE_I32);
|
6187
|
+
|
6188
|
+
const int64_t d_conv = c->ne[0];
|
6189
|
+
const int64_t d_inner = c->ne[1];
|
6190
|
+
const int64_t n_tokens = x->ne[1];
|
6191
|
+
const int64_t n_kv = s->ne[2];
|
6192
|
+
|
6193
|
+
GGML_ASSERT( s->ne[0] == d_conv - 1);
|
6194
|
+
GGML_ASSERT( s->ne[1] == d_inner);
|
6195
|
+
GGML_ASSERT( x->ne[0] == d_inner);
|
6196
|
+
GGML_ASSERT(sq->ne[0] == n_kv);
|
6197
|
+
GGML_ASSERT(sq->ne[1] == n_tokens);
|
6198
|
+
|
6199
|
+
bool is_node = false;
|
6200
|
+
|
6201
|
+
if (s->grad || x->grad || c->grad || sq->grad) {
|
6202
|
+
GGML_ASSERT(false); // TODO: implement
|
6203
|
+
is_node = true;
|
6204
|
+
}
|
6205
|
+
|
6206
|
+
// 2-in-1 concatenated x and conv_states, {d_inner, n_tokens} with {d_conv, d_inner, n_kv}
|
6207
|
+
struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, (d_inner*n_tokens) + (d_conv*d_inner*n_kv));
|
6208
|
+
|
6209
|
+
result->op = GGML_OP_SSM_CONV;
|
6210
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6211
|
+
result->src[0] = s;
|
6212
|
+
result->src[1] = x;
|
6213
|
+
result->src[2] = c;
|
6214
|
+
result->src[3] = sq;
|
6215
|
+
|
6216
|
+
return result;
|
6217
|
+
}
|
6218
|
+
|
6219
|
+
// ggml_ssm_scan
|
6220
|
+
|
6221
|
+
struct ggml_tensor * ggml_ssm_scan(
|
6222
|
+
struct ggml_context * ctx,
|
6223
|
+
struct ggml_tensor * s,
|
6224
|
+
struct ggml_tensor * x,
|
6225
|
+
struct ggml_tensor * dt,
|
6226
|
+
struct ggml_tensor * A,
|
6227
|
+
struct ggml_tensor * B,
|
6228
|
+
struct ggml_tensor * C,
|
6229
|
+
struct ggml_tensor * sq) {
|
6230
|
+
GGML_ASSERT(ggml_is_contiguous(s));
|
6231
|
+
GGML_ASSERT(ggml_is_contiguous(x));
|
6232
|
+
GGML_ASSERT(ggml_is_contiguous(dt));
|
6233
|
+
GGML_ASSERT(ggml_is_contiguous(A));
|
6234
|
+
GGML_ASSERT(sq->type == GGML_TYPE_I32);
|
6235
|
+
GGML_ASSERT(B->nb[0] == ggml_type_size(B->type));
|
6236
|
+
GGML_ASSERT(C->nb[0] == ggml_type_size(C->type));
|
6237
|
+
GGML_ASSERT(ggml_are_same_shape(x, dt));
|
6238
|
+
|
6239
|
+
{
|
6240
|
+
const int64_t d_state = s->ne[0];
|
6241
|
+
const int64_t d_inner = s->ne[1];
|
6242
|
+
const int64_t n_tokens = x->ne[1];
|
6243
|
+
|
6244
|
+
GGML_ASSERT(x->ne[0] == d_inner);
|
6245
|
+
GGML_ASSERT(A->ne[0] == d_state);
|
6246
|
+
GGML_ASSERT(A->ne[1] == d_inner);
|
6247
|
+
GGML_ASSERT(B->ne[0] == d_state);
|
6248
|
+
GGML_ASSERT(B->ne[1] == n_tokens);
|
6249
|
+
GGML_ASSERT(C->ne[0] == d_state);
|
6250
|
+
GGML_ASSERT(C->ne[1] == n_tokens);
|
6251
|
+
}
|
6252
|
+
|
6253
|
+
bool is_node = false;
|
6254
|
+
|
6255
|
+
if (s->grad || x->grad || dt->grad || A->grad || B->grad || C->grad || sq->grad) {
|
6256
|
+
GGML_ASSERT(false); // TODO: implement
|
6257
|
+
is_node = true;
|
6258
|
+
}
|
6259
|
+
|
6260
|
+
// 2-in-1 concatenated y and ssm_states, {d_inner, n_tokens} with {d_state, d_inner, n_kv}
|
6261
|
+
struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ggml_nelements(x) + ggml_nelements(s));
|
6262
|
+
|
6263
|
+
result->op = GGML_OP_SSM_SCAN;
|
6264
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6265
|
+
result->src[0] = s;
|
6266
|
+
result->src[1] = x;
|
6267
|
+
result->src[2] = dt;
|
6268
|
+
result->src[3] = A;
|
6269
|
+
result->src[4] = B;
|
6270
|
+
result->src[5] = C;
|
6271
|
+
result->src[6] = sq;
|
6272
|
+
|
6273
|
+
return result;
|
6274
|
+
}
|
6275
|
+
|
6080
6276
|
// ggml_win_part
|
6081
6277
|
|
6082
6278
|
struct ggml_tensor * ggml_win_part(
|
@@ -10231,7 +10427,7 @@ static void ggml_compute_forward_group_norm_f32(
|
|
10231
10427
|
int n_channels = src0->ne[2];
|
10232
10428
|
int n_groups = dst->op_params[0];
|
10233
10429
|
int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
|
10234
|
-
for (int i = ith; i < n_groups; i+=nth) {
|
10430
|
+
for (int i = ith; i < n_groups; i += nth) {
|
10235
10431
|
int start = i * n_channels_per_group;
|
10236
10432
|
int end = start + n_channels_per_group;
|
10237
10433
|
if (end > n_channels) {
|
@@ -10245,28 +10441,32 @@ static void ggml_compute_forward_group_norm_f32(
|
|
10245
10441
|
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
10246
10442
|
const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
|
10247
10443
|
|
10444
|
+
ggml_float sumr = 0.0;
|
10248
10445
|
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
10249
|
-
|
10446
|
+
sumr += (ggml_float)x[i00];
|
10250
10447
|
}
|
10448
|
+
sum += sumr;
|
10251
10449
|
}
|
10252
10450
|
}
|
10253
|
-
float mean = sum / (ne00 * ne01 * step);
|
10254
|
-
ggml_float sum2 = 0.0;
|
10451
|
+
const float mean = sum / (ne00 * ne01 * step);
|
10255
10452
|
|
10453
|
+
ggml_float sum2 = 0.0;
|
10256
10454
|
for (int64_t i02 = start; i02 < end; i02++) {
|
10257
10455
|
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
10258
10456
|
const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
|
10259
10457
|
|
10260
10458
|
float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
|
10261
10459
|
|
10460
|
+
ggml_float sumr = 0.0;
|
10262
10461
|
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
10263
10462
|
float v = x[i00] - mean;
|
10264
10463
|
y[i00] = v;
|
10265
|
-
|
10464
|
+
sumr += (ggml_float)(v * v);
|
10266
10465
|
}
|
10466
|
+
sum2 += sumr;
|
10267
10467
|
}
|
10268
10468
|
}
|
10269
|
-
float variance = sum2 / (ne00 * ne01 * step);
|
10469
|
+
const float variance = sum2 / (ne00 * ne01 * step);
|
10270
10470
|
const float scale = 1.0f / sqrtf(variance + eps);
|
10271
10471
|
|
10272
10472
|
for (int64_t i02 = start; i02 < end; i02++) {
|
@@ -11373,8 +11573,6 @@ static void ggml_compute_forward_get_rows_q(
|
|
11373
11573
|
const struct ggml_tensor * src0 = dst->src[0];
|
11374
11574
|
const struct ggml_tensor * src1 = dst->src[1];
|
11375
11575
|
|
11376
|
-
assert(params->ith == 0);
|
11377
|
-
|
11378
11576
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
11379
11577
|
return;
|
11380
11578
|
}
|
@@ -11382,7 +11580,7 @@ static void ggml_compute_forward_get_rows_q(
|
|
11382
11580
|
GGML_TENSOR_BINARY_OP_LOCALS
|
11383
11581
|
|
11384
11582
|
const int64_t nc = ne00;
|
11385
|
-
const int64_t nr = ggml_nelements(src1);
|
11583
|
+
const int64_t nr = ggml_nelements(src1);
|
11386
11584
|
|
11387
11585
|
const enum ggml_type type = src0->type;
|
11388
11586
|
ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
|
@@ -11392,17 +11590,25 @@ static void ggml_compute_forward_get_rows_q(
|
|
11392
11590
|
assert(nb00 == ggml_type_size(type));
|
11393
11591
|
assert(ggml_nrows(dst) == nr);
|
11394
11592
|
|
11395
|
-
|
11396
|
-
|
11397
|
-
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
11398
|
-
for (int64_t i10 = 0; i10 < ne10; ++i10) {
|
11399
|
-
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
11593
|
+
const int ith = params->ith;
|
11594
|
+
const int nth = params->nth;
|
11400
11595
|
|
11401
|
-
|
11402
|
-
|
11403
|
-
|
11404
|
-
|
11405
|
-
|
11596
|
+
// rows per thread
|
11597
|
+
const int dr = (nr + nth - 1)/nth;
|
11598
|
+
|
11599
|
+
// row range for this thread
|
11600
|
+
const int ir0 = dr*ith;
|
11601
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
11602
|
+
|
11603
|
+
for (int64_t i = ir0; i < ir1; ++i) {
|
11604
|
+
const int64_t i12 = i/(ne11*ne10);
|
11605
|
+
const int64_t i11 = (i - i12*ne11*ne10)/ne10;
|
11606
|
+
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
11607
|
+
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
11608
|
+
|
11609
|
+
dequantize_row_q(
|
11610
|
+
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
11611
|
+
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
11406
11612
|
}
|
11407
11613
|
}
|
11408
11614
|
|
@@ -11413,8 +11619,6 @@ static void ggml_compute_forward_get_rows_f16(
|
|
11413
11619
|
const struct ggml_tensor * src0 = dst->src[0];
|
11414
11620
|
const struct ggml_tensor * src1 = dst->src[1];
|
11415
11621
|
|
11416
|
-
assert(params->ith == 0);
|
11417
|
-
|
11418
11622
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
11419
11623
|
return;
|
11420
11624
|
}
|
@@ -11422,24 +11626,32 @@ static void ggml_compute_forward_get_rows_f16(
|
|
11422
11626
|
GGML_TENSOR_BINARY_OP_LOCALS
|
11423
11627
|
|
11424
11628
|
const int64_t nc = ne00;
|
11425
|
-
const int64_t nr = ggml_nelements(src1);
|
11629
|
+
const int64_t nr = ggml_nelements(src1);
|
11426
11630
|
|
11427
11631
|
assert(ne0 == nc);
|
11428
11632
|
assert(ne02 == ne11);
|
11429
11633
|
assert(nb00 == sizeof(ggml_fp16_t));
|
11430
11634
|
assert(ggml_nrows(dst) == nr);
|
11431
11635
|
|
11432
|
-
|
11433
|
-
|
11434
|
-
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
11435
|
-
for (int64_t i10 = 0; i10 < ne10; ++i10) {
|
11436
|
-
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
11636
|
+
const int ith = params->ith;
|
11637
|
+
const int nth = params->nth;
|
11437
11638
|
|
11438
|
-
|
11439
|
-
|
11440
|
-
|
11441
|
-
|
11442
|
-
|
11639
|
+
// rows per thread
|
11640
|
+
const int dr = (nr + nth - 1)/nth;
|
11641
|
+
|
11642
|
+
// row range for this thread
|
11643
|
+
const int ir0 = dr*ith;
|
11644
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
11645
|
+
|
11646
|
+
for (int64_t i = ir0; i < ir1; ++i) {
|
11647
|
+
const int64_t i12 = i/(ne11*ne10);
|
11648
|
+
const int64_t i11 = (i - i12*ne11*ne10)/ne10;
|
11649
|
+
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
11650
|
+
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
11651
|
+
|
11652
|
+
ggml_fp16_to_fp32_row(
|
11653
|
+
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
11654
|
+
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
11443
11655
|
}
|
11444
11656
|
}
|
11445
11657
|
|
@@ -11450,8 +11662,6 @@ static void ggml_compute_forward_get_rows_f32(
|
|
11450
11662
|
const struct ggml_tensor * src0 = dst->src[0];
|
11451
11663
|
const struct ggml_tensor * src1 = dst->src[1];
|
11452
11664
|
|
11453
|
-
assert(params->ith == 0);
|
11454
|
-
|
11455
11665
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
11456
11666
|
return;
|
11457
11667
|
}
|
@@ -11459,24 +11669,32 @@ static void ggml_compute_forward_get_rows_f32(
|
|
11459
11669
|
GGML_TENSOR_BINARY_OP_LOCALS
|
11460
11670
|
|
11461
11671
|
const int64_t nc = ne00;
|
11462
|
-
const int64_t nr = ggml_nelements(src1);
|
11672
|
+
const int64_t nr = ggml_nelements(src1);
|
11463
11673
|
|
11464
11674
|
assert(ne0 == nc);
|
11465
11675
|
assert(ne02 == ne11);
|
11466
11676
|
assert(nb00 == sizeof(float));
|
11467
11677
|
assert(ggml_nrows(dst) == nr);
|
11468
11678
|
|
11469
|
-
|
11470
|
-
|
11471
|
-
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
11472
|
-
for (int64_t i10 = 0; i10 < ne10; ++i10) {
|
11473
|
-
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
11679
|
+
const int ith = params->ith;
|
11680
|
+
const int nth = params->nth;
|
11474
11681
|
|
11475
|
-
|
11476
|
-
|
11477
|
-
|
11478
|
-
|
11479
|
-
|
11682
|
+
// rows per thread
|
11683
|
+
const int dr = (nr + nth - 1)/nth;
|
11684
|
+
|
11685
|
+
// row range for this thread
|
11686
|
+
const int ir0 = dr*ith;
|
11687
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
11688
|
+
|
11689
|
+
for (int64_t i = ir0; i < ir1; ++i) {
|
11690
|
+
const int64_t i12 = i/(ne11*ne10);
|
11691
|
+
const int64_t i11 = (i - i12*ne11*ne10)/ne10;
|
11692
|
+
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
11693
|
+
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
11694
|
+
|
11695
|
+
ggml_vec_cpy_f32(nc,
|
11696
|
+
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
|
11697
|
+
(float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
|
11480
11698
|
}
|
11481
11699
|
}
|
11482
11700
|
|
@@ -12213,6 +12431,8 @@ static void ggml_compute_forward_alibi(
|
|
12213
12431
|
case GGML_TYPE_I8:
|
12214
12432
|
case GGML_TYPE_I16:
|
12215
12433
|
case GGML_TYPE_I32:
|
12434
|
+
case GGML_TYPE_I64:
|
12435
|
+
case GGML_TYPE_F64:
|
12216
12436
|
case GGML_TYPE_COUNT:
|
12217
12437
|
{
|
12218
12438
|
GGML_ASSERT(false);
|
@@ -12299,6 +12519,8 @@ static void ggml_compute_forward_clamp(
|
|
12299
12519
|
case GGML_TYPE_I8:
|
12300
12520
|
case GGML_TYPE_I16:
|
12301
12521
|
case GGML_TYPE_I32:
|
12522
|
+
case GGML_TYPE_I64:
|
12523
|
+
case GGML_TYPE_F64:
|
12302
12524
|
case GGML_TYPE_COUNT:
|
12303
12525
|
{
|
12304
12526
|
GGML_ASSERT(false);
|
@@ -13547,6 +13769,106 @@ static void ggml_compute_forward_pad(
|
|
13547
13769
|
}
|
13548
13770
|
}
|
13549
13771
|
|
13772
|
+
|
13773
|
+
// ggml_compute_forward_arange
|
13774
|
+
|
13775
|
+
static void ggml_compute_forward_arange_f32(
|
13776
|
+
const struct ggml_compute_params * params,
|
13777
|
+
struct ggml_tensor * dst) {
|
13778
|
+
|
13779
|
+
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
13780
|
+
return;
|
13781
|
+
}
|
13782
|
+
|
13783
|
+
GGML_ASSERT(dst->nb[0] == sizeof(float));
|
13784
|
+
|
13785
|
+
const int ith = params->ith;
|
13786
|
+
const int nth = params->nth;
|
13787
|
+
|
13788
|
+
const float start = ggml_get_op_params_f32(dst, 0);
|
13789
|
+
const float stop = ggml_get_op_params_f32(dst, 1);
|
13790
|
+
const float step = ggml_get_op_params_f32(dst, 2);
|
13791
|
+
|
13792
|
+
const int64_t steps = (int64_t) ceilf((stop - start) / step);
|
13793
|
+
|
13794
|
+
GGML_ASSERT(ggml_nelements(dst) == steps);
|
13795
|
+
|
13796
|
+
for (int64_t i = ith; i < steps; i+= nth) {
|
13797
|
+
float value = start + step * i;
|
13798
|
+
((float *)dst->data)[i] = value;
|
13799
|
+
}
|
13800
|
+
}
|
13801
|
+
|
13802
|
+
static void ggml_compute_forward_arange(
|
13803
|
+
const struct ggml_compute_params * params,
|
13804
|
+
struct ggml_tensor * dst) {
|
13805
|
+
switch (dst->type) {
|
13806
|
+
case GGML_TYPE_F32:
|
13807
|
+
{
|
13808
|
+
ggml_compute_forward_arange_f32(params, dst);
|
13809
|
+
} break;
|
13810
|
+
default:
|
13811
|
+
{
|
13812
|
+
GGML_ASSERT(false);
|
13813
|
+
} break;
|
13814
|
+
}
|
13815
|
+
}
|
13816
|
+
|
13817
|
+
static void ggml_compute_forward_timestep_embedding_f32(
|
13818
|
+
const struct ggml_compute_params * params,
|
13819
|
+
struct ggml_tensor * dst) {
|
13820
|
+
|
13821
|
+
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
13822
|
+
return;
|
13823
|
+
}
|
13824
|
+
|
13825
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
13826
|
+
|
13827
|
+
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
13828
|
+
|
13829
|
+
const int ith = params->ith;
|
13830
|
+
const int nth = params->nth;
|
13831
|
+
|
13832
|
+
GGML_TENSOR_UNARY_OP_LOCALS
|
13833
|
+
|
13834
|
+
const int dim = ggml_get_op_params_i32(dst, 0);
|
13835
|
+
const int max_period = ggml_get_op_params_i32(dst, 1);
|
13836
|
+
|
13837
|
+
int half = dim / 2;
|
13838
|
+
|
13839
|
+
for (int64_t i = 0; i < ne00; i++) {
|
13840
|
+
float * embed_data = (float *)((char *) dst->data + i*nb1);
|
13841
|
+
for (int64_t j = ith; j < half; j += nth) {
|
13842
|
+
float timestep = ((float *)src0->data)[i];
|
13843
|
+
float freq = (float)expf(-logf(max_period) * j / half);
|
13844
|
+
float arg = timestep * freq;
|
13845
|
+
embed_data[j] = cosf(arg);
|
13846
|
+
embed_data[j + half] = sinf(arg);
|
13847
|
+
}
|
13848
|
+
if (dim % 2 != 0 && ith == 0) {
|
13849
|
+
embed_data[dim] = 0.f;
|
13850
|
+
}
|
13851
|
+
}
|
13852
|
+
}
|
13853
|
+
|
13854
|
+
static void ggml_compute_forward_timestep_embedding(
|
13855
|
+
const struct ggml_compute_params * params,
|
13856
|
+
struct ggml_tensor * dst) {
|
13857
|
+
|
13858
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
13859
|
+
|
13860
|
+
switch (src0->type) {
|
13861
|
+
case GGML_TYPE_F32:
|
13862
|
+
{
|
13863
|
+
ggml_compute_forward_timestep_embedding_f32(params, dst);
|
13864
|
+
} break;
|
13865
|
+
default:
|
13866
|
+
{
|
13867
|
+
GGML_ASSERT(false);
|
13868
|
+
} break;
|
13869
|
+
}
|
13870
|
+
}
|
13871
|
+
|
13550
13872
|
// ggml_compute_forward_argsort
|
13551
13873
|
|
13552
13874
|
static void ggml_compute_forward_argsort_f32(
|
@@ -14590,6 +14912,257 @@ static void ggml_compute_forward_flash_attn_back(
|
|
14590
14912
|
}
|
14591
14913
|
}
|
14592
14914
|
|
14915
|
+
// ggml_compute_forward_ssm_conv
|
14916
|
+
|
14917
|
+
static void ggml_compute_forward_ssm_conv_f32(
|
14918
|
+
const struct ggml_compute_params * params,
|
14919
|
+
struct ggml_tensor * dst) {
|
14920
|
+
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
14921
|
+
return;
|
14922
|
+
}
|
14923
|
+
|
14924
|
+
const struct ggml_tensor * src0 = dst->src[0]; // conv_state
|
14925
|
+
const struct ggml_tensor * src1 = dst->src[1]; // x
|
14926
|
+
const struct ggml_tensor * src2 = dst->src[2]; // conv1d.weight
|
14927
|
+
const struct ggml_tensor * src3 = dst->src[3]; // state_seq
|
14928
|
+
|
14929
|
+
const int ith = params->ith;
|
14930
|
+
const int nth = params->nth;
|
14931
|
+
|
14932
|
+
const int nc = src2->ne[0]; // d_conv
|
14933
|
+
const int nr = src0->ne[1]; // d_inner
|
14934
|
+
const int n_t = src1->ne[1]; // n_tokens
|
14935
|
+
const int n_kv = src0->ne[2]; // max number of sequences in the batch
|
14936
|
+
|
14937
|
+
GGML_ASSERT((nr*n_t) + (nc*nr*n_kv) == ggml_nelements(dst));
|
14938
|
+
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
14939
|
+
GGML_ASSERT(src1->nb[0] == sizeof(float));
|
14940
|
+
GGML_ASSERT(src2->nb[0] == sizeof(float));
|
14941
|
+
GGML_ASSERT(src3->nb[0] == sizeof(int32_t));
|
14942
|
+
GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float));
|
14943
|
+
// for use with the destination state offset between sequences
|
14944
|
+
GGML_ASSERT(src2->nb[2] == src2->ne[1]*src2->ne[0]*sizeof(float));
|
14945
|
+
|
14946
|
+
// rows per thread
|
14947
|
+
const int dr = (nr + nth - 1)/nth;
|
14948
|
+
|
14949
|
+
// row range for this thread
|
14950
|
+
const int ir0 = dr*ith;
|
14951
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
14952
|
+
const int ir = ir1 - ir0;
|
14953
|
+
|
14954
|
+
if (n_kv > 1) {
|
14955
|
+
// multiple sequences means it's hard to know when it's the first time a state is read,
|
14956
|
+
// so copy them all over to the destination, just to be sure.
|
14957
|
+
for (int i3 = 0; i3 < n_kv; ++i3) {
|
14958
|
+
float * s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]));
|
14959
|
+
float * s = (float *) ((char *) dst->data + ir0*(src2->nb[1]) + i3*(src2->nb[2]) + nr*n_t*sizeof(float));
|
14960
|
+
// can't use memcpy because of d_conv vs d_conv - 1
|
14961
|
+
for (int i1 = 0; i1 < ir; ++i1) {
|
14962
|
+
for (int i0 = 0; i0 < nc - 1; ++i0) {
|
14963
|
+
// copy s0 to last (d_conv - 1) columns of s
|
14964
|
+
s[1 + i0 + i1*nc] = s0[i0 + i1*(nc - 1)];
|
14965
|
+
}
|
14966
|
+
}
|
14967
|
+
}
|
14968
|
+
}
|
14969
|
+
|
14970
|
+
for (int i2 = 0; i2 < n_t; ++i2) {
|
14971
|
+
int32_t * sq = (int32_t *) ((char *) src3->data + i2*(src3->nb[1])); // {n_kv, n_tokens}
|
14972
|
+
float * x = (float *) ((char *) dst->data + ir0*sizeof(float) + i2*(nr*sizeof(float))); // {d_inner, n_tokens}
|
14973
|
+
float * s = (float *) ((char *) dst->data + ir0*(src2->nb[1]) + sq[0]*(src2->nb[2]) + nr*n_t*sizeof(float)); // {d_conv, d_inner, n_kv}
|
14974
|
+
float * s0; // {d_conv - 1, d_inner, n_kv}
|
14975
|
+
float * x0 = (float *) ((char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1])); // {d_inner, n_tokens}
|
14976
|
+
float * c = (float *) ((char *) src2->data + ir0*(src2->nb[1])); // {d_conv, d_inner}
|
14977
|
+
int ne0s0;
|
14978
|
+
|
14979
|
+
GGML_ASSERT(0 <= sq[0] && sq[0] < n_kv);
|
14980
|
+
|
14981
|
+
// avoid needing to copy the state for the first token
|
14982
|
+
if (i2 == 0) {
|
14983
|
+
s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + sq[0]*(src0->nb[2])); // {d_conv - 1, d_inner, n_kv}
|
14984
|
+
ne0s0 = src0->ne[0];
|
14985
|
+
} else {
|
14986
|
+
// the source is the last (d_conv - 1) columns of the destination
|
14987
|
+
s0 = s + 1;
|
14988
|
+
ne0s0 = nc;
|
14989
|
+
}
|
14990
|
+
|
14991
|
+
// d_inner
|
14992
|
+
for (int i1 = 0; i1 < ir; ++i1) {
|
14993
|
+
// shift state left
|
14994
|
+
for (int i0 = 0; i0 < nc - 1; ++i0) {
|
14995
|
+
s[i0 + i1*nc] = s0[i0 + i1*ne0s0];
|
14996
|
+
}
|
14997
|
+
// insert x on the last column
|
14998
|
+
s[(nc - 1) + i1*nc] = x0[i1];
|
14999
|
+
}
|
15000
|
+
|
15001
|
+
// handle copies when there are multiple output states
|
15002
|
+
for (int i3 = 1; i3 < n_kv; ++i3) {
|
15003
|
+
int32_t seq = sq[i3];
|
15004
|
+
if (0 <= seq && seq < n_kv) {
|
15005
|
+
float * s1 = s + (seq - sq[0])*nc*nr;
|
15006
|
+
memcpy(s1, s, nc*ir*sizeof(float));
|
15007
|
+
} else {
|
15008
|
+
// stop at negative or too big seq_ids
|
15009
|
+
break;
|
15010
|
+
}
|
15011
|
+
}
|
15012
|
+
|
15013
|
+
// it seems a little faster when this is separate from the state shift
|
15014
|
+
for (int i1 = 0; i1 < ir; ++i1) {
|
15015
|
+
// rowwise dot product
|
15016
|
+
float sumf = 0.0f;
|
15017
|
+
for (int i0 = 0; i0 < nc; ++i0) {
|
15018
|
+
int i = i0 + i1*nc;
|
15019
|
+
sumf += s[i] * c[i];
|
15020
|
+
}
|
15021
|
+
x[i1] = sumf;
|
15022
|
+
}
|
15023
|
+
}
|
15024
|
+
}
|
15025
|
+
|
15026
|
+
static void ggml_compute_forward_ssm_conv(
|
15027
|
+
const struct ggml_compute_params * params,
|
15028
|
+
struct ggml_tensor * dst) {
|
15029
|
+
switch (dst->src[0]->type) {
|
15030
|
+
case GGML_TYPE_F32:
|
15031
|
+
{
|
15032
|
+
ggml_compute_forward_ssm_conv_f32(params, dst);
|
15033
|
+
} break;
|
15034
|
+
default:
|
15035
|
+
{
|
15036
|
+
GGML_ASSERT(false);
|
15037
|
+
} break;
|
15038
|
+
}
|
15039
|
+
}
|
15040
|
+
|
15041
|
+
// ggml_compute_forward_ssm_scan
|
15042
|
+
|
15043
|
+
static void ggml_compute_forward_ssm_scan_f32(
|
15044
|
+
const struct ggml_compute_params * params,
|
15045
|
+
struct ggml_tensor * dst) {
|
15046
|
+
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
15047
|
+
return;
|
15048
|
+
}
|
15049
|
+
|
15050
|
+
const struct ggml_tensor * src0 = dst->src[0]; // s
|
15051
|
+
const struct ggml_tensor * src1 = dst->src[1]; // x
|
15052
|
+
const struct ggml_tensor * src2 = dst->src[2]; // dt
|
15053
|
+
const struct ggml_tensor * src3 = dst->src[3]; // A
|
15054
|
+
const struct ggml_tensor * src4 = dst->src[4]; // B
|
15055
|
+
const struct ggml_tensor * src5 = dst->src[5]; // C
|
15056
|
+
const struct ggml_tensor * src6 = dst->src[6]; // sq
|
15057
|
+
|
15058
|
+
const int ith = params->ith;
|
15059
|
+
const int nth = params->nth;
|
15060
|
+
|
15061
|
+
const int64_t nc = src0->ne[0]; // d_state
|
15062
|
+
const int64_t nr = src0->ne[1]; // d_inner
|
15063
|
+
const int64_t n_t = src1->ne[1]; // number of tokens in the batch
|
15064
|
+
const int64_t n_kv = src0->ne[2]; // max number of sequences in the batch
|
15065
|
+
|
15066
|
+
GGML_ASSERT(ggml_nelements(src1) + ggml_nelements(src0) == ggml_nelements(dst));
|
15067
|
+
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
15068
|
+
GGML_ASSERT(src1->nb[0] == sizeof(float));
|
15069
|
+
GGML_ASSERT(src2->nb[0] == sizeof(float));
|
15070
|
+
GGML_ASSERT(src3->nb[0] == sizeof(float));
|
15071
|
+
GGML_ASSERT(src4->nb[0] == sizeof(float));
|
15072
|
+
GGML_ASSERT(src5->nb[0] == sizeof(float));
|
15073
|
+
// required for the dot product between s and C, and when copying the states
|
15074
|
+
GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float));
|
15075
|
+
// required for per-sequence offsets for states
|
15076
|
+
GGML_ASSERT(src0->nb[2] == src0->ne[0]*src0->ne[1]*sizeof(float));
|
15077
|
+
// required to get correct offset for state destination (i.e. src1->nb[2])
|
15078
|
+
GGML_ASSERT(src1->nb[2] == src1->ne[0]*src1->ne[1]*sizeof(float));
|
15079
|
+
|
15080
|
+
// rows per thread
|
15081
|
+
const int dr = (nr + nth - 1)/nth;
|
15082
|
+
|
15083
|
+
// row range for this thread
|
15084
|
+
const int ir0 = dr*ith;
|
15085
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
15086
|
+
const int ir = ir1 - ir0;
|
15087
|
+
|
15088
|
+
if (n_kv > 1) {
|
15089
|
+
// it's hard to know if the source states have already been copied
|
15090
|
+
// when there are multiple, so copy them already.
|
15091
|
+
for (int i3 = 0; i3 < n_kv; ++i3) {
|
15092
|
+
float * s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]));
|
15093
|
+
float * s = (float *) ((char *) dst->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]) + src1->nb[2]);
|
15094
|
+
memcpy(s, s0, nc*ir*sizeof(float));
|
15095
|
+
}
|
15096
|
+
}
|
15097
|
+
|
15098
|
+
for (int i2 = 0; i2 < n_t; ++i2) {
|
15099
|
+
int32_t * sq = (int32_t *) ((char *) src6->data + i2*(src6->nb[1])); // {n_kv, n_tokens}
|
15100
|
+
float * y = (float *) ((char *) dst->data + ir0*(src1->nb[0]) + i2*(src1->nb[1])); // {d_inner, n_tokens}
|
15101
|
+
float * s = (float *) ((char *) dst->data + ir0*(src0->nb[1]) + sq[0]*(src0->nb[2]) + src1->nb[2]); // {d_state, d_inner, n_kv}
|
15102
|
+
float * s0;
|
15103
|
+
float * x = (float *) ((char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1])); // {d_inner, n_tokens}
|
15104
|
+
float * dt = (float *) ((char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1])); // {d_inner, n_tokens}
|
15105
|
+
float * A = (float *) ((char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
|
15106
|
+
float * B = (float *) ((char *) src4->data + i2*(src4->nb[1])); // {d_state, n_tokens}
|
15107
|
+
float * C = (float *) ((char *) src5->data + i2*(src5->nb[1])); // {d_state, n_tokens}
|
15108
|
+
|
15109
|
+
GGML_ASSERT(0 <= sq[0] && sq[0] < n_kv);
|
15110
|
+
|
15111
|
+
// avoid needing to copy the state for the first token
|
15112
|
+
if (i2 == 0) {
|
15113
|
+
s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + sq[0]*(src0->nb[2])); // {d_state, d_inner, n_kv}
|
15114
|
+
} else {
|
15115
|
+
// otherwise the source is the same as the destination
|
15116
|
+
s0 = s;
|
15117
|
+
}
|
15118
|
+
|
15119
|
+
// d_inner
|
15120
|
+
for (int i1 = 0; i1 < ir; ++i1) {
|
15121
|
+
// ref: https://github.com/state-spaces/mamba/blob/34076d664838588a3c97727b263478ab9f621a07/mamba_ssm/ops/triton/selective_state_update.py#L78
|
15122
|
+
float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1];
|
15123
|
+
float x_dt = x[i1] * dt_soft_plus;
|
15124
|
+
float sumf = 0.0f;
|
15125
|
+
// d_state
|
15126
|
+
for (int i0 = 0; i0 < nc; ++i0) {
|
15127
|
+
int i = i0 + i1*nc;
|
15128
|
+
// state = prev_state * dA + dB * x
|
15129
|
+
float state = (s0[i] * expf(dt_soft_plus * A[i])) + (B[i0] * x_dt);
|
15130
|
+
// y = rowwise_dotprod(state, C)
|
15131
|
+
sumf += state * C[i0];
|
15132
|
+
s[i] = state;
|
15133
|
+
}
|
15134
|
+
y[i1] = sumf;
|
15135
|
+
}
|
15136
|
+
|
15137
|
+
// handle copies when there are multiple output states
|
15138
|
+
for (int i3 = 1; i3 < n_kv; ++i3) {
|
15139
|
+
int32_t seq = sq[i3];
|
15140
|
+
if (0 <= seq && seq < n_kv) {
|
15141
|
+
float * s1 = s + (seq - sq[0])*nc*nr;
|
15142
|
+
memcpy(s1, s, nc*ir*sizeof(float));
|
15143
|
+
} else {
|
15144
|
+
// stop at negative or too big seq_ids
|
15145
|
+
break;
|
15146
|
+
}
|
15147
|
+
}
|
15148
|
+
}
|
15149
|
+
}
|
15150
|
+
|
15151
|
+
static void ggml_compute_forward_ssm_scan(
|
15152
|
+
const struct ggml_compute_params * params,
|
15153
|
+
struct ggml_tensor * dst) {
|
15154
|
+
switch (dst->src[0]->type) {
|
15155
|
+
case GGML_TYPE_F32:
|
15156
|
+
{
|
15157
|
+
ggml_compute_forward_ssm_scan_f32(params, dst);
|
15158
|
+
} break;
|
15159
|
+
default:
|
15160
|
+
{
|
15161
|
+
GGML_ASSERT(false);
|
15162
|
+
} break;
|
15163
|
+
}
|
15164
|
+
}
|
15165
|
+
|
14593
15166
|
// ggml_compute_forward_win_part
|
14594
15167
|
|
14595
15168
|
static void ggml_compute_forward_win_part_f32(
|
@@ -15615,6 +16188,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
15615
16188
|
{
|
15616
16189
|
ggml_compute_forward_pad(params, tensor);
|
15617
16190
|
} break;
|
16191
|
+
case GGML_OP_ARANGE:
|
16192
|
+
{
|
16193
|
+
ggml_compute_forward_arange(params, tensor);
|
16194
|
+
} break;
|
16195
|
+
case GGML_OP_TIMESTEP_EMBEDDING:
|
16196
|
+
{
|
16197
|
+
ggml_compute_forward_timestep_embedding(params, tensor);
|
16198
|
+
} break;
|
15618
16199
|
case GGML_OP_ARGSORT:
|
15619
16200
|
{
|
15620
16201
|
ggml_compute_forward_argsort(params, tensor);
|
@@ -15641,6 +16222,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
15641
16222
|
bool masked = t != 0;
|
15642
16223
|
ggml_compute_forward_flash_attn_back(params, masked, tensor);
|
15643
16224
|
} break;
|
16225
|
+
case GGML_OP_SSM_CONV:
|
16226
|
+
{
|
16227
|
+
ggml_compute_forward_ssm_conv(params, tensor);
|
16228
|
+
} break;
|
16229
|
+
case GGML_OP_SSM_SCAN:
|
16230
|
+
{
|
16231
|
+
ggml_compute_forward_ssm_scan(params, tensor);
|
16232
|
+
} break;
|
15644
16233
|
case GGML_OP_WIN_PART:
|
15645
16234
|
{
|
15646
16235
|
ggml_compute_forward_win_part(params, tensor);
|
@@ -16617,6 +17206,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
16617
17206
|
{
|
16618
17207
|
GGML_ASSERT(false); // TODO: not implemented
|
16619
17208
|
} break;
|
17209
|
+
case GGML_OP_ARANGE:
|
17210
|
+
{
|
17211
|
+
GGML_ASSERT(false); // TODO: not implemented
|
17212
|
+
} break;
|
17213
|
+
case GGML_OP_TIMESTEP_EMBEDDING:
|
17214
|
+
{
|
17215
|
+
GGML_ASSERT(false); // TODO: not implemented
|
17216
|
+
} break;
|
16620
17217
|
case GGML_OP_ARGSORT:
|
16621
17218
|
{
|
16622
17219
|
GGML_ASSERT(false); // TODO: not implemented
|
@@ -16687,6 +17284,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
16687
17284
|
{
|
16688
17285
|
GGML_ASSERT(false); // not supported
|
16689
17286
|
} break;
|
17287
|
+
case GGML_OP_SSM_CONV:
|
17288
|
+
case GGML_OP_SSM_SCAN:
|
17289
|
+
{
|
17290
|
+
GGML_ASSERT(false); // TODO: not implemented
|
17291
|
+
} break;
|
16690
17292
|
case GGML_OP_WIN_PART:
|
16691
17293
|
case GGML_OP_WIN_UNPART:
|
16692
17294
|
case GGML_OP_UNARY:
|
@@ -17217,6 +17819,7 @@ struct ggml_compute_state {
|
|
17217
17819
|
ggml_thread_t thrd;
|
17218
17820
|
int ith;
|
17219
17821
|
struct ggml_compute_state_shared * shared;
|
17822
|
+
enum ggml_status ec;
|
17220
17823
|
};
|
17221
17824
|
|
17222
17825
|
static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
|
@@ -17228,7 +17831,7 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
|
|
17228
17831
|
node->perf_time_us += time_us_cur;
|
17229
17832
|
}
|
17230
17833
|
|
17231
|
-
static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
17834
|
+
static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_threads) {
|
17232
17835
|
int n_tasks = 0;
|
17233
17836
|
|
17234
17837
|
switch (node->op) {
|
@@ -17309,6 +17912,12 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
17309
17912
|
{
|
17310
17913
|
n_tasks = n_threads;
|
17311
17914
|
} break;
|
17915
|
+
case GGML_OP_GET_ROWS:
|
17916
|
+
{
|
17917
|
+
// FIXME: the cost of launching additional threads decreases performance with GPU offloading
|
17918
|
+
//n_tasks = MIN(n_threads, ggml_nelements(node->src[1]));
|
17919
|
+
n_tasks = MIN(n_cur_threads, ggml_nelements(node->src[1]));
|
17920
|
+
} break;
|
17312
17921
|
case GGML_OP_SCALE:
|
17313
17922
|
case GGML_OP_SET:
|
17314
17923
|
case GGML_OP_CONT:
|
@@ -17316,7 +17925,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
17316
17925
|
case GGML_OP_VIEW:
|
17317
17926
|
case GGML_OP_PERMUTE:
|
17318
17927
|
case GGML_OP_TRANSPOSE:
|
17319
|
-
case GGML_OP_GET_ROWS:
|
17320
17928
|
case GGML_OP_GET_ROWS_BACK:
|
17321
17929
|
case GGML_OP_DIAG:
|
17322
17930
|
{
|
@@ -17368,6 +17976,14 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
17368
17976
|
{
|
17369
17977
|
n_tasks = n_threads;
|
17370
17978
|
} break;
|
17979
|
+
case GGML_OP_ARANGE:
|
17980
|
+
{
|
17981
|
+
n_tasks = n_threads;
|
17982
|
+
} break;
|
17983
|
+
case GGML_OP_TIMESTEP_EMBEDDING:
|
17984
|
+
{
|
17985
|
+
n_tasks = n_threads;
|
17986
|
+
} break;
|
17371
17987
|
case GGML_OP_ARGSORT:
|
17372
17988
|
{
|
17373
17989
|
n_tasks = n_threads;
|
@@ -17384,6 +18000,11 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
17384
18000
|
{
|
17385
18001
|
n_tasks = n_threads;
|
17386
18002
|
} break;
|
18003
|
+
case GGML_OP_SSM_CONV:
|
18004
|
+
case GGML_OP_SSM_SCAN:
|
18005
|
+
{
|
18006
|
+
n_tasks = n_threads;
|
18007
|
+
} break;
|
17387
18008
|
case GGML_OP_WIN_PART:
|
17388
18009
|
case GGML_OP_WIN_UNPART:
|
17389
18010
|
case GGML_OP_GET_REL_POS:
|
@@ -17502,7 +18123,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
17502
18123
|
while (true) {
|
17503
18124
|
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
|
17504
18125
|
state->shared->node_n += 1;
|
17505
|
-
|
18126
|
+
state->ec = GGML_STATUS_ABORTED;
|
18127
|
+
return 0;
|
17506
18128
|
}
|
17507
18129
|
|
17508
18130
|
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
@@ -17520,7 +18142,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
17520
18142
|
/* FINALIZE */
|
17521
18143
|
struct ggml_tensor * node = cgraph->nodes[node_n];
|
17522
18144
|
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
17523
|
-
params.nth = ggml_get_n_tasks(node, n_threads);
|
18145
|
+
params.nth = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
|
17524
18146
|
ggml_compute_forward(¶ms, node);
|
17525
18147
|
}
|
17526
18148
|
ggml_graph_compute_perf_stats_node(node, state->shared);
|
@@ -17530,7 +18152,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
17530
18152
|
while (++node_n < cgraph->n_nodes) {
|
17531
18153
|
GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
|
17532
18154
|
struct ggml_tensor * node = cgraph->nodes[node_n];
|
17533
|
-
const int n_tasks = ggml_get_n_tasks(node, n_threads);
|
18155
|
+
const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
|
17534
18156
|
|
17535
18157
|
state->shared->perf_node_start_cycles = ggml_perf_cycles();
|
17536
18158
|
state->shared->perf_node_start_time_us = ggml_perf_time_us();
|
@@ -17578,7 +18200,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
17578
18200
|
|
17579
18201
|
/* INIT & COMPUTE */
|
17580
18202
|
struct ggml_tensor * node = cgraph->nodes[node_n];
|
17581
|
-
const int n_tasks = ggml_get_n_tasks(node, n_threads);
|
18203
|
+
const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
|
17582
18204
|
|
17583
18205
|
struct ggml_compute_params params = {
|
17584
18206
|
/*.type =*/ GGML_TASK_TYPE_INIT,
|
@@ -17624,7 +18246,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
17624
18246
|
}
|
17625
18247
|
}
|
17626
18248
|
|
17627
|
-
return
|
18249
|
+
return 0;
|
17628
18250
|
}
|
17629
18251
|
|
17630
18252
|
struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) {
|
@@ -17643,7 +18265,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
17643
18265
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
17644
18266
|
struct ggml_tensor * node = cgraph->nodes[i];
|
17645
18267
|
|
17646
|
-
const int n_tasks = ggml_get_n_tasks(node, n_threads);
|
18268
|
+
const int n_tasks = ggml_get_n_tasks(node, n_threads, 1);
|
17647
18269
|
|
17648
18270
|
max_tasks = MAX(max_tasks, n_tasks);
|
17649
18271
|
|
@@ -17820,7 +18442,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
17820
18442
|
return cplan;
|
17821
18443
|
}
|
17822
18444
|
|
17823
|
-
|
18445
|
+
enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
17824
18446
|
{
|
17825
18447
|
GGML_ASSERT(cplan);
|
17826
18448
|
GGML_ASSERT(cplan->n_threads > 0);
|
@@ -17864,6 +18486,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|
17864
18486
|
.thrd = 0,
|
17865
18487
|
.ith = j,
|
17866
18488
|
.shared = &state_shared,
|
18489
|
+
.ec = GGML_STATUS_SUCCESS,
|
17867
18490
|
};
|
17868
18491
|
|
17869
18492
|
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
|
@@ -17874,12 +18497,14 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|
17874
18497
|
|
17875
18498
|
workers[0].ith = 0;
|
17876
18499
|
workers[0].shared = &state_shared;
|
18500
|
+
workers[0].ec = GGML_STATUS_SUCCESS;
|
17877
18501
|
|
17878
18502
|
const int64_t perf_start_cycles = ggml_perf_cycles();
|
17879
18503
|
const int64_t perf_start_time_us = ggml_perf_time_us();
|
17880
18504
|
|
17881
18505
|
// this is a work thread too
|
17882
|
-
|
18506
|
+
ggml_graph_compute_thread(&workers[0]);
|
18507
|
+
enum ggml_status compute_status = workers[0].ec;
|
17883
18508
|
|
17884
18509
|
// don't leave affinity set on the main thread
|
17885
18510
|
clear_numa_thread_affinity();
|
@@ -17889,6 +18514,8 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|
17889
18514
|
for (int j = 1; j < n_threads; j++) {
|
17890
18515
|
const int rc = ggml_thread_join(workers[j].thrd, NULL);
|
17891
18516
|
GGML_ASSERT(rc == 0);
|
18517
|
+
if (workers[j].ec != GGML_STATUS_SUCCESS)
|
18518
|
+
compute_status = workers[j].ec;
|
17892
18519
|
}
|
17893
18520
|
}
|
17894
18521
|
|
@@ -17916,14 +18543,14 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|
17916
18543
|
return compute_status;
|
17917
18544
|
}
|
17918
18545
|
|
17919
|
-
|
18546
|
+
enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
|
17920
18547
|
struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
|
17921
18548
|
|
17922
18549
|
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
|
17923
18550
|
|
17924
18551
|
cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
|
17925
18552
|
|
17926
|
-
ggml_graph_compute(cgraph, &cplan);
|
18553
|
+
return ggml_graph_compute(cgraph, &cplan);
|
17927
18554
|
}
|
17928
18555
|
|
17929
18556
|
struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
|
@@ -19572,133 +20199,6 @@ void ggml_quantize_free(void) {
|
|
19572
20199
|
ggml_critical_section_end();
|
19573
20200
|
}
|
19574
20201
|
|
19575
|
-
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) {
|
19576
|
-
assert(k % QK4_0 == 0);
|
19577
|
-
const int nb = k / QK4_0;
|
19578
|
-
|
19579
|
-
for (int b = 0; b < n; b += k) {
|
19580
|
-
block_q4_0 * restrict y = (block_q4_0 *) dst + b/QK4_0;
|
19581
|
-
|
19582
|
-
quantize_row_q4_0_reference(src + b, y, k);
|
19583
|
-
|
19584
|
-
for (int i = 0; i < nb; i++) {
|
19585
|
-
for (int j = 0; j < QK4_0; j += 2) {
|
19586
|
-
const uint8_t vi0 = y[i].qs[j/2] & 0x0F;
|
19587
|
-
const uint8_t vi1 = y[i].qs[j/2] >> 4;
|
19588
|
-
|
19589
|
-
hist[vi0]++;
|
19590
|
-
hist[vi1]++;
|
19591
|
-
}
|
19592
|
-
}
|
19593
|
-
}
|
19594
|
-
|
19595
|
-
return (n/QK4_0*sizeof(block_q4_0));
|
19596
|
-
}
|
19597
|
-
|
19598
|
-
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) {
|
19599
|
-
assert(k % QK4_1 == 0);
|
19600
|
-
const int nb = k / QK4_1;
|
19601
|
-
|
19602
|
-
for (int b = 0; b < n; b += k) {
|
19603
|
-
block_q4_1 * restrict y = (block_q4_1 *) dst + b/QK4_1;
|
19604
|
-
|
19605
|
-
quantize_row_q4_1_reference(src + b, y, k);
|
19606
|
-
|
19607
|
-
for (int i = 0; i < nb; i++) {
|
19608
|
-
for (int j = 0; j < QK4_1; j += 2) {
|
19609
|
-
const uint8_t vi0 = y[i].qs[j/2] & 0x0F;
|
19610
|
-
const uint8_t vi1 = y[i].qs[j/2] >> 4;
|
19611
|
-
|
19612
|
-
hist[vi0]++;
|
19613
|
-
hist[vi1]++;
|
19614
|
-
}
|
19615
|
-
}
|
19616
|
-
}
|
19617
|
-
|
19618
|
-
return (n/QK4_1*sizeof(block_q4_1));
|
19619
|
-
}
|
19620
|
-
|
19621
|
-
size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist) {
|
19622
|
-
assert(k % QK5_0 == 0);
|
19623
|
-
const int nb = k / QK5_0;
|
19624
|
-
|
19625
|
-
for (int b = 0; b < n; b += k) {
|
19626
|
-
block_q5_0 * restrict y = (block_q5_0 *)dst + b/QK5_0;
|
19627
|
-
|
19628
|
-
quantize_row_q5_0_reference(src + b, y, k);
|
19629
|
-
|
19630
|
-
for (int i = 0; i < nb; i++) {
|
19631
|
-
uint32_t qh;
|
19632
|
-
memcpy(&qh, &y[i].qh, sizeof(qh));
|
19633
|
-
|
19634
|
-
for (int j = 0; j < QK5_0; j += 2) {
|
19635
|
-
const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
|
19636
|
-
const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
|
19637
|
-
|
19638
|
-
// cast to 16 bins
|
19639
|
-
const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
|
19640
|
-
const uint8_t vi1 = ((y[i].qs[j/2] >> 4) | vh1) / 2;
|
19641
|
-
|
19642
|
-
hist[vi0]++;
|
19643
|
-
hist[vi1]++;
|
19644
|
-
}
|
19645
|
-
}
|
19646
|
-
}
|
19647
|
-
|
19648
|
-
return (n/QK5_0*sizeof(block_q5_0));
|
19649
|
-
}
|
19650
|
-
|
19651
|
-
size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist) {
|
19652
|
-
assert(k % QK5_1 == 0);
|
19653
|
-
const int nb = k / QK5_1;
|
19654
|
-
|
19655
|
-
for (int b = 0; b < n; b += k) {
|
19656
|
-
block_q5_1 * restrict y = (block_q5_1 *)dst + b/QK5_1;
|
19657
|
-
|
19658
|
-
quantize_row_q5_1_reference(src + b, y, k);
|
19659
|
-
|
19660
|
-
for (int i = 0; i < nb; i++) {
|
19661
|
-
uint32_t qh;
|
19662
|
-
memcpy(&qh, &y[i].qh, sizeof(qh));
|
19663
|
-
|
19664
|
-
for (int j = 0; j < QK5_1; j += 2) {
|
19665
|
-
const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
|
19666
|
-
const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
|
19667
|
-
|
19668
|
-
// cast to 16 bins
|
19669
|
-
const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
|
19670
|
-
const uint8_t vi1 = ((y[i].qs[j/2] >> 4) | vh1) / 2;
|
19671
|
-
|
19672
|
-
hist[vi0]++;
|
19673
|
-
hist[vi1]++;
|
19674
|
-
}
|
19675
|
-
}
|
19676
|
-
}
|
19677
|
-
|
19678
|
-
return (n/QK5_1*sizeof(block_q5_1));
|
19679
|
-
}
|
19680
|
-
|
19681
|
-
size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist) {
|
19682
|
-
assert(k % QK8_0 == 0);
|
19683
|
-
const int nb = k / QK8_0;
|
19684
|
-
|
19685
|
-
for (int b = 0; b < n; b += k) {
|
19686
|
-
block_q8_0 * restrict y = (block_q8_0 *)dst + b/QK8_0;
|
19687
|
-
|
19688
|
-
quantize_row_q8_0_reference(src + b, y, k);
|
19689
|
-
|
19690
|
-
for (int i = 0; i < nb; i++) {
|
19691
|
-
for (int j = 0; j < QK8_0; ++j) {
|
19692
|
-
const int8_t vi = y[i].qs[j];
|
19693
|
-
|
19694
|
-
hist[vi/16 + 8]++;
|
19695
|
-
}
|
19696
|
-
}
|
19697
|
-
}
|
19698
|
-
|
19699
|
-
return (n/QK8_0*sizeof(block_q8_0));
|
19700
|
-
}
|
19701
|
-
|
19702
20202
|
bool ggml_quantize_requires_imatrix(enum ggml_type type) {
|
19703
20203
|
return
|
19704
20204
|
type == GGML_TYPE_IQ2_XXS ||
|
@@ -19706,177 +20206,52 @@ bool ggml_quantize_requires_imatrix(enum ggml_type type) {
|
|
19706
20206
|
type == GGML_TYPE_IQ1_S;
|
19707
20207
|
}
|
19708
20208
|
|
19709
|
-
size_t ggml_quantize_chunk(
|
19710
|
-
|
20209
|
+
size_t ggml_quantize_chunk(
|
20210
|
+
enum ggml_type type,
|
20211
|
+
const float * src,
|
20212
|
+
void * dst,
|
20213
|
+
int start,
|
20214
|
+
int nrows,
|
20215
|
+
int n_per_row,
|
20216
|
+
const float * imatrix) {
|
20217
|
+
const int n = nrows * n_per_row;
|
20218
|
+
|
20219
|
+
if (ggml_quantize_requires_imatrix(type)) {
|
20220
|
+
GGML_ASSERT(imatrix != NULL);
|
20221
|
+
}
|
20222
|
+
|
20223
|
+
GGML_ASSERT(start % type_traits[type].blck_size == 0);
|
20224
|
+
GGML_ASSERT(start % n_per_row == 0);
|
20225
|
+
|
19711
20226
|
ggml_quantize_init(type); // this is noop if already initialized
|
20227
|
+
|
20228
|
+
const size_t start_row = start / n_per_row;
|
20229
|
+
const size_t row_size = ggml_row_size(type, n_per_row);
|
20230
|
+
|
19712
20231
|
size_t result = 0;
|
19713
|
-
|
20232
|
+
|
19714
20233
|
switch (type) {
|
19715
|
-
case GGML_TYPE_Q4_0:
|
19716
|
-
|
19717
|
-
|
19718
|
-
|
19719
|
-
|
19720
|
-
|
19721
|
-
|
19722
|
-
|
19723
|
-
|
19724
|
-
case
|
19725
|
-
|
19726
|
-
|
19727
|
-
|
19728
|
-
|
19729
|
-
|
19730
|
-
|
19731
|
-
|
19732
|
-
} break;
|
19733
|
-
case GGML_TYPE_Q5_0:
|
19734
|
-
{
|
19735
|
-
GGML_ASSERT(start % QK5_0 == 0);
|
19736
|
-
GGML_ASSERT(start % n_per_row == 0);
|
19737
|
-
size_t start_row = start / n_per_row;
|
19738
|
-
size_t row_size = ggml_row_size(type, n_per_row);
|
19739
|
-
result = quantize_q5_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
19740
|
-
GGML_ASSERT(result == row_size * nrows);
|
19741
|
-
} break;
|
19742
|
-
case GGML_TYPE_Q5_1:
|
19743
|
-
{
|
19744
|
-
GGML_ASSERT(start % QK5_1 == 0);
|
19745
|
-
GGML_ASSERT(start % n_per_row == 0);
|
19746
|
-
size_t start_row = start / n_per_row;
|
19747
|
-
size_t row_size = ggml_row_size(type, n_per_row);
|
19748
|
-
result = quantize_q5_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
19749
|
-
GGML_ASSERT(result == row_size * nrows);
|
19750
|
-
} break;
|
19751
|
-
case GGML_TYPE_Q8_0:
|
19752
|
-
{
|
19753
|
-
GGML_ASSERT(start % QK8_0 == 0);
|
19754
|
-
block_q8_0 * block = (block_q8_0*)dst + start / QK8_0;
|
19755
|
-
result = ggml_quantize_q8_0(src + start, block, n, n, hist);
|
19756
|
-
} break;
|
19757
|
-
case GGML_TYPE_Q2_K:
|
19758
|
-
{
|
19759
|
-
GGML_ASSERT(start % QK_K == 0);
|
19760
|
-
GGML_ASSERT(start % n_per_row == 0);
|
19761
|
-
size_t start_row = start / n_per_row;
|
19762
|
-
size_t row_size = ggml_row_size(type, n_per_row);
|
19763
|
-
result = quantize_q2_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
19764
|
-
GGML_ASSERT(result == row_size * nrows);
|
19765
|
-
} break;
|
19766
|
-
case GGML_TYPE_Q3_K:
|
19767
|
-
{
|
19768
|
-
GGML_ASSERT(start % QK_K == 0);
|
19769
|
-
GGML_ASSERT(start % n_per_row == 0);
|
19770
|
-
size_t start_row = start / n_per_row;
|
19771
|
-
size_t row_size = ggml_row_size(type, n_per_row);
|
19772
|
-
result = quantize_q3_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
19773
|
-
GGML_ASSERT(result == row_size * nrows);
|
19774
|
-
} break;
|
19775
|
-
case GGML_TYPE_Q4_K:
|
19776
|
-
{
|
19777
|
-
GGML_ASSERT(start % QK_K == 0);
|
19778
|
-
GGML_ASSERT(start % n_per_row == 0);
|
19779
|
-
size_t start_row = start / n_per_row;
|
19780
|
-
size_t row_size = ggml_row_size(type, n_per_row);
|
19781
|
-
result = quantize_q4_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
19782
|
-
GGML_ASSERT(result == row_size * nrows);
|
19783
|
-
} break;
|
19784
|
-
case GGML_TYPE_Q5_K:
|
19785
|
-
{
|
19786
|
-
GGML_ASSERT(start % QK_K == 0);
|
19787
|
-
GGML_ASSERT(start % n_per_row == 0);
|
19788
|
-
size_t start_row = start / n_per_row;
|
19789
|
-
size_t row_size = ggml_row_size(type, n_per_row);
|
19790
|
-
result = quantize_q5_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
19791
|
-
GGML_ASSERT(result == row_size * nrows);
|
19792
|
-
} break;
|
19793
|
-
case GGML_TYPE_Q6_K:
|
19794
|
-
{
|
19795
|
-
GGML_ASSERT(start % QK_K == 0);
|
19796
|
-
GGML_ASSERT(start % n_per_row == 0);
|
19797
|
-
size_t start_row = start / n_per_row;
|
19798
|
-
size_t row_size = ggml_row_size(type, n_per_row);
|
19799
|
-
result = quantize_q6_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
19800
|
-
GGML_ASSERT(result == row_size * nrows);
|
19801
|
-
} break;
|
19802
|
-
case GGML_TYPE_IQ2_XXS:
|
19803
|
-
{
|
19804
|
-
GGML_ASSERT(start % QK_K == 0);
|
19805
|
-
GGML_ASSERT(start % n_per_row == 0);
|
19806
|
-
GGML_ASSERT(imatrix);
|
19807
|
-
size_t start_row = start / n_per_row;
|
19808
|
-
size_t row_size = ggml_row_size(type, n_per_row);
|
19809
|
-
result = quantize_iq2_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
19810
|
-
GGML_ASSERT(result == row_size * nrows);
|
19811
|
-
} break;
|
19812
|
-
case GGML_TYPE_IQ2_XS:
|
19813
|
-
{
|
19814
|
-
GGML_ASSERT(start % QK_K == 0);
|
19815
|
-
GGML_ASSERT(start % n_per_row == 0);
|
19816
|
-
GGML_ASSERT(imatrix);
|
19817
|
-
size_t start_row = start / n_per_row;
|
19818
|
-
size_t row_size = ggml_row_size(type, n_per_row);
|
19819
|
-
result = quantize_iq2_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
19820
|
-
GGML_ASSERT(result == row_size * nrows);
|
19821
|
-
} break;
|
19822
|
-
case GGML_TYPE_IQ3_XXS:
|
19823
|
-
{
|
19824
|
-
GGML_ASSERT(start % QK_K == 0);
|
19825
|
-
GGML_ASSERT(start % n_per_row == 0);
|
19826
|
-
size_t start_row = start / n_per_row;
|
19827
|
-
size_t row_size = ggml_row_size(type, n_per_row);
|
19828
|
-
result = quantize_iq3_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
19829
|
-
GGML_ASSERT(result == row_size * nrows);
|
19830
|
-
} break;
|
19831
|
-
case GGML_TYPE_IQ3_S:
|
19832
|
-
{
|
19833
|
-
GGML_ASSERT(start % QK_K == 0);
|
19834
|
-
GGML_ASSERT(start % n_per_row == 0);
|
19835
|
-
size_t start_row = start / n_per_row;
|
19836
|
-
size_t row_size = ggml_row_size(type, n_per_row);
|
19837
|
-
result = quantize_iq3_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
19838
|
-
GGML_ASSERT(result == row_size * nrows);
|
19839
|
-
} break;
|
19840
|
-
case GGML_TYPE_IQ2_S:
|
19841
|
-
{
|
19842
|
-
GGML_ASSERT(start % QK_K == 0);
|
19843
|
-
GGML_ASSERT(start % n_per_row == 0);
|
19844
|
-
size_t start_row = start / n_per_row;
|
19845
|
-
size_t row_size = ggml_row_size(type, n_per_row);
|
19846
|
-
result = quantize_iq2_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
19847
|
-
GGML_ASSERT(result == row_size * nrows);
|
19848
|
-
} break;
|
19849
|
-
case GGML_TYPE_IQ1_S:
|
19850
|
-
{
|
19851
|
-
GGML_ASSERT(start % QK_K == 0);
|
19852
|
-
GGML_ASSERT(start % n_per_row == 0);
|
19853
|
-
size_t start_row = start / n_per_row;
|
19854
|
-
size_t row_size = ggml_row_size(type, n_per_row);
|
19855
|
-
result = quantize_iq1_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
19856
|
-
GGML_ASSERT(result == row_size * nrows);
|
19857
|
-
} break;
|
19858
|
-
case GGML_TYPE_IQ4_NL:
|
20234
|
+
case GGML_TYPE_Q4_0: result = quantize_q4_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20235
|
+
case GGML_TYPE_Q4_1: result = quantize_q4_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20236
|
+
case GGML_TYPE_Q5_0: result = quantize_q5_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20237
|
+
case GGML_TYPE_Q5_1: result = quantize_q5_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20238
|
+
case GGML_TYPE_Q8_0: result = quantize_q8_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20239
|
+
case GGML_TYPE_Q2_K: result = quantize_q2_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20240
|
+
case GGML_TYPE_Q3_K: result = quantize_q3_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20241
|
+
case GGML_TYPE_Q4_K: result = quantize_q4_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20242
|
+
case GGML_TYPE_Q5_K: result = quantize_q5_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20243
|
+
case GGML_TYPE_Q6_K: result = quantize_q6_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20244
|
+
case GGML_TYPE_IQ2_XXS: result = quantize_iq2_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20245
|
+
case GGML_TYPE_IQ2_XS: result = quantize_iq2_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20246
|
+
case GGML_TYPE_IQ3_XXS: result = quantize_iq3_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20247
|
+
case GGML_TYPE_IQ3_S: result = quantize_iq3_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20248
|
+
case GGML_TYPE_IQ2_S: result = quantize_iq2_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20249
|
+
case GGML_TYPE_IQ1_S: result = quantize_iq1_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20250
|
+
case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
19859
20251
|
#if QK_K == 64
|
19860
|
-
case GGML_TYPE_IQ4_XS:
|
19861
|
-
#
|
19862
|
-
|
19863
|
-
GGML_ASSERT(start % QK4_NL == 0);
|
19864
|
-
GGML_ASSERT(start % n_per_row == 0);
|
19865
|
-
size_t start_row = start / n_per_row;
|
19866
|
-
size_t row_size = ggml_row_size(type, n_per_row);
|
19867
|
-
result = quantize_iq4_nl(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
19868
|
-
GGML_ASSERT(result == row_size * nrows);
|
19869
|
-
} break;
|
19870
|
-
#if QK_K != 64
|
19871
|
-
case GGML_TYPE_IQ4_XS:
|
19872
|
-
{
|
19873
|
-
GGML_ASSERT(start % QK_K == 0);
|
19874
|
-
GGML_ASSERT(start % n_per_row == 0);
|
19875
|
-
size_t start_row = start / n_per_row;
|
19876
|
-
size_t row_size = ggml_row_size(type, n_per_row);
|
19877
|
-
result = quantize_iq4_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
19878
|
-
GGML_ASSERT(result == row_size * nrows);
|
19879
|
-
} break;
|
20252
|
+
case GGML_TYPE_IQ4_XS: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
20253
|
+
#else
|
20254
|
+
case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
19880
20255
|
#endif
|
19881
20256
|
case GGML_TYPE_F16:
|
19882
20257
|
{
|
@@ -19893,6 +20268,9 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
|
19893
20268
|
default:
|
19894
20269
|
assert(false);
|
19895
20270
|
}
|
20271
|
+
|
20272
|
+
GGML_ASSERT(result == nrows * row_size);
|
20273
|
+
|
19896
20274
|
return result;
|
19897
20275
|
}
|
19898
20276
|
|