llama_cpp 0.12.2 → 0.12.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +15 -0
- data/README.md +2 -2
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +68 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -2
- data/vendor/tmp/llama.cpp/Makefile +25 -3
- data/vendor/tmp/llama.cpp/ggml-alloc.c +87 -27
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +6 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +176 -18
- data/vendor/tmp/llama.cpp/ggml-backend.h +14 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +1990 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.h +46 -0
- data/vendor/tmp/llama.cpp/ggml-metal.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +144 -113
- data/vendor/tmp/llama.cpp/ggml-metal.metal +303 -4
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +95 -3
- data/vendor/tmp/llama.cpp/ggml-opencl.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +736 -59
- data/vendor/tmp/llama.cpp/ggml-quants.h +20 -1
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +15255 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.h +29 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +60854 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +5270 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +34 -0
- data/vendor/tmp/llama.cpp/ggml.c +664 -117
- data/vendor/tmp/llama.cpp/ggml.h +46 -11
- data/vendor/tmp/llama.cpp/llama.cpp +1426 -341
- data/vendor/tmp/llama.cpp/llama.h +24 -15
- data/vendor/tmp/llama.cpp/unicode.h +2 -1
- metadata +10 -3
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
@@ -218,6 +218,7 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|
218
218
|
break;
|
219
219
|
}
|
220
220
|
GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
|
221
|
+
GGML_ASSERT(false);
|
221
222
|
return NULL;
|
222
223
|
}
|
223
224
|
return aligned_memory;
|
@@ -230,6 +231,38 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|
230
231
|
#endif
|
231
232
|
#endif
|
232
233
|
|
234
|
+
inline static void * ggml_malloc(size_t size) {
|
235
|
+
if (size == 0) {
|
236
|
+
GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_malloc!\n");
|
237
|
+
return NULL;
|
238
|
+
}
|
239
|
+
void * result = malloc(size);
|
240
|
+
if (result == NULL) {
|
241
|
+
GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
|
242
|
+
GGML_ASSERT(false);
|
243
|
+
}
|
244
|
+
return result;
|
245
|
+
}
|
246
|
+
|
247
|
+
// calloc
|
248
|
+
inline static void * ggml_calloc(size_t num, size_t size) {
|
249
|
+
if (num == 0 || size == 0) {
|
250
|
+
GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_calloc!\n");
|
251
|
+
return NULL;
|
252
|
+
}
|
253
|
+
void * result = calloc(num, size);
|
254
|
+
if (result == NULL) {
|
255
|
+
GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
|
256
|
+
GGML_ASSERT(false);
|
257
|
+
}
|
258
|
+
return result;
|
259
|
+
}
|
260
|
+
|
261
|
+
#define GGML_MALLOC(size) ggml_malloc(size)
|
262
|
+
#define GGML_CALLOC(num, size) ggml_calloc(num, size)
|
263
|
+
|
264
|
+
#define GGML_FREE(ptr) free(ptr)
|
265
|
+
|
233
266
|
#define UNUSED GGML_UNUSED
|
234
267
|
#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
|
235
268
|
|
@@ -248,6 +281,10 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|
248
281
|
#include "ggml-cuda.h"
|
249
282
|
#elif defined(GGML_USE_CLBLAST)
|
250
283
|
#include "ggml-opencl.h"
|
284
|
+
#elif defined(GGML_USE_VULKAN)
|
285
|
+
#include "ggml-vulkan.h"
|
286
|
+
#elif defined(GGML_USE_SYCL)
|
287
|
+
#include "ggml-sycl.h"
|
251
288
|
#endif
|
252
289
|
|
253
290
|
// floating point type used to accumulate sums
|
@@ -394,12 +431,6 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
|
394
431
|
static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
|
395
432
|
static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
|
396
433
|
|
397
|
-
ggml_collect_imatrix_t g_imatrix_collect = NULL;
|
398
|
-
|
399
|
-
void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect) {
|
400
|
-
g_imatrix_collect = imatrix_collect;
|
401
|
-
}
|
402
|
-
|
403
434
|
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
404
435
|
[GGML_TYPE_I8] = {
|
405
436
|
.type_name = "i8",
|
@@ -601,6 +632,17 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
601
632
|
.vec_dot = ggml_vec_dot_iq2_xs_q8_K,
|
602
633
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
603
634
|
},
|
635
|
+
[GGML_TYPE_IQ3_XXS] = {
|
636
|
+
.type_name = "iq3_xxs",
|
637
|
+
.blck_size = QK_K,
|
638
|
+
.type_size = sizeof(block_iq3_xxs),
|
639
|
+
.is_quantized = true,
|
640
|
+
.to_float = (ggml_to_float_t) dequantize_row_iq3_xxs,
|
641
|
+
.from_float = quantize_row_iq3_xxs,
|
642
|
+
.from_float_reference = (ggml_from_float_t)quantize_row_iq3_xxs_reference,
|
643
|
+
.vec_dot = ggml_vec_dot_iq3_xxs_q8_K,
|
644
|
+
.vec_dot_type = GGML_TYPE_Q8_K,
|
645
|
+
},
|
604
646
|
[GGML_TYPE_Q8_K] = {
|
605
647
|
.type_name = "q8_K",
|
606
648
|
.blck_size = QK_K,
|
@@ -1424,6 +1466,9 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) {
|
|
1424
1466
|
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
|
1425
1467
|
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
1426
1468
|
inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
|
1469
|
+
// TODO: optimize performance
|
1470
|
+
inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
|
1471
|
+
inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
|
1427
1472
|
|
1428
1473
|
static const float GELU_COEF_A = 0.044715f;
|
1429
1474
|
static const float GELU_QUICK_COEF = -1.702f;
|
@@ -1782,9 +1827,11 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
|
|
1782
1827
|
"GELU",
|
1783
1828
|
"GELU_QUICK",
|
1784
1829
|
"SILU",
|
1830
|
+
"HARDSWISH",
|
1831
|
+
"HARDSIGMOID",
|
1785
1832
|
};
|
1786
1833
|
|
1787
|
-
static_assert(GGML_UNARY_OP_COUNT ==
|
1834
|
+
static_assert(GGML_UNARY_OP_COUNT == 12, "GGML_UNARY_OP_COUNT != 12");
|
1788
1835
|
|
1789
1836
|
|
1790
1837
|
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
@@ -2141,6 +2188,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
|
2141
2188
|
case GGML_FTYPE_MOSTLY_Q6_K: wtype = GGML_TYPE_Q6_K; break;
|
2142
2189
|
case GGML_FTYPE_MOSTLY_IQ2_XXS: wtype = GGML_TYPE_IQ2_XXS; break;
|
2143
2190
|
case GGML_FTYPE_MOSTLY_IQ2_XS: wtype = GGML_TYPE_IQ2_XS; break;
|
2191
|
+
case GGML_FTYPE_MOSTLY_IQ3_XXS: wtype = GGML_TYPE_IQ3_XXS; break;
|
2144
2192
|
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
2145
2193
|
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
2146
2194
|
}
|
@@ -2294,6 +2342,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
2294
2342
|
ggml_init_cublas();
|
2295
2343
|
#elif defined(GGML_USE_CLBLAST)
|
2296
2344
|
ggml_cl_init();
|
2345
|
+
#elif defined(GGML_USE_VULKAN)
|
2346
|
+
ggml_vk_init();
|
2347
|
+
#elif defined(GGML_USE_SYCL)
|
2348
|
+
ggml_init_sycl();
|
2297
2349
|
#endif
|
2298
2350
|
|
2299
2351
|
ggml_setup_op_has_task_pass();
|
@@ -3951,6 +4003,20 @@ struct ggml_tensor * ggml_silu_back(
|
|
3951
4003
|
return result;
|
3952
4004
|
}
|
3953
4005
|
|
4006
|
+
// ggml hardswish
|
4007
|
+
struct ggml_tensor * ggml_hardswish(
|
4008
|
+
struct ggml_context * ctx,
|
4009
|
+
struct ggml_tensor * a) {
|
4010
|
+
return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSWISH);
|
4011
|
+
}
|
4012
|
+
|
4013
|
+
// ggml hardsigmoid
|
4014
|
+
struct ggml_tensor * ggml_hardsigmoid(
|
4015
|
+
struct ggml_context * ctx,
|
4016
|
+
struct ggml_tensor * a) {
|
4017
|
+
return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSIGMOID);
|
4018
|
+
}
|
4019
|
+
|
3954
4020
|
// ggml_norm
|
3955
4021
|
|
3956
4022
|
static struct ggml_tensor * ggml_norm_impl(
|
@@ -5283,7 +5349,7 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
|
|
5283
5349
|
int s0,
|
5284
5350
|
int p0,
|
5285
5351
|
int d0) {
|
5286
|
-
struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false); // [N, OL, IC * K]
|
5352
|
+
struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16); // [N, OL, IC * K]
|
5287
5353
|
|
5288
5354
|
struct ggml_tensor * result =
|
5289
5355
|
ggml_mul_mat(ctx,
|
@@ -5350,6 +5416,30 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
|
|
5350
5416
|
return result;
|
5351
5417
|
}
|
5352
5418
|
|
5419
|
+
// ggml_conv_depthwise
|
5420
|
+
struct ggml_tensor * ggml_conv_depthwise_2d(
|
5421
|
+
struct ggml_context * ctx,
|
5422
|
+
struct ggml_tensor * a,
|
5423
|
+
struct ggml_tensor * b,
|
5424
|
+
int s0,
|
5425
|
+
int s1,
|
5426
|
+
int p0,
|
5427
|
+
int p1,
|
5428
|
+
int d0,
|
5429
|
+
int d1) {
|
5430
|
+
|
5431
|
+
struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
|
5432
|
+
struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
|
5433
|
+
ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
|
5434
|
+
s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW]
|
5435
|
+
struct ggml_tensor * new_b = ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
|
5436
|
+
|
5437
|
+
new_a = ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1); // [OC,1, KH, KW] => [1, OC, 1, KH * KW]
|
5438
|
+
struct ggml_tensor * result = ggml_mul_mat(ctx, new_a, new_b);
|
5439
|
+
result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
|
5440
|
+
|
5441
|
+
return result;
|
5442
|
+
}
|
5353
5443
|
// ggml_conv_2d
|
5354
5444
|
|
5355
5445
|
// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
|
@@ -5366,7 +5456,8 @@ struct ggml_tensor * ggml_im2col(
|
|
5366
5456
|
int p1,
|
5367
5457
|
int d0,
|
5368
5458
|
int d1,
|
5369
|
-
bool is_2D
|
5459
|
+
bool is_2D,
|
5460
|
+
enum ggml_type dst_type) {
|
5370
5461
|
|
5371
5462
|
if(is_2D) {
|
5372
5463
|
GGML_ASSERT(a->ne[2] == b->ne[2]);
|
@@ -5390,7 +5481,7 @@ struct ggml_tensor * ggml_im2col(
|
|
5390
5481
|
is_2D ? b->ne[3] : 1,
|
5391
5482
|
};
|
5392
5483
|
|
5393
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx,
|
5484
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, dst_type, 4, ne);
|
5394
5485
|
int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
|
5395
5486
|
ggml_set_op_params(result, params, sizeof(params));
|
5396
5487
|
|
@@ -5415,7 +5506,7 @@ struct ggml_tensor * ggml_conv_2d(
|
|
5415
5506
|
int p1,
|
5416
5507
|
int d0,
|
5417
5508
|
int d1) {
|
5418
|
-
struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true); // [N, OH, OW, IC * KH * KW]
|
5509
|
+
struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N, OH, OW, IC * KH * KW]
|
5419
5510
|
|
5420
5511
|
struct ggml_tensor * result =
|
5421
5512
|
ggml_mul_mat(ctx,
|
@@ -5541,12 +5632,13 @@ struct ggml_tensor * ggml_pool_2d(
|
|
5541
5632
|
is_node = true;
|
5542
5633
|
}
|
5543
5634
|
|
5635
|
+
struct ggml_tensor * result;
|
5544
5636
|
const int64_t ne[3] = {
|
5545
5637
|
ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
|
5546
5638
|
ggml_calc_pool_output_size(a->ne[1], k1, s1, p1),
|
5547
5639
|
a->ne[2],
|
5548
5640
|
};
|
5549
|
-
|
5641
|
+
result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
|
5550
5642
|
|
5551
5643
|
int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
|
5552
5644
|
ggml_set_op_params(result, params, sizeof(params));
|
@@ -5554,7 +5646,6 @@ struct ggml_tensor * ggml_pool_2d(
|
|
5554
5646
|
result->op = GGML_OP_POOL_2D;
|
5555
5647
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5556
5648
|
result->src[0] = a;
|
5557
|
-
|
5558
5649
|
return result;
|
5559
5650
|
}
|
5560
5651
|
|
@@ -7169,6 +7260,17 @@ static void ggml_compute_forward_add_f32(
|
|
7169
7260
|
const int ith = params->ith;
|
7170
7261
|
const int nth = params->nth;
|
7171
7262
|
|
7263
|
+
#ifdef GGML_USE_CLBLAST
|
7264
|
+
if (src1->backend == GGML_BACKEND_GPU) {
|
7265
|
+
// TODO: OpenCL kernel support full broadcast
|
7266
|
+
GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
|
7267
|
+
if (ith == 0) {
|
7268
|
+
ggml_cl_add(src0, src1, dst);
|
7269
|
+
}
|
7270
|
+
return;
|
7271
|
+
}
|
7272
|
+
#endif
|
7273
|
+
|
7172
7274
|
const int nr = ggml_nrows(src0);
|
7173
7275
|
|
7174
7276
|
GGML_TENSOR_BINARY_OP_LOCALS
|
@@ -7449,7 +7551,12 @@ static void ggml_compute_forward_add(
|
|
7449
7551
|
switch (src0->type) {
|
7450
7552
|
case GGML_TYPE_F32:
|
7451
7553
|
{
|
7452
|
-
|
7554
|
+
if (src1->type == GGML_TYPE_F32) {
|
7555
|
+
ggml_compute_forward_add_f32(params, src0, src1, dst);
|
7556
|
+
}
|
7557
|
+
else {
|
7558
|
+
GGML_ASSERT(false);
|
7559
|
+
}
|
7453
7560
|
} break;
|
7454
7561
|
case GGML_TYPE_F16:
|
7455
7562
|
{
|
@@ -7475,6 +7582,7 @@ static void ggml_compute_forward_add(
|
|
7475
7582
|
case GGML_TYPE_Q6_K:
|
7476
7583
|
case GGML_TYPE_IQ2_XXS:
|
7477
7584
|
case GGML_TYPE_IQ2_XS:
|
7585
|
+
case GGML_TYPE_IQ3_XXS:
|
7478
7586
|
{
|
7479
7587
|
ggml_compute_forward_add_q_f32(params, src0, src1, dst);
|
7480
7588
|
} break;
|
@@ -7741,6 +7849,7 @@ static void ggml_compute_forward_add1(
|
|
7741
7849
|
case GGML_TYPE_Q6_K:
|
7742
7850
|
case GGML_TYPE_IQ2_XXS:
|
7743
7851
|
case GGML_TYPE_IQ2_XS:
|
7852
|
+
case GGML_TYPE_IQ3_XXS:
|
7744
7853
|
{
|
7745
7854
|
ggml_compute_forward_add1_q_f32(params, src0, src1, dst);
|
7746
7855
|
} break;
|
@@ -7770,6 +7879,9 @@ static void ggml_compute_forward_acc_f32(
|
|
7770
7879
|
bool inplace = (bool) ((int32_t *) dst->op_params)[4];
|
7771
7880
|
|
7772
7881
|
if (!inplace && (params->type == GGML_TASK_INIT)) {
|
7882
|
+
if (params->ith != 0) {
|
7883
|
+
return;
|
7884
|
+
}
|
7773
7885
|
// memcpy needs to be synchronized across threads to avoid race conditions.
|
7774
7886
|
// => do it in INIT phase
|
7775
7887
|
memcpy(
|
@@ -7857,6 +7969,7 @@ static void ggml_compute_forward_acc(
|
|
7857
7969
|
case GGML_TYPE_Q6_K:
|
7858
7970
|
case GGML_TYPE_IQ2_XXS:
|
7859
7971
|
case GGML_TYPE_IQ2_XS:
|
7972
|
+
case GGML_TYPE_IQ3_XXS:
|
7860
7973
|
default:
|
7861
7974
|
{
|
7862
7975
|
GGML_ASSERT(false);
|
@@ -7958,7 +8071,7 @@ static void ggml_compute_forward_mul_f32(
|
|
7958
8071
|
const int ith = params->ith;
|
7959
8072
|
const int nth = params->nth;
|
7960
8073
|
|
7961
|
-
#
|
8074
|
+
#if defined(GGML_USE_CLBLAST)
|
7962
8075
|
if (src1->backend == GGML_BACKEND_GPU) {
|
7963
8076
|
// TODO: OpenCL kernel support full broadcast
|
7964
8077
|
GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
|
@@ -9339,6 +9452,87 @@ static void ggml_compute_forward_silu_back(
|
|
9339
9452
|
}
|
9340
9453
|
}
|
9341
9454
|
|
9455
|
+
|
9456
|
+
static void ggml_compute_forward_hardswish_f32(
|
9457
|
+
const struct ggml_compute_params * params,
|
9458
|
+
const struct ggml_tensor * src0,
|
9459
|
+
struct ggml_tensor * dst) {
|
9460
|
+
assert(params->ith == 0);
|
9461
|
+
assert(ggml_are_same_shape(src0, dst));
|
9462
|
+
|
9463
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
9464
|
+
return;
|
9465
|
+
}
|
9466
|
+
|
9467
|
+
const int n = ggml_nrows(src0);
|
9468
|
+
const int nc = src0->ne[0];
|
9469
|
+
|
9470
|
+
assert(dst->nb[0] == sizeof(float));
|
9471
|
+
assert(src0->nb[0] == sizeof(float));
|
9472
|
+
|
9473
|
+
for (int i = 0; i < n; i++) {
|
9474
|
+
ggml_vec_hardswish_f32(nc,
|
9475
|
+
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
9476
|
+
(float *) ((char *) src0->data + i*(src0->nb[1])));
|
9477
|
+
}
|
9478
|
+
}
|
9479
|
+
static void ggml_compute_forward_hardswish(
|
9480
|
+
const struct ggml_compute_params * params,
|
9481
|
+
const struct ggml_tensor * src0,
|
9482
|
+
struct ggml_tensor * dst) {
|
9483
|
+
switch (src0->type) {
|
9484
|
+
case GGML_TYPE_F32:
|
9485
|
+
{
|
9486
|
+
ggml_compute_forward_hardswish_f32(params, src0, dst);
|
9487
|
+
} break;
|
9488
|
+
default:
|
9489
|
+
{
|
9490
|
+
GGML_ASSERT(false);
|
9491
|
+
} break;
|
9492
|
+
}
|
9493
|
+
}
|
9494
|
+
|
9495
|
+
static void ggml_compute_forward_hardsigmoid_f32(
|
9496
|
+
const struct ggml_compute_params * params,
|
9497
|
+
const struct ggml_tensor * src0,
|
9498
|
+
struct ggml_tensor * dst) {
|
9499
|
+
assert(params->ith == 0);
|
9500
|
+
assert(ggml_are_same_shape(src0, dst));
|
9501
|
+
|
9502
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
9503
|
+
return;
|
9504
|
+
}
|
9505
|
+
|
9506
|
+
const int n = ggml_nrows(src0);
|
9507
|
+
const int nc = src0->ne[0];
|
9508
|
+
|
9509
|
+
assert(dst->nb[0] == sizeof(float));
|
9510
|
+
assert(src0->nb[0] == sizeof(float));
|
9511
|
+
|
9512
|
+
for (int i = 0; i < n; i++) {
|
9513
|
+
ggml_vec_hardsigmoid_f32(nc,
|
9514
|
+
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
9515
|
+
(float *) ((char *) src0->data + i*(src0->nb[1])));
|
9516
|
+
}
|
9517
|
+
}
|
9518
|
+
|
9519
|
+
static void ggml_compute_forward_hardsigmoid(
|
9520
|
+
const struct ggml_compute_params * params,
|
9521
|
+
const struct ggml_tensor * src0,
|
9522
|
+
struct ggml_tensor * dst) {
|
9523
|
+
switch (src0->type) {
|
9524
|
+
case GGML_TYPE_F32:
|
9525
|
+
{
|
9526
|
+
ggml_compute_forward_hardsigmoid_f32(params, src0, dst);
|
9527
|
+
} break;
|
9528
|
+
default:
|
9529
|
+
{
|
9530
|
+
GGML_ASSERT(false);
|
9531
|
+
} break;
|
9532
|
+
}
|
9533
|
+
}
|
9534
|
+
|
9535
|
+
|
9342
9536
|
// ggml_compute_forward_norm
|
9343
9537
|
|
9344
9538
|
static void ggml_compute_forward_norm_f32(
|
@@ -9790,10 +9984,6 @@ static void ggml_compute_forward_mul_mat(
|
|
9790
9984
|
const int ith = params->ith;
|
9791
9985
|
const int nth = params->nth;
|
9792
9986
|
|
9793
|
-
if (ith == 1 && g_imatrix_collect) {
|
9794
|
-
g_imatrix_collect(src0, src1);
|
9795
|
-
}
|
9796
|
-
|
9797
9987
|
const enum ggml_type type = src0->type;
|
9798
9988
|
|
9799
9989
|
const bool src1_cont = ggml_is_contiguous(src1);
|
@@ -9835,11 +10025,30 @@ static void ggml_compute_forward_mul_mat(
|
|
9835
10025
|
|
9836
10026
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9837
10027
|
if (ggml_compute_forward_mul_mat_use_blas(dst)) {
|
9838
|
-
|
9839
|
-
|
9840
|
-
|
10028
|
+
const int64_t ne_plane = ne01*ne00;
|
10029
|
+
const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
|
10030
|
+
UNUSED(desired_wsize);
|
9841
10031
|
|
9842
10032
|
if (params->type == GGML_TASK_INIT) {
|
10033
|
+
if (type != GGML_TYPE_F32) {
|
10034
|
+
assert(params->wsize >= desired_wsize);
|
10035
|
+
// parallelize by src0 rows
|
10036
|
+
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
10037
|
+
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
10038
|
+
// broadcast src0 into src1 across 2nd,3rd dimension
|
10039
|
+
const int64_t i03 = i13/r3;
|
10040
|
+
const int64_t i02 = i12/r2;
|
10041
|
+
|
10042
|
+
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
10043
|
+
float * const wdata = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
|
10044
|
+
ggml_to_float_t const to_float = type_traits[type].to_float;
|
10045
|
+
|
10046
|
+
for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
|
10047
|
+
to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
|
10048
|
+
}
|
10049
|
+
}
|
10050
|
+
}
|
10051
|
+
}
|
9843
10052
|
return;
|
9844
10053
|
}
|
9845
10054
|
|
@@ -9847,9 +10056,14 @@ static void ggml_compute_forward_mul_mat(
|
|
9847
10056
|
return;
|
9848
10057
|
}
|
9849
10058
|
|
10059
|
+
// perform sgemm, parallelization controlled by blas lib
|
10060
|
+
if (ith != 0) {
|
10061
|
+
return;
|
10062
|
+
}
|
10063
|
+
|
10064
|
+
//const int64_t tgemm0 = ggml_perf_time_us();
|
9850
10065
|
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
9851
10066
|
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
9852
|
-
// broadcast src0 into src1 across 2nd,3rd dimension
|
9853
10067
|
const int64_t i03 = i13/r3;
|
9854
10068
|
const int64_t i02 = i12/r2;
|
9855
10069
|
|
@@ -9858,17 +10072,7 @@ static void ggml_compute_forward_mul_mat(
|
|
9858
10072
|
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
9859
10073
|
|
9860
10074
|
if (type != GGML_TYPE_F32) {
|
9861
|
-
|
9862
|
-
ggml_to_float_t const to_float = type_traits[type].to_float;
|
9863
|
-
|
9864
|
-
size_t id = 0;
|
9865
|
-
for (int64_t i01 = 0; i01 < ne01; ++i01) {
|
9866
|
-
to_float((const char *) x + i01*nb01, wdata + id, ne00);
|
9867
|
-
id += ne00;
|
9868
|
-
}
|
9869
|
-
|
9870
|
-
assert(id*sizeof(float) <= params->wsize);
|
9871
|
-
x = wdata;
|
10075
|
+
x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
|
9872
10076
|
}
|
9873
10077
|
|
9874
10078
|
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
@@ -9878,6 +10082,7 @@ static void ggml_compute_forward_mul_mat(
|
|
9878
10082
|
0.0f, d, ne01);
|
9879
10083
|
}
|
9880
10084
|
}
|
10085
|
+
//printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);
|
9881
10086
|
|
9882
10087
|
//printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
|
9883
10088
|
|
@@ -9886,6 +10091,9 @@ static void ggml_compute_forward_mul_mat(
|
|
9886
10091
|
#endif
|
9887
10092
|
|
9888
10093
|
if (params->type == GGML_TASK_INIT) {
|
10094
|
+
if (ith != 0) {
|
10095
|
+
return;
|
10096
|
+
}
|
9889
10097
|
if (src1->type != vec_dot_type) {
|
9890
10098
|
char * wdata = params->wdata;
|
9891
10099
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
@@ -10050,6 +10258,9 @@ static void ggml_compute_forward_mul_mat_id(
|
|
10050
10258
|
#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
|
10051
10259
|
|
10052
10260
|
if (params->type == GGML_TASK_INIT) {
|
10261
|
+
if (ith != 0) {
|
10262
|
+
return;
|
10263
|
+
}
|
10053
10264
|
char * wdata = params->wdata;
|
10054
10265
|
if (src1->type != vec_dot_type) {
|
10055
10266
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
@@ -10097,10 +10308,6 @@ static void ggml_compute_forward_mul_mat_id(
|
|
10097
10308
|
|
10098
10309
|
const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];
|
10099
10310
|
|
10100
|
-
if (ith == 1 && g_imatrix_collect) {
|
10101
|
-
g_imatrix_collect(src0_cur, src1);
|
10102
|
-
}
|
10103
|
-
|
10104
10311
|
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
10105
10312
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
10106
10313
|
|
@@ -10239,6 +10446,9 @@ static void ggml_compute_forward_out_prod_f32(
|
|
10239
10446
|
return;
|
10240
10447
|
}
|
10241
10448
|
#endif
|
10449
|
+
if (ith != 0) {
|
10450
|
+
return;
|
10451
|
+
}
|
10242
10452
|
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
|
10243
10453
|
return;
|
10244
10454
|
}
|
@@ -10422,6 +10632,9 @@ static void ggml_compute_forward_out_prod_q_f32(
|
|
10422
10632
|
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
10423
10633
|
|
10424
10634
|
if (params->type == GGML_TASK_INIT) {
|
10635
|
+
if (ith != 0) {
|
10636
|
+
return;
|
10637
|
+
}
|
10425
10638
|
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
|
10426
10639
|
return;
|
10427
10640
|
}
|
@@ -10508,6 +10721,7 @@ static void ggml_compute_forward_out_prod(
|
|
10508
10721
|
case GGML_TYPE_Q6_K:
|
10509
10722
|
case GGML_TYPE_IQ2_XXS:
|
10510
10723
|
case GGML_TYPE_IQ2_XS:
|
10724
|
+
case GGML_TYPE_IQ3_XXS:
|
10511
10725
|
{
|
10512
10726
|
ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst);
|
10513
10727
|
} break;
|
@@ -10606,6 +10820,9 @@ static void ggml_compute_forward_set_f32(
|
|
10606
10820
|
bool inplace = (bool) ((int32_t *) dst->op_params)[4];
|
10607
10821
|
|
10608
10822
|
if (!inplace && (params->type == GGML_TASK_INIT)) {
|
10823
|
+
if (params->ith != 0) {
|
10824
|
+
return;
|
10825
|
+
}
|
10609
10826
|
// memcpy needs to be synchronized across threads to avoid race conditions.
|
10610
10827
|
// => do it in INIT phase
|
10611
10828
|
memcpy(
|
@@ -10684,6 +10901,7 @@ static void ggml_compute_forward_set(
|
|
10684
10901
|
case GGML_TYPE_Q6_K:
|
10685
10902
|
case GGML_TYPE_IQ2_XXS:
|
10686
10903
|
case GGML_TYPE_IQ2_XS:
|
10904
|
+
case GGML_TYPE_IQ3_XXS:
|
10687
10905
|
default:
|
10688
10906
|
{
|
10689
10907
|
GGML_ASSERT(false);
|
@@ -10880,6 +11098,7 @@ static void ggml_compute_forward_get_rows(
|
|
10880
11098
|
case GGML_TYPE_Q6_K:
|
10881
11099
|
case GGML_TYPE_IQ2_XXS:
|
10882
11100
|
case GGML_TYPE_IQ2_XS:
|
11101
|
+
case GGML_TYPE_IQ3_XXS:
|
10883
11102
|
{
|
10884
11103
|
ggml_compute_forward_get_rows_q(params, src0, src1, dst);
|
10885
11104
|
} break;
|
@@ -10930,6 +11149,9 @@ static void ggml_compute_forward_get_rows_back_f32_f16(
|
|
10930
11149
|
// ggml_compute_forward_dup_same_cont(params, opt0, dst);
|
10931
11150
|
|
10932
11151
|
if (params->type == GGML_TASK_INIT) {
|
11152
|
+
if (params->ith != 0) {
|
11153
|
+
return;
|
11154
|
+
}
|
10933
11155
|
memset(dst->data, 0, ggml_nbytes(dst));
|
10934
11156
|
}
|
10935
11157
|
|
@@ -10964,6 +11186,9 @@ static void ggml_compute_forward_get_rows_back_f32(
|
|
10964
11186
|
// ggml_compute_forward_dup_same_cont(params, opt0, dst);
|
10965
11187
|
|
10966
11188
|
if (params->type == GGML_TASK_INIT) {
|
11189
|
+
if (params->ith != 0) {
|
11190
|
+
return;
|
11191
|
+
}
|
10967
11192
|
memset(dst->data, 0, ggml_nbytes(dst));
|
10968
11193
|
}
|
10969
11194
|
|
@@ -11101,6 +11326,9 @@ static void ggml_compute_forward_diag_mask_f32(
|
|
11101
11326
|
GGML_ASSERT(n_past >= 0);
|
11102
11327
|
|
11103
11328
|
if (!inplace && (params->type == GGML_TASK_INIT)) {
|
11329
|
+
if (ith != 0) {
|
11330
|
+
return;
|
11331
|
+
}
|
11104
11332
|
// memcpy needs to be synchronized across threads to avoid race conditions.
|
11105
11333
|
// => do it in INIT phase
|
11106
11334
|
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
|
@@ -11518,6 +11746,7 @@ static void ggml_compute_forward_alibi(
|
|
11518
11746
|
case GGML_TYPE_Q6_K:
|
11519
11747
|
case GGML_TYPE_IQ2_XXS:
|
11520
11748
|
case GGML_TYPE_IQ2_XS:
|
11749
|
+
case GGML_TYPE_IQ3_XXS:
|
11521
11750
|
case GGML_TYPE_Q8_K:
|
11522
11751
|
case GGML_TYPE_I8:
|
11523
11752
|
case GGML_TYPE_I16:
|
@@ -11594,6 +11823,7 @@ static void ggml_compute_forward_clamp(
|
|
11594
11823
|
case GGML_TYPE_Q6_K:
|
11595
11824
|
case GGML_TYPE_IQ2_XXS:
|
11596
11825
|
case GGML_TYPE_IQ2_XS:
|
11826
|
+
case GGML_TYPE_IQ3_XXS:
|
11597
11827
|
case GGML_TYPE_Q8_K:
|
11598
11828
|
case GGML_TYPE_I8:
|
11599
11829
|
case GGML_TYPE_I16:
|
@@ -12071,6 +12301,9 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
|
|
12071
12301
|
GGML_ASSERT(nb10 == sizeof(float));
|
12072
12302
|
|
12073
12303
|
if (params->type == GGML_TASK_INIT) {
|
12304
|
+
if (ith != 0) {
|
12305
|
+
return;
|
12306
|
+
}
|
12074
12307
|
memset(params->wdata, 0, params->wsize);
|
12075
12308
|
|
12076
12309
|
// permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
|
@@ -12165,6 +12398,9 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
|
|
12165
12398
|
GGML_ASSERT(nb10 == sizeof(float));
|
12166
12399
|
|
12167
12400
|
if (params->type == GGML_TASK_INIT) {
|
12401
|
+
if (ith != 0) {
|
12402
|
+
return;
|
12403
|
+
}
|
12168
12404
|
memset(params->wdata, 0, params->wsize);
|
12169
12405
|
|
12170
12406
|
// prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
|
@@ -12257,6 +12493,92 @@ static void ggml_compute_forward_conv_transpose_1d(
|
|
12257
12493
|
}
|
12258
12494
|
}
|
12259
12495
|
|
12496
|
+
// src0: kernel [OC, IC, KH, KW]
|
12497
|
+
// src1: image [N, IC, IH, IW]
|
12498
|
+
// dst: result [N, OH, OW, IC*KH*KW]
|
12499
|
+
static void ggml_compute_forward_im2col_f32(
|
12500
|
+
const struct ggml_compute_params * params,
|
12501
|
+
const struct ggml_tensor * src0,
|
12502
|
+
const struct ggml_tensor * src1,
|
12503
|
+
struct ggml_tensor * dst) {
|
12504
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
12505
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
12506
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
12507
|
+
|
12508
|
+
int64_t t0 = ggml_perf_time_us();
|
12509
|
+
UNUSED(t0);
|
12510
|
+
|
12511
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
12512
|
+
|
12513
|
+
const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
|
12514
|
+
const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
|
12515
|
+
const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
|
12516
|
+
const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
|
12517
|
+
const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
|
12518
|
+
const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
|
12519
|
+
const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
|
12520
|
+
|
12521
|
+
const int ith = params->ith;
|
12522
|
+
const int nth = params->nth;
|
12523
|
+
|
12524
|
+
const int64_t N = is_2D ? ne13 : ne12;
|
12525
|
+
const int64_t IC = is_2D ? ne12 : ne11;
|
12526
|
+
const int64_t IH = is_2D ? ne11 : 1;
|
12527
|
+
const int64_t IW = ne10;
|
12528
|
+
|
12529
|
+
const int64_t KH = is_2D ? ne01 : 1;
|
12530
|
+
const int64_t KW = ne00;
|
12531
|
+
|
12532
|
+
const int64_t OH = is_2D ? ne2 : 1;
|
12533
|
+
const int64_t OW = ne1;
|
12534
|
+
|
12535
|
+
int ofs0 = is_2D ? nb13 : nb12;
|
12536
|
+
int ofs1 = is_2D ? nb12 : nb11;
|
12537
|
+
|
12538
|
+
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
12539
|
+
GGML_ASSERT(nb10 == sizeof(float));
|
12540
|
+
|
12541
|
+
if (params->type == GGML_TASK_INIT) {
|
12542
|
+
return;
|
12543
|
+
}
|
12544
|
+
|
12545
|
+
if (params->type == GGML_TASK_FINALIZE) {
|
12546
|
+
return;
|
12547
|
+
}
|
12548
|
+
|
12549
|
+
// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
|
12550
|
+
{
|
12551
|
+
float * const wdata = (float *) dst->data;
|
12552
|
+
|
12553
|
+
for (int64_t in = 0; in < N; in++) {
|
12554
|
+
for (int64_t ioh = 0; ioh < OH; ioh++) { // 1
|
12555
|
+
for (int64_t iow = 0; iow < OW; iow++) {
|
12556
|
+
for (int64_t iic = ith; iic < IC; iic += nth) {
|
12557
|
+
|
12558
|
+
// micro kernel
|
12559
|
+
float * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
|
12560
|
+
const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW]
|
12561
|
+
|
12562
|
+
for (int64_t ikh = 0; ikh < KH; ikh++) { // 1
|
12563
|
+
for (int64_t ikw = 0; ikw < KW; ikw++) {
|
12564
|
+
const int64_t iiw = iow*s0 + ikw*d0 - p0;
|
12565
|
+
const int64_t iih = ioh*s1 + ikh*d1 - p1;
|
12566
|
+
|
12567
|
+
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
12568
|
+
dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
|
12569
|
+
} else {
|
12570
|
+
dst_data[iic*(KH*KW) + ikh*KW + ikw] = (src_data[iih*IW + iiw]);
|
12571
|
+
}
|
12572
|
+
}
|
12573
|
+
}
|
12574
|
+
}
|
12575
|
+
}
|
12576
|
+
}
|
12577
|
+
}
|
12578
|
+
}
|
12579
|
+
}
|
12580
|
+
|
12581
|
+
|
12260
12582
|
// src0: kernel [OC, IC, KH, KW]
|
12261
12583
|
// src1: image [N, IC, IH, IW]
|
12262
12584
|
// dst: result [N, OH, OW, IC*KH*KW]
|
@@ -12347,14 +12669,14 @@ static void ggml_compute_forward_im2col(
|
|
12347
12669
|
const struct ggml_tensor * src0,
|
12348
12670
|
const struct ggml_tensor * src1,
|
12349
12671
|
struct ggml_tensor * dst) {
|
12350
|
-
switch (
|
12672
|
+
switch (dst->type) {
|
12351
12673
|
case GGML_TYPE_F16:
|
12352
12674
|
{
|
12353
12675
|
ggml_compute_forward_im2col_f16(params, src0, src1, dst);
|
12354
12676
|
} break;
|
12355
12677
|
case GGML_TYPE_F32:
|
12356
12678
|
{
|
12357
|
-
|
12679
|
+
ggml_compute_forward_im2col_f32(params, src0, src1, dst);
|
12358
12680
|
} break;
|
12359
12681
|
default:
|
12360
12682
|
{
|
@@ -12363,6 +12685,7 @@ static void ggml_compute_forward_im2col(
|
|
12363
12685
|
}
|
12364
12686
|
}
|
12365
12687
|
|
12688
|
+
|
12366
12689
|
// ggml_compute_forward_conv_transpose_2d
|
12367
12690
|
|
12368
12691
|
static void ggml_compute_forward_conv_transpose_2d(
|
@@ -12388,6 +12711,9 @@ static void ggml_compute_forward_conv_transpose_2d(
|
|
12388
12711
|
GGML_ASSERT(nb10 == sizeof(float));
|
12389
12712
|
|
12390
12713
|
if (params->type == GGML_TASK_INIT) {
|
12714
|
+
if (ith != 0) {
|
12715
|
+
return;
|
12716
|
+
}
|
12391
12717
|
memset(params->wdata, 0, params->wsize);
|
12392
12718
|
|
12393
12719
|
// permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
|
@@ -12541,8 +12867,8 @@ static void ggml_compute_forward_pool_2d(
|
|
12541
12867
|
const struct ggml_compute_params * params,
|
12542
12868
|
const struct ggml_tensor * src,
|
12543
12869
|
struct ggml_tensor * dst) {
|
12544
|
-
|
12545
|
-
|
12870
|
+
GGML_ASSERT(src->type == GGML_TYPE_F32);
|
12871
|
+
GGML_ASSERT(params->ith == 0);
|
12546
12872
|
|
12547
12873
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
12548
12874
|
return;
|
@@ -13931,6 +14257,14 @@ static void ggml_compute_forward_unary(
|
|
13931
14257
|
{
|
13932
14258
|
ggml_compute_forward_silu(params, src0, dst);
|
13933
14259
|
} break;
|
14260
|
+
case GGML_UNARY_OP_HARDSWISH:
|
14261
|
+
{
|
14262
|
+
ggml_compute_forward_hardswish(params, src0, dst);
|
14263
|
+
} break;
|
14264
|
+
case GGML_UNARY_OP_HARDSIGMOID:
|
14265
|
+
{
|
14266
|
+
ggml_compute_forward_hardsigmoid(params, src0, dst);
|
14267
|
+
} break;
|
13934
14268
|
default:
|
13935
14269
|
{
|
13936
14270
|
GGML_ASSERT(false);
|
@@ -13994,6 +14328,9 @@ static void ggml_compute_forward_add_rel_pos_f32(
|
|
13994
14328
|
|
13995
14329
|
const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
|
13996
14330
|
if (!inplace && params->type == GGML_TASK_INIT) {
|
14331
|
+
if (params->ith != 0) {
|
14332
|
+
return;
|
14333
|
+
}
|
13997
14334
|
memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
|
13998
14335
|
return;
|
13999
14336
|
}
|
@@ -14509,8 +14846,26 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14509
14846
|
}
|
14510
14847
|
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
|
14511
14848
|
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
|
14849
|
+
#elif defined(GGML_USE_VULKAN)
|
14850
|
+
const bool skip_cpu = ggml_vk_compute_forward(params, tensor);
|
14851
|
+
#ifdef GGML_VULKAN_CHECK_RESULTS
|
14852
|
+
if (skip_cpu) {
|
14853
|
+
ggml_vk_check_results_1(params, tensor);
|
14854
|
+
}
|
14855
|
+
#endif
|
14856
|
+
if (skip_cpu) {
|
14857
|
+
return;
|
14858
|
+
}
|
14859
|
+
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
|
14860
|
+
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
|
14512
14861
|
#endif // GGML_USE_CUBLAS
|
14513
14862
|
|
14863
|
+
#ifdef GGML_USE_SYCL
|
14864
|
+
bool skip_cpu = ggml_sycl_compute_forward(params, tensor);
|
14865
|
+
if (skip_cpu) {
|
14866
|
+
return;
|
14867
|
+
}
|
14868
|
+
#endif // GGML_USE_SYCL
|
14514
14869
|
switch (tensor->op) {
|
14515
14870
|
case GGML_OP_DUP:
|
14516
14871
|
{
|
@@ -14913,13 +15268,13 @@ struct ggml_hash_set ggml_hash_set_new(size_t size) {
|
|
14913
15268
|
size = ggml_hash_size(size);
|
14914
15269
|
struct ggml_hash_set result;
|
14915
15270
|
result.size = size;
|
14916
|
-
result.keys =
|
15271
|
+
result.keys = GGML_MALLOC(sizeof(struct ggml_tensor *) * size);
|
14917
15272
|
memset(result.keys, 0, sizeof(struct ggml_tensor *) * size);
|
14918
15273
|
return result;
|
14919
15274
|
}
|
14920
15275
|
|
14921
15276
|
static void ggml_hash_set_free(struct ggml_hash_set hash_set) {
|
14922
|
-
|
15277
|
+
GGML_FREE(hash_set.keys);
|
14923
15278
|
}
|
14924
15279
|
|
14925
15280
|
struct hash_map {
|
@@ -14928,17 +15283,17 @@ struct hash_map {
|
|
14928
15283
|
};
|
14929
15284
|
|
14930
15285
|
static struct hash_map * ggml_new_hash_map(size_t size) {
|
14931
|
-
struct hash_map * result =
|
15286
|
+
struct hash_map * result = GGML_MALLOC(sizeof(struct hash_map));
|
14932
15287
|
result->set = ggml_hash_set_new(size);
|
14933
|
-
result->vals =
|
15288
|
+
result->vals = GGML_MALLOC(sizeof(struct ggml_tensor *) * result->set.size);
|
14934
15289
|
memset(result->vals, 0, sizeof(struct ggml_tensor *) * result->set.size);
|
14935
15290
|
return result;
|
14936
15291
|
}
|
14937
15292
|
|
14938
15293
|
static void ggml_hash_map_free(struct hash_map * map) {
|
14939
15294
|
ggml_hash_set_free(map->set);
|
14940
|
-
|
14941
|
-
|
15295
|
+
GGML_FREE(map->vals);
|
15296
|
+
GGML_FREE(map);
|
14942
15297
|
}
|
14943
15298
|
|
14944
15299
|
// gradient checkpointing
|
@@ -16287,8 +16642,9 @@ struct ggml_compute_state_shared {
|
|
16287
16642
|
const int n_threads;
|
16288
16643
|
|
16289
16644
|
// synchronization primitives
|
16290
|
-
atomic_int n_active;
|
16291
|
-
atomic_int node_n;
|
16645
|
+
atomic_int n_active; // num active threads
|
16646
|
+
atomic_int node_n; // active graph node
|
16647
|
+
atomic_int node_task; // active graph node task phase
|
16292
16648
|
|
16293
16649
|
bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
|
16294
16650
|
void * abort_callback_data;
|
@@ -16344,6 +16700,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
16344
16700
|
case GGML_UNARY_OP_TANH:
|
16345
16701
|
case GGML_UNARY_OP_ELU:
|
16346
16702
|
case GGML_UNARY_OP_RELU:
|
16703
|
+
case GGML_UNARY_OP_HARDSWISH: // to opt for multiple threads
|
16704
|
+
case GGML_UNARY_OP_HARDSIGMOID: // to opt for multiple threads
|
16347
16705
|
{
|
16348
16706
|
n_tasks = 1;
|
16349
16707
|
} break;
|
@@ -16420,7 +16778,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
16420
16778
|
} break;
|
16421
16779
|
case GGML_OP_SOFT_MAX:
|
16422
16780
|
{
|
16423
|
-
n_tasks = MIN(
|
16781
|
+
n_tasks = MIN(n_threads, ggml_nrows(node->src[0]));
|
16424
16782
|
} break;
|
16425
16783
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
16426
16784
|
{
|
@@ -16534,6 +16892,34 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
16534
16892
|
return n_tasks;
|
16535
16893
|
}
|
16536
16894
|
|
16895
|
+
static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_compute_state * state, const bool do_yield) {
|
16896
|
+
// wait for other threads to finish
|
16897
|
+
const int last_node_n = * node_n;
|
16898
|
+
|
16899
|
+
while (true) {
|
16900
|
+
if (do_yield) {
|
16901
|
+
sched_yield();
|
16902
|
+
}
|
16903
|
+
|
16904
|
+
* node_n = atomic_load(&state->shared->node_n);
|
16905
|
+
if (* node_n != last_node_n) break;
|
16906
|
+
}
|
16907
|
+
}
|
16908
|
+
|
16909
|
+
static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
|
16910
|
+
// wait for other threads to finish
|
16911
|
+
const int last_task_phase = * task_phase;
|
16912
|
+
|
16913
|
+
while (true) {
|
16914
|
+
if (do_yield) {
|
16915
|
+
sched_yield();
|
16916
|
+
}
|
16917
|
+
|
16918
|
+
* task_phase = atomic_load(&state->shared->node_task);
|
16919
|
+
if (* task_phase != last_task_phase) break;
|
16920
|
+
}
|
16921
|
+
}
|
16922
|
+
|
16537
16923
|
static thread_ret_t ggml_graph_compute_thread(void * data) {
|
16538
16924
|
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
|
16539
16925
|
|
@@ -16544,7 +16930,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16544
16930
|
|
16545
16931
|
set_numa_thread_affinity(state->ith, n_threads);
|
16546
16932
|
|
16547
|
-
int node_n
|
16933
|
+
int node_n = -1;
|
16934
|
+
int task_phase = GGML_TASK_FINALIZE;
|
16548
16935
|
|
16549
16936
|
while (true) {
|
16550
16937
|
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
|
@@ -16576,7 +16963,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16576
16963
|
// distribute new work or execute it direct if 1T
|
16577
16964
|
while (++node_n < cgraph->n_nodes) {
|
16578
16965
|
GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
|
16579
|
-
|
16580
16966
|
struct ggml_tensor * node = cgraph->nodes[node_n];
|
16581
16967
|
const int n_tasks = ggml_get_n_tasks(node, n_threads);
|
16582
16968
|
|
@@ -16585,13 +16971,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16585
16971
|
|
16586
16972
|
params.nth = n_tasks;
|
16587
16973
|
|
16588
|
-
/* INIT */
|
16589
|
-
if (GGML_OP_HAS_INIT[node->op]) {
|
16590
|
-
params.type = GGML_TASK_INIT;
|
16591
|
-
ggml_compute_forward(¶ms, node);
|
16592
|
-
}
|
16593
|
-
|
16594
16974
|
if (n_tasks == 1) {
|
16975
|
+
/* INIT */
|
16976
|
+
if (GGML_OP_HAS_INIT[node->op]) {
|
16977
|
+
params.type = GGML_TASK_INIT;
|
16978
|
+
ggml_compute_forward(¶ms, node);
|
16979
|
+
}
|
16980
|
+
|
16595
16981
|
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
|
16596
16982
|
// they do something more efficient than spinning (?)
|
16597
16983
|
params.type = GGML_TASK_COMPUTE;
|
@@ -16612,38 +16998,24 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16612
16998
|
}
|
16613
16999
|
}
|
16614
17000
|
|
16615
|
-
|
16616
|
-
atomic_store(&state->shared->
|
17001
|
+
task_phase = GGML_TASK_INIT;
|
17002
|
+
atomic_store(&state->shared->n_active, n_threads);
|
17003
|
+
atomic_store(&state->shared->node_n, node_n);
|
17004
|
+
atomic_store(&state->shared->node_task, task_phase);
|
16617
17005
|
} else {
|
16618
|
-
|
16619
|
-
|
16620
|
-
|
16621
|
-
const bool do_yield = last < 0 || cgraph->nodes[last]->op == GGML_OP_MUL_MAT;
|
16622
|
-
|
16623
|
-
while (true) {
|
16624
|
-
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
|
16625
|
-
// depending on the workload and the operating system.
|
16626
|
-
// since it is not clear what is the best approach, it should potentially become user-configurable
|
16627
|
-
// ref: https://github.com/ggerganov/ggml/issues/291
|
16628
|
-
// UPD: adding the do_yield flag seems to resolve the issue universally
|
16629
|
-
if (do_yield) {
|
16630
|
-
sched_yield();
|
16631
|
-
}
|
16632
|
-
|
16633
|
-
node_n = atomic_load(&state->shared->node_n);
|
16634
|
-
if (node_n != last) break;
|
16635
|
-
};
|
17006
|
+
ggml_graph_compute_thread_sync_node(&node_n, state, false);
|
17007
|
+
ggml_graph_compute_thread_sync_task(&task_phase, state, false);
|
16636
17008
|
}
|
16637
17009
|
|
16638
17010
|
// check if we should stop
|
16639
17011
|
if (node_n >= cgraph->n_nodes) break;
|
16640
17012
|
|
16641
|
-
/* COMPUTE */
|
17013
|
+
/* INIT & COMPUTE */
|
16642
17014
|
struct ggml_tensor * node = cgraph->nodes[node_n];
|
16643
17015
|
const int n_tasks = ggml_get_n_tasks(node, n_threads);
|
16644
17016
|
|
16645
17017
|
struct ggml_compute_params params = {
|
16646
|
-
/*.type =*/
|
17018
|
+
/*.type =*/ GGML_TASK_INIT,
|
16647
17019
|
/*.ith =*/ state->ith,
|
16648
17020
|
/*.nth =*/ n_tasks,
|
16649
17021
|
/*.wsize =*/ cplan->work_size,
|
@@ -16651,8 +17023,39 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16651
17023
|
};
|
16652
17024
|
|
16653
17025
|
if (state->ith < n_tasks) {
|
17026
|
+
if (GGML_OP_HAS_INIT[node->op]) {
|
17027
|
+
ggml_compute_forward(¶ms, node);
|
17028
|
+
}
|
17029
|
+
}
|
17030
|
+
|
17031
|
+
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
17032
|
+
task_phase = GGML_TASK_COMPUTE;
|
17033
|
+
atomic_store(&state->shared->n_active, n_threads);
|
17034
|
+
atomic_store(&state->shared->node_task, task_phase);
|
17035
|
+
}
|
17036
|
+
else {
|
17037
|
+
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
|
17038
|
+
// depending on the workload and the operating system.
|
17039
|
+
// since it is not clear what is the best approach, it should potentially become user-configurable
|
17040
|
+
// ref: https://github.com/ggerganov/ggml/issues/291
|
17041
|
+
// UPD: adding the do_yield flag seems to resolve the issue universally
|
17042
|
+
const bool do_yield = node_n < 0 || cgraph->nodes[node_n]->op == GGML_OP_MUL_MAT;
|
17043
|
+
ggml_graph_compute_thread_sync_task(&task_phase, state, do_yield);
|
17044
|
+
}
|
17045
|
+
|
17046
|
+
if (state->ith < n_tasks) {
|
17047
|
+
params.type = GGML_TASK_COMPUTE;
|
16654
17048
|
ggml_compute_forward(¶ms, node);
|
16655
17049
|
}
|
17050
|
+
|
17051
|
+
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
17052
|
+
task_phase = GGML_TASK_FINALIZE;
|
17053
|
+
atomic_store(&state->shared->n_active, n_threads);
|
17054
|
+
atomic_store(&state->shared->node_task, task_phase);
|
17055
|
+
}
|
17056
|
+
else {
|
17057
|
+
ggml_graph_compute_thread_sync_task(&task_phase, state, false);
|
17058
|
+
}
|
16656
17059
|
}
|
16657
17060
|
|
16658
17061
|
return GGML_EXIT_SUCCESS;
|
@@ -16668,12 +17071,16 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
16668
17071
|
struct ggml_cplan cplan;
|
16669
17072
|
memset(&cplan, 0, sizeof(struct ggml_cplan));
|
16670
17073
|
|
17074
|
+
int max_tasks = 1;
|
17075
|
+
|
16671
17076
|
// thread scheduling for the different operations + work buffer size estimation
|
16672
17077
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
16673
17078
|
struct ggml_tensor * node = cgraph->nodes[i];
|
16674
17079
|
|
16675
17080
|
const int n_tasks = ggml_get_n_tasks(node, n_threads);
|
16676
17081
|
|
17082
|
+
max_tasks = MAX(max_tasks, n_tasks);
|
17083
|
+
|
16677
17084
|
size_t cur = 0;
|
16678
17085
|
|
16679
17086
|
switch (node->op) {
|
@@ -16709,8 +17116,11 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
16709
17116
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
16710
17117
|
if (ggml_compute_forward_mul_mat_use_blas(node)) {
|
16711
17118
|
if (node->src[0]->type != GGML_TYPE_F32) {
|
16712
|
-
// here we need memory
|
16713
|
-
|
17119
|
+
// here we need memory for fully dequantized matrix from src0
|
17120
|
+
// take into account that src0 can be broadcasted into src1[2,3]
|
17121
|
+
cur = ggml_type_size(GGML_TYPE_F32)
|
17122
|
+
* node->src[0]->ne[0]*node->src[0]->ne[1]
|
17123
|
+
* node->src[1]->ne[2]*node->src[1]->ne[3];
|
16714
17124
|
}
|
16715
17125
|
} else
|
16716
17126
|
#endif
|
@@ -16837,7 +17247,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
16837
17247
|
work_size += CACHE_LINE_SIZE*(n_threads - 1);
|
16838
17248
|
}
|
16839
17249
|
|
16840
|
-
cplan.n_threads = n_threads;
|
17250
|
+
cplan.n_threads = MIN(max_tasks, n_threads);
|
16841
17251
|
cplan.work_size = work_size;
|
16842
17252
|
cplan.work_data = NULL;
|
16843
17253
|
|
@@ -16854,6 +17264,17 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|
16854
17264
|
}
|
16855
17265
|
}
|
16856
17266
|
|
17267
|
+
#ifdef GGML_USE_VULKAN
|
17268
|
+
for (int i = 0; i < cgraph->n_nodes; i++) {
|
17269
|
+
ggml_vk_preallocate_buffers_graph(cgraph->nodes[i]);
|
17270
|
+
}
|
17271
|
+
ggml_vk_preallocate_buffers();
|
17272
|
+
|
17273
|
+
for (int i = 0; i < cgraph->n_nodes; i++) {
|
17274
|
+
ggml_vk_build_graph(cgraph->nodes[i], i == cgraph->n_nodes - 1);
|
17275
|
+
}
|
17276
|
+
#endif
|
17277
|
+
|
16857
17278
|
const int n_threads = cplan->n_threads;
|
16858
17279
|
|
16859
17280
|
struct ggml_compute_state_shared state_shared = {
|
@@ -16864,6 +17285,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|
16864
17285
|
/*.n_threads =*/ n_threads,
|
16865
17286
|
/*.n_active =*/ n_threads,
|
16866
17287
|
/*.node_n =*/ -1,
|
17288
|
+
/*.node_task =*/ GGML_TASK_FINALIZE,
|
16867
17289
|
/*.abort_callback =*/ NULL,
|
16868
17290
|
/*.abort_callback_data =*/ NULL,
|
16869
17291
|
};
|
@@ -16904,6 +17326,10 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|
16904
17326
|
}
|
16905
17327
|
}
|
16906
17328
|
|
17329
|
+
#ifdef GGML_USE_VULKAN
|
17330
|
+
ggml_vk_graph_cleanup();
|
17331
|
+
#endif
|
17332
|
+
|
16907
17333
|
// performance stats (graph)
|
16908
17334
|
{
|
16909
17335
|
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
|
@@ -18538,6 +18964,29 @@ enum ggml_opt_result ggml_opt_resume_g(
|
|
18538
18964
|
|
18539
18965
|
////////////////////////////////////////////////////////////////////////////////
|
18540
18966
|
|
18967
|
+
void ggml_quantize_init(enum ggml_type type) {
|
18968
|
+
ggml_critical_section_start();
|
18969
|
+
|
18970
|
+
switch (type) {
|
18971
|
+
case GGML_TYPE_IQ2_XXS: iq2xs_init_impl(256); break;
|
18972
|
+
case GGML_TYPE_IQ2_XS: iq2xs_init_impl(512); break;
|
18973
|
+
case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
|
18974
|
+
default: // nothing
|
18975
|
+
break;
|
18976
|
+
}
|
18977
|
+
|
18978
|
+
ggml_critical_section_end();
|
18979
|
+
}
|
18980
|
+
|
18981
|
+
void ggml_quantize_free(void) {
|
18982
|
+
ggml_critical_section_start();
|
18983
|
+
|
18984
|
+
iq2xs_free_impl(256);
|
18985
|
+
iq2xs_free_impl(512);
|
18986
|
+
|
18987
|
+
ggml_critical_section_end();
|
18988
|
+
}
|
18989
|
+
|
18541
18990
|
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) {
|
18542
18991
|
assert(k % QK4_0 == 0);
|
18543
18992
|
const int nb = k / QK4_0;
|
@@ -18665,9 +19114,15 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
|
|
18665
19114
|
return (n/QK8_0*sizeof(block_q8_0));
|
18666
19115
|
}
|
18667
19116
|
|
19117
|
+
bool ggml_quantize_requires_imatrix(enum ggml_type type) {
|
19118
|
+
return
|
19119
|
+
type == GGML_TYPE_IQ2_XXS ||
|
19120
|
+
type == GGML_TYPE_IQ2_XS;
|
19121
|
+
}
|
19122
|
+
|
18668
19123
|
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
|
18669
19124
|
int nrows, int n_per_row, int64_t * hist, const float * imatrix) {
|
18670
|
-
(
|
19125
|
+
ggml_quantize_init(type); // this is noop if already initialized
|
18671
19126
|
size_t result = 0;
|
18672
19127
|
int n = nrows * n_per_row;
|
18673
19128
|
switch (type) {
|
@@ -18778,15 +19233,24 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
|
18778
19233
|
result = quantize_iq2_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
18779
19234
|
GGML_ASSERT(result == row_size * nrows);
|
18780
19235
|
} break;
|
19236
|
+
case GGML_TYPE_IQ3_XXS:
|
19237
|
+
{
|
19238
|
+
GGML_ASSERT(start % QK_K == 0);
|
19239
|
+
GGML_ASSERT(start % n_per_row == 0);
|
19240
|
+
size_t start_row = start / n_per_row;
|
19241
|
+
size_t row_size = ggml_row_size(type, n_per_row);
|
19242
|
+
result = quantize_iq3_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
19243
|
+
GGML_ASSERT(result == row_size * nrows);
|
19244
|
+
} break;
|
18781
19245
|
case GGML_TYPE_F16:
|
18782
19246
|
{
|
18783
|
-
|
19247
|
+
size_t elemsize = sizeof(ggml_fp16_t);
|
18784
19248
|
ggml_fp32_to_fp16_row(src + start, (ggml_fp16_t *)dst + start, n);
|
18785
19249
|
result = n * elemsize;
|
18786
19250
|
} break;
|
18787
19251
|
case GGML_TYPE_F32:
|
18788
19252
|
{
|
18789
|
-
|
19253
|
+
size_t elemsize = sizeof(float);
|
18790
19254
|
result = n * elemsize;
|
18791
19255
|
memcpy((uint8_t *)dst + start * elemsize, src + start, result);
|
18792
19256
|
} break;
|
@@ -18904,6 +19368,25 @@ struct gguf_context {
|
|
18904
19368
|
void * data;
|
18905
19369
|
};
|
18906
19370
|
|
19371
|
+
static size_t gguf_type_size(enum gguf_type type) {
|
19372
|
+
GGML_ASSERT(0 <= type && type < GGUF_TYPE_COUNT);
|
19373
|
+
return GGUF_TYPE_SIZE[type];
|
19374
|
+
}
|
19375
|
+
|
19376
|
+
static void gguf_tensor_info_sanitize(struct gguf_tensor_info * info) {
|
19377
|
+
GGML_ASSERT(info->n_dims <= GGML_MAX_DIMS);
|
19378
|
+
GGML_ASSERT(0 <= info->type && info->type < GGML_TYPE_COUNT);
|
19379
|
+
|
19380
|
+
for (uint32_t i = 0; i < info->n_dims; ++i) {
|
19381
|
+
GGML_ASSERT(info->ne[i] > 0);
|
19382
|
+
}
|
19383
|
+
|
19384
|
+
// prevent overflow for total number of elements
|
19385
|
+
GGML_ASSERT(INT64_MAX/info->ne[1] > info->ne[0]);
|
19386
|
+
GGML_ASSERT(INT64_MAX/info->ne[2] > info->ne[0]*info->ne[1]);
|
19387
|
+
GGML_ASSERT(INT64_MAX/info->ne[3] > info->ne[0]*info->ne[1]*info->ne[2]);
|
19388
|
+
}
|
19389
|
+
|
18907
19390
|
static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) {
|
18908
19391
|
const size_t n = fread(dst, 1, size, file);
|
18909
19392
|
*offset += n;
|
@@ -18916,8 +19399,17 @@ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
|
|
18916
19399
|
|
18917
19400
|
bool ok = true;
|
18918
19401
|
|
18919
|
-
ok = ok && gguf_fread_el(file, &p->n,
|
18920
|
-
|
19402
|
+
ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset);
|
19403
|
+
|
19404
|
+
// early exit if string length is invalid, prevents from integer overflow
|
19405
|
+
if (p->n == SIZE_MAX) {
|
19406
|
+
fprintf(stderr, "%s: invalid string length (%" PRIu64 ")\n", __func__, p->n);
|
19407
|
+
return false;
|
19408
|
+
}
|
19409
|
+
|
19410
|
+
p->data = GGML_CALLOC(p->n + 1, 1);
|
19411
|
+
|
19412
|
+
ok = ok && gguf_fread_el(file, p->data, p->n, offset);
|
18921
19413
|
|
18922
19414
|
return ok;
|
18923
19415
|
}
|
@@ -18989,6 +19481,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
18989
19481
|
return NULL;
|
18990
19482
|
}
|
18991
19483
|
|
19484
|
+
// sanity-checks to prevent from integer/buffer overflows
|
19485
|
+
|
19486
|
+
ok = ok && (ctx->header.n_tensors < (SIZE_MAX/2)/sizeof(struct gguf_tensor_info));
|
19487
|
+
ok = ok && (ctx->header.n_tensors < (SIZE_MAX/2)/ggml_tensor_overhead());
|
19488
|
+
ok = ok && (ctx->header.n_kv < (SIZE_MAX/2)/sizeof(struct gguf_kv));
|
19489
|
+
|
18992
19490
|
if (!ok) {
|
18993
19491
|
fprintf(stderr, "%s: failed to read header\n", __func__);
|
18994
19492
|
fclose(file);
|
@@ -18999,7 +19497,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
18999
19497
|
|
19000
19498
|
// read the kv pairs
|
19001
19499
|
{
|
19002
|
-
ctx->kv =
|
19500
|
+
ctx->kv = GGML_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
|
19003
19501
|
|
19004
19502
|
for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
|
19005
19503
|
struct gguf_kv * kv = &ctx->kv[i];
|
@@ -19027,7 +19525,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
19027
19525
|
case GGUF_TYPE_ARRAY:
|
19028
19526
|
{
|
19029
19527
|
ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
|
19030
|
-
ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n),
|
19528
|
+
ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
|
19031
19529
|
|
19032
19530
|
switch (kv->value.arr.type) {
|
19033
19531
|
case GGUF_TYPE_UINT8:
|
@@ -19042,21 +19540,39 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
19042
19540
|
case GGUF_TYPE_FLOAT64:
|
19043
19541
|
case GGUF_TYPE_BOOL:
|
19044
19542
|
{
|
19045
|
-
|
19046
|
-
|
19543
|
+
// prevent from integer overflow in the malloc below
|
19544
|
+
if (kv->value.arr.n >= SIZE_MAX/gguf_type_size(kv->value.arr.type)) {
|
19545
|
+
fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
|
19546
|
+
fclose(file);
|
19547
|
+
gguf_free(ctx);
|
19548
|
+
return NULL;
|
19549
|
+
}
|
19550
|
+
|
19551
|
+
kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * gguf_type_size(kv->value.arr.type));
|
19552
|
+
|
19553
|
+
ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type), &offset);
|
19047
19554
|
} break;
|
19048
19555
|
case GGUF_TYPE_STRING:
|
19049
19556
|
{
|
19050
|
-
|
19557
|
+
// prevent from integer overflow in the malloc below
|
19558
|
+
if (kv->value.arr.n >= SIZE_MAX/sizeof(struct gguf_str)) {
|
19559
|
+
fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
|
19560
|
+
fclose(file);
|
19561
|
+
gguf_free(ctx);
|
19562
|
+
return NULL;
|
19563
|
+
}
|
19564
|
+
|
19565
|
+
kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * sizeof(struct gguf_str));
|
19566
|
+
|
19051
19567
|
for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
|
19052
19568
|
ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
|
19053
19569
|
}
|
19054
19570
|
} break;
|
19055
19571
|
case GGUF_TYPE_ARRAY:
|
19056
|
-
|
19572
|
+
default: GGML_ASSERT(false && "invalid type"); break;
|
19057
19573
|
}
|
19058
19574
|
} break;
|
19059
|
-
|
19575
|
+
default: GGML_ASSERT(false && "invalid type");
|
19060
19576
|
}
|
19061
19577
|
|
19062
19578
|
if (!ok) {
|
@@ -19074,7 +19590,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
19074
19590
|
|
19075
19591
|
// read the tensor infos
|
19076
19592
|
{
|
19077
|
-
ctx->infos =
|
19593
|
+
ctx->infos = GGML_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
|
19078
19594
|
|
19079
19595
|
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
|
19080
19596
|
struct gguf_tensor_info * info = &ctx->infos[i];
|
@@ -19085,12 +19601,18 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
19085
19601
|
|
19086
19602
|
ok = ok && gguf_fread_str(file, &info->name, &offset);
|
19087
19603
|
ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset);
|
19604
|
+
|
19605
|
+
ok = ok && (info->n_dims <= GGML_MAX_DIMS);
|
19606
|
+
|
19088
19607
|
for (uint32_t j = 0; j < info->n_dims; ++j) {
|
19089
19608
|
ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
|
19090
19609
|
}
|
19610
|
+
|
19091
19611
|
ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
|
19092
19612
|
ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
|
19093
19613
|
|
19614
|
+
gguf_tensor_info_sanitize(info);
|
19615
|
+
|
19094
19616
|
if (!ok) {
|
19095
19617
|
fprintf(stderr, "%s: failed to read tensor info\n", __func__);
|
19096
19618
|
fclose(file);
|
@@ -19244,12 +19766,12 @@ void gguf_free(struct gguf_context * ctx) {
|
|
19244
19766
|
struct gguf_kv * kv = &ctx->kv[i];
|
19245
19767
|
|
19246
19768
|
if (kv->key.data) {
|
19247
|
-
|
19769
|
+
GGML_FREE(kv->key.data);
|
19248
19770
|
}
|
19249
19771
|
|
19250
19772
|
if (kv->type == GGUF_TYPE_STRING) {
|
19251
19773
|
if (kv->value.str.data) {
|
19252
|
-
|
19774
|
+
GGML_FREE(kv->value.str.data);
|
19253
19775
|
}
|
19254
19776
|
}
|
19255
19777
|
|
@@ -19259,16 +19781,16 @@ void gguf_free(struct gguf_context * ctx) {
|
|
19259
19781
|
for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
|
19260
19782
|
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
|
19261
19783
|
if (str->data) {
|
19262
|
-
|
19784
|
+
GGML_FREE(str->data);
|
19263
19785
|
}
|
19264
19786
|
}
|
19265
19787
|
}
|
19266
|
-
|
19788
|
+
GGML_FREE(kv->value.arr.data);
|
19267
19789
|
}
|
19268
19790
|
}
|
19269
19791
|
}
|
19270
19792
|
|
19271
|
-
|
19793
|
+
GGML_FREE(ctx->kv);
|
19272
19794
|
}
|
19273
19795
|
|
19274
19796
|
if (ctx->infos) {
|
@@ -19276,11 +19798,11 @@ void gguf_free(struct gguf_context * ctx) {
|
|
19276
19798
|
struct gguf_tensor_info * info = &ctx->infos[i];
|
19277
19799
|
|
19278
19800
|
if (info->name.data) {
|
19279
|
-
|
19801
|
+
GGML_FREE(info->name.data);
|
19280
19802
|
}
|
19281
19803
|
}
|
19282
19804
|
|
19283
|
-
|
19805
|
+
GGML_FREE(ctx->infos);
|
19284
19806
|
}
|
19285
19807
|
|
19286
19808
|
GGML_ALIGNED_FREE(ctx);
|
@@ -19581,8 +20103,8 @@ void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_ty
|
|
19581
20103
|
ctx->kv[idx].type = GGUF_TYPE_ARRAY;
|
19582
20104
|
ctx->kv[idx].value.arr.type = type;
|
19583
20105
|
ctx->kv[idx].value.arr.n = n;
|
19584
|
-
ctx->kv[idx].value.arr.data =
|
19585
|
-
memcpy(ctx->kv[idx].value.arr.data, data, n*
|
20106
|
+
ctx->kv[idx].value.arr.data = GGML_MALLOC(n*gguf_type_size(type));
|
20107
|
+
memcpy(ctx->kv[idx].value.arr.data, data, n*gguf_type_size(type));
|
19586
20108
|
}
|
19587
20109
|
|
19588
20110
|
void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, int n) {
|
@@ -19591,7 +20113,7 @@ void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char **
|
|
19591
20113
|
ctx->kv[idx].type = GGUF_TYPE_ARRAY;
|
19592
20114
|
ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
|
19593
20115
|
ctx->kv[idx].value.arr.n = n;
|
19594
|
-
ctx->kv[idx].value.arr.data =
|
20116
|
+
ctx->kv[idx].value.arr.data = GGML_MALLOC(n*sizeof(struct gguf_str));
|
19595
20117
|
for (int i = 0; i < n; i++) {
|
19596
20118
|
struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
|
19597
20119
|
str->n = strlen(data[i]);
|
@@ -19618,19 +20140,19 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
|
|
19618
20140
|
case GGUF_TYPE_ARRAY:
|
19619
20141
|
{
|
19620
20142
|
if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
|
19621
|
-
const char ** data =
|
20143
|
+
const char ** data = GGML_MALLOC(src->kv[i].value.arr.n*sizeof(char *));
|
19622
20144
|
for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
|
19623
20145
|
data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
|
19624
20146
|
}
|
19625
20147
|
gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
|
19626
|
-
|
20148
|
+
GGML_FREE((void *)data);
|
19627
20149
|
} else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
|
19628
20150
|
GGML_ASSERT(false && "nested arrays not supported");
|
19629
20151
|
} else {
|
19630
20152
|
gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n);
|
19631
20153
|
}
|
19632
20154
|
} break;
|
19633
|
-
|
20155
|
+
default: GGML_ASSERT(false && "invalid type"); break;
|
19634
20156
|
}
|
19635
20157
|
}
|
19636
20158
|
}
|
@@ -19706,7 +20228,7 @@ struct gguf_buf {
|
|
19706
20228
|
|
19707
20229
|
static struct gguf_buf gguf_buf_init(size_t size) {
|
19708
20230
|
struct gguf_buf buf = {
|
19709
|
-
/*buf.data =*/ size == 0 ? NULL :
|
20231
|
+
/*buf.data =*/ size == 0 ? NULL : GGML_MALLOC(size),
|
19710
20232
|
/*buf.size =*/ size,
|
19711
20233
|
/*buf.offset =*/ 0,
|
19712
20234
|
};
|
@@ -19716,7 +20238,7 @@ static struct gguf_buf gguf_buf_init(size_t size) {
|
|
19716
20238
|
|
19717
20239
|
static void gguf_buf_free(struct gguf_buf buf) {
|
19718
20240
|
if (buf.data) {
|
19719
|
-
|
20241
|
+
GGML_FREE(buf.data);
|
19720
20242
|
}
|
19721
20243
|
}
|
19722
20244
|
|
@@ -19797,7 +20319,7 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf *
|
|
19797
20319
|
case GGUF_TYPE_FLOAT64:
|
19798
20320
|
case GGUF_TYPE_BOOL:
|
19799
20321
|
{
|
19800
|
-
gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n *
|
20322
|
+
gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type));
|
19801
20323
|
} break;
|
19802
20324
|
case GGUF_TYPE_STRING:
|
19803
20325
|
{
|
@@ -19806,10 +20328,10 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf *
|
|
19806
20328
|
}
|
19807
20329
|
} break;
|
19808
20330
|
case GGUF_TYPE_ARRAY:
|
19809
|
-
|
20331
|
+
default: GGML_ASSERT(false && "invalid type"); break;
|
19810
20332
|
}
|
19811
20333
|
} break;
|
19812
|
-
|
20334
|
+
default: GGML_ASSERT(false && "invalid type");
|
19813
20335
|
}
|
19814
20336
|
}
|
19815
20337
|
|
@@ -20010,7 +20532,7 @@ int ggml_cpu_has_wasm_simd(void) {
|
|
20010
20532
|
}
|
20011
20533
|
|
20012
20534
|
int ggml_cpu_has_blas(void) {
|
20013
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
20535
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
|
20014
20536
|
return 1;
|
20015
20537
|
#else
|
20016
20538
|
return 0;
|
@@ -20033,8 +20555,33 @@ int ggml_cpu_has_clblast(void) {
|
|
20033
20555
|
#endif
|
20034
20556
|
}
|
20035
20557
|
|
20558
|
+
int ggml_cpu_has_vulkan(void) {
|
20559
|
+
#if defined(GGML_USE_VULKAN)
|
20560
|
+
return 1;
|
20561
|
+
#else
|
20562
|
+
return 0;
|
20563
|
+
#endif
|
20564
|
+
}
|
20565
|
+
|
20566
|
+
int ggml_cpu_has_kompute(void) {
|
20567
|
+
#if defined(GGML_USE_KOMPUTE)
|
20568
|
+
return 1;
|
20569
|
+
#else
|
20570
|
+
return 0;
|
20571
|
+
#endif
|
20572
|
+
}
|
20573
|
+
|
20574
|
+
int ggml_cpu_has_sycl(void) {
|
20575
|
+
#if defined(GGML_USE_SYCL)
|
20576
|
+
return 1;
|
20577
|
+
#else
|
20578
|
+
return 0;
|
20579
|
+
#endif
|
20580
|
+
}
|
20581
|
+
|
20036
20582
|
int ggml_cpu_has_gpublas(void) {
|
20037
|
-
return ggml_cpu_has_cublas() || ggml_cpu_has_clblast()
|
20583
|
+
return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
|
20584
|
+
ggml_cpu_has_sycl();
|
20038
20585
|
}
|
20039
20586
|
|
20040
20587
|
int ggml_cpu_has_sse3(void) {
|