llama_cpp 0.12.2 → 0.12.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +15 -0
- data/README.md +2 -2
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +68 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -2
- data/vendor/tmp/llama.cpp/Makefile +25 -3
- data/vendor/tmp/llama.cpp/ggml-alloc.c +87 -27
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +6 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +176 -18
- data/vendor/tmp/llama.cpp/ggml-backend.h +14 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +1990 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.h +46 -0
- data/vendor/tmp/llama.cpp/ggml-metal.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +144 -113
- data/vendor/tmp/llama.cpp/ggml-metal.metal +303 -4
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +95 -3
- data/vendor/tmp/llama.cpp/ggml-opencl.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +736 -59
- data/vendor/tmp/llama.cpp/ggml-quants.h +20 -1
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +15255 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.h +29 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +60854 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +5270 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +34 -0
- data/vendor/tmp/llama.cpp/ggml.c +664 -117
- data/vendor/tmp/llama.cpp/ggml.h +46 -11
- data/vendor/tmp/llama.cpp/llama.cpp +1426 -341
- data/vendor/tmp/llama.cpp/llama.h +24 -15
- data/vendor/tmp/llama.cpp/unicode.h +2 -1
- metadata +10 -3
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
@@ -218,6 +218,7 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|
218
218
|
break;
|
219
219
|
}
|
220
220
|
GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
|
221
|
+
GGML_ASSERT(false);
|
221
222
|
return NULL;
|
222
223
|
}
|
223
224
|
return aligned_memory;
|
@@ -230,6 +231,38 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|
230
231
|
#endif
|
231
232
|
#endif
|
232
233
|
|
234
|
+
inline static void * ggml_malloc(size_t size) {
|
235
|
+
if (size == 0) {
|
236
|
+
GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_malloc!\n");
|
237
|
+
return NULL;
|
238
|
+
}
|
239
|
+
void * result = malloc(size);
|
240
|
+
if (result == NULL) {
|
241
|
+
GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
|
242
|
+
GGML_ASSERT(false);
|
243
|
+
}
|
244
|
+
return result;
|
245
|
+
}
|
246
|
+
|
247
|
+
// calloc
|
248
|
+
inline static void * ggml_calloc(size_t num, size_t size) {
|
249
|
+
if (num == 0 || size == 0) {
|
250
|
+
GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_calloc!\n");
|
251
|
+
return NULL;
|
252
|
+
}
|
253
|
+
void * result = calloc(num, size);
|
254
|
+
if (result == NULL) {
|
255
|
+
GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
|
256
|
+
GGML_ASSERT(false);
|
257
|
+
}
|
258
|
+
return result;
|
259
|
+
}
|
260
|
+
|
261
|
+
#define GGML_MALLOC(size) ggml_malloc(size)
|
262
|
+
#define GGML_CALLOC(num, size) ggml_calloc(num, size)
|
263
|
+
|
264
|
+
#define GGML_FREE(ptr) free(ptr)
|
265
|
+
|
233
266
|
#define UNUSED GGML_UNUSED
|
234
267
|
#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
|
235
268
|
|
@@ -248,6 +281,10 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|
248
281
|
#include "ggml-cuda.h"
|
249
282
|
#elif defined(GGML_USE_CLBLAST)
|
250
283
|
#include "ggml-opencl.h"
|
284
|
+
#elif defined(GGML_USE_VULKAN)
|
285
|
+
#include "ggml-vulkan.h"
|
286
|
+
#elif defined(GGML_USE_SYCL)
|
287
|
+
#include "ggml-sycl.h"
|
251
288
|
#endif
|
252
289
|
|
253
290
|
// floating point type used to accumulate sums
|
@@ -394,12 +431,6 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
|
394
431
|
static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
|
395
432
|
static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
|
396
433
|
|
397
|
-
ggml_collect_imatrix_t g_imatrix_collect = NULL;
|
398
|
-
|
399
|
-
void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect) {
|
400
|
-
g_imatrix_collect = imatrix_collect;
|
401
|
-
}
|
402
|
-
|
403
434
|
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
404
435
|
[GGML_TYPE_I8] = {
|
405
436
|
.type_name = "i8",
|
@@ -601,6 +632,17 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
601
632
|
.vec_dot = ggml_vec_dot_iq2_xs_q8_K,
|
602
633
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
603
634
|
},
|
635
|
+
[GGML_TYPE_IQ3_XXS] = {
|
636
|
+
.type_name = "iq3_xxs",
|
637
|
+
.blck_size = QK_K,
|
638
|
+
.type_size = sizeof(block_iq3_xxs),
|
639
|
+
.is_quantized = true,
|
640
|
+
.to_float = (ggml_to_float_t) dequantize_row_iq3_xxs,
|
641
|
+
.from_float = quantize_row_iq3_xxs,
|
642
|
+
.from_float_reference = (ggml_from_float_t)quantize_row_iq3_xxs_reference,
|
643
|
+
.vec_dot = ggml_vec_dot_iq3_xxs_q8_K,
|
644
|
+
.vec_dot_type = GGML_TYPE_Q8_K,
|
645
|
+
},
|
604
646
|
[GGML_TYPE_Q8_K] = {
|
605
647
|
.type_name = "q8_K",
|
606
648
|
.blck_size = QK_K,
|
@@ -1424,6 +1466,9 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) {
|
|
1424
1466
|
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
|
1425
1467
|
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
1426
1468
|
inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
|
1469
|
+
// TODO: optimize performance
|
1470
|
+
inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
|
1471
|
+
inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
|
1427
1472
|
|
1428
1473
|
static const float GELU_COEF_A = 0.044715f;
|
1429
1474
|
static const float GELU_QUICK_COEF = -1.702f;
|
@@ -1782,9 +1827,11 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
|
|
1782
1827
|
"GELU",
|
1783
1828
|
"GELU_QUICK",
|
1784
1829
|
"SILU",
|
1830
|
+
"HARDSWISH",
|
1831
|
+
"HARDSIGMOID",
|
1785
1832
|
};
|
1786
1833
|
|
1787
|
-
static_assert(GGML_UNARY_OP_COUNT ==
|
1834
|
+
static_assert(GGML_UNARY_OP_COUNT == 12, "GGML_UNARY_OP_COUNT != 12");
|
1788
1835
|
|
1789
1836
|
|
1790
1837
|
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
@@ -2141,6 +2188,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
|
2141
2188
|
case GGML_FTYPE_MOSTLY_Q6_K: wtype = GGML_TYPE_Q6_K; break;
|
2142
2189
|
case GGML_FTYPE_MOSTLY_IQ2_XXS: wtype = GGML_TYPE_IQ2_XXS; break;
|
2143
2190
|
case GGML_FTYPE_MOSTLY_IQ2_XS: wtype = GGML_TYPE_IQ2_XS; break;
|
2191
|
+
case GGML_FTYPE_MOSTLY_IQ3_XXS: wtype = GGML_TYPE_IQ3_XXS; break;
|
2144
2192
|
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
2145
2193
|
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
2146
2194
|
}
|
@@ -2294,6 +2342,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
2294
2342
|
ggml_init_cublas();
|
2295
2343
|
#elif defined(GGML_USE_CLBLAST)
|
2296
2344
|
ggml_cl_init();
|
2345
|
+
#elif defined(GGML_USE_VULKAN)
|
2346
|
+
ggml_vk_init();
|
2347
|
+
#elif defined(GGML_USE_SYCL)
|
2348
|
+
ggml_init_sycl();
|
2297
2349
|
#endif
|
2298
2350
|
|
2299
2351
|
ggml_setup_op_has_task_pass();
|
@@ -3951,6 +4003,20 @@ struct ggml_tensor * ggml_silu_back(
|
|
3951
4003
|
return result;
|
3952
4004
|
}
|
3953
4005
|
|
4006
|
+
// ggml hardswish
|
4007
|
+
struct ggml_tensor * ggml_hardswish(
|
4008
|
+
struct ggml_context * ctx,
|
4009
|
+
struct ggml_tensor * a) {
|
4010
|
+
return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSWISH);
|
4011
|
+
}
|
4012
|
+
|
4013
|
+
// ggml hardsigmoid
|
4014
|
+
struct ggml_tensor * ggml_hardsigmoid(
|
4015
|
+
struct ggml_context * ctx,
|
4016
|
+
struct ggml_tensor * a) {
|
4017
|
+
return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSIGMOID);
|
4018
|
+
}
|
4019
|
+
|
3954
4020
|
// ggml_norm
|
3955
4021
|
|
3956
4022
|
static struct ggml_tensor * ggml_norm_impl(
|
@@ -5283,7 +5349,7 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
|
|
5283
5349
|
int s0,
|
5284
5350
|
int p0,
|
5285
5351
|
int d0) {
|
5286
|
-
struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false); // [N, OL, IC * K]
|
5352
|
+
struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16); // [N, OL, IC * K]
|
5287
5353
|
|
5288
5354
|
struct ggml_tensor * result =
|
5289
5355
|
ggml_mul_mat(ctx,
|
@@ -5350,6 +5416,30 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
|
|
5350
5416
|
return result;
|
5351
5417
|
}
|
5352
5418
|
|
5419
|
+
// ggml_conv_depthwise
|
5420
|
+
struct ggml_tensor * ggml_conv_depthwise_2d(
|
5421
|
+
struct ggml_context * ctx,
|
5422
|
+
struct ggml_tensor * a,
|
5423
|
+
struct ggml_tensor * b,
|
5424
|
+
int s0,
|
5425
|
+
int s1,
|
5426
|
+
int p0,
|
5427
|
+
int p1,
|
5428
|
+
int d0,
|
5429
|
+
int d1) {
|
5430
|
+
|
5431
|
+
struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
|
5432
|
+
struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
|
5433
|
+
ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
|
5434
|
+
s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW]
|
5435
|
+
struct ggml_tensor * new_b = ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
|
5436
|
+
|
5437
|
+
new_a = ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1); // [OC,1, KH, KW] => [1, OC, 1, KH * KW]
|
5438
|
+
struct ggml_tensor * result = ggml_mul_mat(ctx, new_a, new_b);
|
5439
|
+
result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
|
5440
|
+
|
5441
|
+
return result;
|
5442
|
+
}
|
5353
5443
|
// ggml_conv_2d
|
5354
5444
|
|
5355
5445
|
// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
|
@@ -5366,7 +5456,8 @@ struct ggml_tensor * ggml_im2col(
|
|
5366
5456
|
int p1,
|
5367
5457
|
int d0,
|
5368
5458
|
int d1,
|
5369
|
-
bool is_2D
|
5459
|
+
bool is_2D,
|
5460
|
+
enum ggml_type dst_type) {
|
5370
5461
|
|
5371
5462
|
if(is_2D) {
|
5372
5463
|
GGML_ASSERT(a->ne[2] == b->ne[2]);
|
@@ -5390,7 +5481,7 @@ struct ggml_tensor * ggml_im2col(
|
|
5390
5481
|
is_2D ? b->ne[3] : 1,
|
5391
5482
|
};
|
5392
5483
|
|
5393
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx,
|
5484
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, dst_type, 4, ne);
|
5394
5485
|
int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
|
5395
5486
|
ggml_set_op_params(result, params, sizeof(params));
|
5396
5487
|
|
@@ -5415,7 +5506,7 @@ struct ggml_tensor * ggml_conv_2d(
|
|
5415
5506
|
int p1,
|
5416
5507
|
int d0,
|
5417
5508
|
int d1) {
|
5418
|
-
struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true); // [N, OH, OW, IC * KH * KW]
|
5509
|
+
struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N, OH, OW, IC * KH * KW]
|
5419
5510
|
|
5420
5511
|
struct ggml_tensor * result =
|
5421
5512
|
ggml_mul_mat(ctx,
|
@@ -5541,12 +5632,13 @@ struct ggml_tensor * ggml_pool_2d(
|
|
5541
5632
|
is_node = true;
|
5542
5633
|
}
|
5543
5634
|
|
5635
|
+
struct ggml_tensor * result;
|
5544
5636
|
const int64_t ne[3] = {
|
5545
5637
|
ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
|
5546
5638
|
ggml_calc_pool_output_size(a->ne[1], k1, s1, p1),
|
5547
5639
|
a->ne[2],
|
5548
5640
|
};
|
5549
|
-
|
5641
|
+
result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
|
5550
5642
|
|
5551
5643
|
int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
|
5552
5644
|
ggml_set_op_params(result, params, sizeof(params));
|
@@ -5554,7 +5646,6 @@ struct ggml_tensor * ggml_pool_2d(
|
|
5554
5646
|
result->op = GGML_OP_POOL_2D;
|
5555
5647
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5556
5648
|
result->src[0] = a;
|
5557
|
-
|
5558
5649
|
return result;
|
5559
5650
|
}
|
5560
5651
|
|
@@ -7169,6 +7260,17 @@ static void ggml_compute_forward_add_f32(
|
|
7169
7260
|
const int ith = params->ith;
|
7170
7261
|
const int nth = params->nth;
|
7171
7262
|
|
7263
|
+
#ifdef GGML_USE_CLBLAST
|
7264
|
+
if (src1->backend == GGML_BACKEND_GPU) {
|
7265
|
+
// TODO: OpenCL kernel support full broadcast
|
7266
|
+
GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
|
7267
|
+
if (ith == 0) {
|
7268
|
+
ggml_cl_add(src0, src1, dst);
|
7269
|
+
}
|
7270
|
+
return;
|
7271
|
+
}
|
7272
|
+
#endif
|
7273
|
+
|
7172
7274
|
const int nr = ggml_nrows(src0);
|
7173
7275
|
|
7174
7276
|
GGML_TENSOR_BINARY_OP_LOCALS
|
@@ -7449,7 +7551,12 @@ static void ggml_compute_forward_add(
|
|
7449
7551
|
switch (src0->type) {
|
7450
7552
|
case GGML_TYPE_F32:
|
7451
7553
|
{
|
7452
|
-
|
7554
|
+
if (src1->type == GGML_TYPE_F32) {
|
7555
|
+
ggml_compute_forward_add_f32(params, src0, src1, dst);
|
7556
|
+
}
|
7557
|
+
else {
|
7558
|
+
GGML_ASSERT(false);
|
7559
|
+
}
|
7453
7560
|
} break;
|
7454
7561
|
case GGML_TYPE_F16:
|
7455
7562
|
{
|
@@ -7475,6 +7582,7 @@ static void ggml_compute_forward_add(
|
|
7475
7582
|
case GGML_TYPE_Q6_K:
|
7476
7583
|
case GGML_TYPE_IQ2_XXS:
|
7477
7584
|
case GGML_TYPE_IQ2_XS:
|
7585
|
+
case GGML_TYPE_IQ3_XXS:
|
7478
7586
|
{
|
7479
7587
|
ggml_compute_forward_add_q_f32(params, src0, src1, dst);
|
7480
7588
|
} break;
|
@@ -7741,6 +7849,7 @@ static void ggml_compute_forward_add1(
|
|
7741
7849
|
case GGML_TYPE_Q6_K:
|
7742
7850
|
case GGML_TYPE_IQ2_XXS:
|
7743
7851
|
case GGML_TYPE_IQ2_XS:
|
7852
|
+
case GGML_TYPE_IQ3_XXS:
|
7744
7853
|
{
|
7745
7854
|
ggml_compute_forward_add1_q_f32(params, src0, src1, dst);
|
7746
7855
|
} break;
|
@@ -7770,6 +7879,9 @@ static void ggml_compute_forward_acc_f32(
|
|
7770
7879
|
bool inplace = (bool) ((int32_t *) dst->op_params)[4];
|
7771
7880
|
|
7772
7881
|
if (!inplace && (params->type == GGML_TASK_INIT)) {
|
7882
|
+
if (params->ith != 0) {
|
7883
|
+
return;
|
7884
|
+
}
|
7773
7885
|
// memcpy needs to be synchronized across threads to avoid race conditions.
|
7774
7886
|
// => do it in INIT phase
|
7775
7887
|
memcpy(
|
@@ -7857,6 +7969,7 @@ static void ggml_compute_forward_acc(
|
|
7857
7969
|
case GGML_TYPE_Q6_K:
|
7858
7970
|
case GGML_TYPE_IQ2_XXS:
|
7859
7971
|
case GGML_TYPE_IQ2_XS:
|
7972
|
+
case GGML_TYPE_IQ3_XXS:
|
7860
7973
|
default:
|
7861
7974
|
{
|
7862
7975
|
GGML_ASSERT(false);
|
@@ -7958,7 +8071,7 @@ static void ggml_compute_forward_mul_f32(
|
|
7958
8071
|
const int ith = params->ith;
|
7959
8072
|
const int nth = params->nth;
|
7960
8073
|
|
7961
|
-
#
|
8074
|
+
#if defined(GGML_USE_CLBLAST)
|
7962
8075
|
if (src1->backend == GGML_BACKEND_GPU) {
|
7963
8076
|
// TODO: OpenCL kernel support full broadcast
|
7964
8077
|
GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
|
@@ -9339,6 +9452,87 @@ static void ggml_compute_forward_silu_back(
|
|
9339
9452
|
}
|
9340
9453
|
}
|
9341
9454
|
|
9455
|
+
|
9456
|
+
static void ggml_compute_forward_hardswish_f32(
|
9457
|
+
const struct ggml_compute_params * params,
|
9458
|
+
const struct ggml_tensor * src0,
|
9459
|
+
struct ggml_tensor * dst) {
|
9460
|
+
assert(params->ith == 0);
|
9461
|
+
assert(ggml_are_same_shape(src0, dst));
|
9462
|
+
|
9463
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
9464
|
+
return;
|
9465
|
+
}
|
9466
|
+
|
9467
|
+
const int n = ggml_nrows(src0);
|
9468
|
+
const int nc = src0->ne[0];
|
9469
|
+
|
9470
|
+
assert(dst->nb[0] == sizeof(float));
|
9471
|
+
assert(src0->nb[0] == sizeof(float));
|
9472
|
+
|
9473
|
+
for (int i = 0; i < n; i++) {
|
9474
|
+
ggml_vec_hardswish_f32(nc,
|
9475
|
+
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
9476
|
+
(float *) ((char *) src0->data + i*(src0->nb[1])));
|
9477
|
+
}
|
9478
|
+
}
|
9479
|
+
static void ggml_compute_forward_hardswish(
|
9480
|
+
const struct ggml_compute_params * params,
|
9481
|
+
const struct ggml_tensor * src0,
|
9482
|
+
struct ggml_tensor * dst) {
|
9483
|
+
switch (src0->type) {
|
9484
|
+
case GGML_TYPE_F32:
|
9485
|
+
{
|
9486
|
+
ggml_compute_forward_hardswish_f32(params, src0, dst);
|
9487
|
+
} break;
|
9488
|
+
default:
|
9489
|
+
{
|
9490
|
+
GGML_ASSERT(false);
|
9491
|
+
} break;
|
9492
|
+
}
|
9493
|
+
}
|
9494
|
+
|
9495
|
+
static void ggml_compute_forward_hardsigmoid_f32(
|
9496
|
+
const struct ggml_compute_params * params,
|
9497
|
+
const struct ggml_tensor * src0,
|
9498
|
+
struct ggml_tensor * dst) {
|
9499
|
+
assert(params->ith == 0);
|
9500
|
+
assert(ggml_are_same_shape(src0, dst));
|
9501
|
+
|
9502
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
9503
|
+
return;
|
9504
|
+
}
|
9505
|
+
|
9506
|
+
const int n = ggml_nrows(src0);
|
9507
|
+
const int nc = src0->ne[0];
|
9508
|
+
|
9509
|
+
assert(dst->nb[0] == sizeof(float));
|
9510
|
+
assert(src0->nb[0] == sizeof(float));
|
9511
|
+
|
9512
|
+
for (int i = 0; i < n; i++) {
|
9513
|
+
ggml_vec_hardsigmoid_f32(nc,
|
9514
|
+
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
9515
|
+
(float *) ((char *) src0->data + i*(src0->nb[1])));
|
9516
|
+
}
|
9517
|
+
}
|
9518
|
+
|
9519
|
+
static void ggml_compute_forward_hardsigmoid(
|
9520
|
+
const struct ggml_compute_params * params,
|
9521
|
+
const struct ggml_tensor * src0,
|
9522
|
+
struct ggml_tensor * dst) {
|
9523
|
+
switch (src0->type) {
|
9524
|
+
case GGML_TYPE_F32:
|
9525
|
+
{
|
9526
|
+
ggml_compute_forward_hardsigmoid_f32(params, src0, dst);
|
9527
|
+
} break;
|
9528
|
+
default:
|
9529
|
+
{
|
9530
|
+
GGML_ASSERT(false);
|
9531
|
+
} break;
|
9532
|
+
}
|
9533
|
+
}
|
9534
|
+
|
9535
|
+
|
9342
9536
|
// ggml_compute_forward_norm
|
9343
9537
|
|
9344
9538
|
static void ggml_compute_forward_norm_f32(
|
@@ -9790,10 +9984,6 @@ static void ggml_compute_forward_mul_mat(
|
|
9790
9984
|
const int ith = params->ith;
|
9791
9985
|
const int nth = params->nth;
|
9792
9986
|
|
9793
|
-
if (ith == 1 && g_imatrix_collect) {
|
9794
|
-
g_imatrix_collect(src0, src1);
|
9795
|
-
}
|
9796
|
-
|
9797
9987
|
const enum ggml_type type = src0->type;
|
9798
9988
|
|
9799
9989
|
const bool src1_cont = ggml_is_contiguous(src1);
|
@@ -9835,11 +10025,30 @@ static void ggml_compute_forward_mul_mat(
|
|
9835
10025
|
|
9836
10026
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9837
10027
|
if (ggml_compute_forward_mul_mat_use_blas(dst)) {
|
9838
|
-
|
9839
|
-
|
9840
|
-
|
10028
|
+
const int64_t ne_plane = ne01*ne00;
|
10029
|
+
const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
|
10030
|
+
UNUSED(desired_wsize);
|
9841
10031
|
|
9842
10032
|
if (params->type == GGML_TASK_INIT) {
|
10033
|
+
if (type != GGML_TYPE_F32) {
|
10034
|
+
assert(params->wsize >= desired_wsize);
|
10035
|
+
// parallelize by src0 rows
|
10036
|
+
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
10037
|
+
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
10038
|
+
// broadcast src0 into src1 across 2nd,3rd dimension
|
10039
|
+
const int64_t i03 = i13/r3;
|
10040
|
+
const int64_t i02 = i12/r2;
|
10041
|
+
|
10042
|
+
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
10043
|
+
float * const wdata = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
|
10044
|
+
ggml_to_float_t const to_float = type_traits[type].to_float;
|
10045
|
+
|
10046
|
+
for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
|
10047
|
+
to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
|
10048
|
+
}
|
10049
|
+
}
|
10050
|
+
}
|
10051
|
+
}
|
9843
10052
|
return;
|
9844
10053
|
}
|
9845
10054
|
|
@@ -9847,9 +10056,14 @@ static void ggml_compute_forward_mul_mat(
|
|
9847
10056
|
return;
|
9848
10057
|
}
|
9849
10058
|
|
10059
|
+
// perform sgemm, parallelization controlled by blas lib
|
10060
|
+
if (ith != 0) {
|
10061
|
+
return;
|
10062
|
+
}
|
10063
|
+
|
10064
|
+
//const int64_t tgemm0 = ggml_perf_time_us();
|
9850
10065
|
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
9851
10066
|
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
9852
|
-
// broadcast src0 into src1 across 2nd,3rd dimension
|
9853
10067
|
const int64_t i03 = i13/r3;
|
9854
10068
|
const int64_t i02 = i12/r2;
|
9855
10069
|
|
@@ -9858,17 +10072,7 @@ static void ggml_compute_forward_mul_mat(
|
|
9858
10072
|
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
9859
10073
|
|
9860
10074
|
if (type != GGML_TYPE_F32) {
|
9861
|
-
|
9862
|
-
ggml_to_float_t const to_float = type_traits[type].to_float;
|
9863
|
-
|
9864
|
-
size_t id = 0;
|
9865
|
-
for (int64_t i01 = 0; i01 < ne01; ++i01) {
|
9866
|
-
to_float((const char *) x + i01*nb01, wdata + id, ne00);
|
9867
|
-
id += ne00;
|
9868
|
-
}
|
9869
|
-
|
9870
|
-
assert(id*sizeof(float) <= params->wsize);
|
9871
|
-
x = wdata;
|
10075
|
+
x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
|
9872
10076
|
}
|
9873
10077
|
|
9874
10078
|
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
@@ -9878,6 +10082,7 @@ static void ggml_compute_forward_mul_mat(
|
|
9878
10082
|
0.0f, d, ne01);
|
9879
10083
|
}
|
9880
10084
|
}
|
10085
|
+
//printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);
|
9881
10086
|
|
9882
10087
|
//printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
|
9883
10088
|
|
@@ -9886,6 +10091,9 @@ static void ggml_compute_forward_mul_mat(
|
|
9886
10091
|
#endif
|
9887
10092
|
|
9888
10093
|
if (params->type == GGML_TASK_INIT) {
|
10094
|
+
if (ith != 0) {
|
10095
|
+
return;
|
10096
|
+
}
|
9889
10097
|
if (src1->type != vec_dot_type) {
|
9890
10098
|
char * wdata = params->wdata;
|
9891
10099
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
@@ -10050,6 +10258,9 @@ static void ggml_compute_forward_mul_mat_id(
|
|
10050
10258
|
#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
|
10051
10259
|
|
10052
10260
|
if (params->type == GGML_TASK_INIT) {
|
10261
|
+
if (ith != 0) {
|
10262
|
+
return;
|
10263
|
+
}
|
10053
10264
|
char * wdata = params->wdata;
|
10054
10265
|
if (src1->type != vec_dot_type) {
|
10055
10266
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
@@ -10097,10 +10308,6 @@ static void ggml_compute_forward_mul_mat_id(
|
|
10097
10308
|
|
10098
10309
|
const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];
|
10099
10310
|
|
10100
|
-
if (ith == 1 && g_imatrix_collect) {
|
10101
|
-
g_imatrix_collect(src0_cur, src1);
|
10102
|
-
}
|
10103
|
-
|
10104
10311
|
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
10105
10312
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
10106
10313
|
|
@@ -10239,6 +10446,9 @@ static void ggml_compute_forward_out_prod_f32(
|
|
10239
10446
|
return;
|
10240
10447
|
}
|
10241
10448
|
#endif
|
10449
|
+
if (ith != 0) {
|
10450
|
+
return;
|
10451
|
+
}
|
10242
10452
|
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
|
10243
10453
|
return;
|
10244
10454
|
}
|
@@ -10422,6 +10632,9 @@ static void ggml_compute_forward_out_prod_q_f32(
|
|
10422
10632
|
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
10423
10633
|
|
10424
10634
|
if (params->type == GGML_TASK_INIT) {
|
10635
|
+
if (ith != 0) {
|
10636
|
+
return;
|
10637
|
+
}
|
10425
10638
|
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
|
10426
10639
|
return;
|
10427
10640
|
}
|
@@ -10508,6 +10721,7 @@ static void ggml_compute_forward_out_prod(
|
|
10508
10721
|
case GGML_TYPE_Q6_K:
|
10509
10722
|
case GGML_TYPE_IQ2_XXS:
|
10510
10723
|
case GGML_TYPE_IQ2_XS:
|
10724
|
+
case GGML_TYPE_IQ3_XXS:
|
10511
10725
|
{
|
10512
10726
|
ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst);
|
10513
10727
|
} break;
|
@@ -10606,6 +10820,9 @@ static void ggml_compute_forward_set_f32(
|
|
10606
10820
|
bool inplace = (bool) ((int32_t *) dst->op_params)[4];
|
10607
10821
|
|
10608
10822
|
if (!inplace && (params->type == GGML_TASK_INIT)) {
|
10823
|
+
if (params->ith != 0) {
|
10824
|
+
return;
|
10825
|
+
}
|
10609
10826
|
// memcpy needs to be synchronized across threads to avoid race conditions.
|
10610
10827
|
// => do it in INIT phase
|
10611
10828
|
memcpy(
|
@@ -10684,6 +10901,7 @@ static void ggml_compute_forward_set(
|
|
10684
10901
|
case GGML_TYPE_Q6_K:
|
10685
10902
|
case GGML_TYPE_IQ2_XXS:
|
10686
10903
|
case GGML_TYPE_IQ2_XS:
|
10904
|
+
case GGML_TYPE_IQ3_XXS:
|
10687
10905
|
default:
|
10688
10906
|
{
|
10689
10907
|
GGML_ASSERT(false);
|
@@ -10880,6 +11098,7 @@ static void ggml_compute_forward_get_rows(
|
|
10880
11098
|
case GGML_TYPE_Q6_K:
|
10881
11099
|
case GGML_TYPE_IQ2_XXS:
|
10882
11100
|
case GGML_TYPE_IQ2_XS:
|
11101
|
+
case GGML_TYPE_IQ3_XXS:
|
10883
11102
|
{
|
10884
11103
|
ggml_compute_forward_get_rows_q(params, src0, src1, dst);
|
10885
11104
|
} break;
|
@@ -10930,6 +11149,9 @@ static void ggml_compute_forward_get_rows_back_f32_f16(
|
|
10930
11149
|
// ggml_compute_forward_dup_same_cont(params, opt0, dst);
|
10931
11150
|
|
10932
11151
|
if (params->type == GGML_TASK_INIT) {
|
11152
|
+
if (params->ith != 0) {
|
11153
|
+
return;
|
11154
|
+
}
|
10933
11155
|
memset(dst->data, 0, ggml_nbytes(dst));
|
10934
11156
|
}
|
10935
11157
|
|
@@ -10964,6 +11186,9 @@ static void ggml_compute_forward_get_rows_back_f32(
|
|
10964
11186
|
// ggml_compute_forward_dup_same_cont(params, opt0, dst);
|
10965
11187
|
|
10966
11188
|
if (params->type == GGML_TASK_INIT) {
|
11189
|
+
if (params->ith != 0) {
|
11190
|
+
return;
|
11191
|
+
}
|
10967
11192
|
memset(dst->data, 0, ggml_nbytes(dst));
|
10968
11193
|
}
|
10969
11194
|
|
@@ -11101,6 +11326,9 @@ static void ggml_compute_forward_diag_mask_f32(
|
|
11101
11326
|
GGML_ASSERT(n_past >= 0);
|
11102
11327
|
|
11103
11328
|
if (!inplace && (params->type == GGML_TASK_INIT)) {
|
11329
|
+
if (ith != 0) {
|
11330
|
+
return;
|
11331
|
+
}
|
11104
11332
|
// memcpy needs to be synchronized across threads to avoid race conditions.
|
11105
11333
|
// => do it in INIT phase
|
11106
11334
|
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
|
@@ -11518,6 +11746,7 @@ static void ggml_compute_forward_alibi(
|
|
11518
11746
|
case GGML_TYPE_Q6_K:
|
11519
11747
|
case GGML_TYPE_IQ2_XXS:
|
11520
11748
|
case GGML_TYPE_IQ2_XS:
|
11749
|
+
case GGML_TYPE_IQ3_XXS:
|
11521
11750
|
case GGML_TYPE_Q8_K:
|
11522
11751
|
case GGML_TYPE_I8:
|
11523
11752
|
case GGML_TYPE_I16:
|
@@ -11594,6 +11823,7 @@ static void ggml_compute_forward_clamp(
|
|
11594
11823
|
case GGML_TYPE_Q6_K:
|
11595
11824
|
case GGML_TYPE_IQ2_XXS:
|
11596
11825
|
case GGML_TYPE_IQ2_XS:
|
11826
|
+
case GGML_TYPE_IQ3_XXS:
|
11597
11827
|
case GGML_TYPE_Q8_K:
|
11598
11828
|
case GGML_TYPE_I8:
|
11599
11829
|
case GGML_TYPE_I16:
|
@@ -12071,6 +12301,9 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
|
|
12071
12301
|
GGML_ASSERT(nb10 == sizeof(float));
|
12072
12302
|
|
12073
12303
|
if (params->type == GGML_TASK_INIT) {
|
12304
|
+
if (ith != 0) {
|
12305
|
+
return;
|
12306
|
+
}
|
12074
12307
|
memset(params->wdata, 0, params->wsize);
|
12075
12308
|
|
12076
12309
|
// permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
|
@@ -12165,6 +12398,9 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
|
|
12165
12398
|
GGML_ASSERT(nb10 == sizeof(float));
|
12166
12399
|
|
12167
12400
|
if (params->type == GGML_TASK_INIT) {
|
12401
|
+
if (ith != 0) {
|
12402
|
+
return;
|
12403
|
+
}
|
12168
12404
|
memset(params->wdata, 0, params->wsize);
|
12169
12405
|
|
12170
12406
|
// prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
|
@@ -12257,6 +12493,92 @@ static void ggml_compute_forward_conv_transpose_1d(
|
|
12257
12493
|
}
|
12258
12494
|
}
|
12259
12495
|
|
12496
|
+
// src0: kernel [OC, IC, KH, KW]
|
12497
|
+
// src1: image [N, IC, IH, IW]
|
12498
|
+
// dst: result [N, OH, OW, IC*KH*KW]
|
12499
|
+
static void ggml_compute_forward_im2col_f32(
|
12500
|
+
const struct ggml_compute_params * params,
|
12501
|
+
const struct ggml_tensor * src0,
|
12502
|
+
const struct ggml_tensor * src1,
|
12503
|
+
struct ggml_tensor * dst) {
|
12504
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
12505
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
12506
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
12507
|
+
|
12508
|
+
int64_t t0 = ggml_perf_time_us();
|
12509
|
+
UNUSED(t0);
|
12510
|
+
|
12511
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
12512
|
+
|
12513
|
+
const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
|
12514
|
+
const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
|
12515
|
+
const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
|
12516
|
+
const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
|
12517
|
+
const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
|
12518
|
+
const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
|
12519
|
+
const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
|
12520
|
+
|
12521
|
+
const int ith = params->ith;
|
12522
|
+
const int nth = params->nth;
|
12523
|
+
|
12524
|
+
const int64_t N = is_2D ? ne13 : ne12;
|
12525
|
+
const int64_t IC = is_2D ? ne12 : ne11;
|
12526
|
+
const int64_t IH = is_2D ? ne11 : 1;
|
12527
|
+
const int64_t IW = ne10;
|
12528
|
+
|
12529
|
+
const int64_t KH = is_2D ? ne01 : 1;
|
12530
|
+
const int64_t KW = ne00;
|
12531
|
+
|
12532
|
+
const int64_t OH = is_2D ? ne2 : 1;
|
12533
|
+
const int64_t OW = ne1;
|
12534
|
+
|
12535
|
+
int ofs0 = is_2D ? nb13 : nb12;
|
12536
|
+
int ofs1 = is_2D ? nb12 : nb11;
|
12537
|
+
|
12538
|
+
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
12539
|
+
GGML_ASSERT(nb10 == sizeof(float));
|
12540
|
+
|
12541
|
+
if (params->type == GGML_TASK_INIT) {
|
12542
|
+
return;
|
12543
|
+
}
|
12544
|
+
|
12545
|
+
if (params->type == GGML_TASK_FINALIZE) {
|
12546
|
+
return;
|
12547
|
+
}
|
12548
|
+
|
12549
|
+
// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
|
12550
|
+
{
|
12551
|
+
float * const wdata = (float *) dst->data;
|
12552
|
+
|
12553
|
+
for (int64_t in = 0; in < N; in++) {
|
12554
|
+
for (int64_t ioh = 0; ioh < OH; ioh++) { // 1
|
12555
|
+
for (int64_t iow = 0; iow < OW; iow++) {
|
12556
|
+
for (int64_t iic = ith; iic < IC; iic += nth) {
|
12557
|
+
|
12558
|
+
// micro kernel
|
12559
|
+
float * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
|
12560
|
+
const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW]
|
12561
|
+
|
12562
|
+
for (int64_t ikh = 0; ikh < KH; ikh++) { // 1
|
12563
|
+
for (int64_t ikw = 0; ikw < KW; ikw++) {
|
12564
|
+
const int64_t iiw = iow*s0 + ikw*d0 - p0;
|
12565
|
+
const int64_t iih = ioh*s1 + ikh*d1 - p1;
|
12566
|
+
|
12567
|
+
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
12568
|
+
dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
|
12569
|
+
} else {
|
12570
|
+
dst_data[iic*(KH*KW) + ikh*KW + ikw] = (src_data[iih*IW + iiw]);
|
12571
|
+
}
|
12572
|
+
}
|
12573
|
+
}
|
12574
|
+
}
|
12575
|
+
}
|
12576
|
+
}
|
12577
|
+
}
|
12578
|
+
}
|
12579
|
+
}
|
12580
|
+
|
12581
|
+
|
12260
12582
|
// src0: kernel [OC, IC, KH, KW]
|
12261
12583
|
// src1: image [N, IC, IH, IW]
|
12262
12584
|
// dst: result [N, OH, OW, IC*KH*KW]
|
@@ -12347,14 +12669,14 @@ static void ggml_compute_forward_im2col(
|
|
12347
12669
|
const struct ggml_tensor * src0,
|
12348
12670
|
const struct ggml_tensor * src1,
|
12349
12671
|
struct ggml_tensor * dst) {
|
12350
|
-
switch (
|
12672
|
+
switch (dst->type) {
|
12351
12673
|
case GGML_TYPE_F16:
|
12352
12674
|
{
|
12353
12675
|
ggml_compute_forward_im2col_f16(params, src0, src1, dst);
|
12354
12676
|
} break;
|
12355
12677
|
case GGML_TYPE_F32:
|
12356
12678
|
{
|
12357
|
-
|
12679
|
+
ggml_compute_forward_im2col_f32(params, src0, src1, dst);
|
12358
12680
|
} break;
|
12359
12681
|
default:
|
12360
12682
|
{
|
@@ -12363,6 +12685,7 @@ static void ggml_compute_forward_im2col(
|
|
12363
12685
|
}
|
12364
12686
|
}
|
12365
12687
|
|
12688
|
+
|
12366
12689
|
// ggml_compute_forward_conv_transpose_2d
|
12367
12690
|
|
12368
12691
|
static void ggml_compute_forward_conv_transpose_2d(
|
@@ -12388,6 +12711,9 @@ static void ggml_compute_forward_conv_transpose_2d(
|
|
12388
12711
|
GGML_ASSERT(nb10 == sizeof(float));
|
12389
12712
|
|
12390
12713
|
if (params->type == GGML_TASK_INIT) {
|
12714
|
+
if (ith != 0) {
|
12715
|
+
return;
|
12716
|
+
}
|
12391
12717
|
memset(params->wdata, 0, params->wsize);
|
12392
12718
|
|
12393
12719
|
// permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
|
@@ -12541,8 +12867,8 @@ static void ggml_compute_forward_pool_2d(
|
|
12541
12867
|
const struct ggml_compute_params * params,
|
12542
12868
|
const struct ggml_tensor * src,
|
12543
12869
|
struct ggml_tensor * dst) {
|
12544
|
-
|
12545
|
-
|
12870
|
+
GGML_ASSERT(src->type == GGML_TYPE_F32);
|
12871
|
+
GGML_ASSERT(params->ith == 0);
|
12546
12872
|
|
12547
12873
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
12548
12874
|
return;
|
@@ -13931,6 +14257,14 @@ static void ggml_compute_forward_unary(
|
|
13931
14257
|
{
|
13932
14258
|
ggml_compute_forward_silu(params, src0, dst);
|
13933
14259
|
} break;
|
14260
|
+
case GGML_UNARY_OP_HARDSWISH:
|
14261
|
+
{
|
14262
|
+
ggml_compute_forward_hardswish(params, src0, dst);
|
14263
|
+
} break;
|
14264
|
+
case GGML_UNARY_OP_HARDSIGMOID:
|
14265
|
+
{
|
14266
|
+
ggml_compute_forward_hardsigmoid(params, src0, dst);
|
14267
|
+
} break;
|
13934
14268
|
default:
|
13935
14269
|
{
|
13936
14270
|
GGML_ASSERT(false);
|
@@ -13994,6 +14328,9 @@ static void ggml_compute_forward_add_rel_pos_f32(
|
|
13994
14328
|
|
13995
14329
|
const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
|
13996
14330
|
if (!inplace && params->type == GGML_TASK_INIT) {
|
14331
|
+
if (params->ith != 0) {
|
14332
|
+
return;
|
14333
|
+
}
|
13997
14334
|
memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
|
13998
14335
|
return;
|
13999
14336
|
}
|
@@ -14509,8 +14846,26 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14509
14846
|
}
|
14510
14847
|
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
|
14511
14848
|
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
|
14849
|
+
#elif defined(GGML_USE_VULKAN)
|
14850
|
+
const bool skip_cpu = ggml_vk_compute_forward(params, tensor);
|
14851
|
+
#ifdef GGML_VULKAN_CHECK_RESULTS
|
14852
|
+
if (skip_cpu) {
|
14853
|
+
ggml_vk_check_results_1(params, tensor);
|
14854
|
+
}
|
14855
|
+
#endif
|
14856
|
+
if (skip_cpu) {
|
14857
|
+
return;
|
14858
|
+
}
|
14859
|
+
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
|
14860
|
+
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
|
14512
14861
|
#endif // GGML_USE_CUBLAS
|
14513
14862
|
|
14863
|
+
#ifdef GGML_USE_SYCL
|
14864
|
+
bool skip_cpu = ggml_sycl_compute_forward(params, tensor);
|
14865
|
+
if (skip_cpu) {
|
14866
|
+
return;
|
14867
|
+
}
|
14868
|
+
#endif // GGML_USE_SYCL
|
14514
14869
|
switch (tensor->op) {
|
14515
14870
|
case GGML_OP_DUP:
|
14516
14871
|
{
|
@@ -14913,13 +15268,13 @@ struct ggml_hash_set ggml_hash_set_new(size_t size) {
|
|
14913
15268
|
size = ggml_hash_size(size);
|
14914
15269
|
struct ggml_hash_set result;
|
14915
15270
|
result.size = size;
|
14916
|
-
result.keys =
|
15271
|
+
result.keys = GGML_MALLOC(sizeof(struct ggml_tensor *) * size);
|
14917
15272
|
memset(result.keys, 0, sizeof(struct ggml_tensor *) * size);
|
14918
15273
|
return result;
|
14919
15274
|
}
|
14920
15275
|
|
14921
15276
|
static void ggml_hash_set_free(struct ggml_hash_set hash_set) {
|
14922
|
-
|
15277
|
+
GGML_FREE(hash_set.keys);
|
14923
15278
|
}
|
14924
15279
|
|
14925
15280
|
struct hash_map {
|
@@ -14928,17 +15283,17 @@ struct hash_map {
|
|
14928
15283
|
};
|
14929
15284
|
|
14930
15285
|
static struct hash_map * ggml_new_hash_map(size_t size) {
|
14931
|
-
struct hash_map * result =
|
15286
|
+
struct hash_map * result = GGML_MALLOC(sizeof(struct hash_map));
|
14932
15287
|
result->set = ggml_hash_set_new(size);
|
14933
|
-
result->vals =
|
15288
|
+
result->vals = GGML_MALLOC(sizeof(struct ggml_tensor *) * result->set.size);
|
14934
15289
|
memset(result->vals, 0, sizeof(struct ggml_tensor *) * result->set.size);
|
14935
15290
|
return result;
|
14936
15291
|
}
|
14937
15292
|
|
14938
15293
|
static void ggml_hash_map_free(struct hash_map * map) {
|
14939
15294
|
ggml_hash_set_free(map->set);
|
14940
|
-
|
14941
|
-
|
15295
|
+
GGML_FREE(map->vals);
|
15296
|
+
GGML_FREE(map);
|
14942
15297
|
}
|
14943
15298
|
|
14944
15299
|
// gradient checkpointing
|
@@ -16287,8 +16642,9 @@ struct ggml_compute_state_shared {
|
|
16287
16642
|
const int n_threads;
|
16288
16643
|
|
16289
16644
|
// synchronization primitives
|
16290
|
-
atomic_int n_active;
|
16291
|
-
atomic_int node_n;
|
16645
|
+
atomic_int n_active; // num active threads
|
16646
|
+
atomic_int node_n; // active graph node
|
16647
|
+
atomic_int node_task; // active graph node task phase
|
16292
16648
|
|
16293
16649
|
bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
|
16294
16650
|
void * abort_callback_data;
|
@@ -16344,6 +16700,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
16344
16700
|
case GGML_UNARY_OP_TANH:
|
16345
16701
|
case GGML_UNARY_OP_ELU:
|
16346
16702
|
case GGML_UNARY_OP_RELU:
|
16703
|
+
case GGML_UNARY_OP_HARDSWISH: // to opt for multiple threads
|
16704
|
+
case GGML_UNARY_OP_HARDSIGMOID: // to opt for multiple threads
|
16347
16705
|
{
|
16348
16706
|
n_tasks = 1;
|
16349
16707
|
} break;
|
@@ -16420,7 +16778,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
16420
16778
|
} break;
|
16421
16779
|
case GGML_OP_SOFT_MAX:
|
16422
16780
|
{
|
16423
|
-
n_tasks = MIN(
|
16781
|
+
n_tasks = MIN(n_threads, ggml_nrows(node->src[0]));
|
16424
16782
|
} break;
|
16425
16783
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
16426
16784
|
{
|
@@ -16534,6 +16892,34 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
16534
16892
|
return n_tasks;
|
16535
16893
|
}
|
16536
16894
|
|
16895
|
+
static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_compute_state * state, const bool do_yield) {
|
16896
|
+
// wait for other threads to finish
|
16897
|
+
const int last_node_n = * node_n;
|
16898
|
+
|
16899
|
+
while (true) {
|
16900
|
+
if (do_yield) {
|
16901
|
+
sched_yield();
|
16902
|
+
}
|
16903
|
+
|
16904
|
+
* node_n = atomic_load(&state->shared->node_n);
|
16905
|
+
if (* node_n != last_node_n) break;
|
16906
|
+
}
|
16907
|
+
}
|
16908
|
+
|
16909
|
+
static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
|
16910
|
+
// wait for other threads to finish
|
16911
|
+
const int last_task_phase = * task_phase;
|
16912
|
+
|
16913
|
+
while (true) {
|
16914
|
+
if (do_yield) {
|
16915
|
+
sched_yield();
|
16916
|
+
}
|
16917
|
+
|
16918
|
+
* task_phase = atomic_load(&state->shared->node_task);
|
16919
|
+
if (* task_phase != last_task_phase) break;
|
16920
|
+
}
|
16921
|
+
}
|
16922
|
+
|
16537
16923
|
static thread_ret_t ggml_graph_compute_thread(void * data) {
|
16538
16924
|
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
|
16539
16925
|
|
@@ -16544,7 +16930,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16544
16930
|
|
16545
16931
|
set_numa_thread_affinity(state->ith, n_threads);
|
16546
16932
|
|
16547
|
-
int node_n
|
16933
|
+
int node_n = -1;
|
16934
|
+
int task_phase = GGML_TASK_FINALIZE;
|
16548
16935
|
|
16549
16936
|
while (true) {
|
16550
16937
|
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
|
@@ -16576,7 +16963,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16576
16963
|
// distribute new work or execute it direct if 1T
|
16577
16964
|
while (++node_n < cgraph->n_nodes) {
|
16578
16965
|
GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
|
16579
|
-
|
16580
16966
|
struct ggml_tensor * node = cgraph->nodes[node_n];
|
16581
16967
|
const int n_tasks = ggml_get_n_tasks(node, n_threads);
|
16582
16968
|
|
@@ -16585,13 +16971,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16585
16971
|
|
16586
16972
|
params.nth = n_tasks;
|
16587
16973
|
|
16588
|
-
/* INIT */
|
16589
|
-
if (GGML_OP_HAS_INIT[node->op]) {
|
16590
|
-
params.type = GGML_TASK_INIT;
|
16591
|
-
ggml_compute_forward(¶ms, node);
|
16592
|
-
}
|
16593
|
-
|
16594
16974
|
if (n_tasks == 1) {
|
16975
|
+
/* INIT */
|
16976
|
+
if (GGML_OP_HAS_INIT[node->op]) {
|
16977
|
+
params.type = GGML_TASK_INIT;
|
16978
|
+
ggml_compute_forward(¶ms, node);
|
16979
|
+
}
|
16980
|
+
|
16595
16981
|
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
|
16596
16982
|
// they do something more efficient than spinning (?)
|
16597
16983
|
params.type = GGML_TASK_COMPUTE;
|
@@ -16612,38 +16998,24 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16612
16998
|
}
|
16613
16999
|
}
|
16614
17000
|
|
16615
|
-
|
16616
|
-
atomic_store(&state->shared->
|
17001
|
+
task_phase = GGML_TASK_INIT;
|
17002
|
+
atomic_store(&state->shared->n_active, n_threads);
|
17003
|
+
atomic_store(&state->shared->node_n, node_n);
|
17004
|
+
atomic_store(&state->shared->node_task, task_phase);
|
16617
17005
|
} else {
|
16618
|
-
|
16619
|
-
|
16620
|
-
|
16621
|
-
const bool do_yield = last < 0 || cgraph->nodes[last]->op == GGML_OP_MUL_MAT;
|
16622
|
-
|
16623
|
-
while (true) {
|
16624
|
-
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
|
16625
|
-
// depending on the workload and the operating system.
|
16626
|
-
// since it is not clear what is the best approach, it should potentially become user-configurable
|
16627
|
-
// ref: https://github.com/ggerganov/ggml/issues/291
|
16628
|
-
// UPD: adding the do_yield flag seems to resolve the issue universally
|
16629
|
-
if (do_yield) {
|
16630
|
-
sched_yield();
|
16631
|
-
}
|
16632
|
-
|
16633
|
-
node_n = atomic_load(&state->shared->node_n);
|
16634
|
-
if (node_n != last) break;
|
16635
|
-
};
|
17006
|
+
ggml_graph_compute_thread_sync_node(&node_n, state, false);
|
17007
|
+
ggml_graph_compute_thread_sync_task(&task_phase, state, false);
|
16636
17008
|
}
|
16637
17009
|
|
16638
17010
|
// check if we should stop
|
16639
17011
|
if (node_n >= cgraph->n_nodes) break;
|
16640
17012
|
|
16641
|
-
/* COMPUTE */
|
17013
|
+
/* INIT & COMPUTE */
|
16642
17014
|
struct ggml_tensor * node = cgraph->nodes[node_n];
|
16643
17015
|
const int n_tasks = ggml_get_n_tasks(node, n_threads);
|
16644
17016
|
|
16645
17017
|
struct ggml_compute_params params = {
|
16646
|
-
/*.type =*/
|
17018
|
+
/*.type =*/ GGML_TASK_INIT,
|
16647
17019
|
/*.ith =*/ state->ith,
|
16648
17020
|
/*.nth =*/ n_tasks,
|
16649
17021
|
/*.wsize =*/ cplan->work_size,
|
@@ -16651,8 +17023,39 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16651
17023
|
};
|
16652
17024
|
|
16653
17025
|
if (state->ith < n_tasks) {
|
17026
|
+
if (GGML_OP_HAS_INIT[node->op]) {
|
17027
|
+
ggml_compute_forward(¶ms, node);
|
17028
|
+
}
|
17029
|
+
}
|
17030
|
+
|
17031
|
+
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
17032
|
+
task_phase = GGML_TASK_COMPUTE;
|
17033
|
+
atomic_store(&state->shared->n_active, n_threads);
|
17034
|
+
atomic_store(&state->shared->node_task, task_phase);
|
17035
|
+
}
|
17036
|
+
else {
|
17037
|
+
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
|
17038
|
+
// depending on the workload and the operating system.
|
17039
|
+
// since it is not clear what is the best approach, it should potentially become user-configurable
|
17040
|
+
// ref: https://github.com/ggerganov/ggml/issues/291
|
17041
|
+
// UPD: adding the do_yield flag seems to resolve the issue universally
|
17042
|
+
const bool do_yield = node_n < 0 || cgraph->nodes[node_n]->op == GGML_OP_MUL_MAT;
|
17043
|
+
ggml_graph_compute_thread_sync_task(&task_phase, state, do_yield);
|
17044
|
+
}
|
17045
|
+
|
17046
|
+
if (state->ith < n_tasks) {
|
17047
|
+
params.type = GGML_TASK_COMPUTE;
|
16654
17048
|
ggml_compute_forward(¶ms, node);
|
16655
17049
|
}
|
17050
|
+
|
17051
|
+
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
17052
|
+
task_phase = GGML_TASK_FINALIZE;
|
17053
|
+
atomic_store(&state->shared->n_active, n_threads);
|
17054
|
+
atomic_store(&state->shared->node_task, task_phase);
|
17055
|
+
}
|
17056
|
+
else {
|
17057
|
+
ggml_graph_compute_thread_sync_task(&task_phase, state, false);
|
17058
|
+
}
|
16656
17059
|
}
|
16657
17060
|
|
16658
17061
|
return GGML_EXIT_SUCCESS;
|
@@ -16668,12 +17071,16 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
16668
17071
|
struct ggml_cplan cplan;
|
16669
17072
|
memset(&cplan, 0, sizeof(struct ggml_cplan));
|
16670
17073
|
|
17074
|
+
int max_tasks = 1;
|
17075
|
+
|
16671
17076
|
// thread scheduling for the different operations + work buffer size estimation
|
16672
17077
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
16673
17078
|
struct ggml_tensor * node = cgraph->nodes[i];
|
16674
17079
|
|
16675
17080
|
const int n_tasks = ggml_get_n_tasks(node, n_threads);
|
16676
17081
|
|
17082
|
+
max_tasks = MAX(max_tasks, n_tasks);
|
17083
|
+
|
16677
17084
|
size_t cur = 0;
|
16678
17085
|
|
16679
17086
|
switch (node->op) {
|
@@ -16709,8 +17116,11 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
16709
17116
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
16710
17117
|
if (ggml_compute_forward_mul_mat_use_blas(node)) {
|
16711
17118
|
if (node->src[0]->type != GGML_TYPE_F32) {
|
16712
|
-
// here we need memory
|
16713
|
-
|
17119
|
+
// here we need memory for fully dequantized matrix from src0
|
17120
|
+
// take into account that src0 can be broadcasted into src1[2,3]
|
17121
|
+
cur = ggml_type_size(GGML_TYPE_F32)
|
17122
|
+
* node->src[0]->ne[0]*node->src[0]->ne[1]
|
17123
|
+
* node->src[1]->ne[2]*node->src[1]->ne[3];
|
16714
17124
|
}
|
16715
17125
|
} else
|
16716
17126
|
#endif
|
@@ -16837,7 +17247,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
16837
17247
|
work_size += CACHE_LINE_SIZE*(n_threads - 1);
|
16838
17248
|
}
|
16839
17249
|
|
16840
|
-
cplan.n_threads = n_threads;
|
17250
|
+
cplan.n_threads = MIN(max_tasks, n_threads);
|
16841
17251
|
cplan.work_size = work_size;
|
16842
17252
|
cplan.work_data = NULL;
|
16843
17253
|
|
@@ -16854,6 +17264,17 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|
16854
17264
|
}
|
16855
17265
|
}
|
16856
17266
|
|
17267
|
+
#ifdef GGML_USE_VULKAN
|
17268
|
+
for (int i = 0; i < cgraph->n_nodes; i++) {
|
17269
|
+
ggml_vk_preallocate_buffers_graph(cgraph->nodes[i]);
|
17270
|
+
}
|
17271
|
+
ggml_vk_preallocate_buffers();
|
17272
|
+
|
17273
|
+
for (int i = 0; i < cgraph->n_nodes; i++) {
|
17274
|
+
ggml_vk_build_graph(cgraph->nodes[i], i == cgraph->n_nodes - 1);
|
17275
|
+
}
|
17276
|
+
#endif
|
17277
|
+
|
16857
17278
|
const int n_threads = cplan->n_threads;
|
16858
17279
|
|
16859
17280
|
struct ggml_compute_state_shared state_shared = {
|
@@ -16864,6 +17285,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|
16864
17285
|
/*.n_threads =*/ n_threads,
|
16865
17286
|
/*.n_active =*/ n_threads,
|
16866
17287
|
/*.node_n =*/ -1,
|
17288
|
+
/*.node_task =*/ GGML_TASK_FINALIZE,
|
16867
17289
|
/*.abort_callback =*/ NULL,
|
16868
17290
|
/*.abort_callback_data =*/ NULL,
|
16869
17291
|
};
|
@@ -16904,6 +17326,10 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|
16904
17326
|
}
|
16905
17327
|
}
|
16906
17328
|
|
17329
|
+
#ifdef GGML_USE_VULKAN
|
17330
|
+
ggml_vk_graph_cleanup();
|
17331
|
+
#endif
|
17332
|
+
|
16907
17333
|
// performance stats (graph)
|
16908
17334
|
{
|
16909
17335
|
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
|
@@ -18538,6 +18964,29 @@ enum ggml_opt_result ggml_opt_resume_g(
|
|
18538
18964
|
|
18539
18965
|
////////////////////////////////////////////////////////////////////////////////
|
18540
18966
|
|
18967
|
+
void ggml_quantize_init(enum ggml_type type) {
|
18968
|
+
ggml_critical_section_start();
|
18969
|
+
|
18970
|
+
switch (type) {
|
18971
|
+
case GGML_TYPE_IQ2_XXS: iq2xs_init_impl(256); break;
|
18972
|
+
case GGML_TYPE_IQ2_XS: iq2xs_init_impl(512); break;
|
18973
|
+
case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
|
18974
|
+
default: // nothing
|
18975
|
+
break;
|
18976
|
+
}
|
18977
|
+
|
18978
|
+
ggml_critical_section_end();
|
18979
|
+
}
|
18980
|
+
|
18981
|
+
void ggml_quantize_free(void) {
|
18982
|
+
ggml_critical_section_start();
|
18983
|
+
|
18984
|
+
iq2xs_free_impl(256);
|
18985
|
+
iq2xs_free_impl(512);
|
18986
|
+
|
18987
|
+
ggml_critical_section_end();
|
18988
|
+
}
|
18989
|
+
|
18541
18990
|
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) {
|
18542
18991
|
assert(k % QK4_0 == 0);
|
18543
18992
|
const int nb = k / QK4_0;
|
@@ -18665,9 +19114,15 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
|
|
18665
19114
|
return (n/QK8_0*sizeof(block_q8_0));
|
18666
19115
|
}
|
18667
19116
|
|
19117
|
+
bool ggml_quantize_requires_imatrix(enum ggml_type type) {
|
19118
|
+
return
|
19119
|
+
type == GGML_TYPE_IQ2_XXS ||
|
19120
|
+
type == GGML_TYPE_IQ2_XS;
|
19121
|
+
}
|
19122
|
+
|
18668
19123
|
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
|
18669
19124
|
int nrows, int n_per_row, int64_t * hist, const float * imatrix) {
|
18670
|
-
(
|
19125
|
+
ggml_quantize_init(type); // this is noop if already initialized
|
18671
19126
|
size_t result = 0;
|
18672
19127
|
int n = nrows * n_per_row;
|
18673
19128
|
switch (type) {
|
@@ -18778,15 +19233,24 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
|
18778
19233
|
result = quantize_iq2_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
18779
19234
|
GGML_ASSERT(result == row_size * nrows);
|
18780
19235
|
} break;
|
19236
|
+
case GGML_TYPE_IQ3_XXS:
|
19237
|
+
{
|
19238
|
+
GGML_ASSERT(start % QK_K == 0);
|
19239
|
+
GGML_ASSERT(start % n_per_row == 0);
|
19240
|
+
size_t start_row = start / n_per_row;
|
19241
|
+
size_t row_size = ggml_row_size(type, n_per_row);
|
19242
|
+
result = quantize_iq3_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
19243
|
+
GGML_ASSERT(result == row_size * nrows);
|
19244
|
+
} break;
|
18781
19245
|
case GGML_TYPE_F16:
|
18782
19246
|
{
|
18783
|
-
|
19247
|
+
size_t elemsize = sizeof(ggml_fp16_t);
|
18784
19248
|
ggml_fp32_to_fp16_row(src + start, (ggml_fp16_t *)dst + start, n);
|
18785
19249
|
result = n * elemsize;
|
18786
19250
|
} break;
|
18787
19251
|
case GGML_TYPE_F32:
|
18788
19252
|
{
|
18789
|
-
|
19253
|
+
size_t elemsize = sizeof(float);
|
18790
19254
|
result = n * elemsize;
|
18791
19255
|
memcpy((uint8_t *)dst + start * elemsize, src + start, result);
|
18792
19256
|
} break;
|
@@ -18904,6 +19368,25 @@ struct gguf_context {
|
|
18904
19368
|
void * data;
|
18905
19369
|
};
|
18906
19370
|
|
19371
|
+
static size_t gguf_type_size(enum gguf_type type) {
|
19372
|
+
GGML_ASSERT(0 <= type && type < GGUF_TYPE_COUNT);
|
19373
|
+
return GGUF_TYPE_SIZE[type];
|
19374
|
+
}
|
19375
|
+
|
19376
|
+
static void gguf_tensor_info_sanitize(struct gguf_tensor_info * info) {
|
19377
|
+
GGML_ASSERT(info->n_dims <= GGML_MAX_DIMS);
|
19378
|
+
GGML_ASSERT(0 <= info->type && info->type < GGML_TYPE_COUNT);
|
19379
|
+
|
19380
|
+
for (uint32_t i = 0; i < info->n_dims; ++i) {
|
19381
|
+
GGML_ASSERT(info->ne[i] > 0);
|
19382
|
+
}
|
19383
|
+
|
19384
|
+
// prevent overflow for total number of elements
|
19385
|
+
GGML_ASSERT(INT64_MAX/info->ne[1] > info->ne[0]);
|
19386
|
+
GGML_ASSERT(INT64_MAX/info->ne[2] > info->ne[0]*info->ne[1]);
|
19387
|
+
GGML_ASSERT(INT64_MAX/info->ne[3] > info->ne[0]*info->ne[1]*info->ne[2]);
|
19388
|
+
}
|
19389
|
+
|
18907
19390
|
static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) {
|
18908
19391
|
const size_t n = fread(dst, 1, size, file);
|
18909
19392
|
*offset += n;
|
@@ -18916,8 +19399,17 @@ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
|
|
18916
19399
|
|
18917
19400
|
bool ok = true;
|
18918
19401
|
|
18919
|
-
ok = ok && gguf_fread_el(file, &p->n,
|
18920
|
-
|
19402
|
+
ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset);
|
19403
|
+
|
19404
|
+
// early exit if string length is invalid, prevents from integer overflow
|
19405
|
+
if (p->n == SIZE_MAX) {
|
19406
|
+
fprintf(stderr, "%s: invalid string length (%" PRIu64 ")\n", __func__, p->n);
|
19407
|
+
return false;
|
19408
|
+
}
|
19409
|
+
|
19410
|
+
p->data = GGML_CALLOC(p->n + 1, 1);
|
19411
|
+
|
19412
|
+
ok = ok && gguf_fread_el(file, p->data, p->n, offset);
|
18921
19413
|
|
18922
19414
|
return ok;
|
18923
19415
|
}
|
@@ -18989,6 +19481,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
18989
19481
|
return NULL;
|
18990
19482
|
}
|
18991
19483
|
|
19484
|
+
// sanity-checks to prevent from integer/buffer overflows
|
19485
|
+
|
19486
|
+
ok = ok && (ctx->header.n_tensors < (SIZE_MAX/2)/sizeof(struct gguf_tensor_info));
|
19487
|
+
ok = ok && (ctx->header.n_tensors < (SIZE_MAX/2)/ggml_tensor_overhead());
|
19488
|
+
ok = ok && (ctx->header.n_kv < (SIZE_MAX/2)/sizeof(struct gguf_kv));
|
19489
|
+
|
18992
19490
|
if (!ok) {
|
18993
19491
|
fprintf(stderr, "%s: failed to read header\n", __func__);
|
18994
19492
|
fclose(file);
|
@@ -18999,7 +19497,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
18999
19497
|
|
19000
19498
|
// read the kv pairs
|
19001
19499
|
{
|
19002
|
-
ctx->kv =
|
19500
|
+
ctx->kv = GGML_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
|
19003
19501
|
|
19004
19502
|
for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
|
19005
19503
|
struct gguf_kv * kv = &ctx->kv[i];
|
@@ -19027,7 +19525,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
19027
19525
|
case GGUF_TYPE_ARRAY:
|
19028
19526
|
{
|
19029
19527
|
ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
|
19030
|
-
ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n),
|
19528
|
+
ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
|
19031
19529
|
|
19032
19530
|
switch (kv->value.arr.type) {
|
19033
19531
|
case GGUF_TYPE_UINT8:
|
@@ -19042,21 +19540,39 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
19042
19540
|
case GGUF_TYPE_FLOAT64:
|
19043
19541
|
case GGUF_TYPE_BOOL:
|
19044
19542
|
{
|
19045
|
-
|
19046
|
-
|
19543
|
+
// prevent from integer overflow in the malloc below
|
19544
|
+
if (kv->value.arr.n >= SIZE_MAX/gguf_type_size(kv->value.arr.type)) {
|
19545
|
+
fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
|
19546
|
+
fclose(file);
|
19547
|
+
gguf_free(ctx);
|
19548
|
+
return NULL;
|
19549
|
+
}
|
19550
|
+
|
19551
|
+
kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * gguf_type_size(kv->value.arr.type));
|
19552
|
+
|
19553
|
+
ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type), &offset);
|
19047
19554
|
} break;
|
19048
19555
|
case GGUF_TYPE_STRING:
|
19049
19556
|
{
|
19050
|
-
|
19557
|
+
// prevent from integer overflow in the malloc below
|
19558
|
+
if (kv->value.arr.n >= SIZE_MAX/sizeof(struct gguf_str)) {
|
19559
|
+
fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
|
19560
|
+
fclose(file);
|
19561
|
+
gguf_free(ctx);
|
19562
|
+
return NULL;
|
19563
|
+
}
|
19564
|
+
|
19565
|
+
kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * sizeof(struct gguf_str));
|
19566
|
+
|
19051
19567
|
for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
|
19052
19568
|
ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
|
19053
19569
|
}
|
19054
19570
|
} break;
|
19055
19571
|
case GGUF_TYPE_ARRAY:
|
19056
|
-
|
19572
|
+
default: GGML_ASSERT(false && "invalid type"); break;
|
19057
19573
|
}
|
19058
19574
|
} break;
|
19059
|
-
|
19575
|
+
default: GGML_ASSERT(false && "invalid type");
|
19060
19576
|
}
|
19061
19577
|
|
19062
19578
|
if (!ok) {
|
@@ -19074,7 +19590,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
19074
19590
|
|
19075
19591
|
// read the tensor infos
|
19076
19592
|
{
|
19077
|
-
ctx->infos =
|
19593
|
+
ctx->infos = GGML_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
|
19078
19594
|
|
19079
19595
|
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
|
19080
19596
|
struct gguf_tensor_info * info = &ctx->infos[i];
|
@@ -19085,12 +19601,18 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
19085
19601
|
|
19086
19602
|
ok = ok && gguf_fread_str(file, &info->name, &offset);
|
19087
19603
|
ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset);
|
19604
|
+
|
19605
|
+
ok = ok && (info->n_dims <= GGML_MAX_DIMS);
|
19606
|
+
|
19088
19607
|
for (uint32_t j = 0; j < info->n_dims; ++j) {
|
19089
19608
|
ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
|
19090
19609
|
}
|
19610
|
+
|
19091
19611
|
ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
|
19092
19612
|
ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
|
19093
19613
|
|
19614
|
+
gguf_tensor_info_sanitize(info);
|
19615
|
+
|
19094
19616
|
if (!ok) {
|
19095
19617
|
fprintf(stderr, "%s: failed to read tensor info\n", __func__);
|
19096
19618
|
fclose(file);
|
@@ -19244,12 +19766,12 @@ void gguf_free(struct gguf_context * ctx) {
|
|
19244
19766
|
struct gguf_kv * kv = &ctx->kv[i];
|
19245
19767
|
|
19246
19768
|
if (kv->key.data) {
|
19247
|
-
|
19769
|
+
GGML_FREE(kv->key.data);
|
19248
19770
|
}
|
19249
19771
|
|
19250
19772
|
if (kv->type == GGUF_TYPE_STRING) {
|
19251
19773
|
if (kv->value.str.data) {
|
19252
|
-
|
19774
|
+
GGML_FREE(kv->value.str.data);
|
19253
19775
|
}
|
19254
19776
|
}
|
19255
19777
|
|
@@ -19259,16 +19781,16 @@ void gguf_free(struct gguf_context * ctx) {
|
|
19259
19781
|
for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
|
19260
19782
|
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
|
19261
19783
|
if (str->data) {
|
19262
|
-
|
19784
|
+
GGML_FREE(str->data);
|
19263
19785
|
}
|
19264
19786
|
}
|
19265
19787
|
}
|
19266
|
-
|
19788
|
+
GGML_FREE(kv->value.arr.data);
|
19267
19789
|
}
|
19268
19790
|
}
|
19269
19791
|
}
|
19270
19792
|
|
19271
|
-
|
19793
|
+
GGML_FREE(ctx->kv);
|
19272
19794
|
}
|
19273
19795
|
|
19274
19796
|
if (ctx->infos) {
|
@@ -19276,11 +19798,11 @@ void gguf_free(struct gguf_context * ctx) {
|
|
19276
19798
|
struct gguf_tensor_info * info = &ctx->infos[i];
|
19277
19799
|
|
19278
19800
|
if (info->name.data) {
|
19279
|
-
|
19801
|
+
GGML_FREE(info->name.data);
|
19280
19802
|
}
|
19281
19803
|
}
|
19282
19804
|
|
19283
|
-
|
19805
|
+
GGML_FREE(ctx->infos);
|
19284
19806
|
}
|
19285
19807
|
|
19286
19808
|
GGML_ALIGNED_FREE(ctx);
|
@@ -19581,8 +20103,8 @@ void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_ty
|
|
19581
20103
|
ctx->kv[idx].type = GGUF_TYPE_ARRAY;
|
19582
20104
|
ctx->kv[idx].value.arr.type = type;
|
19583
20105
|
ctx->kv[idx].value.arr.n = n;
|
19584
|
-
ctx->kv[idx].value.arr.data =
|
19585
|
-
memcpy(ctx->kv[idx].value.arr.data, data, n*
|
20106
|
+
ctx->kv[idx].value.arr.data = GGML_MALLOC(n*gguf_type_size(type));
|
20107
|
+
memcpy(ctx->kv[idx].value.arr.data, data, n*gguf_type_size(type));
|
19586
20108
|
}
|
19587
20109
|
|
19588
20110
|
void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, int n) {
|
@@ -19591,7 +20113,7 @@ void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char **
|
|
19591
20113
|
ctx->kv[idx].type = GGUF_TYPE_ARRAY;
|
19592
20114
|
ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
|
19593
20115
|
ctx->kv[idx].value.arr.n = n;
|
19594
|
-
ctx->kv[idx].value.arr.data =
|
20116
|
+
ctx->kv[idx].value.arr.data = GGML_MALLOC(n*sizeof(struct gguf_str));
|
19595
20117
|
for (int i = 0; i < n; i++) {
|
19596
20118
|
struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
|
19597
20119
|
str->n = strlen(data[i]);
|
@@ -19618,19 +20140,19 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
|
|
19618
20140
|
case GGUF_TYPE_ARRAY:
|
19619
20141
|
{
|
19620
20142
|
if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
|
19621
|
-
const char ** data =
|
20143
|
+
const char ** data = GGML_MALLOC(src->kv[i].value.arr.n*sizeof(char *));
|
19622
20144
|
for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
|
19623
20145
|
data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
|
19624
20146
|
}
|
19625
20147
|
gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
|
19626
|
-
|
20148
|
+
GGML_FREE((void *)data);
|
19627
20149
|
} else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
|
19628
20150
|
GGML_ASSERT(false && "nested arrays not supported");
|
19629
20151
|
} else {
|
19630
20152
|
gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n);
|
19631
20153
|
}
|
19632
20154
|
} break;
|
19633
|
-
|
20155
|
+
default: GGML_ASSERT(false && "invalid type"); break;
|
19634
20156
|
}
|
19635
20157
|
}
|
19636
20158
|
}
|
@@ -19706,7 +20228,7 @@ struct gguf_buf {
|
|
19706
20228
|
|
19707
20229
|
static struct gguf_buf gguf_buf_init(size_t size) {
|
19708
20230
|
struct gguf_buf buf = {
|
19709
|
-
/*buf.data =*/ size == 0 ? NULL :
|
20231
|
+
/*buf.data =*/ size == 0 ? NULL : GGML_MALLOC(size),
|
19710
20232
|
/*buf.size =*/ size,
|
19711
20233
|
/*buf.offset =*/ 0,
|
19712
20234
|
};
|
@@ -19716,7 +20238,7 @@ static struct gguf_buf gguf_buf_init(size_t size) {
|
|
19716
20238
|
|
19717
20239
|
static void gguf_buf_free(struct gguf_buf buf) {
|
19718
20240
|
if (buf.data) {
|
19719
|
-
|
20241
|
+
GGML_FREE(buf.data);
|
19720
20242
|
}
|
19721
20243
|
}
|
19722
20244
|
|
@@ -19797,7 +20319,7 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf *
|
|
19797
20319
|
case GGUF_TYPE_FLOAT64:
|
19798
20320
|
case GGUF_TYPE_BOOL:
|
19799
20321
|
{
|
19800
|
-
gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n *
|
20322
|
+
gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type));
|
19801
20323
|
} break;
|
19802
20324
|
case GGUF_TYPE_STRING:
|
19803
20325
|
{
|
@@ -19806,10 +20328,10 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf *
|
|
19806
20328
|
}
|
19807
20329
|
} break;
|
19808
20330
|
case GGUF_TYPE_ARRAY:
|
19809
|
-
|
20331
|
+
default: GGML_ASSERT(false && "invalid type"); break;
|
19810
20332
|
}
|
19811
20333
|
} break;
|
19812
|
-
|
20334
|
+
default: GGML_ASSERT(false && "invalid type");
|
19813
20335
|
}
|
19814
20336
|
}
|
19815
20337
|
|
@@ -20010,7 +20532,7 @@ int ggml_cpu_has_wasm_simd(void) {
|
|
20010
20532
|
}
|
20011
20533
|
|
20012
20534
|
int ggml_cpu_has_blas(void) {
|
20013
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
20535
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
|
20014
20536
|
return 1;
|
20015
20537
|
#else
|
20016
20538
|
return 0;
|
@@ -20033,8 +20555,33 @@ int ggml_cpu_has_clblast(void) {
|
|
20033
20555
|
#endif
|
20034
20556
|
}
|
20035
20557
|
|
20558
|
+
int ggml_cpu_has_vulkan(void) {
|
20559
|
+
#if defined(GGML_USE_VULKAN)
|
20560
|
+
return 1;
|
20561
|
+
#else
|
20562
|
+
return 0;
|
20563
|
+
#endif
|
20564
|
+
}
|
20565
|
+
|
20566
|
+
int ggml_cpu_has_kompute(void) {
|
20567
|
+
#if defined(GGML_USE_KOMPUTE)
|
20568
|
+
return 1;
|
20569
|
+
#else
|
20570
|
+
return 0;
|
20571
|
+
#endif
|
20572
|
+
}
|
20573
|
+
|
20574
|
+
int ggml_cpu_has_sycl(void) {
|
20575
|
+
#if defined(GGML_USE_SYCL)
|
20576
|
+
return 1;
|
20577
|
+
#else
|
20578
|
+
return 0;
|
20579
|
+
#endif
|
20580
|
+
}
|
20581
|
+
|
20036
20582
|
int ggml_cpu_has_gpublas(void) {
|
20037
|
-
return ggml_cpu_has_cublas() || ggml_cpu_has_clblast()
|
20583
|
+
return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
|
20584
|
+
ggml_cpu_has_sycl();
|
20038
20585
|
}
|
20039
20586
|
|
20040
20587
|
int ggml_cpu_has_sse3(void) {
|