llama_cpp 0.12.3 → 0.12.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +22 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -2
- data/vendor/tmp/llama.cpp/Makefile +160 -56
- data/vendor/tmp/llama.cpp/ggml-alloc.c +85 -25
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +6 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +115 -3
- data/vendor/tmp/llama.cpp/ggml-backend.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +688 -270
- data/vendor/tmp/llama.cpp/ggml-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +1990 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.h +46 -0
- data/vendor/tmp/llama.cpp/ggml-metal.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +121 -86
- data/vendor/tmp/llama.cpp/ggml-metal.metal +303 -4
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +95 -3
- data/vendor/tmp/llama.cpp/ggml-opencl.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +745 -109
- data/vendor/tmp/llama.cpp/ggml-quants.h +81 -56
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +15296 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.h +29 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +51714 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +5726 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +39 -0
- data/vendor/tmp/llama.cpp/ggml.c +356 -60
- data/vendor/tmp/llama.cpp/ggml.h +7 -1
- data/vendor/tmp/llama.cpp/llama.cpp +876 -118
- data/vendor/tmp/llama.cpp/llama.h +12 -16
- metadata +9 -2
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
@@ -218,6 +218,7 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|
218
218
|
break;
|
219
219
|
}
|
220
220
|
GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
|
221
|
+
GGML_ASSERT(false);
|
221
222
|
return NULL;
|
222
223
|
}
|
223
224
|
return aligned_memory;
|
@@ -230,6 +231,38 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|
230
231
|
#endif
|
231
232
|
#endif
|
232
233
|
|
234
|
+
inline static void * ggml_malloc(size_t size) {
|
235
|
+
if (size == 0) {
|
236
|
+
GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_malloc!\n");
|
237
|
+
return NULL;
|
238
|
+
}
|
239
|
+
void * result = malloc(size);
|
240
|
+
if (result == NULL) {
|
241
|
+
GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
|
242
|
+
GGML_ASSERT(false);
|
243
|
+
}
|
244
|
+
return result;
|
245
|
+
}
|
246
|
+
|
247
|
+
// calloc
|
248
|
+
inline static void * ggml_calloc(size_t num, size_t size) {
|
249
|
+
if (num == 0 || size == 0) {
|
250
|
+
GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_calloc!\n");
|
251
|
+
return NULL;
|
252
|
+
}
|
253
|
+
void * result = calloc(num, size);
|
254
|
+
if (result == NULL) {
|
255
|
+
GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
|
256
|
+
GGML_ASSERT(false);
|
257
|
+
}
|
258
|
+
return result;
|
259
|
+
}
|
260
|
+
|
261
|
+
#define GGML_MALLOC(size) ggml_malloc(size)
|
262
|
+
#define GGML_CALLOC(num, size) ggml_calloc(num, size)
|
263
|
+
|
264
|
+
#define GGML_FREE(ptr) free(ptr)
|
265
|
+
|
233
266
|
#define UNUSED GGML_UNUSED
|
234
267
|
#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
|
235
268
|
|
@@ -248,6 +281,10 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|
248
281
|
#include "ggml-cuda.h"
|
249
282
|
#elif defined(GGML_USE_CLBLAST)
|
250
283
|
#include "ggml-opencl.h"
|
284
|
+
#elif defined(GGML_USE_VULKAN)
|
285
|
+
#include "ggml-vulkan.h"
|
286
|
+
#elif defined(GGML_USE_SYCL)
|
287
|
+
#include "ggml-sycl.h"
|
251
288
|
#endif
|
252
289
|
|
253
290
|
// floating point type used to accumulate sums
|
@@ -595,6 +632,17 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
595
632
|
.vec_dot = ggml_vec_dot_iq2_xs_q8_K,
|
596
633
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
597
634
|
},
|
635
|
+
[GGML_TYPE_IQ3_XXS] = {
|
636
|
+
.type_name = "iq3_xxs",
|
637
|
+
.blck_size = QK_K,
|
638
|
+
.type_size = sizeof(block_iq3_xxs),
|
639
|
+
.is_quantized = true,
|
640
|
+
.to_float = (ggml_to_float_t) dequantize_row_iq3_xxs,
|
641
|
+
.from_float = quantize_row_iq3_xxs,
|
642
|
+
.from_float_reference = (ggml_from_float_t)quantize_row_iq3_xxs_reference,
|
643
|
+
.vec_dot = ggml_vec_dot_iq3_xxs_q8_K,
|
644
|
+
.vec_dot_type = GGML_TYPE_Q8_K,
|
645
|
+
},
|
598
646
|
[GGML_TYPE_Q8_K] = {
|
599
647
|
.type_name = "q8_K",
|
600
648
|
.blck_size = QK_K,
|
@@ -2140,6 +2188,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
|
2140
2188
|
case GGML_FTYPE_MOSTLY_Q6_K: wtype = GGML_TYPE_Q6_K; break;
|
2141
2189
|
case GGML_FTYPE_MOSTLY_IQ2_XXS: wtype = GGML_TYPE_IQ2_XXS; break;
|
2142
2190
|
case GGML_FTYPE_MOSTLY_IQ2_XS: wtype = GGML_TYPE_IQ2_XS; break;
|
2191
|
+
case GGML_FTYPE_MOSTLY_IQ3_XXS: wtype = GGML_TYPE_IQ3_XXS; break;
|
2143
2192
|
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
2144
2193
|
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
2145
2194
|
}
|
@@ -2293,6 +2342,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
2293
2342
|
ggml_init_cublas();
|
2294
2343
|
#elif defined(GGML_USE_CLBLAST)
|
2295
2344
|
ggml_cl_init();
|
2345
|
+
#elif defined(GGML_USE_VULKAN)
|
2346
|
+
ggml_vk_init_cpu_assist();
|
2347
|
+
#elif defined(GGML_USE_SYCL)
|
2348
|
+
ggml_init_sycl();
|
2296
2349
|
#endif
|
2297
2350
|
|
2298
2351
|
ggml_setup_op_has_task_pass();
|
@@ -2417,7 +2470,8 @@ size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
|
|
2417
2470
|
size_t max_size = 0;
|
2418
2471
|
|
2419
2472
|
for (struct ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor != NULL; tensor = ggml_get_next_tensor(ctx, tensor)) {
|
2420
|
-
|
2473
|
+
size_t bytes = ggml_nbytes(tensor);
|
2474
|
+
max_size = MAX(max_size, bytes);
|
2421
2475
|
}
|
2422
2476
|
|
2423
2477
|
return max_size;
|
@@ -5296,7 +5350,7 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
|
|
5296
5350
|
int s0,
|
5297
5351
|
int p0,
|
5298
5352
|
int d0) {
|
5299
|
-
struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false); // [N, OL, IC * K]
|
5353
|
+
struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16); // [N, OL, IC * K]
|
5300
5354
|
|
5301
5355
|
struct ggml_tensor * result =
|
5302
5356
|
ggml_mul_mat(ctx,
|
@@ -5374,16 +5428,15 @@ struct ggml_tensor * ggml_conv_depthwise_2d(
|
|
5374
5428
|
int p1,
|
5375
5429
|
int d0,
|
5376
5430
|
int d1) {
|
5431
|
+
|
5377
5432
|
struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
|
5378
5433
|
struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
|
5379
5434
|
ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
|
5380
|
-
s0, s1, p0, p1, d0, d1, true); // [N * IC, OH, OW, KH * KW]
|
5381
|
-
|
5382
|
-
struct ggml_tensor * result =
|
5383
|
-
ggml_mul_mat(ctx,
|
5384
|
-
ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1), // [OC,1, KH, KW] => [1, OC, 1, KH * KW]
|
5385
|
-
ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3])); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
|
5435
|
+
s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW]
|
5436
|
+
struct ggml_tensor * new_b = ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
|
5386
5437
|
|
5438
|
+
new_a = ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1); // [OC,1, KH, KW] => [1, OC, 1, KH * KW]
|
5439
|
+
struct ggml_tensor * result = ggml_mul_mat(ctx, new_a, new_b);
|
5387
5440
|
result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
|
5388
5441
|
|
5389
5442
|
return result;
|
@@ -5404,7 +5457,8 @@ struct ggml_tensor * ggml_im2col(
|
|
5404
5457
|
int p1,
|
5405
5458
|
int d0,
|
5406
5459
|
int d1,
|
5407
|
-
bool is_2D
|
5460
|
+
bool is_2D,
|
5461
|
+
enum ggml_type dst_type) {
|
5408
5462
|
|
5409
5463
|
if(is_2D) {
|
5410
5464
|
GGML_ASSERT(a->ne[2] == b->ne[2]);
|
@@ -5428,7 +5482,7 @@ struct ggml_tensor * ggml_im2col(
|
|
5428
5482
|
is_2D ? b->ne[3] : 1,
|
5429
5483
|
};
|
5430
5484
|
|
5431
|
-
struct ggml_tensor * result = ggml_new_tensor(ctx,
|
5485
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, dst_type, 4, ne);
|
5432
5486
|
int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
|
5433
5487
|
ggml_set_op_params(result, params, sizeof(params));
|
5434
5488
|
|
@@ -5453,7 +5507,7 @@ struct ggml_tensor * ggml_conv_2d(
|
|
5453
5507
|
int p1,
|
5454
5508
|
int d0,
|
5455
5509
|
int d1) {
|
5456
|
-
struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true); // [N, OH, OW, IC * KH * KW]
|
5510
|
+
struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N, OH, OW, IC * KH * KW]
|
5457
5511
|
|
5458
5512
|
struct ggml_tensor * result =
|
5459
5513
|
ggml_mul_mat(ctx,
|
@@ -5579,12 +5633,13 @@ struct ggml_tensor * ggml_pool_2d(
|
|
5579
5633
|
is_node = true;
|
5580
5634
|
}
|
5581
5635
|
|
5636
|
+
struct ggml_tensor * result;
|
5582
5637
|
const int64_t ne[3] = {
|
5583
5638
|
ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
|
5584
5639
|
ggml_calc_pool_output_size(a->ne[1], k1, s1, p1),
|
5585
5640
|
a->ne[2],
|
5586
5641
|
};
|
5587
|
-
|
5642
|
+
result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
|
5588
5643
|
|
5589
5644
|
int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
|
5590
5645
|
ggml_set_op_params(result, params, sizeof(params));
|
@@ -5592,7 +5647,6 @@ struct ggml_tensor * ggml_pool_2d(
|
|
5592
5647
|
result->op = GGML_OP_POOL_2D;
|
5593
5648
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5594
5649
|
result->src[0] = a;
|
5595
|
-
|
5596
5650
|
return result;
|
5597
5651
|
}
|
5598
5652
|
|
@@ -7207,6 +7261,17 @@ static void ggml_compute_forward_add_f32(
|
|
7207
7261
|
const int ith = params->ith;
|
7208
7262
|
const int nth = params->nth;
|
7209
7263
|
|
7264
|
+
#ifdef GGML_USE_CLBLAST
|
7265
|
+
if (src1->backend == GGML_BACKEND_GPU) {
|
7266
|
+
// TODO: OpenCL kernel support full broadcast
|
7267
|
+
GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
|
7268
|
+
if (ith == 0) {
|
7269
|
+
ggml_cl_add(src0, src1, dst);
|
7270
|
+
}
|
7271
|
+
return;
|
7272
|
+
}
|
7273
|
+
#endif
|
7274
|
+
|
7210
7275
|
const int nr = ggml_nrows(src0);
|
7211
7276
|
|
7212
7277
|
GGML_TENSOR_BINARY_OP_LOCALS
|
@@ -7487,7 +7552,12 @@ static void ggml_compute_forward_add(
|
|
7487
7552
|
switch (src0->type) {
|
7488
7553
|
case GGML_TYPE_F32:
|
7489
7554
|
{
|
7490
|
-
|
7555
|
+
if (src1->type == GGML_TYPE_F32) {
|
7556
|
+
ggml_compute_forward_add_f32(params, src0, src1, dst);
|
7557
|
+
}
|
7558
|
+
else {
|
7559
|
+
GGML_ASSERT(false);
|
7560
|
+
}
|
7491
7561
|
} break;
|
7492
7562
|
case GGML_TYPE_F16:
|
7493
7563
|
{
|
@@ -7513,6 +7583,7 @@ static void ggml_compute_forward_add(
|
|
7513
7583
|
case GGML_TYPE_Q6_K:
|
7514
7584
|
case GGML_TYPE_IQ2_XXS:
|
7515
7585
|
case GGML_TYPE_IQ2_XS:
|
7586
|
+
case GGML_TYPE_IQ3_XXS:
|
7516
7587
|
{
|
7517
7588
|
ggml_compute_forward_add_q_f32(params, src0, src1, dst);
|
7518
7589
|
} break;
|
@@ -7779,6 +7850,7 @@ static void ggml_compute_forward_add1(
|
|
7779
7850
|
case GGML_TYPE_Q6_K:
|
7780
7851
|
case GGML_TYPE_IQ2_XXS:
|
7781
7852
|
case GGML_TYPE_IQ2_XS:
|
7853
|
+
case GGML_TYPE_IQ3_XXS:
|
7782
7854
|
{
|
7783
7855
|
ggml_compute_forward_add1_q_f32(params, src0, src1, dst);
|
7784
7856
|
} break;
|
@@ -7898,6 +7970,7 @@ static void ggml_compute_forward_acc(
|
|
7898
7970
|
case GGML_TYPE_Q6_K:
|
7899
7971
|
case GGML_TYPE_IQ2_XXS:
|
7900
7972
|
case GGML_TYPE_IQ2_XS:
|
7973
|
+
case GGML_TYPE_IQ3_XXS:
|
7901
7974
|
default:
|
7902
7975
|
{
|
7903
7976
|
GGML_ASSERT(false);
|
@@ -7999,7 +8072,7 @@ static void ggml_compute_forward_mul_f32(
|
|
7999
8072
|
const int ith = params->ith;
|
8000
8073
|
const int nth = params->nth;
|
8001
8074
|
|
8002
|
-
#
|
8075
|
+
#if defined(GGML_USE_CLBLAST)
|
8003
8076
|
if (src1->backend == GGML_BACKEND_GPU) {
|
8004
8077
|
// TODO: OpenCL kernel support full broadcast
|
8005
8078
|
GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
|
@@ -9954,7 +10027,7 @@ static void ggml_compute_forward_mul_mat(
|
|
9954
10027
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9955
10028
|
if (ggml_compute_forward_mul_mat_use_blas(dst)) {
|
9956
10029
|
const int64_t ne_plane = ne01*ne00;
|
9957
|
-
const
|
10030
|
+
const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
|
9958
10031
|
UNUSED(desired_wsize);
|
9959
10032
|
|
9960
10033
|
if (params->type == GGML_TASK_INIT) {
|
@@ -10649,6 +10722,7 @@ static void ggml_compute_forward_out_prod(
|
|
10649
10722
|
case GGML_TYPE_Q6_K:
|
10650
10723
|
case GGML_TYPE_IQ2_XXS:
|
10651
10724
|
case GGML_TYPE_IQ2_XS:
|
10725
|
+
case GGML_TYPE_IQ3_XXS:
|
10652
10726
|
{
|
10653
10727
|
ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst);
|
10654
10728
|
} break;
|
@@ -10828,6 +10902,7 @@ static void ggml_compute_forward_set(
|
|
10828
10902
|
case GGML_TYPE_Q6_K:
|
10829
10903
|
case GGML_TYPE_IQ2_XXS:
|
10830
10904
|
case GGML_TYPE_IQ2_XS:
|
10905
|
+
case GGML_TYPE_IQ3_XXS:
|
10831
10906
|
default:
|
10832
10907
|
{
|
10833
10908
|
GGML_ASSERT(false);
|
@@ -11024,6 +11099,7 @@ static void ggml_compute_forward_get_rows(
|
|
11024
11099
|
case GGML_TYPE_Q6_K:
|
11025
11100
|
case GGML_TYPE_IQ2_XXS:
|
11026
11101
|
case GGML_TYPE_IQ2_XS:
|
11102
|
+
case GGML_TYPE_IQ3_XXS:
|
11027
11103
|
{
|
11028
11104
|
ggml_compute_forward_get_rows_q(params, src0, src1, dst);
|
11029
11105
|
} break;
|
@@ -11671,6 +11747,7 @@ static void ggml_compute_forward_alibi(
|
|
11671
11747
|
case GGML_TYPE_Q6_K:
|
11672
11748
|
case GGML_TYPE_IQ2_XXS:
|
11673
11749
|
case GGML_TYPE_IQ2_XS:
|
11750
|
+
case GGML_TYPE_IQ3_XXS:
|
11674
11751
|
case GGML_TYPE_Q8_K:
|
11675
11752
|
case GGML_TYPE_I8:
|
11676
11753
|
case GGML_TYPE_I16:
|
@@ -11747,6 +11824,7 @@ static void ggml_compute_forward_clamp(
|
|
11747
11824
|
case GGML_TYPE_Q6_K:
|
11748
11825
|
case GGML_TYPE_IQ2_XXS:
|
11749
11826
|
case GGML_TYPE_IQ2_XS:
|
11827
|
+
case GGML_TYPE_IQ3_XXS:
|
11750
11828
|
case GGML_TYPE_Q8_K:
|
11751
11829
|
case GGML_TYPE_I8:
|
11752
11830
|
case GGML_TYPE_I16:
|
@@ -11810,8 +11888,10 @@ GGML_CALL void ggml_rope_yarn_corr_dims(
|
|
11810
11888
|
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
|
11811
11889
|
) {
|
11812
11890
|
// start and end correction dims
|
11813
|
-
|
11814
|
-
|
11891
|
+
float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_fast, freq_base));
|
11892
|
+
float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_slow, freq_base));
|
11893
|
+
dims[0] = MAX(0, start);
|
11894
|
+
dims[1] = MIN(n_dims - 1, end);
|
11815
11895
|
}
|
11816
11896
|
|
11817
11897
|
static void ggml_compute_forward_rope_f32(
|
@@ -12416,6 +12496,92 @@ static void ggml_compute_forward_conv_transpose_1d(
|
|
12416
12496
|
}
|
12417
12497
|
}
|
12418
12498
|
|
12499
|
+
// src0: kernel [OC, IC, KH, KW]
|
12500
|
+
// src1: image [N, IC, IH, IW]
|
12501
|
+
// dst: result [N, OH, OW, IC*KH*KW]
|
12502
|
+
static void ggml_compute_forward_im2col_f32(
|
12503
|
+
const struct ggml_compute_params * params,
|
12504
|
+
const struct ggml_tensor * src0,
|
12505
|
+
const struct ggml_tensor * src1,
|
12506
|
+
struct ggml_tensor * dst) {
|
12507
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
12508
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
12509
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
12510
|
+
|
12511
|
+
int64_t t0 = ggml_perf_time_us();
|
12512
|
+
UNUSED(t0);
|
12513
|
+
|
12514
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
12515
|
+
|
12516
|
+
const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
|
12517
|
+
const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
|
12518
|
+
const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
|
12519
|
+
const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
|
12520
|
+
const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
|
12521
|
+
const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
|
12522
|
+
const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
|
12523
|
+
|
12524
|
+
const int ith = params->ith;
|
12525
|
+
const int nth = params->nth;
|
12526
|
+
|
12527
|
+
const int64_t N = is_2D ? ne13 : ne12;
|
12528
|
+
const int64_t IC = is_2D ? ne12 : ne11;
|
12529
|
+
const int64_t IH = is_2D ? ne11 : 1;
|
12530
|
+
const int64_t IW = ne10;
|
12531
|
+
|
12532
|
+
const int64_t KH = is_2D ? ne01 : 1;
|
12533
|
+
const int64_t KW = ne00;
|
12534
|
+
|
12535
|
+
const int64_t OH = is_2D ? ne2 : 1;
|
12536
|
+
const int64_t OW = ne1;
|
12537
|
+
|
12538
|
+
int ofs0 = is_2D ? nb13 : nb12;
|
12539
|
+
int ofs1 = is_2D ? nb12 : nb11;
|
12540
|
+
|
12541
|
+
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
12542
|
+
GGML_ASSERT(nb10 == sizeof(float));
|
12543
|
+
|
12544
|
+
if (params->type == GGML_TASK_INIT) {
|
12545
|
+
return;
|
12546
|
+
}
|
12547
|
+
|
12548
|
+
if (params->type == GGML_TASK_FINALIZE) {
|
12549
|
+
return;
|
12550
|
+
}
|
12551
|
+
|
12552
|
+
// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
|
12553
|
+
{
|
12554
|
+
float * const wdata = (float *) dst->data;
|
12555
|
+
|
12556
|
+
for (int64_t in = 0; in < N; in++) {
|
12557
|
+
for (int64_t ioh = 0; ioh < OH; ioh++) { // 1
|
12558
|
+
for (int64_t iow = 0; iow < OW; iow++) {
|
12559
|
+
for (int64_t iic = ith; iic < IC; iic += nth) {
|
12560
|
+
|
12561
|
+
// micro kernel
|
12562
|
+
float * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
|
12563
|
+
const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW]
|
12564
|
+
|
12565
|
+
for (int64_t ikh = 0; ikh < KH; ikh++) { // 1
|
12566
|
+
for (int64_t ikw = 0; ikw < KW; ikw++) {
|
12567
|
+
const int64_t iiw = iow*s0 + ikw*d0 - p0;
|
12568
|
+
const int64_t iih = ioh*s1 + ikh*d1 - p1;
|
12569
|
+
|
12570
|
+
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
12571
|
+
dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
|
12572
|
+
} else {
|
12573
|
+
dst_data[iic*(KH*KW) + ikh*KW + ikw] = (src_data[iih*IW + iiw]);
|
12574
|
+
}
|
12575
|
+
}
|
12576
|
+
}
|
12577
|
+
}
|
12578
|
+
}
|
12579
|
+
}
|
12580
|
+
}
|
12581
|
+
}
|
12582
|
+
}
|
12583
|
+
|
12584
|
+
|
12419
12585
|
// src0: kernel [OC, IC, KH, KW]
|
12420
12586
|
// src1: image [N, IC, IH, IW]
|
12421
12587
|
// dst: result [N, OH, OW, IC*KH*KW]
|
@@ -12506,14 +12672,14 @@ static void ggml_compute_forward_im2col(
|
|
12506
12672
|
const struct ggml_tensor * src0,
|
12507
12673
|
const struct ggml_tensor * src1,
|
12508
12674
|
struct ggml_tensor * dst) {
|
12509
|
-
switch (
|
12675
|
+
switch (dst->type) {
|
12510
12676
|
case GGML_TYPE_F16:
|
12511
12677
|
{
|
12512
12678
|
ggml_compute_forward_im2col_f16(params, src0, src1, dst);
|
12513
12679
|
} break;
|
12514
12680
|
case GGML_TYPE_F32:
|
12515
12681
|
{
|
12516
|
-
|
12682
|
+
ggml_compute_forward_im2col_f32(params, src0, src1, dst);
|
12517
12683
|
} break;
|
12518
12684
|
default:
|
12519
12685
|
{
|
@@ -12704,8 +12870,8 @@ static void ggml_compute_forward_pool_2d(
|
|
12704
12870
|
const struct ggml_compute_params * params,
|
12705
12871
|
const struct ggml_tensor * src,
|
12706
12872
|
struct ggml_tensor * dst) {
|
12707
|
-
|
12708
|
-
|
12873
|
+
GGML_ASSERT(src->type == GGML_TYPE_F32);
|
12874
|
+
GGML_ASSERT(params->ith == 0);
|
12709
12875
|
|
12710
12876
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
12711
12877
|
return;
|
@@ -14683,8 +14849,26 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14683
14849
|
}
|
14684
14850
|
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
|
14685
14851
|
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
|
14852
|
+
#elif defined(GGML_USE_VULKAN)
|
14853
|
+
const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
|
14854
|
+
#ifdef GGML_VULKAN_CHECK_RESULTS
|
14855
|
+
if (skip_cpu) {
|
14856
|
+
ggml_vk_check_results_1_cpu_assist(params, tensor);
|
14857
|
+
}
|
14858
|
+
#endif
|
14859
|
+
if (skip_cpu) {
|
14860
|
+
return;
|
14861
|
+
}
|
14862
|
+
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
|
14863
|
+
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
|
14686
14864
|
#endif // GGML_USE_CUBLAS
|
14687
14865
|
|
14866
|
+
#ifdef GGML_USE_SYCL
|
14867
|
+
bool skip_cpu = ggml_sycl_compute_forward(params, tensor);
|
14868
|
+
if (skip_cpu) {
|
14869
|
+
return;
|
14870
|
+
}
|
14871
|
+
#endif // GGML_USE_SYCL
|
14688
14872
|
switch (tensor->op) {
|
14689
14873
|
case GGML_OP_DUP:
|
14690
14874
|
{
|
@@ -15087,13 +15271,13 @@ struct ggml_hash_set ggml_hash_set_new(size_t size) {
|
|
15087
15271
|
size = ggml_hash_size(size);
|
15088
15272
|
struct ggml_hash_set result;
|
15089
15273
|
result.size = size;
|
15090
|
-
result.keys =
|
15274
|
+
result.keys = GGML_MALLOC(sizeof(struct ggml_tensor *) * size);
|
15091
15275
|
memset(result.keys, 0, sizeof(struct ggml_tensor *) * size);
|
15092
15276
|
return result;
|
15093
15277
|
}
|
15094
15278
|
|
15095
15279
|
static void ggml_hash_set_free(struct ggml_hash_set hash_set) {
|
15096
|
-
|
15280
|
+
GGML_FREE(hash_set.keys);
|
15097
15281
|
}
|
15098
15282
|
|
15099
15283
|
struct hash_map {
|
@@ -15102,17 +15286,17 @@ struct hash_map {
|
|
15102
15286
|
};
|
15103
15287
|
|
15104
15288
|
static struct hash_map * ggml_new_hash_map(size_t size) {
|
15105
|
-
struct hash_map * result =
|
15289
|
+
struct hash_map * result = GGML_MALLOC(sizeof(struct hash_map));
|
15106
15290
|
result->set = ggml_hash_set_new(size);
|
15107
|
-
result->vals =
|
15291
|
+
result->vals = GGML_MALLOC(sizeof(struct ggml_tensor *) * result->set.size);
|
15108
15292
|
memset(result->vals, 0, sizeof(struct ggml_tensor *) * result->set.size);
|
15109
15293
|
return result;
|
15110
15294
|
}
|
15111
15295
|
|
15112
15296
|
static void ggml_hash_map_free(struct hash_map * map) {
|
15113
15297
|
ggml_hash_set_free(map->set);
|
15114
|
-
|
15115
|
-
|
15298
|
+
GGML_FREE(map->vals);
|
15299
|
+
GGML_FREE(map);
|
15116
15300
|
}
|
15117
15301
|
|
15118
15302
|
// gradient checkpointing
|
@@ -16597,7 +16781,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
16597
16781
|
} break;
|
16598
16782
|
case GGML_OP_SOFT_MAX:
|
16599
16783
|
{
|
16600
|
-
n_tasks = MIN(
|
16784
|
+
n_tasks = MIN(n_threads, ggml_nrows(node->src[0]));
|
16601
16785
|
} break;
|
16602
16786
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
16603
16787
|
{
|
@@ -16890,12 +17074,16 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
16890
17074
|
struct ggml_cplan cplan;
|
16891
17075
|
memset(&cplan, 0, sizeof(struct ggml_cplan));
|
16892
17076
|
|
17077
|
+
int max_tasks = 1;
|
17078
|
+
|
16893
17079
|
// thread scheduling for the different operations + work buffer size estimation
|
16894
17080
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
16895
17081
|
struct ggml_tensor * node = cgraph->nodes[i];
|
16896
17082
|
|
16897
17083
|
const int n_tasks = ggml_get_n_tasks(node, n_threads);
|
16898
17084
|
|
17085
|
+
max_tasks = MAX(max_tasks, n_tasks);
|
17086
|
+
|
16899
17087
|
size_t cur = 0;
|
16900
17088
|
|
16901
17089
|
switch (node->op) {
|
@@ -17062,7 +17250,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
17062
17250
|
work_size += CACHE_LINE_SIZE*(n_threads - 1);
|
17063
17251
|
}
|
17064
17252
|
|
17065
|
-
cplan.n_threads = n_threads;
|
17253
|
+
cplan.n_threads = MIN(max_tasks, n_threads);
|
17066
17254
|
cplan.work_size = work_size;
|
17067
17255
|
cplan.work_data = NULL;
|
17068
17256
|
|
@@ -17079,6 +17267,17 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|
17079
17267
|
}
|
17080
17268
|
}
|
17081
17269
|
|
17270
|
+
#ifdef GGML_USE_VULKAN
|
17271
|
+
for (int i = 0; i < cgraph->n_nodes; i++) {
|
17272
|
+
ggml_vk_preallocate_buffers_graph_cpu_assist(cgraph->nodes[i]);
|
17273
|
+
}
|
17274
|
+
ggml_vk_preallocate_buffers_cpu_assist();
|
17275
|
+
|
17276
|
+
for (int i = 0; i < cgraph->n_nodes; i++) {
|
17277
|
+
ggml_vk_build_graph_cpu_assist(cgraph->nodes[i], i == cgraph->n_nodes - 1);
|
17278
|
+
}
|
17279
|
+
#endif
|
17280
|
+
|
17082
17281
|
const int n_threads = cplan->n_threads;
|
17083
17282
|
|
17084
17283
|
struct ggml_compute_state_shared state_shared = {
|
@@ -17130,6 +17329,10 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|
17130
17329
|
}
|
17131
17330
|
}
|
17132
17331
|
|
17332
|
+
#ifdef GGML_USE_VULKAN
|
17333
|
+
ggml_vk_graph_cleanup_cpu_assist();
|
17334
|
+
#endif
|
17335
|
+
|
17133
17336
|
// performance stats (graph)
|
17134
17337
|
{
|
17135
17338
|
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
|
@@ -18770,6 +18973,7 @@ void ggml_quantize_init(enum ggml_type type) {
|
|
18770
18973
|
switch (type) {
|
18771
18974
|
case GGML_TYPE_IQ2_XXS: iq2xs_init_impl(256); break;
|
18772
18975
|
case GGML_TYPE_IQ2_XS: iq2xs_init_impl(512); break;
|
18976
|
+
case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
|
18773
18977
|
default: // nothing
|
18774
18978
|
break;
|
18775
18979
|
}
|
@@ -19032,6 +19236,15 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
|
19032
19236
|
result = quantize_iq2_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
19033
19237
|
GGML_ASSERT(result == row_size * nrows);
|
19034
19238
|
} break;
|
19239
|
+
case GGML_TYPE_IQ3_XXS:
|
19240
|
+
{
|
19241
|
+
GGML_ASSERT(start % QK_K == 0);
|
19242
|
+
GGML_ASSERT(start % n_per_row == 0);
|
19243
|
+
size_t start_row = start / n_per_row;
|
19244
|
+
size_t row_size = ggml_row_size(type, n_per_row);
|
19245
|
+
result = quantize_iq3_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
19246
|
+
GGML_ASSERT(result == row_size * nrows);
|
19247
|
+
} break;
|
19035
19248
|
case GGML_TYPE_F16:
|
19036
19249
|
{
|
19037
19250
|
size_t elemsize = sizeof(ggml_fp16_t);
|
@@ -19158,6 +19371,25 @@ struct gguf_context {
|
|
19158
19371
|
void * data;
|
19159
19372
|
};
|
19160
19373
|
|
19374
|
+
static size_t gguf_type_size(enum gguf_type type) {
|
19375
|
+
GGML_ASSERT(0 <= type && type < GGUF_TYPE_COUNT);
|
19376
|
+
return GGUF_TYPE_SIZE[type];
|
19377
|
+
}
|
19378
|
+
|
19379
|
+
static void gguf_tensor_info_sanitize(struct gguf_tensor_info * info) {
|
19380
|
+
GGML_ASSERT(info->n_dims <= GGML_MAX_DIMS);
|
19381
|
+
GGML_ASSERT(0 <= info->type && info->type < GGML_TYPE_COUNT);
|
19382
|
+
|
19383
|
+
for (uint32_t i = 0; i < info->n_dims; ++i) {
|
19384
|
+
GGML_ASSERT(info->ne[i] > 0);
|
19385
|
+
}
|
19386
|
+
|
19387
|
+
// prevent overflow for total number of elements
|
19388
|
+
GGML_ASSERT(INT64_MAX/info->ne[1] > info->ne[0]);
|
19389
|
+
GGML_ASSERT(INT64_MAX/info->ne[2] > info->ne[0]*info->ne[1]);
|
19390
|
+
GGML_ASSERT(INT64_MAX/info->ne[3] > info->ne[0]*info->ne[1]*info->ne[2]);
|
19391
|
+
}
|
19392
|
+
|
19161
19393
|
static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) {
|
19162
19394
|
const size_t n = fread(dst, 1, size, file);
|
19163
19395
|
*offset += n;
|
@@ -19170,8 +19402,17 @@ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
|
|
19170
19402
|
|
19171
19403
|
bool ok = true;
|
19172
19404
|
|
19173
|
-
ok = ok && gguf_fread_el(file, &p->n,
|
19174
|
-
|
19405
|
+
ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset);
|
19406
|
+
|
19407
|
+
// early exit if string length is invalid, prevents from integer overflow
|
19408
|
+
if (p->n == SIZE_MAX) {
|
19409
|
+
fprintf(stderr, "%s: invalid string length (%" PRIu64 ")\n", __func__, p->n);
|
19410
|
+
return false;
|
19411
|
+
}
|
19412
|
+
|
19413
|
+
p->data = GGML_CALLOC(p->n + 1, 1);
|
19414
|
+
|
19415
|
+
ok = ok && gguf_fread_el(file, p->data, p->n, offset);
|
19175
19416
|
|
19176
19417
|
return ok;
|
19177
19418
|
}
|
@@ -19243,6 +19484,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
19243
19484
|
return NULL;
|
19244
19485
|
}
|
19245
19486
|
|
19487
|
+
// sanity-checks to prevent from integer/buffer overflows
|
19488
|
+
|
19489
|
+
ok = ok && (ctx->header.n_tensors < (SIZE_MAX/2)/sizeof(struct gguf_tensor_info));
|
19490
|
+
ok = ok && (ctx->header.n_tensors < (SIZE_MAX/2)/ggml_tensor_overhead());
|
19491
|
+
ok = ok && (ctx->header.n_kv < (SIZE_MAX/2)/sizeof(struct gguf_kv));
|
19492
|
+
|
19246
19493
|
if (!ok) {
|
19247
19494
|
fprintf(stderr, "%s: failed to read header\n", __func__);
|
19248
19495
|
fclose(file);
|
@@ -19253,7 +19500,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
19253
19500
|
|
19254
19501
|
// read the kv pairs
|
19255
19502
|
{
|
19256
|
-
ctx->kv =
|
19503
|
+
ctx->kv = GGML_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
|
19257
19504
|
|
19258
19505
|
for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
|
19259
19506
|
struct gguf_kv * kv = &ctx->kv[i];
|
@@ -19281,7 +19528,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
19281
19528
|
case GGUF_TYPE_ARRAY:
|
19282
19529
|
{
|
19283
19530
|
ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
|
19284
|
-
ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n),
|
19531
|
+
ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
|
19285
19532
|
|
19286
19533
|
switch (kv->value.arr.type) {
|
19287
19534
|
case GGUF_TYPE_UINT8:
|
@@ -19296,21 +19543,39 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
19296
19543
|
case GGUF_TYPE_FLOAT64:
|
19297
19544
|
case GGUF_TYPE_BOOL:
|
19298
19545
|
{
|
19299
|
-
|
19300
|
-
|
19546
|
+
// prevent from integer overflow in the malloc below
|
19547
|
+
if (kv->value.arr.n >= SIZE_MAX/gguf_type_size(kv->value.arr.type)) {
|
19548
|
+
fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
|
19549
|
+
fclose(file);
|
19550
|
+
gguf_free(ctx);
|
19551
|
+
return NULL;
|
19552
|
+
}
|
19553
|
+
|
19554
|
+
kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * gguf_type_size(kv->value.arr.type));
|
19555
|
+
|
19556
|
+
ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type), &offset);
|
19301
19557
|
} break;
|
19302
19558
|
case GGUF_TYPE_STRING:
|
19303
19559
|
{
|
19304
|
-
|
19560
|
+
// prevent from integer overflow in the malloc below
|
19561
|
+
if (kv->value.arr.n >= SIZE_MAX/sizeof(struct gguf_str)) {
|
19562
|
+
fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
|
19563
|
+
fclose(file);
|
19564
|
+
gguf_free(ctx);
|
19565
|
+
return NULL;
|
19566
|
+
}
|
19567
|
+
|
19568
|
+
kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * sizeof(struct gguf_str));
|
19569
|
+
|
19305
19570
|
for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
|
19306
19571
|
ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
|
19307
19572
|
}
|
19308
19573
|
} break;
|
19309
19574
|
case GGUF_TYPE_ARRAY:
|
19310
|
-
|
19575
|
+
default: GGML_ASSERT(false && "invalid type"); break;
|
19311
19576
|
}
|
19312
19577
|
} break;
|
19313
|
-
|
19578
|
+
default: GGML_ASSERT(false && "invalid type");
|
19314
19579
|
}
|
19315
19580
|
|
19316
19581
|
if (!ok) {
|
@@ -19328,7 +19593,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
19328
19593
|
|
19329
19594
|
// read the tensor infos
|
19330
19595
|
{
|
19331
|
-
ctx->infos =
|
19596
|
+
ctx->infos = GGML_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
|
19332
19597
|
|
19333
19598
|
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
|
19334
19599
|
struct gguf_tensor_info * info = &ctx->infos[i];
|
@@ -19339,12 +19604,18 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
19339
19604
|
|
19340
19605
|
ok = ok && gguf_fread_str(file, &info->name, &offset);
|
19341
19606
|
ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset);
|
19607
|
+
|
19608
|
+
ok = ok && (info->n_dims <= GGML_MAX_DIMS);
|
19609
|
+
|
19342
19610
|
for (uint32_t j = 0; j < info->n_dims; ++j) {
|
19343
19611
|
ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
|
19344
19612
|
}
|
19613
|
+
|
19345
19614
|
ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
|
19346
19615
|
ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
|
19347
19616
|
|
19617
|
+
gguf_tensor_info_sanitize(info);
|
19618
|
+
|
19348
19619
|
if (!ok) {
|
19349
19620
|
fprintf(stderr, "%s: failed to read tensor info\n", __func__);
|
19350
19621
|
fclose(file);
|
@@ -19498,12 +19769,12 @@ void gguf_free(struct gguf_context * ctx) {
|
|
19498
19769
|
struct gguf_kv * kv = &ctx->kv[i];
|
19499
19770
|
|
19500
19771
|
if (kv->key.data) {
|
19501
|
-
|
19772
|
+
GGML_FREE(kv->key.data);
|
19502
19773
|
}
|
19503
19774
|
|
19504
19775
|
if (kv->type == GGUF_TYPE_STRING) {
|
19505
19776
|
if (kv->value.str.data) {
|
19506
|
-
|
19777
|
+
GGML_FREE(kv->value.str.data);
|
19507
19778
|
}
|
19508
19779
|
}
|
19509
19780
|
|
@@ -19513,16 +19784,16 @@ void gguf_free(struct gguf_context * ctx) {
|
|
19513
19784
|
for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
|
19514
19785
|
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
|
19515
19786
|
if (str->data) {
|
19516
|
-
|
19787
|
+
GGML_FREE(str->data);
|
19517
19788
|
}
|
19518
19789
|
}
|
19519
19790
|
}
|
19520
|
-
|
19791
|
+
GGML_FREE(kv->value.arr.data);
|
19521
19792
|
}
|
19522
19793
|
}
|
19523
19794
|
}
|
19524
19795
|
|
19525
|
-
|
19796
|
+
GGML_FREE(ctx->kv);
|
19526
19797
|
}
|
19527
19798
|
|
19528
19799
|
if (ctx->infos) {
|
@@ -19530,11 +19801,11 @@ void gguf_free(struct gguf_context * ctx) {
|
|
19530
19801
|
struct gguf_tensor_info * info = &ctx->infos[i];
|
19531
19802
|
|
19532
19803
|
if (info->name.data) {
|
19533
|
-
|
19804
|
+
GGML_FREE(info->name.data);
|
19534
19805
|
}
|
19535
19806
|
}
|
19536
19807
|
|
19537
|
-
|
19808
|
+
GGML_FREE(ctx->infos);
|
19538
19809
|
}
|
19539
19810
|
|
19540
19811
|
GGML_ALIGNED_FREE(ctx);
|
@@ -19835,8 +20106,8 @@ void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_ty
|
|
19835
20106
|
ctx->kv[idx].type = GGUF_TYPE_ARRAY;
|
19836
20107
|
ctx->kv[idx].value.arr.type = type;
|
19837
20108
|
ctx->kv[idx].value.arr.n = n;
|
19838
|
-
ctx->kv[idx].value.arr.data =
|
19839
|
-
memcpy(ctx->kv[idx].value.arr.data, data, n*
|
20109
|
+
ctx->kv[idx].value.arr.data = GGML_MALLOC(n*gguf_type_size(type));
|
20110
|
+
memcpy(ctx->kv[idx].value.arr.data, data, n*gguf_type_size(type));
|
19840
20111
|
}
|
19841
20112
|
|
19842
20113
|
void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, int n) {
|
@@ -19845,7 +20116,7 @@ void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char **
|
|
19845
20116
|
ctx->kv[idx].type = GGUF_TYPE_ARRAY;
|
19846
20117
|
ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
|
19847
20118
|
ctx->kv[idx].value.arr.n = n;
|
19848
|
-
ctx->kv[idx].value.arr.data =
|
20119
|
+
ctx->kv[idx].value.arr.data = GGML_MALLOC(n*sizeof(struct gguf_str));
|
19849
20120
|
for (int i = 0; i < n; i++) {
|
19850
20121
|
struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
|
19851
20122
|
str->n = strlen(data[i]);
|
@@ -19872,19 +20143,19 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
|
|
19872
20143
|
case GGUF_TYPE_ARRAY:
|
19873
20144
|
{
|
19874
20145
|
if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
|
19875
|
-
const char ** data =
|
20146
|
+
const char ** data = GGML_MALLOC(src->kv[i].value.arr.n*sizeof(char *));
|
19876
20147
|
for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
|
19877
20148
|
data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
|
19878
20149
|
}
|
19879
20150
|
gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
|
19880
|
-
|
20151
|
+
GGML_FREE((void *)data);
|
19881
20152
|
} else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
|
19882
20153
|
GGML_ASSERT(false && "nested arrays not supported");
|
19883
20154
|
} else {
|
19884
20155
|
gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n);
|
19885
20156
|
}
|
19886
20157
|
} break;
|
19887
|
-
|
20158
|
+
default: GGML_ASSERT(false && "invalid type"); break;
|
19888
20159
|
}
|
19889
20160
|
}
|
19890
20161
|
}
|
@@ -19960,7 +20231,7 @@ struct gguf_buf {
|
|
19960
20231
|
|
19961
20232
|
static struct gguf_buf gguf_buf_init(size_t size) {
|
19962
20233
|
struct gguf_buf buf = {
|
19963
|
-
/*buf.data =*/ size == 0 ? NULL :
|
20234
|
+
/*buf.data =*/ size == 0 ? NULL : GGML_MALLOC(size),
|
19964
20235
|
/*buf.size =*/ size,
|
19965
20236
|
/*buf.offset =*/ 0,
|
19966
20237
|
};
|
@@ -19970,7 +20241,7 @@ static struct gguf_buf gguf_buf_init(size_t size) {
|
|
19970
20241
|
|
19971
20242
|
static void gguf_buf_free(struct gguf_buf buf) {
|
19972
20243
|
if (buf.data) {
|
19973
|
-
|
20244
|
+
GGML_FREE(buf.data);
|
19974
20245
|
}
|
19975
20246
|
}
|
19976
20247
|
|
@@ -20051,7 +20322,7 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf *
|
|
20051
20322
|
case GGUF_TYPE_FLOAT64:
|
20052
20323
|
case GGUF_TYPE_BOOL:
|
20053
20324
|
{
|
20054
|
-
gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n *
|
20325
|
+
gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type));
|
20055
20326
|
} break;
|
20056
20327
|
case GGUF_TYPE_STRING:
|
20057
20328
|
{
|
@@ -20060,10 +20331,10 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf *
|
|
20060
20331
|
}
|
20061
20332
|
} break;
|
20062
20333
|
case GGUF_TYPE_ARRAY:
|
20063
|
-
|
20334
|
+
default: GGML_ASSERT(false && "invalid type"); break;
|
20064
20335
|
}
|
20065
20336
|
} break;
|
20066
|
-
|
20337
|
+
default: GGML_ASSERT(false && "invalid type");
|
20067
20338
|
}
|
20068
20339
|
}
|
20069
20340
|
|
@@ -20264,7 +20535,7 @@ int ggml_cpu_has_wasm_simd(void) {
|
|
20264
20535
|
}
|
20265
20536
|
|
20266
20537
|
int ggml_cpu_has_blas(void) {
|
20267
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
20538
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
|
20268
20539
|
return 1;
|
20269
20540
|
#else
|
20270
20541
|
return 0;
|
@@ -20287,8 +20558,33 @@ int ggml_cpu_has_clblast(void) {
|
|
20287
20558
|
#endif
|
20288
20559
|
}
|
20289
20560
|
|
20561
|
+
int ggml_cpu_has_vulkan(void) {
|
20562
|
+
#if defined(GGML_USE_VULKAN)
|
20563
|
+
return 1;
|
20564
|
+
#else
|
20565
|
+
return 0;
|
20566
|
+
#endif
|
20567
|
+
}
|
20568
|
+
|
20569
|
+
int ggml_cpu_has_kompute(void) {
|
20570
|
+
#if defined(GGML_USE_KOMPUTE)
|
20571
|
+
return 1;
|
20572
|
+
#else
|
20573
|
+
return 0;
|
20574
|
+
#endif
|
20575
|
+
}
|
20576
|
+
|
20577
|
+
int ggml_cpu_has_sycl(void) {
|
20578
|
+
#if defined(GGML_USE_SYCL)
|
20579
|
+
return 1;
|
20580
|
+
#else
|
20581
|
+
return 0;
|
20582
|
+
#endif
|
20583
|
+
}
|
20584
|
+
|
20290
20585
|
int ggml_cpu_has_gpublas(void) {
|
20291
|
-
return ggml_cpu_has_cublas() || ggml_cpu_has_clblast()
|
20586
|
+
return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
|
20587
|
+
ggml_cpu_has_sycl();
|
20292
20588
|
}
|
20293
20589
|
|
20294
20590
|
int ggml_cpu_has_sse3(void) {
|