llama_cpp 0.15.4 → 0.16.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/ext/llama_cpp/extconf.rb +3 -2
- data/ext/llama_cpp/llama_cpp.cpp +17 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -1
- data/vendor/tmp/llama.cpp/Makefile +166 -82
- data/vendor/tmp/llama.cpp/ggml-alloc.c +82 -26
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +183 -69
- data/vendor/tmp/llama.cpp/ggml-backend.h +4 -4
- data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
- data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
- data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +104 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +674 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +88 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +419 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +112 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +206 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +286 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +103 -135
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +29 -13
- data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +45 -33
- data/vendor/tmp/llama.cpp/ggml-metal.metal +83 -59
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +15 -14
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +26 -90
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +74522 -14913
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +631 -471
- data/vendor/tmp/llama.cpp/ggml.c +278 -603
- data/vendor/tmp/llama.cpp/ggml.h +9 -28
- data/vendor/tmp/llama.cpp/llama.cpp +345 -473
- data/vendor/tmp/llama.cpp/llama.h +21 -43
- metadata +134 -7
- data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
- data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
- data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
@@ -339,6 +339,7 @@ struct hash_node {
|
|
339
339
|
};
|
340
340
|
|
341
341
|
struct tensor_alloc {
|
342
|
+
int buffer_id;
|
342
343
|
size_t offset;
|
343
344
|
size_t size_max; // 0 = pre-allocated, unused, or view
|
344
345
|
};
|
@@ -349,7 +350,6 @@ struct leaf_alloc {
|
|
349
350
|
};
|
350
351
|
|
351
352
|
struct node_alloc {
|
352
|
-
int buffer_id;
|
353
353
|
struct tensor_alloc dst;
|
354
354
|
struct tensor_alloc src[GGML_MAX_SRC];
|
355
355
|
};
|
@@ -377,7 +377,7 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
|
|
377
377
|
galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
|
378
378
|
GGML_ASSERT(galloc->bufts != NULL);
|
379
379
|
|
380
|
-
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t)
|
380
|
+
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
|
381
381
|
GGML_ASSERT(galloc->buffers != NULL);
|
382
382
|
|
383
383
|
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
|
@@ -386,8 +386,19 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
|
|
386
386
|
for (int i = 0; i < n_bufs; i++) {
|
387
387
|
galloc->bufts[i] = bufts[i];
|
388
388
|
galloc->buffers[i] = NULL;
|
389
|
-
|
390
|
-
|
389
|
+
|
390
|
+
// check if the same buffer type is used multiple times and reuse the same allocator
|
391
|
+
for (int j = 0; j < i; j++) {
|
392
|
+
if (bufts[i] == bufts[j]) {
|
393
|
+
galloc->buf_tallocs[i] = galloc->buf_tallocs[j];
|
394
|
+
break;
|
395
|
+
}
|
396
|
+
}
|
397
|
+
|
398
|
+
if (galloc->buf_tallocs[i] == NULL) {
|
399
|
+
size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
|
400
|
+
galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
|
401
|
+
}
|
391
402
|
}
|
392
403
|
galloc->n_buffers = n_bufs;
|
393
404
|
|
@@ -405,10 +416,30 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
|
|
405
416
|
|
406
417
|
for (int i = 0; i < galloc->n_buffers; i++) {
|
407
418
|
if (galloc->buffers != NULL) {
|
408
|
-
|
419
|
+
// skip if already freed
|
420
|
+
bool freed = false;
|
421
|
+
for (int j = 0; j < i; j++) {
|
422
|
+
if (galloc->buffers[j] == galloc->buffers[i]) {
|
423
|
+
freed = true;
|
424
|
+
break;
|
425
|
+
}
|
426
|
+
}
|
427
|
+
if (!freed) {
|
428
|
+
ggml_backend_buffer_free(galloc->buffers[i]);
|
429
|
+
}
|
409
430
|
}
|
410
431
|
if (galloc->buf_tallocs != NULL) {
|
411
|
-
|
432
|
+
// skip if already freed
|
433
|
+
bool freed = false;
|
434
|
+
for (int j = 0; j < i; j++) {
|
435
|
+
if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
|
436
|
+
freed = true;
|
437
|
+
break;
|
438
|
+
}
|
439
|
+
}
|
440
|
+
if (!freed) {
|
441
|
+
ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
|
442
|
+
}
|
412
443
|
}
|
413
444
|
}
|
414
445
|
|
@@ -511,17 +542,18 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
|
|
511
542
|
}
|
512
543
|
}
|
513
544
|
|
514
|
-
static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node
|
545
|
+
static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
|
515
546
|
// graph outputs are never freed
|
516
547
|
if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
|
517
548
|
AT_PRINTF("not freeing output %s\n", node->name);
|
518
549
|
return;
|
519
550
|
}
|
520
551
|
|
521
|
-
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
|
522
|
-
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
|
523
552
|
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
524
553
|
size_t offset = hn->offset;
|
554
|
+
int buffer_id = hn->buffer_id;
|
555
|
+
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
|
556
|
+
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
|
525
557
|
size_t size = ggml_backend_buft_get_alloc_size(buft, node);
|
526
558
|
ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
|
527
559
|
hn->allocated = false;
|
@@ -626,11 +658,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
626
658
|
AT_PRINTF("view_src %s: %d children, %d views\n",
|
627
659
|
view_src->name, view_src_hn->n_children, view_src_hn->n_views);
|
628
660
|
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
|
629
|
-
ggml_gallocr_free_node(galloc, view_src
|
661
|
+
ggml_gallocr_free_node(galloc, view_src);
|
630
662
|
}
|
631
663
|
}
|
632
664
|
else if (p_hn->allocated) {
|
633
|
-
ggml_gallocr_free_node(galloc, parent
|
665
|
+
ggml_gallocr_free_node(galloc, parent);
|
634
666
|
}
|
635
667
|
}
|
636
668
|
AT_PRINTF("\n");
|
@@ -674,22 +706,25 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
674
706
|
for (int i = 0; i < graph->n_nodes; i++) {
|
675
707
|
struct ggml_tensor * node = graph->nodes[i];
|
676
708
|
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
677
|
-
node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i);
|
678
709
|
if (node->view_src || node->data) {
|
710
|
+
node_alloc->dst.buffer_id = -1;
|
679
711
|
node_alloc->dst.offset = SIZE_MAX;
|
680
712
|
node_alloc->dst.size_max = 0;
|
681
713
|
} else {
|
682
714
|
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
683
|
-
node_alloc->dst.
|
684
|
-
node_alloc->dst.
|
715
|
+
node_alloc->dst.buffer_id = hn->buffer_id;
|
716
|
+
node_alloc->dst.offset = hn->offset;
|
717
|
+
node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
|
685
718
|
}
|
686
719
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
687
720
|
struct ggml_tensor * src = node->src[j];
|
688
721
|
if (!src || src->view_src || src->data) {
|
722
|
+
node_alloc->src[j].buffer_id = -1;
|
689
723
|
node_alloc->src[j].offset = SIZE_MAX;
|
690
724
|
node_alloc->src[j].size_max = 0;
|
691
725
|
} else {
|
692
726
|
struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
|
727
|
+
node_alloc->src[j].buffer_id = hn->buffer_id;
|
693
728
|
node_alloc->src[j].offset = hn->offset;
|
694
729
|
node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
|
695
730
|
}
|
@@ -706,9 +741,11 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
706
741
|
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
707
742
|
galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
|
708
743
|
if (leaf->view_src || leaf->data) {
|
744
|
+
galloc->leaf_allocs[i].leaf.buffer_id = -1;
|
709
745
|
galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
|
710
746
|
galloc->leaf_allocs[i].leaf.size_max = 0;
|
711
747
|
} else {
|
748
|
+
galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
|
712
749
|
galloc->leaf_allocs[i].leaf.offset = hn->offset;
|
713
750
|
galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
|
714
751
|
}
|
@@ -716,6 +753,14 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
716
753
|
|
717
754
|
// reallocate buffers if needed
|
718
755
|
for (int i = 0; i < galloc->n_buffers; i++) {
|
756
|
+
// if the buffer type is used multiple times, we reuse the same buffer
|
757
|
+
for (int j = 0; j < i; j++) {
|
758
|
+
if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
|
759
|
+
galloc->buffers[i] = galloc->buffers[j];
|
760
|
+
break;
|
761
|
+
}
|
762
|
+
}
|
763
|
+
|
719
764
|
size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
|
720
765
|
size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
|
721
766
|
|
@@ -724,6 +769,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
724
769
|
#ifndef NDEBUG
|
725
770
|
fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
726
771
|
#endif
|
772
|
+
|
727
773
|
ggml_backend_buffer_free(galloc->buffers[i]);
|
728
774
|
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
|
729
775
|
if (galloc->buffers[i] == NULL) {
|
@@ -740,7 +786,8 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
|
740
786
|
return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
|
741
787
|
}
|
742
788
|
|
743
|
-
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor,
|
789
|
+
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
|
790
|
+
int buffer_id = tensor_alloc->buffer_id;
|
744
791
|
assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
|
745
792
|
|
746
793
|
if (tensor->view_src != NULL) {
|
@@ -750,7 +797,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
|
|
750
797
|
// this tensor was allocated without ggml-backend
|
751
798
|
return;
|
752
799
|
}
|
753
|
-
ggml_backend_view_init(
|
800
|
+
ggml_backend_view_init(tensor);
|
754
801
|
}
|
755
802
|
} else {
|
756
803
|
if (tensor->data == NULL) {
|
@@ -768,8 +815,8 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
|
|
768
815
|
}
|
769
816
|
}
|
770
817
|
|
771
|
-
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct
|
772
|
-
ggml_backend_buffer_type_t buft = galloc->bufts[
|
818
|
+
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
|
819
|
+
ggml_backend_buffer_type_t buft = talloc->buffer_id != -1 ? galloc->bufts[talloc->buffer_id] : NULL;
|
773
820
|
size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
|
774
821
|
return talloc->size_max >= node_size;
|
775
822
|
}
|
@@ -793,7 +840,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
|
|
793
840
|
struct ggml_tensor * node = graph->nodes[i];
|
794
841
|
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
795
842
|
|
796
|
-
if (!ggml_gallocr_node_needs_realloc(galloc, node,
|
843
|
+
if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
|
797
844
|
#ifndef NDEBUG
|
798
845
|
fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
|
799
846
|
#endif
|
@@ -805,7 +852,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
|
|
805
852
|
if (src == NULL) {
|
806
853
|
continue;
|
807
854
|
}
|
808
|
-
if (!ggml_gallocr_node_needs_realloc(galloc, src,
|
855
|
+
if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
|
809
856
|
#ifndef NDEBUG
|
810
857
|
fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
|
811
858
|
#endif
|
@@ -846,7 +893,7 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
|
|
846
893
|
for (int i = 0; i < graph->n_leafs; i++) {
|
847
894
|
struct ggml_tensor * leaf = graph->leafs[i];
|
848
895
|
struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
|
849
|
-
ggml_gallocr_init_tensor(galloc, leaf,
|
896
|
+
ggml_gallocr_init_tensor(galloc, leaf, &leaf_alloc->leaf);
|
850
897
|
}
|
851
898
|
// nodes
|
852
899
|
for (int i = 0; i < graph->n_nodes; i++) {
|
@@ -857,9 +904,9 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
|
|
857
904
|
if (src == NULL) {
|
858
905
|
continue;
|
859
906
|
}
|
860
|
-
ggml_gallocr_init_tensor(galloc, src,
|
907
|
+
ggml_gallocr_init_tensor(galloc, src, &node_alloc->src[j]);
|
861
908
|
}
|
862
|
-
ggml_gallocr_init_tensor(galloc, node,
|
909
|
+
ggml_gallocr_init_tensor(galloc, node, &node_alloc->dst);
|
863
910
|
}
|
864
911
|
|
865
912
|
return true;
|
@@ -871,6 +918,15 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
|
871
918
|
if (galloc->buffers[buffer_id] == NULL) {
|
872
919
|
return 0;
|
873
920
|
}
|
921
|
+
|
922
|
+
for (int i = 0; i < buffer_id; i++) {
|
923
|
+
if (galloc->buffers[i] == galloc->buffers[buffer_id]) {
|
924
|
+
// this buffer is the same as a previous one due to the same buffer type being used multiple times
|
925
|
+
// only return the buffer size the first time it appears to avoid double counting
|
926
|
+
return 0;
|
927
|
+
}
|
928
|
+
}
|
929
|
+
|
874
930
|
return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
|
875
931
|
}
|
876
932
|
|
@@ -886,7 +942,7 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
|
886
942
|
fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
|
887
943
|
#endif
|
888
944
|
for (size_t i = 0; i < *n_buffers; i++) {
|
889
|
-
ggml_backend_buffer_free(*buffers[i]);
|
945
|
+
ggml_backend_buffer_free((*buffers)[i]);
|
890
946
|
}
|
891
947
|
free(*buffers);
|
892
948
|
return false;
|
@@ -899,12 +955,12 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
|
899
955
|
if (t->view_src == NULL) {
|
900
956
|
ggml_tallocr_alloc(&tallocr, t);
|
901
957
|
} else if (t->buffer == NULL) {
|
902
|
-
ggml_backend_view_init(
|
958
|
+
ggml_backend_view_init(t);
|
903
959
|
}
|
904
960
|
} else {
|
905
961
|
if (t->view_src != NULL && t->buffer == NULL) {
|
906
962
|
// view of a pre-allocated tensor
|
907
|
-
ggml_backend_view_init(
|
963
|
+
ggml_backend_view_init(t);
|
908
964
|
}
|
909
965
|
}
|
910
966
|
}
|
@@ -17,13 +17,15 @@ extern "C" {
|
|
17
17
|
|
18
18
|
struct ggml_backend_buffer_type_i {
|
19
19
|
const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
|
20
|
+
// allocate a buffer of this type
|
20
21
|
ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
21
|
-
|
22
|
-
size_t (*GGML_CALL
|
23
|
-
|
24
|
-
|
22
|
+
// tensor alignment
|
23
|
+
size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft);
|
24
|
+
// max buffer size that can be allocated
|
25
|
+
size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft);
|
26
|
+
// data size needed to allocate the tensor, including padding
|
27
|
+
size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
|
25
28
|
// check if tensor data is in host memory
|
26
|
-
// should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
|
27
29
|
bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft);
|
28
30
|
};
|
29
31
|
|
@@ -92,27 +94,37 @@ extern "C" {
|
|
92
94
|
void (*GGML_CALL synchronize)(ggml_backend_t backend);
|
93
95
|
|
94
96
|
// compute graph with a plan (not used currently)
|
97
|
+
// create a new plan for a graph
|
95
98
|
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
96
99
|
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
100
|
+
// update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
|
101
|
+
void (*GGML_CALL graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
|
102
|
+
// compute the graph with the plan
|
103
|
+
enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
97
104
|
|
98
|
-
// compute graph with a plan
|
99
|
-
enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
100
105
|
// compute graph without a plan (async)
|
101
106
|
enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
102
107
|
|
103
|
-
// check if the backend
|
108
|
+
// check if the backend can compute an operation
|
104
109
|
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
105
110
|
|
111
|
+
// check if the backend can use tensors allocated in a buffer type
|
112
|
+
bool (*GGML_CALL supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
|
113
|
+
|
106
114
|
// check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
|
107
115
|
// these should be expensive operations with large batch sizes that may benefit from running on this backend
|
108
116
|
// even if the weight has to be copied from the CPU temporarily
|
109
117
|
bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
110
118
|
|
111
119
|
// (optional) event synchronization
|
120
|
+
// create a new event that can record events on this backend instance
|
112
121
|
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
|
113
122
|
void (*GGML_CALL event_free) (ggml_backend_event_t event);
|
123
|
+
// record an event on the backend instance that created it
|
114
124
|
void (*GGML_CALL event_record) (ggml_backend_event_t event);
|
125
|
+
// wait for an event on on a different backend instance
|
115
126
|
void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
|
127
|
+
// block until an event is recorded
|
116
128
|
void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
|
117
129
|
};
|
118
130
|
|