llama_cpp 0.15.4 → 0.16.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/ext/llama_cpp/extconf.rb +3 -2
- data/ext/llama_cpp/llama_cpp.cpp +17 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -1
- data/vendor/tmp/llama.cpp/Makefile +166 -82
- data/vendor/tmp/llama.cpp/ggml-alloc.c +82 -26
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +183 -69
- data/vendor/tmp/llama.cpp/ggml-backend.h +4 -4
- data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
- data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
- data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +104 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +674 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +88 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +419 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +112 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +206 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +286 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +103 -135
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +29 -13
- data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +45 -33
- data/vendor/tmp/llama.cpp/ggml-metal.metal +83 -59
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +15 -14
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +26 -90
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +74522 -14913
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +631 -471
- data/vendor/tmp/llama.cpp/ggml.c +278 -603
- data/vendor/tmp/llama.cpp/ggml.h +9 -28
- data/vendor/tmp/llama.cpp/llama.cpp +345 -473
- data/vendor/tmp/llama.cpp/llama.h +21 -43
- metadata +134 -7
- data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
- data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
- data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
|
@@ -339,6 +339,7 @@ struct hash_node {
|
|
|
339
339
|
};
|
|
340
340
|
|
|
341
341
|
struct tensor_alloc {
|
|
342
|
+
int buffer_id;
|
|
342
343
|
size_t offset;
|
|
343
344
|
size_t size_max; // 0 = pre-allocated, unused, or view
|
|
344
345
|
};
|
|
@@ -349,7 +350,6 @@ struct leaf_alloc {
|
|
|
349
350
|
};
|
|
350
351
|
|
|
351
352
|
struct node_alloc {
|
|
352
|
-
int buffer_id;
|
|
353
353
|
struct tensor_alloc dst;
|
|
354
354
|
struct tensor_alloc src[GGML_MAX_SRC];
|
|
355
355
|
};
|
|
@@ -377,7 +377,7 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
|
|
|
377
377
|
galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
|
|
378
378
|
GGML_ASSERT(galloc->bufts != NULL);
|
|
379
379
|
|
|
380
|
-
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t)
|
|
380
|
+
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
|
|
381
381
|
GGML_ASSERT(galloc->buffers != NULL);
|
|
382
382
|
|
|
383
383
|
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
|
|
@@ -386,8 +386,19 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
|
|
|
386
386
|
for (int i = 0; i < n_bufs; i++) {
|
|
387
387
|
galloc->bufts[i] = bufts[i];
|
|
388
388
|
galloc->buffers[i] = NULL;
|
|
389
|
-
|
|
390
|
-
|
|
389
|
+
|
|
390
|
+
// check if the same buffer type is used multiple times and reuse the same allocator
|
|
391
|
+
for (int j = 0; j < i; j++) {
|
|
392
|
+
if (bufts[i] == bufts[j]) {
|
|
393
|
+
galloc->buf_tallocs[i] = galloc->buf_tallocs[j];
|
|
394
|
+
break;
|
|
395
|
+
}
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
if (galloc->buf_tallocs[i] == NULL) {
|
|
399
|
+
size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
|
|
400
|
+
galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
|
|
401
|
+
}
|
|
391
402
|
}
|
|
392
403
|
galloc->n_buffers = n_bufs;
|
|
393
404
|
|
|
@@ -405,10 +416,30 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
|
|
|
405
416
|
|
|
406
417
|
for (int i = 0; i < galloc->n_buffers; i++) {
|
|
407
418
|
if (galloc->buffers != NULL) {
|
|
408
|
-
|
|
419
|
+
// skip if already freed
|
|
420
|
+
bool freed = false;
|
|
421
|
+
for (int j = 0; j < i; j++) {
|
|
422
|
+
if (galloc->buffers[j] == galloc->buffers[i]) {
|
|
423
|
+
freed = true;
|
|
424
|
+
break;
|
|
425
|
+
}
|
|
426
|
+
}
|
|
427
|
+
if (!freed) {
|
|
428
|
+
ggml_backend_buffer_free(galloc->buffers[i]);
|
|
429
|
+
}
|
|
409
430
|
}
|
|
410
431
|
if (galloc->buf_tallocs != NULL) {
|
|
411
|
-
|
|
432
|
+
// skip if already freed
|
|
433
|
+
bool freed = false;
|
|
434
|
+
for (int j = 0; j < i; j++) {
|
|
435
|
+
if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
|
|
436
|
+
freed = true;
|
|
437
|
+
break;
|
|
438
|
+
}
|
|
439
|
+
}
|
|
440
|
+
if (!freed) {
|
|
441
|
+
ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
|
|
442
|
+
}
|
|
412
443
|
}
|
|
413
444
|
}
|
|
414
445
|
|
|
@@ -511,17 +542,18 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
|
|
|
511
542
|
}
|
|
512
543
|
}
|
|
513
544
|
|
|
514
|
-
static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node
|
|
545
|
+
static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
|
|
515
546
|
// graph outputs are never freed
|
|
516
547
|
if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
|
|
517
548
|
AT_PRINTF("not freeing output %s\n", node->name);
|
|
518
549
|
return;
|
|
519
550
|
}
|
|
520
551
|
|
|
521
|
-
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
|
|
522
|
-
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
|
|
523
552
|
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
|
524
553
|
size_t offset = hn->offset;
|
|
554
|
+
int buffer_id = hn->buffer_id;
|
|
555
|
+
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
|
|
556
|
+
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
|
|
525
557
|
size_t size = ggml_backend_buft_get_alloc_size(buft, node);
|
|
526
558
|
ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
|
|
527
559
|
hn->allocated = false;
|
|
@@ -626,11 +658,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
|
626
658
|
AT_PRINTF("view_src %s: %d children, %d views\n",
|
|
627
659
|
view_src->name, view_src_hn->n_children, view_src_hn->n_views);
|
|
628
660
|
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
|
|
629
|
-
ggml_gallocr_free_node(galloc, view_src
|
|
661
|
+
ggml_gallocr_free_node(galloc, view_src);
|
|
630
662
|
}
|
|
631
663
|
}
|
|
632
664
|
else if (p_hn->allocated) {
|
|
633
|
-
ggml_gallocr_free_node(galloc, parent
|
|
665
|
+
ggml_gallocr_free_node(galloc, parent);
|
|
634
666
|
}
|
|
635
667
|
}
|
|
636
668
|
AT_PRINTF("\n");
|
|
@@ -674,22 +706,25 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
|
674
706
|
for (int i = 0; i < graph->n_nodes; i++) {
|
|
675
707
|
struct ggml_tensor * node = graph->nodes[i];
|
|
676
708
|
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
|
677
|
-
node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i);
|
|
678
709
|
if (node->view_src || node->data) {
|
|
710
|
+
node_alloc->dst.buffer_id = -1;
|
|
679
711
|
node_alloc->dst.offset = SIZE_MAX;
|
|
680
712
|
node_alloc->dst.size_max = 0;
|
|
681
713
|
} else {
|
|
682
714
|
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
|
683
|
-
node_alloc->dst.
|
|
684
|
-
node_alloc->dst.
|
|
715
|
+
node_alloc->dst.buffer_id = hn->buffer_id;
|
|
716
|
+
node_alloc->dst.offset = hn->offset;
|
|
717
|
+
node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
|
|
685
718
|
}
|
|
686
719
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
687
720
|
struct ggml_tensor * src = node->src[j];
|
|
688
721
|
if (!src || src->view_src || src->data) {
|
|
722
|
+
node_alloc->src[j].buffer_id = -1;
|
|
689
723
|
node_alloc->src[j].offset = SIZE_MAX;
|
|
690
724
|
node_alloc->src[j].size_max = 0;
|
|
691
725
|
} else {
|
|
692
726
|
struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
|
|
727
|
+
node_alloc->src[j].buffer_id = hn->buffer_id;
|
|
693
728
|
node_alloc->src[j].offset = hn->offset;
|
|
694
729
|
node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
|
|
695
730
|
}
|
|
@@ -706,9 +741,11 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
|
706
741
|
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
|
707
742
|
galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
|
|
708
743
|
if (leaf->view_src || leaf->data) {
|
|
744
|
+
galloc->leaf_allocs[i].leaf.buffer_id = -1;
|
|
709
745
|
galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
|
|
710
746
|
galloc->leaf_allocs[i].leaf.size_max = 0;
|
|
711
747
|
} else {
|
|
748
|
+
galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
|
|
712
749
|
galloc->leaf_allocs[i].leaf.offset = hn->offset;
|
|
713
750
|
galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
|
|
714
751
|
}
|
|
@@ -716,6 +753,14 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
|
716
753
|
|
|
717
754
|
// reallocate buffers if needed
|
|
718
755
|
for (int i = 0; i < galloc->n_buffers; i++) {
|
|
756
|
+
// if the buffer type is used multiple times, we reuse the same buffer
|
|
757
|
+
for (int j = 0; j < i; j++) {
|
|
758
|
+
if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
|
|
759
|
+
galloc->buffers[i] = galloc->buffers[j];
|
|
760
|
+
break;
|
|
761
|
+
}
|
|
762
|
+
}
|
|
763
|
+
|
|
719
764
|
size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
|
|
720
765
|
size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
|
|
721
766
|
|
|
@@ -724,6 +769,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
|
724
769
|
#ifndef NDEBUG
|
|
725
770
|
fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
|
726
771
|
#endif
|
|
772
|
+
|
|
727
773
|
ggml_backend_buffer_free(galloc->buffers[i]);
|
|
728
774
|
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
|
|
729
775
|
if (galloc->buffers[i] == NULL) {
|
|
@@ -740,7 +786,8 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
|
|
740
786
|
return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
|
|
741
787
|
}
|
|
742
788
|
|
|
743
|
-
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor,
|
|
789
|
+
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
|
|
790
|
+
int buffer_id = tensor_alloc->buffer_id;
|
|
744
791
|
assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
|
|
745
792
|
|
|
746
793
|
if (tensor->view_src != NULL) {
|
|
@@ -750,7 +797,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
|
|
|
750
797
|
// this tensor was allocated without ggml-backend
|
|
751
798
|
return;
|
|
752
799
|
}
|
|
753
|
-
ggml_backend_view_init(
|
|
800
|
+
ggml_backend_view_init(tensor);
|
|
754
801
|
}
|
|
755
802
|
} else {
|
|
756
803
|
if (tensor->data == NULL) {
|
|
@@ -768,8 +815,8 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
|
|
|
768
815
|
}
|
|
769
816
|
}
|
|
770
817
|
|
|
771
|
-
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct
|
|
772
|
-
ggml_backend_buffer_type_t buft = galloc->bufts[
|
|
818
|
+
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
|
|
819
|
+
ggml_backend_buffer_type_t buft = talloc->buffer_id != -1 ? galloc->bufts[talloc->buffer_id] : NULL;
|
|
773
820
|
size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
|
|
774
821
|
return talloc->size_max >= node_size;
|
|
775
822
|
}
|
|
@@ -793,7 +840,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
|
|
|
793
840
|
struct ggml_tensor * node = graph->nodes[i];
|
|
794
841
|
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
|
795
842
|
|
|
796
|
-
if (!ggml_gallocr_node_needs_realloc(galloc, node,
|
|
843
|
+
if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
|
|
797
844
|
#ifndef NDEBUG
|
|
798
845
|
fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
|
|
799
846
|
#endif
|
|
@@ -805,7 +852,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
|
|
|
805
852
|
if (src == NULL) {
|
|
806
853
|
continue;
|
|
807
854
|
}
|
|
808
|
-
if (!ggml_gallocr_node_needs_realloc(galloc, src,
|
|
855
|
+
if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
|
|
809
856
|
#ifndef NDEBUG
|
|
810
857
|
fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
|
|
811
858
|
#endif
|
|
@@ -846,7 +893,7 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
|
|
|
846
893
|
for (int i = 0; i < graph->n_leafs; i++) {
|
|
847
894
|
struct ggml_tensor * leaf = graph->leafs[i];
|
|
848
895
|
struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
|
|
849
|
-
ggml_gallocr_init_tensor(galloc, leaf,
|
|
896
|
+
ggml_gallocr_init_tensor(galloc, leaf, &leaf_alloc->leaf);
|
|
850
897
|
}
|
|
851
898
|
// nodes
|
|
852
899
|
for (int i = 0; i < graph->n_nodes; i++) {
|
|
@@ -857,9 +904,9 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
|
|
|
857
904
|
if (src == NULL) {
|
|
858
905
|
continue;
|
|
859
906
|
}
|
|
860
|
-
ggml_gallocr_init_tensor(galloc, src,
|
|
907
|
+
ggml_gallocr_init_tensor(galloc, src, &node_alloc->src[j]);
|
|
861
908
|
}
|
|
862
|
-
ggml_gallocr_init_tensor(galloc, node,
|
|
909
|
+
ggml_gallocr_init_tensor(galloc, node, &node_alloc->dst);
|
|
863
910
|
}
|
|
864
911
|
|
|
865
912
|
return true;
|
|
@@ -871,6 +918,15 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
|
|
871
918
|
if (galloc->buffers[buffer_id] == NULL) {
|
|
872
919
|
return 0;
|
|
873
920
|
}
|
|
921
|
+
|
|
922
|
+
for (int i = 0; i < buffer_id; i++) {
|
|
923
|
+
if (galloc->buffers[i] == galloc->buffers[buffer_id]) {
|
|
924
|
+
// this buffer is the same as a previous one due to the same buffer type being used multiple times
|
|
925
|
+
// only return the buffer size the first time it appears to avoid double counting
|
|
926
|
+
return 0;
|
|
927
|
+
}
|
|
928
|
+
}
|
|
929
|
+
|
|
874
930
|
return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
|
|
875
931
|
}
|
|
876
932
|
|
|
@@ -886,7 +942,7 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
|
|
886
942
|
fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
|
|
887
943
|
#endif
|
|
888
944
|
for (size_t i = 0; i < *n_buffers; i++) {
|
|
889
|
-
ggml_backend_buffer_free(*buffers[i]);
|
|
945
|
+
ggml_backend_buffer_free((*buffers)[i]);
|
|
890
946
|
}
|
|
891
947
|
free(*buffers);
|
|
892
948
|
return false;
|
|
@@ -899,12 +955,12 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
|
|
899
955
|
if (t->view_src == NULL) {
|
|
900
956
|
ggml_tallocr_alloc(&tallocr, t);
|
|
901
957
|
} else if (t->buffer == NULL) {
|
|
902
|
-
ggml_backend_view_init(
|
|
958
|
+
ggml_backend_view_init(t);
|
|
903
959
|
}
|
|
904
960
|
} else {
|
|
905
961
|
if (t->view_src != NULL && t->buffer == NULL) {
|
|
906
962
|
// view of a pre-allocated tensor
|
|
907
|
-
ggml_backend_view_init(
|
|
963
|
+
ggml_backend_view_init(t);
|
|
908
964
|
}
|
|
909
965
|
}
|
|
910
966
|
}
|
|
@@ -17,13 +17,15 @@ extern "C" {
|
|
|
17
17
|
|
|
18
18
|
struct ggml_backend_buffer_type_i {
|
|
19
19
|
const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
|
|
20
|
+
// allocate a buffer of this type
|
|
20
21
|
ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
|
21
|
-
|
|
22
|
-
size_t (*GGML_CALL
|
|
23
|
-
|
|
24
|
-
|
|
22
|
+
// tensor alignment
|
|
23
|
+
size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft);
|
|
24
|
+
// max buffer size that can be allocated
|
|
25
|
+
size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft);
|
|
26
|
+
// data size needed to allocate the tensor, including padding
|
|
27
|
+
size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
|
|
25
28
|
// check if tensor data is in host memory
|
|
26
|
-
// should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
|
|
27
29
|
bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft);
|
|
28
30
|
};
|
|
29
31
|
|
|
@@ -92,27 +94,37 @@ extern "C" {
|
|
|
92
94
|
void (*GGML_CALL synchronize)(ggml_backend_t backend);
|
|
93
95
|
|
|
94
96
|
// compute graph with a plan (not used currently)
|
|
97
|
+
// create a new plan for a graph
|
|
95
98
|
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
|
96
99
|
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
|
100
|
+
// update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
|
|
101
|
+
void (*GGML_CALL graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
|
|
102
|
+
// compute the graph with the plan
|
|
103
|
+
enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
|
97
104
|
|
|
98
|
-
// compute graph with a plan
|
|
99
|
-
enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
|
100
105
|
// compute graph without a plan (async)
|
|
101
106
|
enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
|
102
107
|
|
|
103
|
-
// check if the backend
|
|
108
|
+
// check if the backend can compute an operation
|
|
104
109
|
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
|
105
110
|
|
|
111
|
+
// check if the backend can use tensors allocated in a buffer type
|
|
112
|
+
bool (*GGML_CALL supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
|
|
113
|
+
|
|
106
114
|
// check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
|
|
107
115
|
// these should be expensive operations with large batch sizes that may benefit from running on this backend
|
|
108
116
|
// even if the weight has to be copied from the CPU temporarily
|
|
109
117
|
bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
|
110
118
|
|
|
111
119
|
// (optional) event synchronization
|
|
120
|
+
// create a new event that can record events on this backend instance
|
|
112
121
|
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
|
|
113
122
|
void (*GGML_CALL event_free) (ggml_backend_event_t event);
|
|
123
|
+
// record an event on the backend instance that created it
|
|
114
124
|
void (*GGML_CALL event_record) (ggml_backend_event_t event);
|
|
125
|
+
// wait for an event on on a different backend instance
|
|
115
126
|
void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
|
|
127
|
+
// block until an event is recorded
|
|
116
128
|
void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
|
|
117
129
|
};
|
|
118
130
|
|