llama_cpp 0.15.2 → 0.15.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +49 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +6 -17
- data/vendor/tmp/llama.cpp/ggml-common.h +0 -54
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +72 -30
- data/vendor/tmp/llama.cpp/ggml-cuda.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +40 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +68 -70
- data/vendor/tmp/llama.cpp/ggml-metal.metal +24 -409
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-quants.c +1879 -2450
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +176 -53
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +40 -500
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +202 -225
- data/vendor/tmp/llama.cpp/ggml.c +376 -758
- data/vendor/tmp/llama.cpp/ggml.h +39 -27
- data/vendor/tmp/llama.cpp/llama.cpp +823 -593
- data/vendor/tmp/llama.cpp/llama.h +10 -3
- metadata +3 -3
@@ -56,6 +56,7 @@ struct socket_t {
|
|
56
56
|
};
|
57
57
|
|
58
58
|
// ggml_tensor is serialized into rpc_tensor
|
59
|
+
#pragma pack(push, 1)
|
59
60
|
struct rpc_tensor {
|
60
61
|
uint64_t id;
|
61
62
|
uint32_t type;
|
@@ -71,6 +72,7 @@ struct rpc_tensor {
|
|
71
72
|
uint64_t data;
|
72
73
|
char name[GGML_MAX_NAME];
|
73
74
|
};
|
75
|
+
#pragma pack(pop)
|
74
76
|
|
75
77
|
// RPC commands
|
76
78
|
enum rpc_cmd {
|
@@ -340,23 +342,6 @@ static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
|
|
340
342
|
return result;
|
341
343
|
}
|
342
344
|
|
343
|
-
static ggml_tensor * deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor) {
|
344
|
-
ggml_tensor * result = ggml_new_tensor_4d(ctx, (ggml_type) tensor->type,
|
345
|
-
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
|
346
|
-
for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) {
|
347
|
-
result->nb[i] = tensor->nb[i];
|
348
|
-
}
|
349
|
-
result->buffer = reinterpret_cast<ggml_backend_buffer_t>(tensor->buffer);
|
350
|
-
result->op = (ggml_op) tensor->op;
|
351
|
-
for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
|
352
|
-
result->op_params[i] = tensor->op_params[i];
|
353
|
-
}
|
354
|
-
result->flags = tensor->flags;
|
355
|
-
result->data = reinterpret_cast<void *>(tensor->data);
|
356
|
-
ggml_set_name(result, tensor->name);
|
357
|
-
return result;
|
358
|
-
}
|
359
|
-
|
360
345
|
GGML_CALL static void ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
361
346
|
UNUSED(buffer);
|
362
347
|
if (ggml_is_quantized(tensor->type)) {
|
@@ -465,13 +450,15 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer
|
|
465
450
|
memcpy(&remote_ptr, output.data(), sizeof(remote_ptr));
|
466
451
|
size_t remote_size;
|
467
452
|
memcpy(&remote_size, output.data() + sizeof(uint64_t), sizeof(remote_size));
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
453
|
+
if (remote_ptr != 0) {
|
454
|
+
ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
|
455
|
+
ggml_backend_rpc_buffer_interface,
|
456
|
+
new ggml_backend_rpc_buffer_context{buft_ctx->sock, {}, remote_ptr, "RPC"},
|
457
|
+
remote_size);
|
458
|
+
return buffer;
|
459
|
+
} else {
|
460
|
+
return nullptr;
|
461
|
+
}
|
475
462
|
}
|
476
463
|
|
477
464
|
static size_t get_alignment(const std::shared_ptr<socket_t> & sock) {
|
@@ -658,7 +645,7 @@ GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
|
|
658
645
|
}
|
659
646
|
}
|
660
647
|
#endif
|
661
|
-
|
648
|
+
fprintf(stderr, "Connecting to %s\n", endpoint);
|
662
649
|
std::string host;
|
663
650
|
int port;
|
664
651
|
if (!parse_endpoint(endpoint, host, port)) {
|
@@ -731,22 +718,61 @@ GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint
|
|
731
718
|
|
732
719
|
// RPC server-side implementation
|
733
720
|
|
734
|
-
|
721
|
+
class rpc_server {
|
722
|
+
public:
|
723
|
+
rpc_server(ggml_backend_t backend) : backend(backend) {}
|
724
|
+
~rpc_server();
|
725
|
+
|
726
|
+
bool alloc_buffer(const std::vector<uint8_t> & input, std::vector<uint8_t> & output);
|
727
|
+
void get_alignment(std::vector<uint8_t> & output);
|
728
|
+
void get_max_size(std::vector<uint8_t> & output);
|
729
|
+
bool buffer_get_base(const std::vector<uint8_t> & input, std::vector<uint8_t> & output);
|
730
|
+
bool free_buffer(const std::vector<uint8_t> & input);
|
731
|
+
bool buffer_clear(const std::vector<uint8_t> & input);
|
732
|
+
bool set_tensor(const std::vector<uint8_t> & input);
|
733
|
+
bool get_tensor(const std::vector<uint8_t> & input, std::vector<uint8_t> & output);
|
734
|
+
bool copy_tensor(const std::vector<uint8_t> & input, std::vector<uint8_t> & output);
|
735
|
+
bool graph_compute(const std::vector<uint8_t> & input, std::vector<uint8_t> & output);
|
736
|
+
|
737
|
+
private:
|
738
|
+
ggml_tensor * deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor);
|
739
|
+
ggml_tensor * create_node(uint64_t id,
|
740
|
+
struct ggml_context * ctx,
|
741
|
+
const std::unordered_map<uint64_t, const rpc_tensor*> & tensor_ptrs,
|
742
|
+
std::unordered_map<uint64_t, struct ggml_tensor*> & tensor_map);
|
743
|
+
|
744
|
+
|
745
|
+
ggml_backend_t backend;
|
746
|
+
std::unordered_set<ggml_backend_buffer_t> buffers;
|
747
|
+
};
|
748
|
+
|
749
|
+
bool rpc_server::alloc_buffer(const std::vector<uint8_t> & input, std::vector<uint8_t> & output) {
|
735
750
|
// input serialization format: | size (8 bytes) |
|
751
|
+
if (input.size() != sizeof(uint64_t)) {
|
752
|
+
return false;
|
753
|
+
}
|
736
754
|
uint64_t size;
|
737
755
|
memcpy(&size, input.data(), sizeof(size));
|
738
756
|
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
|
739
757
|
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
|
740
|
-
uint64_t remote_ptr =
|
741
|
-
uint64_t remote_size =
|
742
|
-
|
758
|
+
uint64_t remote_ptr = 0;
|
759
|
+
uint64_t remote_size = 0;
|
760
|
+
if (buffer != nullptr) {
|
761
|
+
remote_ptr = reinterpret_cast<uint64_t>(buffer);
|
762
|
+
remote_size = buffer->size;
|
763
|
+
GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> remote_ptr: %" PRIx64 ", remote_size: %" PRIu64 "\n", __func__, size, remote_ptr, remote_size);
|
764
|
+
buffers.insert(buffer);
|
765
|
+
} else {
|
766
|
+
GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> failed\n", __func__, size);
|
767
|
+
}
|
743
768
|
// output serialization format: | remote_ptr (8 bytes) | remote_size (8 bytes) |
|
744
769
|
output.resize(2*sizeof(uint64_t), 0);
|
745
770
|
memcpy(output.data(), &remote_ptr, sizeof(remote_ptr));
|
746
771
|
memcpy(output.data() + sizeof(uint64_t), &remote_size, sizeof(remote_size));
|
772
|
+
return true;
|
747
773
|
}
|
748
774
|
|
749
|
-
|
775
|
+
void rpc_server::get_alignment(std::vector<uint8_t> & output) {
|
750
776
|
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
|
751
777
|
size_t alignment = ggml_backend_buft_get_alignment(buft);
|
752
778
|
GGML_PRINT_DEBUG("[%s] alignment: %lu\n", __func__, alignment);
|
@@ -755,7 +781,7 @@ static void rpc_get_alignment(ggml_backend_t backend, std::vector<uint8_t> & out
|
|
755
781
|
memcpy(output.data(), &alignment, sizeof(alignment));
|
756
782
|
}
|
757
783
|
|
758
|
-
|
784
|
+
void rpc_server::get_max_size(std::vector<uint8_t> & output) {
|
759
785
|
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
|
760
786
|
size_t max_size = ggml_backend_buft_get_max_size(buft);
|
761
787
|
GGML_PRINT_DEBUG("[%s] max_size: %lu\n", __func__, max_size);
|
@@ -764,41 +790,90 @@ static void rpc_get_max_size(ggml_backend_t backend, std::vector<uint8_t> & outp
|
|
764
790
|
memcpy(output.data(), &max_size, sizeof(max_size));
|
765
791
|
}
|
766
792
|
|
767
|
-
|
793
|
+
bool rpc_server::buffer_get_base(const std::vector<uint8_t> & input, std::vector<uint8_t> & output) {
|
768
794
|
// input serialization format: | remote_ptr (8 bytes) |
|
795
|
+
if (input.size() != sizeof(uint64_t)) {
|
796
|
+
return false;
|
797
|
+
}
|
769
798
|
uint64_t remote_ptr;
|
770
799
|
memcpy(&remote_ptr, input.data(), sizeof(remote_ptr));
|
771
800
|
GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, remote_ptr);
|
772
801
|
ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(remote_ptr);
|
802
|
+
if (buffers.find(buffer) == buffers.end()) {
|
803
|
+
GGML_PRINT_DEBUG("[%s] buffer not found\n", __func__);
|
804
|
+
return false;
|
805
|
+
}
|
773
806
|
void * base = ggml_backend_buffer_get_base(buffer);
|
774
807
|
// output serialization format: | base_ptr (8 bytes) |
|
775
808
|
uint64_t base_ptr = reinterpret_cast<uint64_t>(base);
|
776
809
|
output.resize(sizeof(uint64_t), 0);
|
777
810
|
memcpy(output.data(), &base_ptr, sizeof(base_ptr));
|
811
|
+
return true;
|
778
812
|
}
|
779
813
|
|
780
|
-
|
814
|
+
bool rpc_server::free_buffer(const std::vector<uint8_t> & input) {
|
781
815
|
// input serialization format: | remote_ptr (8 bytes) |
|
816
|
+
if (input.size() != sizeof(uint64_t)) {
|
817
|
+
return false;
|
818
|
+
}
|
782
819
|
uint64_t remote_ptr;
|
783
820
|
memcpy(&remote_ptr, input.data(), sizeof(remote_ptr));
|
784
821
|
GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, remote_ptr);
|
785
822
|
ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(remote_ptr);
|
823
|
+
if (buffers.find(buffer) == buffers.end()) {
|
824
|
+
GGML_PRINT_DEBUG("[%s] buffer not found\n", __func__);
|
825
|
+
return false;
|
826
|
+
}
|
786
827
|
ggml_backend_buffer_free(buffer);
|
828
|
+
buffers.erase(buffer);
|
829
|
+
return true;
|
787
830
|
}
|
788
831
|
|
789
|
-
|
832
|
+
bool rpc_server::buffer_clear(const std::vector<uint8_t> & input) {
|
790
833
|
// input serialization format: | remote_ptr (8 bytes) | value (1 byte) |
|
834
|
+
if (input.size() != sizeof(uint64_t) + sizeof(uint8_t)) {
|
835
|
+
return false;
|
836
|
+
}
|
791
837
|
uint64_t remote_ptr;
|
792
838
|
memcpy(&remote_ptr, input.data(), sizeof(remote_ptr));
|
793
839
|
uint8_t value;
|
794
840
|
memcpy(&value, input.data() + sizeof(uint64_t), sizeof(value));
|
795
841
|
GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 ", value: %u\n", __func__, remote_ptr, value);
|
796
842
|
ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(remote_ptr);
|
843
|
+
if (buffers.find(buffer) == buffers.end()) {
|
844
|
+
GGML_PRINT_DEBUG("[%s] buffer not found\n", __func__);
|
845
|
+
return false;
|
846
|
+
}
|
797
847
|
ggml_backend_buffer_clear(buffer, value);
|
848
|
+
return true;
|
798
849
|
}
|
799
850
|
|
800
|
-
|
851
|
+
ggml_tensor * rpc_server::deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor) {
|
852
|
+
ggml_tensor * result = ggml_new_tensor_4d(ctx, (ggml_type) tensor->type,
|
853
|
+
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
|
854
|
+
for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) {
|
855
|
+
result->nb[i] = tensor->nb[i];
|
856
|
+
}
|
857
|
+
result->buffer = reinterpret_cast<ggml_backend_buffer_t>(tensor->buffer);
|
858
|
+
if (result->buffer && buffers.find(result->buffer) == buffers.end()) {
|
859
|
+
return nullptr;
|
860
|
+
}
|
861
|
+
result->op = (ggml_op) tensor->op;
|
862
|
+
for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
|
863
|
+
result->op_params[i] = tensor->op_params[i];
|
864
|
+
}
|
865
|
+
result->flags = tensor->flags;
|
866
|
+
result->data = reinterpret_cast<void *>(tensor->data);
|
867
|
+
ggml_set_name(result, tensor->name);
|
868
|
+
return result;
|
869
|
+
}
|
870
|
+
|
871
|
+
|
872
|
+
bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
|
801
873
|
// serialization format: | rpc_tensor | offset (8 bytes) | data (size bytes) |
|
874
|
+
if (input.size() < sizeof(rpc_tensor) + sizeof(uint64_t)) {
|
875
|
+
return false;
|
876
|
+
}
|
802
877
|
const rpc_tensor * in_tensor = (const rpc_tensor *)input.data();
|
803
878
|
uint64_t offset;
|
804
879
|
memcpy(&offset, input.data() + sizeof(rpc_tensor), sizeof(offset));
|
@@ -811,14 +886,23 @@ static void rpc_set_tensor(const std::vector<uint8_t> & input) {
|
|
811
886
|
};
|
812
887
|
struct ggml_context * ctx = ggml_init(params);
|
813
888
|
ggml_tensor * tensor = deserialize_tensor(ctx, in_tensor);
|
889
|
+
if (tensor == nullptr) {
|
890
|
+
GGML_PRINT_DEBUG("[%s] error deserializing tensor\n", __func__);
|
891
|
+
ggml_free(ctx);
|
892
|
+
return false;
|
893
|
+
}
|
814
894
|
GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu\n", __func__, (void*)tensor->buffer, tensor->data, offset, size);
|
815
895
|
const void * data = input.data() + sizeof(rpc_tensor) + sizeof(offset);
|
816
896
|
ggml_backend_tensor_set(tensor, data, offset, size);
|
817
897
|
ggml_free(ctx);
|
898
|
+
return true;
|
818
899
|
}
|
819
900
|
|
820
|
-
|
901
|
+
bool rpc_server::get_tensor(const std::vector<uint8_t> & input, std::vector<uint8_t> & output) {
|
821
902
|
// serialization format: | rpc_tensor | offset (8 bytes) | size (8 bytes) |
|
903
|
+
if (input.size() != sizeof(rpc_tensor) + 2*sizeof(uint64_t)) {
|
904
|
+
return false;
|
905
|
+
}
|
822
906
|
const rpc_tensor * in_tensor = (const rpc_tensor *)input.data();
|
823
907
|
uint64_t offset;
|
824
908
|
memcpy(&offset, input.data() + sizeof(rpc_tensor), sizeof(offset));
|
@@ -832,15 +916,24 @@ static void rpc_get_tensor(const std::vector<uint8_t> & input, std::vector<uint8
|
|
832
916
|
};
|
833
917
|
struct ggml_context * ctx = ggml_init(params);
|
834
918
|
ggml_tensor * tensor = deserialize_tensor(ctx, in_tensor);
|
919
|
+
if (tensor == nullptr) {
|
920
|
+
GGML_PRINT_DEBUG("[%s] error deserializing tensor\n", __func__);
|
921
|
+
ggml_free(ctx);
|
922
|
+
return false;
|
923
|
+
}
|
835
924
|
GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %" PRIu64 "\n", __func__, (void*)tensor->buffer, tensor->data, offset, size);
|
836
925
|
// output serialization format: | data (size bytes) |
|
837
926
|
output.resize(size, 0);
|
838
927
|
ggml_backend_tensor_get(tensor, output.data(), offset, size);
|
839
928
|
ggml_free(ctx);
|
929
|
+
return true;
|
840
930
|
}
|
841
931
|
|
842
|
-
|
932
|
+
bool rpc_server::copy_tensor(const std::vector<uint8_t> & input, std::vector<uint8_t> & output) {
|
843
933
|
// serialization format: | rpc_tensor src | rpc_tensor dst |
|
934
|
+
if (input.size() != 2*sizeof(rpc_tensor)) {
|
935
|
+
return false;
|
936
|
+
}
|
844
937
|
const rpc_tensor * rpc_src = (const rpc_tensor *)input.data();
|
845
938
|
const rpc_tensor * rpc_dst = (const rpc_tensor *)(input.data() + sizeof(rpc_src));
|
846
939
|
|
@@ -852,18 +945,24 @@ static void rpc_copy_tensor(const std::vector<uint8_t> & input, std::vector<uint
|
|
852
945
|
struct ggml_context * ctx = ggml_init(params);
|
853
946
|
ggml_tensor * src = deserialize_tensor(ctx, rpc_src);
|
854
947
|
ggml_tensor * dst = deserialize_tensor(ctx, rpc_dst);
|
948
|
+
if (src == nullptr || dst == nullptr) {
|
949
|
+
GGML_PRINT_DEBUG("[%s] error deserializing tensors\n", __func__);
|
950
|
+
ggml_free(ctx);
|
951
|
+
return false;
|
952
|
+
}
|
855
953
|
GGML_PRINT_DEBUG("[%s] src->buffer: %p, dst->buffer: %p\n", __func__, (void*)src->buffer, (void*)dst->buffer);
|
856
954
|
bool result = ggml_backend_buffer_copy_tensor(src, dst);
|
857
955
|
// output serialization format: | result (1 byte) |
|
858
956
|
output.resize(1, 0);
|
859
957
|
output[0] = result;
|
860
958
|
ggml_free(ctx);
|
959
|
+
return true;
|
861
960
|
}
|
862
961
|
|
863
|
-
|
864
|
-
|
865
|
-
|
866
|
-
|
962
|
+
ggml_tensor * rpc_server::create_node(uint64_t id,
|
963
|
+
struct ggml_context * ctx,
|
964
|
+
const std::unordered_map<uint64_t, const rpc_tensor*> & tensor_ptrs,
|
965
|
+
std::unordered_map<uint64_t, struct ggml_tensor*> & tensor_map) {
|
867
966
|
if (id == 0) {
|
868
967
|
return nullptr;
|
869
968
|
}
|
@@ -872,6 +971,9 @@ static struct ggml_tensor * create_node(uint64_t id,
|
|
872
971
|
}
|
873
972
|
const rpc_tensor * tensor = tensor_ptrs.at(id);
|
874
973
|
struct ggml_tensor * result = deserialize_tensor(ctx, tensor);
|
974
|
+
if (result == nullptr) {
|
975
|
+
return nullptr;
|
976
|
+
}
|
875
977
|
tensor_map[id] = result;
|
876
978
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
877
979
|
result->src[i] = create_node(tensor->src[i], ctx, tensor_ptrs, tensor_map);
|
@@ -881,14 +983,23 @@ static struct ggml_tensor * create_node(uint64_t id,
|
|
881
983
|
return result;
|
882
984
|
}
|
883
985
|
|
884
|
-
|
986
|
+
bool rpc_server::graph_compute(const std::vector<uint8_t> & input, std::vector<uint8_t> & output) {
|
885
987
|
// serialization format:
|
886
988
|
// | n_nodes (4 bytes) | nodes (n_nodes * sizeof(uint64_t) | n_tensors (4 bytes) | tensors (n_tensors * sizeof(rpc_tensor)) |
|
989
|
+
if (input.size() < sizeof(uint32_t)) {
|
990
|
+
return false;
|
991
|
+
}
|
887
992
|
uint32_t n_nodes;
|
888
993
|
memcpy(&n_nodes, input.data(), sizeof(n_nodes));
|
994
|
+
if (input.size() < sizeof(uint32_t) + n_nodes*sizeof(uint64_t) + sizeof(uint32_t)) {
|
995
|
+
return false;
|
996
|
+
}
|
889
997
|
const uint64_t * nodes = (const uint64_t *)(input.data() + sizeof(n_nodes));
|
890
998
|
uint32_t n_tensors;
|
891
999
|
memcpy(&n_tensors, input.data() + sizeof(n_nodes) + n_nodes*sizeof(uint64_t), sizeof(n_tensors));
|
1000
|
+
if (input.size() < sizeof(uint32_t) + n_nodes*sizeof(uint64_t) + sizeof(uint32_t) + n_tensors*sizeof(rpc_tensor)) {
|
1001
|
+
return false;
|
1002
|
+
}
|
892
1003
|
const rpc_tensor * tensors = (const rpc_tensor *)(input.data() + sizeof(n_nodes) + n_nodes*sizeof(uint64_t) + sizeof(n_tensors));
|
893
1004
|
GGML_PRINT_DEBUG("[%s] n_nodes: %u, n_tensors: %u\n", __func__, n_nodes, n_tensors);
|
894
1005
|
|
@@ -914,9 +1025,17 @@ static void rpc_graph_compute(ggml_backend_t backend, const std::vector<uint8_t>
|
|
914
1025
|
output.resize(1, 0);
|
915
1026
|
output[0] = status;
|
916
1027
|
ggml_free(ctx);
|
1028
|
+
return true;
|
1029
|
+
}
|
1030
|
+
|
1031
|
+
rpc_server::~rpc_server() {
|
1032
|
+
for (auto buffer : buffers) {
|
1033
|
+
ggml_backend_buffer_free(buffer);
|
1034
|
+
}
|
917
1035
|
}
|
918
1036
|
|
919
1037
|
static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t free_mem, size_t total_mem) {
|
1038
|
+
rpc_server server(backend);
|
920
1039
|
while (true) {
|
921
1040
|
uint8_t cmd;
|
922
1041
|
if (!recv_data(sockfd, &cmd, 1)) {
|
@@ -932,45 +1051,46 @@ static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t fre
|
|
932
1051
|
if (!recv_data(sockfd, input.data(), input_size)) {
|
933
1052
|
break;
|
934
1053
|
}
|
1054
|
+
bool ok = true;
|
935
1055
|
switch (cmd) {
|
936
1056
|
case ALLOC_BUFFER: {
|
937
|
-
|
1057
|
+
ok = server.alloc_buffer(input, output);
|
938
1058
|
break;
|
939
1059
|
}
|
940
1060
|
case GET_ALIGNMENT: {
|
941
|
-
|
1061
|
+
server.get_alignment(output);
|
942
1062
|
break;
|
943
1063
|
}
|
944
1064
|
case GET_MAX_SIZE: {
|
945
|
-
|
1065
|
+
server.get_max_size(output);
|
946
1066
|
break;
|
947
1067
|
}
|
948
1068
|
case BUFFER_GET_BASE: {
|
949
|
-
|
1069
|
+
ok = server.buffer_get_base(input, output);
|
950
1070
|
break;
|
951
1071
|
}
|
952
1072
|
case FREE_BUFFER: {
|
953
|
-
|
1073
|
+
ok = server.free_buffer(input);
|
954
1074
|
break;
|
955
1075
|
}
|
956
1076
|
case BUFFER_CLEAR: {
|
957
|
-
|
1077
|
+
ok = server.buffer_clear(input);
|
958
1078
|
break;
|
959
1079
|
}
|
960
1080
|
case SET_TENSOR: {
|
961
|
-
|
1081
|
+
ok = server.set_tensor(input);
|
962
1082
|
break;
|
963
1083
|
}
|
964
1084
|
case GET_TENSOR: {
|
965
|
-
|
1085
|
+
ok = server.get_tensor(input, output);
|
966
1086
|
break;
|
967
1087
|
}
|
968
1088
|
case COPY_TENSOR: {
|
969
|
-
|
1089
|
+
ok = server.copy_tensor(input, output);
|
970
1090
|
break;
|
971
1091
|
}
|
972
1092
|
case GRAPH_COMPUTE: {
|
973
|
-
|
1093
|
+
ok = server.graph_compute(input, output);
|
974
1094
|
break;
|
975
1095
|
}
|
976
1096
|
case GET_DEVICE_MEMORY: {
|
@@ -982,9 +1102,12 @@ static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t fre
|
|
982
1102
|
}
|
983
1103
|
default: {
|
984
1104
|
fprintf(stderr, "Unknown command: %d\n", cmd);
|
985
|
-
|
1105
|
+
ok = false;
|
986
1106
|
}
|
987
1107
|
}
|
1108
|
+
if (!ok) {
|
1109
|
+
break;
|
1110
|
+
}
|
988
1111
|
uint64_t output_size = output.size();
|
989
1112
|
if (!send_data(sockfd, &output_size, sizeof(output_size))) {
|
990
1113
|
break;
|