@fugood/llama.node 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +1 -1
- package/src/LlamaContext.cpp +2 -2
- package/src/llama.cpp/CMakeLists.txt +72 -46
- package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +16 -0
- package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +6 -0
- package/src/llama.cpp/common/common.cpp +732 -752
- package/src/llama.cpp/common/common.h +47 -41
- package/src/llama.cpp/common/grammar-parser.cpp +1 -1
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +6 -6
- package/src/llama.cpp/common/log.h +5 -5
- package/src/llama.cpp/common/sampling.cpp +89 -7
- package/src/llama.cpp/common/sampling.h +5 -0
- package/src/llama.cpp/common/train.cpp +2 -2
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +3 -2
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +2 -2
- package/src/llama.cpp/examples/finetune/finetune.cpp +4 -3
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -2
- package/src/llama.cpp/examples/infill/infill.cpp +8 -8
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
- package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +13 -8
- package/src/llama.cpp/examples/llava/clip.h +1 -1
- package/src/llama.cpp/examples/llava/llava-cli.cpp +1 -1
- package/src/llama.cpp/examples/llava/llava.cpp +0 -15
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/main.cpp +24 -16
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +9 -9
- package/src/llama.cpp/examples/quantize/quantize.cpp +2 -2
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +2 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +78 -14
- package/src/llama.cpp/examples/server/server.cpp +21 -9
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +359 -9
- package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +4 -3
- package/src/llama.cpp/ggml-backend.c +0 -1
- package/src/llama.cpp/ggml-common.h +0 -54
- package/src/llama.cpp/ggml-cuda.h +1 -0
- package/src/llama.cpp/ggml-impl.h +51 -0
- package/src/llama.cpp/ggml-kompute.cpp +4 -0
- package/src/llama.cpp/ggml-opencl.cpp +4 -1
- package/src/llama.cpp/ggml-quants.c +3700 -2041
- package/src/llama.cpp/ggml-rpc.cpp +188 -56
- package/src/llama.cpp/ggml-sycl.cpp +99 -530
- package/src/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- package/src/llama.cpp/ggml-vulkan.cpp +202 -225
- package/src/llama.cpp/ggml.c +1034 -1154
- package/src/llama.cpp/ggml.h +59 -31
- package/src/llama.cpp/llama.cpp +859 -609
- package/src/llama.cpp/llama.h +19 -6
- package/src/llama.cpp/requirements.txt +0 -1
- package/src/llama.cpp/tests/test-backend-ops.cpp +113 -47
- package/src/llama.cpp/tests/test-chat-template.cpp +16 -4
- package/src/llama.cpp/tests/test-grad0.cpp +43 -83
- package/src/llama.cpp/unicode-data.cpp +6969 -2169
- package/src/llama.cpp/unicode-data.h +15 -12
- package/src/llama.cpp/unicode.cpp +89 -111
- package/src/llama.cpp/unicode.h +44 -12
- package/src/llama.cpp/build.zig +0 -172
- package/src/llama.cpp/ggml-mpi.c +0 -216
- package/src/llama.cpp/ggml-mpi.h +0 -39
- package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +0 -2
|
@@ -28,7 +28,7 @@
|
|
|
28
28
|
|
|
29
29
|
#define UNUSED GGML_UNUSED
|
|
30
30
|
|
|
31
|
-
#define GGML_DEBUG
|
|
31
|
+
#define GGML_DEBUG 0
|
|
32
32
|
#if (GGML_DEBUG >= 1)
|
|
33
33
|
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
|
|
34
34
|
#else
|
|
@@ -56,6 +56,7 @@ struct socket_t {
|
|
|
56
56
|
};
|
|
57
57
|
|
|
58
58
|
// ggml_tensor is serialized into rpc_tensor
|
|
59
|
+
#pragma pack(push, 1)
|
|
59
60
|
struct rpc_tensor {
|
|
60
61
|
uint64_t id;
|
|
61
62
|
uint32_t type;
|
|
@@ -71,6 +72,7 @@ struct rpc_tensor {
|
|
|
71
72
|
uint64_t data;
|
|
72
73
|
char name[GGML_MAX_NAME];
|
|
73
74
|
};
|
|
75
|
+
#pragma pack(pop)
|
|
74
76
|
|
|
75
77
|
// RPC commands
|
|
76
78
|
enum rpc_cmd {
|
|
@@ -134,7 +136,13 @@ static bool set_no_delay(sockfd_t sockfd) {
|
|
|
134
136
|
int flag = 1;
|
|
135
137
|
// set TCP_NODELAY to disable Nagle's algorithm
|
|
136
138
|
int ret = setsockopt(sockfd, IPPROTO_TCP, TCP_NODELAY, (char *)&flag, sizeof(int));
|
|
137
|
-
return ret
|
|
139
|
+
return ret == 0;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
static bool set_reuse_addr(sockfd_t sockfd) {
|
|
143
|
+
int flag = 1;
|
|
144
|
+
int ret = setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, (char *)&flag, sizeof(int));
|
|
145
|
+
return ret == 0;
|
|
138
146
|
}
|
|
139
147
|
|
|
140
148
|
static std::shared_ptr<socket_t> socket_connect(const char * host, int port) {
|
|
@@ -181,7 +189,10 @@ static std::shared_ptr<socket_t> create_server_socket(const char * host, int por
|
|
|
181
189
|
if (sock == nullptr) {
|
|
182
190
|
return nullptr;
|
|
183
191
|
}
|
|
184
|
-
|
|
192
|
+
if (!set_reuse_addr(sockfd)) {
|
|
193
|
+
fprintf(stderr, "Failed to set SO_REUSEADDR\n");
|
|
194
|
+
return nullptr;
|
|
195
|
+
}
|
|
185
196
|
struct sockaddr_in serv_addr;
|
|
186
197
|
serv_addr.sin_family = AF_INET;
|
|
187
198
|
serv_addr.sin_addr.s_addr = inet_addr(host);
|
|
@@ -331,23 +342,6 @@ static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
|
|
|
331
342
|
return result;
|
|
332
343
|
}
|
|
333
344
|
|
|
334
|
-
static ggml_tensor * deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor) {
|
|
335
|
-
ggml_tensor * result = ggml_new_tensor_4d(ctx, (ggml_type) tensor->type,
|
|
336
|
-
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
|
|
337
|
-
for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) {
|
|
338
|
-
result->nb[i] = tensor->nb[i];
|
|
339
|
-
}
|
|
340
|
-
result->buffer = reinterpret_cast<ggml_backend_buffer_t>(tensor->buffer);
|
|
341
|
-
result->op = (ggml_op) tensor->op;
|
|
342
|
-
for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
|
|
343
|
-
result->op_params[i] = tensor->op_params[i];
|
|
344
|
-
}
|
|
345
|
-
result->flags = tensor->flags;
|
|
346
|
-
result->data = reinterpret_cast<void *>(tensor->data);
|
|
347
|
-
ggml_set_name(result, tensor->name);
|
|
348
|
-
return result;
|
|
349
|
-
}
|
|
350
|
-
|
|
351
345
|
GGML_CALL static void ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
|
352
346
|
UNUSED(buffer);
|
|
353
347
|
if (ggml_is_quantized(tensor->type)) {
|
|
@@ -456,13 +450,15 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer
|
|
|
456
450
|
memcpy(&remote_ptr, output.data(), sizeof(remote_ptr));
|
|
457
451
|
size_t remote_size;
|
|
458
452
|
memcpy(&remote_size, output.data() + sizeof(uint64_t), sizeof(remote_size));
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
453
|
+
if (remote_ptr != 0) {
|
|
454
|
+
ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
|
|
455
|
+
ggml_backend_rpc_buffer_interface,
|
|
456
|
+
new ggml_backend_rpc_buffer_context{buft_ctx->sock, {}, remote_ptr, "RPC"},
|
|
457
|
+
remote_size);
|
|
458
|
+
return buffer;
|
|
459
|
+
} else {
|
|
460
|
+
return nullptr;
|
|
461
|
+
}
|
|
466
462
|
}
|
|
467
463
|
|
|
468
464
|
static size_t get_alignment(const std::shared_ptr<socket_t> & sock) {
|
|
@@ -649,7 +645,7 @@ GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
|
|
|
649
645
|
}
|
|
650
646
|
}
|
|
651
647
|
#endif
|
|
652
|
-
|
|
648
|
+
fprintf(stderr, "Connecting to %s\n", endpoint);
|
|
653
649
|
std::string host;
|
|
654
650
|
int port;
|
|
655
651
|
if (!parse_endpoint(endpoint, host, port)) {
|
|
@@ -722,22 +718,61 @@ GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint
|
|
|
722
718
|
|
|
723
719
|
// RPC server-side implementation
|
|
724
720
|
|
|
725
|
-
|
|
721
|
+
class rpc_server {
|
|
722
|
+
public:
|
|
723
|
+
rpc_server(ggml_backend_t backend) : backend(backend) {}
|
|
724
|
+
~rpc_server();
|
|
725
|
+
|
|
726
|
+
bool alloc_buffer(const std::vector<uint8_t> & input, std::vector<uint8_t> & output);
|
|
727
|
+
void get_alignment(std::vector<uint8_t> & output);
|
|
728
|
+
void get_max_size(std::vector<uint8_t> & output);
|
|
729
|
+
bool buffer_get_base(const std::vector<uint8_t> & input, std::vector<uint8_t> & output);
|
|
730
|
+
bool free_buffer(const std::vector<uint8_t> & input);
|
|
731
|
+
bool buffer_clear(const std::vector<uint8_t> & input);
|
|
732
|
+
bool set_tensor(const std::vector<uint8_t> & input);
|
|
733
|
+
bool get_tensor(const std::vector<uint8_t> & input, std::vector<uint8_t> & output);
|
|
734
|
+
bool copy_tensor(const std::vector<uint8_t> & input, std::vector<uint8_t> & output);
|
|
735
|
+
bool graph_compute(const std::vector<uint8_t> & input, std::vector<uint8_t> & output);
|
|
736
|
+
|
|
737
|
+
private:
|
|
738
|
+
ggml_tensor * deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor);
|
|
739
|
+
ggml_tensor * create_node(uint64_t id,
|
|
740
|
+
struct ggml_context * ctx,
|
|
741
|
+
const std::unordered_map<uint64_t, const rpc_tensor*> & tensor_ptrs,
|
|
742
|
+
std::unordered_map<uint64_t, struct ggml_tensor*> & tensor_map);
|
|
743
|
+
|
|
744
|
+
|
|
745
|
+
ggml_backend_t backend;
|
|
746
|
+
std::unordered_set<ggml_backend_buffer_t> buffers;
|
|
747
|
+
};
|
|
748
|
+
|
|
749
|
+
bool rpc_server::alloc_buffer(const std::vector<uint8_t> & input, std::vector<uint8_t> & output) {
|
|
726
750
|
// input serialization format: | size (8 bytes) |
|
|
751
|
+
if (input.size() != sizeof(uint64_t)) {
|
|
752
|
+
return false;
|
|
753
|
+
}
|
|
727
754
|
uint64_t size;
|
|
728
755
|
memcpy(&size, input.data(), sizeof(size));
|
|
729
756
|
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
|
|
730
757
|
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
|
|
731
|
-
uint64_t remote_ptr =
|
|
732
|
-
uint64_t remote_size =
|
|
733
|
-
|
|
758
|
+
uint64_t remote_ptr = 0;
|
|
759
|
+
uint64_t remote_size = 0;
|
|
760
|
+
if (buffer != nullptr) {
|
|
761
|
+
remote_ptr = reinterpret_cast<uint64_t>(buffer);
|
|
762
|
+
remote_size = buffer->size;
|
|
763
|
+
GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> remote_ptr: %" PRIx64 ", remote_size: %" PRIu64 "\n", __func__, size, remote_ptr, remote_size);
|
|
764
|
+
buffers.insert(buffer);
|
|
765
|
+
} else {
|
|
766
|
+
GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> failed\n", __func__, size);
|
|
767
|
+
}
|
|
734
768
|
// output serialization format: | remote_ptr (8 bytes) | remote_size (8 bytes) |
|
|
735
769
|
output.resize(2*sizeof(uint64_t), 0);
|
|
736
770
|
memcpy(output.data(), &remote_ptr, sizeof(remote_ptr));
|
|
737
771
|
memcpy(output.data() + sizeof(uint64_t), &remote_size, sizeof(remote_size));
|
|
772
|
+
return true;
|
|
738
773
|
}
|
|
739
774
|
|
|
740
|
-
|
|
775
|
+
void rpc_server::get_alignment(std::vector<uint8_t> & output) {
|
|
741
776
|
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
|
|
742
777
|
size_t alignment = ggml_backend_buft_get_alignment(buft);
|
|
743
778
|
GGML_PRINT_DEBUG("[%s] alignment: %lu\n", __func__, alignment);
|
|
@@ -746,7 +781,7 @@ static void rpc_get_alignment(ggml_backend_t backend, std::vector<uint8_t> & out
|
|
|
746
781
|
memcpy(output.data(), &alignment, sizeof(alignment));
|
|
747
782
|
}
|
|
748
783
|
|
|
749
|
-
|
|
784
|
+
void rpc_server::get_max_size(std::vector<uint8_t> & output) {
|
|
750
785
|
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
|
|
751
786
|
size_t max_size = ggml_backend_buft_get_max_size(buft);
|
|
752
787
|
GGML_PRINT_DEBUG("[%s] max_size: %lu\n", __func__, max_size);
|
|
@@ -755,41 +790,90 @@ static void rpc_get_max_size(ggml_backend_t backend, std::vector<uint8_t> & outp
|
|
|
755
790
|
memcpy(output.data(), &max_size, sizeof(max_size));
|
|
756
791
|
}
|
|
757
792
|
|
|
758
|
-
|
|
793
|
+
bool rpc_server::buffer_get_base(const std::vector<uint8_t> & input, std::vector<uint8_t> & output) {
|
|
759
794
|
// input serialization format: | remote_ptr (8 bytes) |
|
|
795
|
+
if (input.size() != sizeof(uint64_t)) {
|
|
796
|
+
return false;
|
|
797
|
+
}
|
|
760
798
|
uint64_t remote_ptr;
|
|
761
799
|
memcpy(&remote_ptr, input.data(), sizeof(remote_ptr));
|
|
762
800
|
GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, remote_ptr);
|
|
763
801
|
ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(remote_ptr);
|
|
802
|
+
if (buffers.find(buffer) == buffers.end()) {
|
|
803
|
+
GGML_PRINT_DEBUG("[%s] buffer not found\n", __func__);
|
|
804
|
+
return false;
|
|
805
|
+
}
|
|
764
806
|
void * base = ggml_backend_buffer_get_base(buffer);
|
|
765
807
|
// output serialization format: | base_ptr (8 bytes) |
|
|
766
808
|
uint64_t base_ptr = reinterpret_cast<uint64_t>(base);
|
|
767
809
|
output.resize(sizeof(uint64_t), 0);
|
|
768
810
|
memcpy(output.data(), &base_ptr, sizeof(base_ptr));
|
|
811
|
+
return true;
|
|
769
812
|
}
|
|
770
813
|
|
|
771
|
-
|
|
814
|
+
bool rpc_server::free_buffer(const std::vector<uint8_t> & input) {
|
|
772
815
|
// input serialization format: | remote_ptr (8 bytes) |
|
|
816
|
+
if (input.size() != sizeof(uint64_t)) {
|
|
817
|
+
return false;
|
|
818
|
+
}
|
|
773
819
|
uint64_t remote_ptr;
|
|
774
820
|
memcpy(&remote_ptr, input.data(), sizeof(remote_ptr));
|
|
775
821
|
GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, remote_ptr);
|
|
776
822
|
ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(remote_ptr);
|
|
823
|
+
if (buffers.find(buffer) == buffers.end()) {
|
|
824
|
+
GGML_PRINT_DEBUG("[%s] buffer not found\n", __func__);
|
|
825
|
+
return false;
|
|
826
|
+
}
|
|
777
827
|
ggml_backend_buffer_free(buffer);
|
|
828
|
+
buffers.erase(buffer);
|
|
829
|
+
return true;
|
|
778
830
|
}
|
|
779
831
|
|
|
780
|
-
|
|
832
|
+
bool rpc_server::buffer_clear(const std::vector<uint8_t> & input) {
|
|
781
833
|
// input serialization format: | remote_ptr (8 bytes) | value (1 byte) |
|
|
834
|
+
if (input.size() != sizeof(uint64_t) + sizeof(uint8_t)) {
|
|
835
|
+
return false;
|
|
836
|
+
}
|
|
782
837
|
uint64_t remote_ptr;
|
|
783
838
|
memcpy(&remote_ptr, input.data(), sizeof(remote_ptr));
|
|
784
839
|
uint8_t value;
|
|
785
840
|
memcpy(&value, input.data() + sizeof(uint64_t), sizeof(value));
|
|
786
841
|
GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 ", value: %u\n", __func__, remote_ptr, value);
|
|
787
842
|
ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(remote_ptr);
|
|
843
|
+
if (buffers.find(buffer) == buffers.end()) {
|
|
844
|
+
GGML_PRINT_DEBUG("[%s] buffer not found\n", __func__);
|
|
845
|
+
return false;
|
|
846
|
+
}
|
|
788
847
|
ggml_backend_buffer_clear(buffer, value);
|
|
848
|
+
return true;
|
|
789
849
|
}
|
|
790
850
|
|
|
791
|
-
|
|
851
|
+
ggml_tensor * rpc_server::deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor) {
|
|
852
|
+
ggml_tensor * result = ggml_new_tensor_4d(ctx, (ggml_type) tensor->type,
|
|
853
|
+
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
|
|
854
|
+
for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) {
|
|
855
|
+
result->nb[i] = tensor->nb[i];
|
|
856
|
+
}
|
|
857
|
+
result->buffer = reinterpret_cast<ggml_backend_buffer_t>(tensor->buffer);
|
|
858
|
+
if (result->buffer && buffers.find(result->buffer) == buffers.end()) {
|
|
859
|
+
return nullptr;
|
|
860
|
+
}
|
|
861
|
+
result->op = (ggml_op) tensor->op;
|
|
862
|
+
for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
|
|
863
|
+
result->op_params[i] = tensor->op_params[i];
|
|
864
|
+
}
|
|
865
|
+
result->flags = tensor->flags;
|
|
866
|
+
result->data = reinterpret_cast<void *>(tensor->data);
|
|
867
|
+
ggml_set_name(result, tensor->name);
|
|
868
|
+
return result;
|
|
869
|
+
}
|
|
870
|
+
|
|
871
|
+
|
|
872
|
+
bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
|
|
792
873
|
// serialization format: | rpc_tensor | offset (8 bytes) | data (size bytes) |
|
|
874
|
+
if (input.size() < sizeof(rpc_tensor) + sizeof(uint64_t)) {
|
|
875
|
+
return false;
|
|
876
|
+
}
|
|
793
877
|
const rpc_tensor * in_tensor = (const rpc_tensor *)input.data();
|
|
794
878
|
uint64_t offset;
|
|
795
879
|
memcpy(&offset, input.data() + sizeof(rpc_tensor), sizeof(offset));
|
|
@@ -802,14 +886,23 @@ static void rpc_set_tensor(const std::vector<uint8_t> & input) {
|
|
|
802
886
|
};
|
|
803
887
|
struct ggml_context * ctx = ggml_init(params);
|
|
804
888
|
ggml_tensor * tensor = deserialize_tensor(ctx, in_tensor);
|
|
889
|
+
if (tensor == nullptr) {
|
|
890
|
+
GGML_PRINT_DEBUG("[%s] error deserializing tensor\n", __func__);
|
|
891
|
+
ggml_free(ctx);
|
|
892
|
+
return false;
|
|
893
|
+
}
|
|
805
894
|
GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu\n", __func__, (void*)tensor->buffer, tensor->data, offset, size);
|
|
806
895
|
const void * data = input.data() + sizeof(rpc_tensor) + sizeof(offset);
|
|
807
896
|
ggml_backend_tensor_set(tensor, data, offset, size);
|
|
808
897
|
ggml_free(ctx);
|
|
898
|
+
return true;
|
|
809
899
|
}
|
|
810
900
|
|
|
811
|
-
|
|
901
|
+
bool rpc_server::get_tensor(const std::vector<uint8_t> & input, std::vector<uint8_t> & output) {
|
|
812
902
|
// serialization format: | rpc_tensor | offset (8 bytes) | size (8 bytes) |
|
|
903
|
+
if (input.size() != sizeof(rpc_tensor) + 2*sizeof(uint64_t)) {
|
|
904
|
+
return false;
|
|
905
|
+
}
|
|
813
906
|
const rpc_tensor * in_tensor = (const rpc_tensor *)input.data();
|
|
814
907
|
uint64_t offset;
|
|
815
908
|
memcpy(&offset, input.data() + sizeof(rpc_tensor), sizeof(offset));
|
|
@@ -823,15 +916,24 @@ static void rpc_get_tensor(const std::vector<uint8_t> & input, std::vector<uint8
|
|
|
823
916
|
};
|
|
824
917
|
struct ggml_context * ctx = ggml_init(params);
|
|
825
918
|
ggml_tensor * tensor = deserialize_tensor(ctx, in_tensor);
|
|
919
|
+
if (tensor == nullptr) {
|
|
920
|
+
GGML_PRINT_DEBUG("[%s] error deserializing tensor\n", __func__);
|
|
921
|
+
ggml_free(ctx);
|
|
922
|
+
return false;
|
|
923
|
+
}
|
|
826
924
|
GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %" PRIu64 "\n", __func__, (void*)tensor->buffer, tensor->data, offset, size);
|
|
827
925
|
// output serialization format: | data (size bytes) |
|
|
828
926
|
output.resize(size, 0);
|
|
829
927
|
ggml_backend_tensor_get(tensor, output.data(), offset, size);
|
|
830
928
|
ggml_free(ctx);
|
|
929
|
+
return true;
|
|
831
930
|
}
|
|
832
931
|
|
|
833
|
-
|
|
932
|
+
bool rpc_server::copy_tensor(const std::vector<uint8_t> & input, std::vector<uint8_t> & output) {
|
|
834
933
|
// serialization format: | rpc_tensor src | rpc_tensor dst |
|
|
934
|
+
if (input.size() != 2*sizeof(rpc_tensor)) {
|
|
935
|
+
return false;
|
|
936
|
+
}
|
|
835
937
|
const rpc_tensor * rpc_src = (const rpc_tensor *)input.data();
|
|
836
938
|
const rpc_tensor * rpc_dst = (const rpc_tensor *)(input.data() + sizeof(rpc_src));
|
|
837
939
|
|
|
@@ -843,18 +945,24 @@ static void rpc_copy_tensor(const std::vector<uint8_t> & input, std::vector<uint
|
|
|
843
945
|
struct ggml_context * ctx = ggml_init(params);
|
|
844
946
|
ggml_tensor * src = deserialize_tensor(ctx, rpc_src);
|
|
845
947
|
ggml_tensor * dst = deserialize_tensor(ctx, rpc_dst);
|
|
948
|
+
if (src == nullptr || dst == nullptr) {
|
|
949
|
+
GGML_PRINT_DEBUG("[%s] error deserializing tensors\n", __func__);
|
|
950
|
+
ggml_free(ctx);
|
|
951
|
+
return false;
|
|
952
|
+
}
|
|
846
953
|
GGML_PRINT_DEBUG("[%s] src->buffer: %p, dst->buffer: %p\n", __func__, (void*)src->buffer, (void*)dst->buffer);
|
|
847
954
|
bool result = ggml_backend_buffer_copy_tensor(src, dst);
|
|
848
955
|
// output serialization format: | result (1 byte) |
|
|
849
956
|
output.resize(1, 0);
|
|
850
957
|
output[0] = result;
|
|
851
958
|
ggml_free(ctx);
|
|
959
|
+
return true;
|
|
852
960
|
}
|
|
853
961
|
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
962
|
+
ggml_tensor * rpc_server::create_node(uint64_t id,
|
|
963
|
+
struct ggml_context * ctx,
|
|
964
|
+
const std::unordered_map<uint64_t, const rpc_tensor*> & tensor_ptrs,
|
|
965
|
+
std::unordered_map<uint64_t, struct ggml_tensor*> & tensor_map) {
|
|
858
966
|
if (id == 0) {
|
|
859
967
|
return nullptr;
|
|
860
968
|
}
|
|
@@ -863,6 +971,9 @@ static struct ggml_tensor * create_node(uint64_t id,
|
|
|
863
971
|
}
|
|
864
972
|
const rpc_tensor * tensor = tensor_ptrs.at(id);
|
|
865
973
|
struct ggml_tensor * result = deserialize_tensor(ctx, tensor);
|
|
974
|
+
if (result == nullptr) {
|
|
975
|
+
return nullptr;
|
|
976
|
+
}
|
|
866
977
|
tensor_map[id] = result;
|
|
867
978
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
|
868
979
|
result->src[i] = create_node(tensor->src[i], ctx, tensor_ptrs, tensor_map);
|
|
@@ -872,14 +983,23 @@ static struct ggml_tensor * create_node(uint64_t id,
|
|
|
872
983
|
return result;
|
|
873
984
|
}
|
|
874
985
|
|
|
875
|
-
|
|
986
|
+
bool rpc_server::graph_compute(const std::vector<uint8_t> & input, std::vector<uint8_t> & output) {
|
|
876
987
|
// serialization format:
|
|
877
988
|
// | n_nodes (4 bytes) | nodes (n_nodes * sizeof(uint64_t) | n_tensors (4 bytes) | tensors (n_tensors * sizeof(rpc_tensor)) |
|
|
989
|
+
if (input.size() < sizeof(uint32_t)) {
|
|
990
|
+
return false;
|
|
991
|
+
}
|
|
878
992
|
uint32_t n_nodes;
|
|
879
993
|
memcpy(&n_nodes, input.data(), sizeof(n_nodes));
|
|
994
|
+
if (input.size() < sizeof(uint32_t) + n_nodes*sizeof(uint64_t) + sizeof(uint32_t)) {
|
|
995
|
+
return false;
|
|
996
|
+
}
|
|
880
997
|
const uint64_t * nodes = (const uint64_t *)(input.data() + sizeof(n_nodes));
|
|
881
998
|
uint32_t n_tensors;
|
|
882
999
|
memcpy(&n_tensors, input.data() + sizeof(n_nodes) + n_nodes*sizeof(uint64_t), sizeof(n_tensors));
|
|
1000
|
+
if (input.size() < sizeof(uint32_t) + n_nodes*sizeof(uint64_t) + sizeof(uint32_t) + n_tensors*sizeof(rpc_tensor)) {
|
|
1001
|
+
return false;
|
|
1002
|
+
}
|
|
883
1003
|
const rpc_tensor * tensors = (const rpc_tensor *)(input.data() + sizeof(n_nodes) + n_nodes*sizeof(uint64_t) + sizeof(n_tensors));
|
|
884
1004
|
GGML_PRINT_DEBUG("[%s] n_nodes: %u, n_tensors: %u\n", __func__, n_nodes, n_tensors);
|
|
885
1005
|
|
|
@@ -905,9 +1025,17 @@ static void rpc_graph_compute(ggml_backend_t backend, const std::vector<uint8_t>
|
|
|
905
1025
|
output.resize(1, 0);
|
|
906
1026
|
output[0] = status;
|
|
907
1027
|
ggml_free(ctx);
|
|
1028
|
+
return true;
|
|
1029
|
+
}
|
|
1030
|
+
|
|
1031
|
+
rpc_server::~rpc_server() {
|
|
1032
|
+
for (auto buffer : buffers) {
|
|
1033
|
+
ggml_backend_buffer_free(buffer);
|
|
1034
|
+
}
|
|
908
1035
|
}
|
|
909
1036
|
|
|
910
1037
|
static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t free_mem, size_t total_mem) {
|
|
1038
|
+
rpc_server server(backend);
|
|
911
1039
|
while (true) {
|
|
912
1040
|
uint8_t cmd;
|
|
913
1041
|
if (!recv_data(sockfd, &cmd, 1)) {
|
|
@@ -923,45 +1051,46 @@ static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t fre
|
|
|
923
1051
|
if (!recv_data(sockfd, input.data(), input_size)) {
|
|
924
1052
|
break;
|
|
925
1053
|
}
|
|
1054
|
+
bool ok = true;
|
|
926
1055
|
switch (cmd) {
|
|
927
1056
|
case ALLOC_BUFFER: {
|
|
928
|
-
|
|
1057
|
+
ok = server.alloc_buffer(input, output);
|
|
929
1058
|
break;
|
|
930
1059
|
}
|
|
931
1060
|
case GET_ALIGNMENT: {
|
|
932
|
-
|
|
1061
|
+
server.get_alignment(output);
|
|
933
1062
|
break;
|
|
934
1063
|
}
|
|
935
1064
|
case GET_MAX_SIZE: {
|
|
936
|
-
|
|
1065
|
+
server.get_max_size(output);
|
|
937
1066
|
break;
|
|
938
1067
|
}
|
|
939
1068
|
case BUFFER_GET_BASE: {
|
|
940
|
-
|
|
1069
|
+
ok = server.buffer_get_base(input, output);
|
|
941
1070
|
break;
|
|
942
1071
|
}
|
|
943
1072
|
case FREE_BUFFER: {
|
|
944
|
-
|
|
1073
|
+
ok = server.free_buffer(input);
|
|
945
1074
|
break;
|
|
946
1075
|
}
|
|
947
1076
|
case BUFFER_CLEAR: {
|
|
948
|
-
|
|
1077
|
+
ok = server.buffer_clear(input);
|
|
949
1078
|
break;
|
|
950
1079
|
}
|
|
951
1080
|
case SET_TENSOR: {
|
|
952
|
-
|
|
1081
|
+
ok = server.set_tensor(input);
|
|
953
1082
|
break;
|
|
954
1083
|
}
|
|
955
1084
|
case GET_TENSOR: {
|
|
956
|
-
|
|
1085
|
+
ok = server.get_tensor(input, output);
|
|
957
1086
|
break;
|
|
958
1087
|
}
|
|
959
1088
|
case COPY_TENSOR: {
|
|
960
|
-
|
|
1089
|
+
ok = server.copy_tensor(input, output);
|
|
961
1090
|
break;
|
|
962
1091
|
}
|
|
963
1092
|
case GRAPH_COMPUTE: {
|
|
964
|
-
|
|
1093
|
+
ok = server.graph_compute(input, output);
|
|
965
1094
|
break;
|
|
966
1095
|
}
|
|
967
1096
|
case GET_DEVICE_MEMORY: {
|
|
@@ -973,9 +1102,12 @@ static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t fre
|
|
|
973
1102
|
}
|
|
974
1103
|
default: {
|
|
975
1104
|
fprintf(stderr, "Unknown command: %d\n", cmd);
|
|
976
|
-
|
|
1105
|
+
ok = false;
|
|
977
1106
|
}
|
|
978
1107
|
}
|
|
1108
|
+
if (!ok) {
|
|
1109
|
+
break;
|
|
1110
|
+
}
|
|
979
1111
|
uint64_t output_size = output.size();
|
|
980
1112
|
if (!send_data(sockfd, &output_size, sizeof(output_size))) {
|
|
981
1113
|
break;
|