llama_cpp 0.15.2 → 0.15.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +61 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -0
- data/vendor/tmp/llama.cpp/Makefile +8 -16
- data/vendor/tmp/llama.cpp/ggml-common.h +0 -54
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +99 -40
- data/vendor/tmp/llama.cpp/ggml-cuda.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +44 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +133 -81
- data/vendor/tmp/llama.cpp/ggml-metal.metal +91 -434
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-quants.c +1962 -2443
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +248 -108
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +375 -657
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +204 -225
- data/vendor/tmp/llama.cpp/ggml.c +498 -836
- data/vendor/tmp/llama.cpp/ggml.h +57 -30
- data/vendor/tmp/llama.cpp/llama.cpp +1477 -859
- data/vendor/tmp/llama.cpp/llama.h +21 -8
- metadata +3 -3
@@ -6,6 +6,7 @@
|
|
6
6
|
#include <string>
|
7
7
|
#include <vector>
|
8
8
|
#include <memory>
|
9
|
+
#include <mutex>
|
9
10
|
#include <unordered_map>
|
10
11
|
#include <unordered_set>
|
11
12
|
#ifdef _WIN32
|
@@ -47,6 +48,7 @@ struct socket_t {
|
|
47
48
|
sockfd_t fd;
|
48
49
|
socket_t(sockfd_t fd) : fd(fd) {}
|
49
50
|
~socket_t() {
|
51
|
+
GGML_PRINT_DEBUG("[%s] closing socket %d\n", __func__, this->fd);
|
50
52
|
#ifdef _WIN32
|
51
53
|
closesocket(this->fd);
|
52
54
|
#else
|
@@ -56,6 +58,7 @@ struct socket_t {
|
|
56
58
|
};
|
57
59
|
|
58
60
|
// ggml_tensor is serialized into rpc_tensor
|
61
|
+
#pragma pack(push, 1)
|
59
62
|
struct rpc_tensor {
|
60
63
|
uint64_t id;
|
61
64
|
uint32_t type;
|
@@ -71,6 +74,7 @@ struct rpc_tensor {
|
|
71
74
|
uint64_t data;
|
72
75
|
char name[GGML_MAX_NAME];
|
73
76
|
};
|
77
|
+
#pragma pack(pop)
|
74
78
|
|
75
79
|
// RPC commands
|
76
80
|
enum rpc_cmd {
|
@@ -95,7 +99,7 @@ static ggml_guid_t ggml_backend_rpc_guid() {
|
|
95
99
|
}
|
96
100
|
|
97
101
|
struct ggml_backend_rpc_buffer_type_context {
|
98
|
-
std::
|
102
|
+
std::string endpoint;
|
99
103
|
std::string name;
|
100
104
|
size_t alignment;
|
101
105
|
size_t max_size;
|
@@ -104,8 +108,6 @@ struct ggml_backend_rpc_buffer_type_context {
|
|
104
108
|
struct ggml_backend_rpc_context {
|
105
109
|
std::string endpoint;
|
106
110
|
std::string name;
|
107
|
-
std::shared_ptr<socket_t> sock;
|
108
|
-
ggml_backend_buffer_type_t buft;
|
109
111
|
};
|
110
112
|
|
111
113
|
struct ggml_backend_rpc_buffer_context {
|
@@ -229,14 +231,13 @@ static bool recv_data(sockfd_t sockfd, void * data, size_t size) {
|
|
229
231
|
return true;
|
230
232
|
}
|
231
233
|
|
232
|
-
static bool parse_endpoint(const
|
233
|
-
|
234
|
-
size_t pos = str.find(':');
|
234
|
+
static bool parse_endpoint(const std::string & endpoint, std::string & host, int & port) {
|
235
|
+
size_t pos = endpoint.find(':');
|
235
236
|
if (pos == std::string::npos) {
|
236
237
|
return false;
|
237
238
|
}
|
238
|
-
host =
|
239
|
-
port = std::stoi(
|
239
|
+
host = endpoint.substr(0, pos);
|
240
|
+
port = std::stoi(endpoint.substr(pos + 1));
|
240
241
|
return true;
|
241
242
|
}
|
242
243
|
|
@@ -271,6 +272,44 @@ static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cm
|
|
271
272
|
|
272
273
|
// RPC client-side implementation
|
273
274
|
|
275
|
+
static std::shared_ptr<socket_t> get_socket(const std::string & endpoint) {
|
276
|
+
static std::mutex mutex;
|
277
|
+
std::lock_guard<std::mutex> lock(mutex);
|
278
|
+
static std::unordered_map<std::string, std::weak_ptr<socket_t>> sockets;
|
279
|
+
static bool initialized = false;
|
280
|
+
|
281
|
+
auto it = sockets.find(endpoint);
|
282
|
+
if (it != sockets.end()) {
|
283
|
+
if (auto sock = it->second.lock()) {
|
284
|
+
return sock;
|
285
|
+
}
|
286
|
+
}
|
287
|
+
std::string host;
|
288
|
+
int port;
|
289
|
+
if (!parse_endpoint(endpoint, host, port)) {
|
290
|
+
return nullptr;
|
291
|
+
}
|
292
|
+
#ifdef _WIN32
|
293
|
+
if (!initialized) {
|
294
|
+
WSADATA wsaData;
|
295
|
+
int res = WSAStartup(MAKEWORD(2, 2), &wsaData);
|
296
|
+
if (res != 0) {
|
297
|
+
return nullptr;
|
298
|
+
}
|
299
|
+
initialized = true;
|
300
|
+
}
|
301
|
+
#else
|
302
|
+
UNUSED(initialized);
|
303
|
+
#endif
|
304
|
+
auto sock = socket_connect(host.c_str(), port);
|
305
|
+
if (sock == nullptr) {
|
306
|
+
return nullptr;
|
307
|
+
}
|
308
|
+
GGML_PRINT_DEBUG("[%s] connected to %s, sockfd=%d\n", __func__, endpoint.c_str(), sock->fd);
|
309
|
+
sockets[endpoint] = sock;
|
310
|
+
return sock;
|
311
|
+
}
|
312
|
+
|
274
313
|
GGML_CALL static const char * ggml_backend_rpc_buffer_get_name(ggml_backend_buffer_t buffer) {
|
275
314
|
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
276
315
|
return ctx->name.c_str();
|
@@ -340,23 +379,6 @@ static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
|
|
340
379
|
return result;
|
341
380
|
}
|
342
381
|
|
343
|
-
static ggml_tensor * deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor) {
|
344
|
-
ggml_tensor * result = ggml_new_tensor_4d(ctx, (ggml_type) tensor->type,
|
345
|
-
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
|
346
|
-
for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) {
|
347
|
-
result->nb[i] = tensor->nb[i];
|
348
|
-
}
|
349
|
-
result->buffer = reinterpret_cast<ggml_backend_buffer_t>(tensor->buffer);
|
350
|
-
result->op = (ggml_op) tensor->op;
|
351
|
-
for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
|
352
|
-
result->op_params[i] = tensor->op_params[i];
|
353
|
-
}
|
354
|
-
result->flags = tensor->flags;
|
355
|
-
result->data = reinterpret_cast<void *>(tensor->data);
|
356
|
-
ggml_set_name(result, tensor->name);
|
357
|
-
return result;
|
358
|
-
}
|
359
|
-
|
360
382
|
GGML_CALL static void ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
361
383
|
UNUSED(buffer);
|
362
384
|
if (ggml_is_quantized(tensor->type)) {
|
@@ -457,7 +479,8 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer
|
|
457
479
|
std::vector<uint8_t> input(input_size, 0);
|
458
480
|
memcpy(input.data(), &size, sizeof(size));
|
459
481
|
std::vector<uint8_t> output;
|
460
|
-
|
482
|
+
auto sock = get_socket(buft_ctx->endpoint);
|
483
|
+
bool status = send_rpc_cmd(sock, ALLOC_BUFFER, input, output);
|
461
484
|
GGML_ASSERT(status);
|
462
485
|
GGML_ASSERT(output.size() == 2*sizeof(uint64_t));
|
463
486
|
// output serialization format: | remote_ptr (8 bytes) | remote_size (8 bytes) |
|
@@ -465,13 +488,15 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer
|
|
465
488
|
memcpy(&remote_ptr, output.data(), sizeof(remote_ptr));
|
466
489
|
size_t remote_size;
|
467
490
|
memcpy(&remote_size, output.data() + sizeof(uint64_t), sizeof(remote_size));
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
491
|
+
if (remote_ptr != 0) {
|
492
|
+
ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
|
493
|
+
ggml_backend_rpc_buffer_interface,
|
494
|
+
new ggml_backend_rpc_buffer_context{sock, {}, remote_ptr, "RPC"},
|
495
|
+
remote_size);
|
496
|
+
return buffer;
|
497
|
+
} else {
|
498
|
+
return nullptr;
|
499
|
+
}
|
475
500
|
}
|
476
501
|
|
477
502
|
static size_t get_alignment(const std::shared_ptr<socket_t> & sock) {
|
@@ -521,7 +546,7 @@ GGML_CALL static bool ggml_backend_rpc_buffer_type_supports_backend(ggml_backend
|
|
521
546
|
}
|
522
547
|
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
|
523
548
|
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
524
|
-
return buft_ctx->
|
549
|
+
return buft_ctx->endpoint == rpc_ctx->endpoint;
|
525
550
|
}
|
526
551
|
|
527
552
|
static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = {
|
@@ -534,7 +559,6 @@ static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = {
|
|
534
559
|
/* .is_host = */ NULL,
|
535
560
|
};
|
536
561
|
|
537
|
-
|
538
562
|
GGML_CALL static const char * ggml_backend_rpc_name(ggml_backend_t backend) {
|
539
563
|
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
540
564
|
|
@@ -543,16 +567,13 @@ GGML_CALL static const char * ggml_backend_rpc_name(ggml_backend_t backend) {
|
|
543
567
|
|
544
568
|
GGML_CALL static void ggml_backend_rpc_free(ggml_backend_t backend) {
|
545
569
|
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
546
|
-
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)rpc_ctx->buft->context;
|
547
|
-
delete buft_ctx;
|
548
|
-
delete rpc_ctx->buft;
|
549
570
|
delete rpc_ctx;
|
550
571
|
delete backend;
|
551
572
|
}
|
552
573
|
|
553
574
|
GGML_CALL static ggml_backend_buffer_type_t ggml_backend_rpc_get_default_buffer_type(ggml_backend_t backend) {
|
554
575
|
ggml_backend_rpc_context * ctx = (ggml_backend_rpc_context *)backend->context;
|
555
|
-
return ctx->
|
576
|
+
return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str());
|
556
577
|
}
|
557
578
|
|
558
579
|
GGML_CALL static void ggml_backend_rpc_synchronize(ggml_backend_t backend) {
|
@@ -603,7 +624,8 @@ GGML_CALL static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t
|
|
603
624
|
std::vector<uint8_t> input;
|
604
625
|
serialize_graph(cgraph, input);
|
605
626
|
std::vector<uint8_t> output;
|
606
|
-
|
627
|
+
auto sock = get_socket(rpc_ctx->endpoint);
|
628
|
+
bool status = send_rpc_cmd(sock, GRAPH_COMPUTE, input, output);
|
607
629
|
GGML_ASSERT(status);
|
608
630
|
GGML_ASSERT(output.size() == 1);
|
609
631
|
return (enum ggml_status)output[0];
|
@@ -637,65 +659,48 @@ static ggml_backend_i ggml_backend_rpc_interface = {
|
|
637
659
|
/* .event_synchronize = */ NULL,
|
638
660
|
};
|
639
661
|
|
640
|
-
static std::unordered_map<std::string, ggml_backend_t> instances;
|
641
|
-
|
642
662
|
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint) {
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
return instances[endpoint_str];
|
651
|
-
}
|
652
|
-
#ifdef _WIN32
|
653
|
-
{
|
654
|
-
WSADATA wsaData;
|
655
|
-
int res = WSAStartup(MAKEWORD(2, 2), &wsaData);
|
656
|
-
if (res != 0) {
|
657
|
-
return nullptr;
|
658
|
-
}
|
659
|
-
}
|
660
|
-
#endif
|
661
|
-
GGML_PRINT_DEBUG("Connecting to %s\n", endpoint);
|
662
|
-
std::string host;
|
663
|
-
int port;
|
664
|
-
if (!parse_endpoint(endpoint, host, port)) {
|
665
|
-
return nullptr;
|
663
|
+
static std::mutex mutex;
|
664
|
+
std::lock_guard<std::mutex> lock(mutex);
|
665
|
+
// NOTE: buffer types are allocated and never freed; this is by design
|
666
|
+
static std::unordered_map<std::string, ggml_backend_buffer_type_t> buft_map;
|
667
|
+
auto it = buft_map.find(endpoint);
|
668
|
+
if (it != buft_map.end()) {
|
669
|
+
return it->second;
|
666
670
|
}
|
667
|
-
auto sock =
|
671
|
+
auto sock = get_socket(endpoint);
|
668
672
|
if (sock == nullptr) {
|
669
673
|
return nullptr;
|
670
674
|
}
|
671
675
|
size_t alignment = get_alignment(sock);
|
672
676
|
size_t max_size = get_max_size(sock);
|
673
677
|
ggml_backend_rpc_buffer_type_context * buft_ctx = new ggml_backend_rpc_buffer_type_context {
|
674
|
-
/* .
|
675
|
-
/* .name
|
678
|
+
/* .endpoint = */ endpoint,
|
679
|
+
/* .name = */ "RPC[" + std::string(endpoint) + "]",
|
676
680
|
/* .alignment = */ alignment,
|
677
|
-
/* .max_size
|
681
|
+
/* .max_size = */ max_size
|
678
682
|
};
|
679
683
|
|
680
684
|
ggml_backend_buffer_type_t buft = new ggml_backend_buffer_type {
|
681
685
|
/* .iface = */ ggml_backend_rpc_buffer_type_interface,
|
682
686
|
/* .context = */ buft_ctx
|
683
687
|
};
|
688
|
+
buft_map[endpoint] = buft;
|
689
|
+
return buft;
|
690
|
+
}
|
684
691
|
|
692
|
+
GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
|
685
693
|
ggml_backend_rpc_context * ctx = new ggml_backend_rpc_context {
|
686
|
-
/* .endpoint
|
687
|
-
/* .name
|
688
|
-
/* .sock = */ sock,
|
689
|
-
/* .buft = */ buft
|
694
|
+
/* .endpoint = */ endpoint,
|
695
|
+
/* .name = */ "RPC",
|
690
696
|
};
|
691
697
|
|
692
|
-
|
698
|
+
ggml_backend_t backend = new ggml_backend {
|
693
699
|
/* .guid = */ ggml_backend_rpc_guid(),
|
694
700
|
/* .interface = */ ggml_backend_rpc_interface,
|
695
701
|
/* .context = */ ctx
|
696
702
|
};
|
697
|
-
|
698
|
-
return instances[endpoint];
|
703
|
+
return backend;
|
699
704
|
}
|
700
705
|
|
701
706
|
GGML_API GGML_CALL bool ggml_backend_is_rpc(ggml_backend_t backend) {
|
@@ -719,34 +724,72 @@ static void get_device_memory(const std::shared_ptr<socket_t> & sock, size_t * f
|
|
719
724
|
}
|
720
725
|
|
721
726
|
GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total) {
|
722
|
-
|
723
|
-
if (
|
727
|
+
auto sock = get_socket(endpoint);
|
728
|
+
if (sock == nullptr) {
|
724
729
|
*free = 0;
|
725
730
|
*total = 0;
|
726
731
|
return;
|
727
732
|
}
|
728
|
-
|
729
|
-
get_device_memory(ctx->sock, free, total);
|
733
|
+
get_device_memory(sock, free, total);
|
730
734
|
}
|
731
735
|
|
732
736
|
// RPC server-side implementation
|
733
737
|
|
734
|
-
|
738
|
+
class rpc_server {
|
739
|
+
public:
|
740
|
+
rpc_server(ggml_backend_t backend) : backend(backend) {}
|
741
|
+
~rpc_server();
|
742
|
+
|
743
|
+
bool alloc_buffer(const std::vector<uint8_t> & input, std::vector<uint8_t> & output);
|
744
|
+
void get_alignment(std::vector<uint8_t> & output);
|
745
|
+
void get_max_size(std::vector<uint8_t> & output);
|
746
|
+
bool buffer_get_base(const std::vector<uint8_t> & input, std::vector<uint8_t> & output);
|
747
|
+
bool free_buffer(const std::vector<uint8_t> & input);
|
748
|
+
bool buffer_clear(const std::vector<uint8_t> & input);
|
749
|
+
bool set_tensor(const std::vector<uint8_t> & input);
|
750
|
+
bool get_tensor(const std::vector<uint8_t> & input, std::vector<uint8_t> & output);
|
751
|
+
bool copy_tensor(const std::vector<uint8_t> & input, std::vector<uint8_t> & output);
|
752
|
+
bool graph_compute(const std::vector<uint8_t> & input, std::vector<uint8_t> & output);
|
753
|
+
|
754
|
+
private:
|
755
|
+
ggml_tensor * deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor);
|
756
|
+
ggml_tensor * create_node(uint64_t id,
|
757
|
+
struct ggml_context * ctx,
|
758
|
+
const std::unordered_map<uint64_t, const rpc_tensor*> & tensor_ptrs,
|
759
|
+
std::unordered_map<uint64_t, struct ggml_tensor*> & tensor_map);
|
760
|
+
|
761
|
+
|
762
|
+
ggml_backend_t backend;
|
763
|
+
std::unordered_set<ggml_backend_buffer_t> buffers;
|
764
|
+
};
|
765
|
+
|
766
|
+
bool rpc_server::alloc_buffer(const std::vector<uint8_t> & input, std::vector<uint8_t> & output) {
|
735
767
|
// input serialization format: | size (8 bytes) |
|
768
|
+
if (input.size() != sizeof(uint64_t)) {
|
769
|
+
return false;
|
770
|
+
}
|
736
771
|
uint64_t size;
|
737
772
|
memcpy(&size, input.data(), sizeof(size));
|
738
773
|
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
|
739
774
|
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
|
740
|
-
uint64_t remote_ptr =
|
741
|
-
uint64_t remote_size =
|
742
|
-
|
775
|
+
uint64_t remote_ptr = 0;
|
776
|
+
uint64_t remote_size = 0;
|
777
|
+
if (buffer != nullptr) {
|
778
|
+
remote_ptr = reinterpret_cast<uint64_t>(buffer);
|
779
|
+
remote_size = buffer->size;
|
780
|
+
GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> remote_ptr: %" PRIx64 ", remote_size: %" PRIu64 "\n", __func__, size, remote_ptr, remote_size);
|
781
|
+
buffers.insert(buffer);
|
782
|
+
} else {
|
783
|
+
GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> failed\n", __func__, size);
|
784
|
+
}
|
743
785
|
// output serialization format: | remote_ptr (8 bytes) | remote_size (8 bytes) |
|
744
786
|
output.resize(2*sizeof(uint64_t), 0);
|
745
787
|
memcpy(output.data(), &remote_ptr, sizeof(remote_ptr));
|
746
788
|
memcpy(output.data() + sizeof(uint64_t), &remote_size, sizeof(remote_size));
|
789
|
+
return true;
|
747
790
|
}
|
748
791
|
|
749
|
-
|
792
|
+
void rpc_server::get_alignment(std::vector<uint8_t> & output) {
|
750
793
|
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
|
751
794
|
size_t alignment = ggml_backend_buft_get_alignment(buft);
|
752
795
|
GGML_PRINT_DEBUG("[%s] alignment: %lu\n", __func__, alignment);
|
@@ -755,7 +798,7 @@ static void rpc_get_alignment(ggml_backend_t backend, std::vector<uint8_t> & out
|
|
755
798
|
memcpy(output.data(), &alignment, sizeof(alignment));
|
756
799
|
}
|
757
800
|
|
758
|
-
|
801
|
+
void rpc_server::get_max_size(std::vector<uint8_t> & output) {
|
759
802
|
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
|
760
803
|
size_t max_size = ggml_backend_buft_get_max_size(buft);
|
761
804
|
GGML_PRINT_DEBUG("[%s] max_size: %lu\n", __func__, max_size);
|
@@ -764,41 +807,90 @@ static void rpc_get_max_size(ggml_backend_t backend, std::vector<uint8_t> & outp
|
|
764
807
|
memcpy(output.data(), &max_size, sizeof(max_size));
|
765
808
|
}
|
766
809
|
|
767
|
-
|
810
|
+
bool rpc_server::buffer_get_base(const std::vector<uint8_t> & input, std::vector<uint8_t> & output) {
|
768
811
|
// input serialization format: | remote_ptr (8 bytes) |
|
812
|
+
if (input.size() != sizeof(uint64_t)) {
|
813
|
+
return false;
|
814
|
+
}
|
769
815
|
uint64_t remote_ptr;
|
770
816
|
memcpy(&remote_ptr, input.data(), sizeof(remote_ptr));
|
771
817
|
GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, remote_ptr);
|
772
818
|
ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(remote_ptr);
|
819
|
+
if (buffers.find(buffer) == buffers.end()) {
|
820
|
+
GGML_PRINT_DEBUG("[%s] buffer not found\n", __func__);
|
821
|
+
return false;
|
822
|
+
}
|
773
823
|
void * base = ggml_backend_buffer_get_base(buffer);
|
774
824
|
// output serialization format: | base_ptr (8 bytes) |
|
775
825
|
uint64_t base_ptr = reinterpret_cast<uint64_t>(base);
|
776
826
|
output.resize(sizeof(uint64_t), 0);
|
777
827
|
memcpy(output.data(), &base_ptr, sizeof(base_ptr));
|
828
|
+
return true;
|
778
829
|
}
|
779
830
|
|
780
|
-
|
831
|
+
bool rpc_server::free_buffer(const std::vector<uint8_t> & input) {
|
781
832
|
// input serialization format: | remote_ptr (8 bytes) |
|
833
|
+
if (input.size() != sizeof(uint64_t)) {
|
834
|
+
return false;
|
835
|
+
}
|
782
836
|
uint64_t remote_ptr;
|
783
837
|
memcpy(&remote_ptr, input.data(), sizeof(remote_ptr));
|
784
838
|
GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, remote_ptr);
|
785
839
|
ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(remote_ptr);
|
840
|
+
if (buffers.find(buffer) == buffers.end()) {
|
841
|
+
GGML_PRINT_DEBUG("[%s] buffer not found\n", __func__);
|
842
|
+
return false;
|
843
|
+
}
|
786
844
|
ggml_backend_buffer_free(buffer);
|
845
|
+
buffers.erase(buffer);
|
846
|
+
return true;
|
787
847
|
}
|
788
848
|
|
789
|
-
|
849
|
+
bool rpc_server::buffer_clear(const std::vector<uint8_t> & input) {
|
790
850
|
// input serialization format: | remote_ptr (8 bytes) | value (1 byte) |
|
851
|
+
if (input.size() != sizeof(uint64_t) + sizeof(uint8_t)) {
|
852
|
+
return false;
|
853
|
+
}
|
791
854
|
uint64_t remote_ptr;
|
792
855
|
memcpy(&remote_ptr, input.data(), sizeof(remote_ptr));
|
793
856
|
uint8_t value;
|
794
857
|
memcpy(&value, input.data() + sizeof(uint64_t), sizeof(value));
|
795
858
|
GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 ", value: %u\n", __func__, remote_ptr, value);
|
796
859
|
ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(remote_ptr);
|
860
|
+
if (buffers.find(buffer) == buffers.end()) {
|
861
|
+
GGML_PRINT_DEBUG("[%s] buffer not found\n", __func__);
|
862
|
+
return false;
|
863
|
+
}
|
797
864
|
ggml_backend_buffer_clear(buffer, value);
|
865
|
+
return true;
|
866
|
+
}
|
867
|
+
|
868
|
+
ggml_tensor * rpc_server::deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor) {
|
869
|
+
ggml_tensor * result = ggml_new_tensor_4d(ctx, (ggml_type) tensor->type,
|
870
|
+
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
|
871
|
+
for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) {
|
872
|
+
result->nb[i] = tensor->nb[i];
|
873
|
+
}
|
874
|
+
result->buffer = reinterpret_cast<ggml_backend_buffer_t>(tensor->buffer);
|
875
|
+
if (result->buffer && buffers.find(result->buffer) == buffers.end()) {
|
876
|
+
return nullptr;
|
877
|
+
}
|
878
|
+
result->op = (ggml_op) tensor->op;
|
879
|
+
for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
|
880
|
+
result->op_params[i] = tensor->op_params[i];
|
881
|
+
}
|
882
|
+
result->flags = tensor->flags;
|
883
|
+
result->data = reinterpret_cast<void *>(tensor->data);
|
884
|
+
ggml_set_name(result, tensor->name);
|
885
|
+
return result;
|
798
886
|
}
|
799
887
|
|
800
|
-
|
888
|
+
|
889
|
+
bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
|
801
890
|
// serialization format: | rpc_tensor | offset (8 bytes) | data (size bytes) |
|
891
|
+
if (input.size() < sizeof(rpc_tensor) + sizeof(uint64_t)) {
|
892
|
+
return false;
|
893
|
+
}
|
802
894
|
const rpc_tensor * in_tensor = (const rpc_tensor *)input.data();
|
803
895
|
uint64_t offset;
|
804
896
|
memcpy(&offset, input.data() + sizeof(rpc_tensor), sizeof(offset));
|
@@ -811,14 +903,23 @@ static void rpc_set_tensor(const std::vector<uint8_t> & input) {
|
|
811
903
|
};
|
812
904
|
struct ggml_context * ctx = ggml_init(params);
|
813
905
|
ggml_tensor * tensor = deserialize_tensor(ctx, in_tensor);
|
906
|
+
if (tensor == nullptr) {
|
907
|
+
GGML_PRINT_DEBUG("[%s] error deserializing tensor\n", __func__);
|
908
|
+
ggml_free(ctx);
|
909
|
+
return false;
|
910
|
+
}
|
814
911
|
GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu\n", __func__, (void*)tensor->buffer, tensor->data, offset, size);
|
815
912
|
const void * data = input.data() + sizeof(rpc_tensor) + sizeof(offset);
|
816
913
|
ggml_backend_tensor_set(tensor, data, offset, size);
|
817
914
|
ggml_free(ctx);
|
915
|
+
return true;
|
818
916
|
}
|
819
917
|
|
820
|
-
|
918
|
+
bool rpc_server::get_tensor(const std::vector<uint8_t> & input, std::vector<uint8_t> & output) {
|
821
919
|
// serialization format: | rpc_tensor | offset (8 bytes) | size (8 bytes) |
|
920
|
+
if (input.size() != sizeof(rpc_tensor) + 2*sizeof(uint64_t)) {
|
921
|
+
return false;
|
922
|
+
}
|
822
923
|
const rpc_tensor * in_tensor = (const rpc_tensor *)input.data();
|
823
924
|
uint64_t offset;
|
824
925
|
memcpy(&offset, input.data() + sizeof(rpc_tensor), sizeof(offset));
|
@@ -832,15 +933,24 @@ static void rpc_get_tensor(const std::vector<uint8_t> & input, std::vector<uint8
|
|
832
933
|
};
|
833
934
|
struct ggml_context * ctx = ggml_init(params);
|
834
935
|
ggml_tensor * tensor = deserialize_tensor(ctx, in_tensor);
|
936
|
+
if (tensor == nullptr) {
|
937
|
+
GGML_PRINT_DEBUG("[%s] error deserializing tensor\n", __func__);
|
938
|
+
ggml_free(ctx);
|
939
|
+
return false;
|
940
|
+
}
|
835
941
|
GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %" PRIu64 "\n", __func__, (void*)tensor->buffer, tensor->data, offset, size);
|
836
942
|
// output serialization format: | data (size bytes) |
|
837
943
|
output.resize(size, 0);
|
838
944
|
ggml_backend_tensor_get(tensor, output.data(), offset, size);
|
839
945
|
ggml_free(ctx);
|
946
|
+
return true;
|
840
947
|
}
|
841
948
|
|
842
|
-
|
949
|
+
bool rpc_server::copy_tensor(const std::vector<uint8_t> & input, std::vector<uint8_t> & output) {
|
843
950
|
// serialization format: | rpc_tensor src | rpc_tensor dst |
|
951
|
+
if (input.size() != 2*sizeof(rpc_tensor)) {
|
952
|
+
return false;
|
953
|
+
}
|
844
954
|
const rpc_tensor * rpc_src = (const rpc_tensor *)input.data();
|
845
955
|
const rpc_tensor * rpc_dst = (const rpc_tensor *)(input.data() + sizeof(rpc_src));
|
846
956
|
|
@@ -852,18 +962,24 @@ static void rpc_copy_tensor(const std::vector<uint8_t> & input, std::vector<uint
|
|
852
962
|
struct ggml_context * ctx = ggml_init(params);
|
853
963
|
ggml_tensor * src = deserialize_tensor(ctx, rpc_src);
|
854
964
|
ggml_tensor * dst = deserialize_tensor(ctx, rpc_dst);
|
965
|
+
if (src == nullptr || dst == nullptr) {
|
966
|
+
GGML_PRINT_DEBUG("[%s] error deserializing tensors\n", __func__);
|
967
|
+
ggml_free(ctx);
|
968
|
+
return false;
|
969
|
+
}
|
855
970
|
GGML_PRINT_DEBUG("[%s] src->buffer: %p, dst->buffer: %p\n", __func__, (void*)src->buffer, (void*)dst->buffer);
|
856
971
|
bool result = ggml_backend_buffer_copy_tensor(src, dst);
|
857
972
|
// output serialization format: | result (1 byte) |
|
858
973
|
output.resize(1, 0);
|
859
974
|
output[0] = result;
|
860
975
|
ggml_free(ctx);
|
976
|
+
return true;
|
861
977
|
}
|
862
978
|
|
863
|
-
|
864
|
-
|
865
|
-
|
866
|
-
|
979
|
+
ggml_tensor * rpc_server::create_node(uint64_t id,
|
980
|
+
struct ggml_context * ctx,
|
981
|
+
const std::unordered_map<uint64_t, const rpc_tensor*> & tensor_ptrs,
|
982
|
+
std::unordered_map<uint64_t, struct ggml_tensor*> & tensor_map) {
|
867
983
|
if (id == 0) {
|
868
984
|
return nullptr;
|
869
985
|
}
|
@@ -872,6 +988,9 @@ static struct ggml_tensor * create_node(uint64_t id,
|
|
872
988
|
}
|
873
989
|
const rpc_tensor * tensor = tensor_ptrs.at(id);
|
874
990
|
struct ggml_tensor * result = deserialize_tensor(ctx, tensor);
|
991
|
+
if (result == nullptr) {
|
992
|
+
return nullptr;
|
993
|
+
}
|
875
994
|
tensor_map[id] = result;
|
876
995
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
877
996
|
result->src[i] = create_node(tensor->src[i], ctx, tensor_ptrs, tensor_map);
|
@@ -881,14 +1000,23 @@ static struct ggml_tensor * create_node(uint64_t id,
|
|
881
1000
|
return result;
|
882
1001
|
}
|
883
1002
|
|
884
|
-
|
1003
|
+
bool rpc_server::graph_compute(const std::vector<uint8_t> & input, std::vector<uint8_t> & output) {
|
885
1004
|
// serialization format:
|
886
1005
|
// | n_nodes (4 bytes) | nodes (n_nodes * sizeof(uint64_t) | n_tensors (4 bytes) | tensors (n_tensors * sizeof(rpc_tensor)) |
|
1006
|
+
if (input.size() < sizeof(uint32_t)) {
|
1007
|
+
return false;
|
1008
|
+
}
|
887
1009
|
uint32_t n_nodes;
|
888
1010
|
memcpy(&n_nodes, input.data(), sizeof(n_nodes));
|
1011
|
+
if (input.size() < sizeof(uint32_t) + n_nodes*sizeof(uint64_t) + sizeof(uint32_t)) {
|
1012
|
+
return false;
|
1013
|
+
}
|
889
1014
|
const uint64_t * nodes = (const uint64_t *)(input.data() + sizeof(n_nodes));
|
890
1015
|
uint32_t n_tensors;
|
891
1016
|
memcpy(&n_tensors, input.data() + sizeof(n_nodes) + n_nodes*sizeof(uint64_t), sizeof(n_tensors));
|
1017
|
+
if (input.size() < sizeof(uint32_t) + n_nodes*sizeof(uint64_t) + sizeof(uint32_t) + n_tensors*sizeof(rpc_tensor)) {
|
1018
|
+
return false;
|
1019
|
+
}
|
892
1020
|
const rpc_tensor * tensors = (const rpc_tensor *)(input.data() + sizeof(n_nodes) + n_nodes*sizeof(uint64_t) + sizeof(n_tensors));
|
893
1021
|
GGML_PRINT_DEBUG("[%s] n_nodes: %u, n_tensors: %u\n", __func__, n_nodes, n_tensors);
|
894
1022
|
|
@@ -914,9 +1042,17 @@ static void rpc_graph_compute(ggml_backend_t backend, const std::vector<uint8_t>
|
|
914
1042
|
output.resize(1, 0);
|
915
1043
|
output[0] = status;
|
916
1044
|
ggml_free(ctx);
|
1045
|
+
return true;
|
1046
|
+
}
|
1047
|
+
|
1048
|
+
rpc_server::~rpc_server() {
|
1049
|
+
for (auto buffer : buffers) {
|
1050
|
+
ggml_backend_buffer_free(buffer);
|
1051
|
+
}
|
917
1052
|
}
|
918
1053
|
|
919
1054
|
static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t free_mem, size_t total_mem) {
|
1055
|
+
rpc_server server(backend);
|
920
1056
|
while (true) {
|
921
1057
|
uint8_t cmd;
|
922
1058
|
if (!recv_data(sockfd, &cmd, 1)) {
|
@@ -932,45 +1068,46 @@ static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t fre
|
|
932
1068
|
if (!recv_data(sockfd, input.data(), input_size)) {
|
933
1069
|
break;
|
934
1070
|
}
|
1071
|
+
bool ok = true;
|
935
1072
|
switch (cmd) {
|
936
1073
|
case ALLOC_BUFFER: {
|
937
|
-
|
1074
|
+
ok = server.alloc_buffer(input, output);
|
938
1075
|
break;
|
939
1076
|
}
|
940
1077
|
case GET_ALIGNMENT: {
|
941
|
-
|
1078
|
+
server.get_alignment(output);
|
942
1079
|
break;
|
943
1080
|
}
|
944
1081
|
case GET_MAX_SIZE: {
|
945
|
-
|
1082
|
+
server.get_max_size(output);
|
946
1083
|
break;
|
947
1084
|
}
|
948
1085
|
case BUFFER_GET_BASE: {
|
949
|
-
|
1086
|
+
ok = server.buffer_get_base(input, output);
|
950
1087
|
break;
|
951
1088
|
}
|
952
1089
|
case FREE_BUFFER: {
|
953
|
-
|
1090
|
+
ok = server.free_buffer(input);
|
954
1091
|
break;
|
955
1092
|
}
|
956
1093
|
case BUFFER_CLEAR: {
|
957
|
-
|
1094
|
+
ok = server.buffer_clear(input);
|
958
1095
|
break;
|
959
1096
|
}
|
960
1097
|
case SET_TENSOR: {
|
961
|
-
|
1098
|
+
ok = server.set_tensor(input);
|
962
1099
|
break;
|
963
1100
|
}
|
964
1101
|
case GET_TENSOR: {
|
965
|
-
|
1102
|
+
ok = server.get_tensor(input, output);
|
966
1103
|
break;
|
967
1104
|
}
|
968
1105
|
case COPY_TENSOR: {
|
969
|
-
|
1106
|
+
ok = server.copy_tensor(input, output);
|
970
1107
|
break;
|
971
1108
|
}
|
972
1109
|
case GRAPH_COMPUTE: {
|
973
|
-
|
1110
|
+
ok = server.graph_compute(input, output);
|
974
1111
|
break;
|
975
1112
|
}
|
976
1113
|
case GET_DEVICE_MEMORY: {
|
@@ -982,9 +1119,12 @@ static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t fre
|
|
982
1119
|
}
|
983
1120
|
default: {
|
984
1121
|
fprintf(stderr, "Unknown command: %d\n", cmd);
|
985
|
-
|
1122
|
+
ok = false;
|
986
1123
|
}
|
987
1124
|
}
|
1125
|
+
if (!ok) {
|
1126
|
+
break;
|
1127
|
+
}
|
988
1128
|
uint64_t output_size = output.size();
|
989
1129
|
if (!send_data(sockfd, &output_size, sizeof(output_size))) {
|
990
1130
|
break;
|