@fugood/llama.node 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/bin/darwin/arm64/default.metallib +0 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/default.metallib +0 -0
  4. package/bin/darwin/x64/llama-node.node +0 -0
  5. package/bin/linux/arm64/llama-node.node +0 -0
  6. package/bin/linux/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/package.json +1 -1
  18. package/src/LlamaContext.cpp +2 -2
  19. package/src/LoadSessionWorker.cpp +1 -0
  20. package/src/llama.cpp/CMakeLists.txt +72 -46
  21. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +16 -0
  22. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +6 -0
  23. package/src/llama.cpp/common/common.cpp +732 -752
  24. package/src/llama.cpp/common/common.h +47 -41
  25. package/src/llama.cpp/common/grammar-parser.cpp +1 -1
  26. package/src/llama.cpp/common/json-schema-to-grammar.cpp +6 -6
  27. package/src/llama.cpp/common/log.h +5 -5
  28. package/src/llama.cpp/common/sampling.cpp +89 -7
  29. package/src/llama.cpp/common/sampling.h +5 -0
  30. package/src/llama.cpp/common/train.cpp +2 -2
  31. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  32. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  33. package/src/llama.cpp/examples/embedding/embedding.cpp +3 -2
  34. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +2 -2
  35. package/src/llama.cpp/examples/finetune/finetune.cpp +4 -3
  36. package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -2
  37. package/src/llama.cpp/examples/infill/infill.cpp +8 -8
  38. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
  39. package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +13 -8
  40. package/src/llama.cpp/examples/llava/clip.h +1 -1
  41. package/src/llama.cpp/examples/llava/llava-cli.cpp +1 -1
  42. package/src/llama.cpp/examples/llava/llava.cpp +0 -15
  43. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
  44. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  45. package/src/llama.cpp/examples/main/main.cpp +24 -16
  46. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
  47. package/src/llama.cpp/examples/perplexity/perplexity.cpp +9 -9
  48. package/src/llama.cpp/examples/quantize/quantize.cpp +2 -2
  49. package/src/llama.cpp/examples/retrieval/retrieval.cpp +2 -2
  50. package/src/llama.cpp/examples/rpc/rpc-server.cpp +78 -14
  51. package/src/llama.cpp/examples/server/server.cpp +21 -9
  52. package/src/llama.cpp/examples/tokenize/tokenize.cpp +359 -9
  53. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +4 -3
  54. package/src/llama.cpp/ggml-backend.c +0 -1
  55. package/src/llama.cpp/ggml-common.h +0 -54
  56. package/src/llama.cpp/ggml-cuda.h +1 -0
  57. package/src/llama.cpp/ggml-impl.h +51 -0
  58. package/src/llama.cpp/ggml-kompute.cpp +4 -0
  59. package/src/llama.cpp/ggml-opencl.cpp +4 -1
  60. package/src/llama.cpp/ggml-quants.c +3700 -2041
  61. package/src/llama.cpp/ggml-rpc.cpp +188 -56
  62. package/src/llama.cpp/ggml-sycl.cpp +99 -530
  63. package/src/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
  64. package/src/llama.cpp/ggml-vulkan.cpp +202 -225
  65. package/src/llama.cpp/ggml.c +1034 -1154
  66. package/src/llama.cpp/ggml.h +59 -31
  67. package/src/llama.cpp/llama.cpp +859 -609
  68. package/src/llama.cpp/llama.h +19 -6
  69. package/src/llama.cpp/requirements.txt +0 -1
  70. package/src/llama.cpp/tests/test-backend-ops.cpp +113 -47
  71. package/src/llama.cpp/tests/test-chat-template.cpp +16 -4
  72. package/src/llama.cpp/tests/test-grad0.cpp +43 -83
  73. package/src/llama.cpp/unicode-data.cpp +6969 -2169
  74. package/src/llama.cpp/unicode-data.h +15 -12
  75. package/src/llama.cpp/unicode.cpp +89 -111
  76. package/src/llama.cpp/unicode.h +44 -12
  77. package/src/llama.cpp/build.zig +0 -172
  78. package/src/llama.cpp/ggml-mpi.c +0 -216
  79. package/src/llama.cpp/ggml-mpi.h +0 -39
  80. package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +0 -2
@@ -28,7 +28,7 @@
28
28
 
29
29
  #define UNUSED GGML_UNUSED
30
30
 
31
- #define GGML_DEBUG 1
31
+ #define GGML_DEBUG 0
32
32
  #if (GGML_DEBUG >= 1)
33
33
  #define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
34
34
  #else
@@ -56,6 +56,7 @@ struct socket_t {
56
56
  };
57
57
 
58
58
  // ggml_tensor is serialized into rpc_tensor
59
+ #pragma pack(push, 1)
59
60
  struct rpc_tensor {
60
61
  uint64_t id;
61
62
  uint32_t type;
@@ -71,6 +72,7 @@ struct rpc_tensor {
71
72
  uint64_t data;
72
73
  char name[GGML_MAX_NAME];
73
74
  };
75
+ #pragma pack(pop)
74
76
 
75
77
  // RPC commands
76
78
  enum rpc_cmd {
@@ -134,7 +136,13 @@ static bool set_no_delay(sockfd_t sockfd) {
134
136
  int flag = 1;
135
137
  // set TCP_NODELAY to disable Nagle's algorithm
136
138
  int ret = setsockopt(sockfd, IPPROTO_TCP, TCP_NODELAY, (char *)&flag, sizeof(int));
137
- return ret >= 0;
139
+ return ret == 0;
140
+ }
141
+
142
+ static bool set_reuse_addr(sockfd_t sockfd) {
143
+ int flag = 1;
144
+ int ret = setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, (char *)&flag, sizeof(int));
145
+ return ret == 0;
138
146
  }
139
147
 
140
148
  static std::shared_ptr<socket_t> socket_connect(const char * host, int port) {
@@ -181,7 +189,10 @@ static std::shared_ptr<socket_t> create_server_socket(const char * host, int por
181
189
  if (sock == nullptr) {
182
190
  return nullptr;
183
191
  }
184
-
192
+ if (!set_reuse_addr(sockfd)) {
193
+ fprintf(stderr, "Failed to set SO_REUSEADDR\n");
194
+ return nullptr;
195
+ }
185
196
  struct sockaddr_in serv_addr;
186
197
  serv_addr.sin_family = AF_INET;
187
198
  serv_addr.sin_addr.s_addr = inet_addr(host);
@@ -331,23 +342,6 @@ static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
331
342
  return result;
332
343
  }
333
344
 
334
- static ggml_tensor * deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor) {
335
- ggml_tensor * result = ggml_new_tensor_4d(ctx, (ggml_type) tensor->type,
336
- tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
337
- for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) {
338
- result->nb[i] = tensor->nb[i];
339
- }
340
- result->buffer = reinterpret_cast<ggml_backend_buffer_t>(tensor->buffer);
341
- result->op = (ggml_op) tensor->op;
342
- for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
343
- result->op_params[i] = tensor->op_params[i];
344
- }
345
- result->flags = tensor->flags;
346
- result->data = reinterpret_cast<void *>(tensor->data);
347
- ggml_set_name(result, tensor->name);
348
- return result;
349
- }
350
-
351
345
  GGML_CALL static void ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
352
346
  UNUSED(buffer);
353
347
  if (ggml_is_quantized(tensor->type)) {
@@ -456,13 +450,15 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer
456
450
  memcpy(&remote_ptr, output.data(), sizeof(remote_ptr));
457
451
  size_t remote_size;
458
452
  memcpy(&remote_size, output.data() + sizeof(uint64_t), sizeof(remote_size));
459
-
460
- ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
461
- ggml_backend_rpc_buffer_interface,
462
- new ggml_backend_rpc_buffer_context{buft_ctx->sock, {}, remote_ptr, "RPC"},
463
- remote_size);
464
-
465
- return buffer;
453
+ if (remote_ptr != 0) {
454
+ ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
455
+ ggml_backend_rpc_buffer_interface,
456
+ new ggml_backend_rpc_buffer_context{buft_ctx->sock, {}, remote_ptr, "RPC"},
457
+ remote_size);
458
+ return buffer;
459
+ } else {
460
+ return nullptr;
461
+ }
466
462
  }
467
463
 
468
464
  static size_t get_alignment(const std::shared_ptr<socket_t> & sock) {
@@ -649,7 +645,7 @@ GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
649
645
  }
650
646
  }
651
647
  #endif
652
- GGML_PRINT_DEBUG("Connecting to %s\n", endpoint);
648
+ fprintf(stderr, "Connecting to %s\n", endpoint);
653
649
  std::string host;
654
650
  int port;
655
651
  if (!parse_endpoint(endpoint, host, port)) {
@@ -722,22 +718,61 @@ GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint
722
718
 
723
719
  // RPC server-side implementation
724
720
 
725
- static void rpc_alloc_buffer(ggml_backend_t backend, const std::vector<uint8_t> & input, std::vector<uint8_t> & output) {
721
+ class rpc_server {
722
+ public:
723
+ rpc_server(ggml_backend_t backend) : backend(backend) {}
724
+ ~rpc_server();
725
+
726
+ bool alloc_buffer(const std::vector<uint8_t> & input, std::vector<uint8_t> & output);
727
+ void get_alignment(std::vector<uint8_t> & output);
728
+ void get_max_size(std::vector<uint8_t> & output);
729
+ bool buffer_get_base(const std::vector<uint8_t> & input, std::vector<uint8_t> & output);
730
+ bool free_buffer(const std::vector<uint8_t> & input);
731
+ bool buffer_clear(const std::vector<uint8_t> & input);
732
+ bool set_tensor(const std::vector<uint8_t> & input);
733
+ bool get_tensor(const std::vector<uint8_t> & input, std::vector<uint8_t> & output);
734
+ bool copy_tensor(const std::vector<uint8_t> & input, std::vector<uint8_t> & output);
735
+ bool graph_compute(const std::vector<uint8_t> & input, std::vector<uint8_t> & output);
736
+
737
+ private:
738
+ ggml_tensor * deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor);
739
+ ggml_tensor * create_node(uint64_t id,
740
+ struct ggml_context * ctx,
741
+ const std::unordered_map<uint64_t, const rpc_tensor*> & tensor_ptrs,
742
+ std::unordered_map<uint64_t, struct ggml_tensor*> & tensor_map);
743
+
744
+
745
+ ggml_backend_t backend;
746
+ std::unordered_set<ggml_backend_buffer_t> buffers;
747
+ };
748
+
749
+ bool rpc_server::alloc_buffer(const std::vector<uint8_t> & input, std::vector<uint8_t> & output) {
726
750
  // input serialization format: | size (8 bytes) |
751
+ if (input.size() != sizeof(uint64_t)) {
752
+ return false;
753
+ }
727
754
  uint64_t size;
728
755
  memcpy(&size, input.data(), sizeof(size));
729
756
  ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
730
757
  ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
731
- uint64_t remote_ptr = reinterpret_cast<uint64_t>(buffer);
732
- uint64_t remote_size = buffer->size;
733
- GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> remote_ptr: %" PRIx64 ", remote_size: %" PRIu64 "\n", __func__, size, remote_ptr, remote_size);
758
+ uint64_t remote_ptr = 0;
759
+ uint64_t remote_size = 0;
760
+ if (buffer != nullptr) {
761
+ remote_ptr = reinterpret_cast<uint64_t>(buffer);
762
+ remote_size = buffer->size;
763
+ GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> remote_ptr: %" PRIx64 ", remote_size: %" PRIu64 "\n", __func__, size, remote_ptr, remote_size);
764
+ buffers.insert(buffer);
765
+ } else {
766
+ GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> failed\n", __func__, size);
767
+ }
734
768
  // output serialization format: | remote_ptr (8 bytes) | remote_size (8 bytes) |
735
769
  output.resize(2*sizeof(uint64_t), 0);
736
770
  memcpy(output.data(), &remote_ptr, sizeof(remote_ptr));
737
771
  memcpy(output.data() + sizeof(uint64_t), &remote_size, sizeof(remote_size));
772
+ return true;
738
773
  }
739
774
 
740
- static void rpc_get_alignment(ggml_backend_t backend, std::vector<uint8_t> & output) {
775
+ void rpc_server::get_alignment(std::vector<uint8_t> & output) {
741
776
  ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
742
777
  size_t alignment = ggml_backend_buft_get_alignment(buft);
743
778
  GGML_PRINT_DEBUG("[%s] alignment: %lu\n", __func__, alignment);
@@ -746,7 +781,7 @@ static void rpc_get_alignment(ggml_backend_t backend, std::vector<uint8_t> & out
746
781
  memcpy(output.data(), &alignment, sizeof(alignment));
747
782
  }
748
783
 
749
- static void rpc_get_max_size(ggml_backend_t backend, std::vector<uint8_t> & output) {
784
+ void rpc_server::get_max_size(std::vector<uint8_t> & output) {
750
785
  ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
751
786
  size_t max_size = ggml_backend_buft_get_max_size(buft);
752
787
  GGML_PRINT_DEBUG("[%s] max_size: %lu\n", __func__, max_size);
@@ -755,41 +790,90 @@ static void rpc_get_max_size(ggml_backend_t backend, std::vector<uint8_t> & outp
755
790
  memcpy(output.data(), &max_size, sizeof(max_size));
756
791
  }
757
792
 
758
- static void rpc_buffer_get_base(const std::vector<uint8_t> & input, std::vector<uint8_t> & output) {
793
+ bool rpc_server::buffer_get_base(const std::vector<uint8_t> & input, std::vector<uint8_t> & output) {
759
794
  // input serialization format: | remote_ptr (8 bytes) |
795
+ if (input.size() != sizeof(uint64_t)) {
796
+ return false;
797
+ }
760
798
  uint64_t remote_ptr;
761
799
  memcpy(&remote_ptr, input.data(), sizeof(remote_ptr));
762
800
  GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, remote_ptr);
763
801
  ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(remote_ptr);
802
+ if (buffers.find(buffer) == buffers.end()) {
803
+ GGML_PRINT_DEBUG("[%s] buffer not found\n", __func__);
804
+ return false;
805
+ }
764
806
  void * base = ggml_backend_buffer_get_base(buffer);
765
807
  // output serialization format: | base_ptr (8 bytes) |
766
808
  uint64_t base_ptr = reinterpret_cast<uint64_t>(base);
767
809
  output.resize(sizeof(uint64_t), 0);
768
810
  memcpy(output.data(), &base_ptr, sizeof(base_ptr));
811
+ return true;
769
812
  }
770
813
 
771
- static void rpc_free_buffer(const std::vector<uint8_t> & input) {
814
+ bool rpc_server::free_buffer(const std::vector<uint8_t> & input) {
772
815
  // input serialization format: | remote_ptr (8 bytes) |
816
+ if (input.size() != sizeof(uint64_t)) {
817
+ return false;
818
+ }
773
819
  uint64_t remote_ptr;
774
820
  memcpy(&remote_ptr, input.data(), sizeof(remote_ptr));
775
821
  GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, remote_ptr);
776
822
  ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(remote_ptr);
823
+ if (buffers.find(buffer) == buffers.end()) {
824
+ GGML_PRINT_DEBUG("[%s] buffer not found\n", __func__);
825
+ return false;
826
+ }
777
827
  ggml_backend_buffer_free(buffer);
828
+ buffers.erase(buffer);
829
+ return true;
778
830
  }
779
831
 
780
- static void rpc_buffer_clear(const std::vector<uint8_t> & input) {
832
+ bool rpc_server::buffer_clear(const std::vector<uint8_t> & input) {
781
833
  // input serialization format: | remote_ptr (8 bytes) | value (1 byte) |
834
+ if (input.size() != sizeof(uint64_t) + sizeof(uint8_t)) {
835
+ return false;
836
+ }
782
837
  uint64_t remote_ptr;
783
838
  memcpy(&remote_ptr, input.data(), sizeof(remote_ptr));
784
839
  uint8_t value;
785
840
  memcpy(&value, input.data() + sizeof(uint64_t), sizeof(value));
786
841
  GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 ", value: %u\n", __func__, remote_ptr, value);
787
842
  ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(remote_ptr);
843
+ if (buffers.find(buffer) == buffers.end()) {
844
+ GGML_PRINT_DEBUG("[%s] buffer not found\n", __func__);
845
+ return false;
846
+ }
788
847
  ggml_backend_buffer_clear(buffer, value);
848
+ return true;
789
849
  }
790
850
 
791
- static void rpc_set_tensor(const std::vector<uint8_t> & input) {
851
+ ggml_tensor * rpc_server::deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor) {
852
+ ggml_tensor * result = ggml_new_tensor_4d(ctx, (ggml_type) tensor->type,
853
+ tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
854
+ for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) {
855
+ result->nb[i] = tensor->nb[i];
856
+ }
857
+ result->buffer = reinterpret_cast<ggml_backend_buffer_t>(tensor->buffer);
858
+ if (result->buffer && buffers.find(result->buffer) == buffers.end()) {
859
+ return nullptr;
860
+ }
861
+ result->op = (ggml_op) tensor->op;
862
+ for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
863
+ result->op_params[i] = tensor->op_params[i];
864
+ }
865
+ result->flags = tensor->flags;
866
+ result->data = reinterpret_cast<void *>(tensor->data);
867
+ ggml_set_name(result, tensor->name);
868
+ return result;
869
+ }
870
+
871
+
872
+ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
792
873
  // serialization format: | rpc_tensor | offset (8 bytes) | data (size bytes) |
874
+ if (input.size() < sizeof(rpc_tensor) + sizeof(uint64_t)) {
875
+ return false;
876
+ }
793
877
  const rpc_tensor * in_tensor = (const rpc_tensor *)input.data();
794
878
  uint64_t offset;
795
879
  memcpy(&offset, input.data() + sizeof(rpc_tensor), sizeof(offset));
@@ -802,14 +886,23 @@ static void rpc_set_tensor(const std::vector<uint8_t> & input) {
802
886
  };
803
887
  struct ggml_context * ctx = ggml_init(params);
804
888
  ggml_tensor * tensor = deserialize_tensor(ctx, in_tensor);
889
+ if (tensor == nullptr) {
890
+ GGML_PRINT_DEBUG("[%s] error deserializing tensor\n", __func__);
891
+ ggml_free(ctx);
892
+ return false;
893
+ }
805
894
  GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu\n", __func__, (void*)tensor->buffer, tensor->data, offset, size);
806
895
  const void * data = input.data() + sizeof(rpc_tensor) + sizeof(offset);
807
896
  ggml_backend_tensor_set(tensor, data, offset, size);
808
897
  ggml_free(ctx);
898
+ return true;
809
899
  }
810
900
 
811
- static void rpc_get_tensor(const std::vector<uint8_t> & input, std::vector<uint8_t> & output) {
901
+ bool rpc_server::get_tensor(const std::vector<uint8_t> & input, std::vector<uint8_t> & output) {
812
902
  // serialization format: | rpc_tensor | offset (8 bytes) | size (8 bytes) |
903
+ if (input.size() != sizeof(rpc_tensor) + 2*sizeof(uint64_t)) {
904
+ return false;
905
+ }
813
906
  const rpc_tensor * in_tensor = (const rpc_tensor *)input.data();
814
907
  uint64_t offset;
815
908
  memcpy(&offset, input.data() + sizeof(rpc_tensor), sizeof(offset));
@@ -823,15 +916,24 @@ static void rpc_get_tensor(const std::vector<uint8_t> & input, std::vector<uint8
823
916
  };
824
917
  struct ggml_context * ctx = ggml_init(params);
825
918
  ggml_tensor * tensor = deserialize_tensor(ctx, in_tensor);
919
+ if (tensor == nullptr) {
920
+ GGML_PRINT_DEBUG("[%s] error deserializing tensor\n", __func__);
921
+ ggml_free(ctx);
922
+ return false;
923
+ }
826
924
  GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %" PRIu64 "\n", __func__, (void*)tensor->buffer, tensor->data, offset, size);
827
925
  // output serialization format: | data (size bytes) |
828
926
  output.resize(size, 0);
829
927
  ggml_backend_tensor_get(tensor, output.data(), offset, size);
830
928
  ggml_free(ctx);
929
+ return true;
831
930
  }
832
931
 
833
- static void rpc_copy_tensor(const std::vector<uint8_t> & input, std::vector<uint8_t> & output) {
932
+ bool rpc_server::copy_tensor(const std::vector<uint8_t> & input, std::vector<uint8_t> & output) {
834
933
  // serialization format: | rpc_tensor src | rpc_tensor dst |
934
+ if (input.size() != 2*sizeof(rpc_tensor)) {
935
+ return false;
936
+ }
835
937
  const rpc_tensor * rpc_src = (const rpc_tensor *)input.data();
836
938
  const rpc_tensor * rpc_dst = (const rpc_tensor *)(input.data() + sizeof(rpc_src));
837
939
 
@@ -843,18 +945,24 @@ static void rpc_copy_tensor(const std::vector<uint8_t> & input, std::vector<uint
843
945
  struct ggml_context * ctx = ggml_init(params);
844
946
  ggml_tensor * src = deserialize_tensor(ctx, rpc_src);
845
947
  ggml_tensor * dst = deserialize_tensor(ctx, rpc_dst);
948
+ if (src == nullptr || dst == nullptr) {
949
+ GGML_PRINT_DEBUG("[%s] error deserializing tensors\n", __func__);
950
+ ggml_free(ctx);
951
+ return false;
952
+ }
846
953
  GGML_PRINT_DEBUG("[%s] src->buffer: %p, dst->buffer: %p\n", __func__, (void*)src->buffer, (void*)dst->buffer);
847
954
  bool result = ggml_backend_buffer_copy_tensor(src, dst);
848
955
  // output serialization format: | result (1 byte) |
849
956
  output.resize(1, 0);
850
957
  output[0] = result;
851
958
  ggml_free(ctx);
959
+ return true;
852
960
  }
853
961
 
854
- static struct ggml_tensor * create_node(uint64_t id,
855
- struct ggml_context * ctx,
856
- const std::unordered_map<uint64_t, const rpc_tensor*> & tensor_ptrs,
857
- std::unordered_map<uint64_t, struct ggml_tensor*> & tensor_map) {
962
+ ggml_tensor * rpc_server::create_node(uint64_t id,
963
+ struct ggml_context * ctx,
964
+ const std::unordered_map<uint64_t, const rpc_tensor*> & tensor_ptrs,
965
+ std::unordered_map<uint64_t, struct ggml_tensor*> & tensor_map) {
858
966
  if (id == 0) {
859
967
  return nullptr;
860
968
  }
@@ -863,6 +971,9 @@ static struct ggml_tensor * create_node(uint64_t id,
863
971
  }
864
972
  const rpc_tensor * tensor = tensor_ptrs.at(id);
865
973
  struct ggml_tensor * result = deserialize_tensor(ctx, tensor);
974
+ if (result == nullptr) {
975
+ return nullptr;
976
+ }
866
977
  tensor_map[id] = result;
867
978
  for (int i = 0; i < GGML_MAX_SRC; i++) {
868
979
  result->src[i] = create_node(tensor->src[i], ctx, tensor_ptrs, tensor_map);
@@ -872,14 +983,23 @@ static struct ggml_tensor * create_node(uint64_t id,
872
983
  return result;
873
984
  }
874
985
 
875
- static void rpc_graph_compute(ggml_backend_t backend, const std::vector<uint8_t> & input, std::vector<uint8_t> & output) {
986
+ bool rpc_server::graph_compute(const std::vector<uint8_t> & input, std::vector<uint8_t> & output) {
876
987
  // serialization format:
877
988
  // | n_nodes (4 bytes) | nodes (n_nodes * sizeof(uint64_t) | n_tensors (4 bytes) | tensors (n_tensors * sizeof(rpc_tensor)) |
989
+ if (input.size() < sizeof(uint32_t)) {
990
+ return false;
991
+ }
878
992
  uint32_t n_nodes;
879
993
  memcpy(&n_nodes, input.data(), sizeof(n_nodes));
994
+ if (input.size() < sizeof(uint32_t) + n_nodes*sizeof(uint64_t) + sizeof(uint32_t)) {
995
+ return false;
996
+ }
880
997
  const uint64_t * nodes = (const uint64_t *)(input.data() + sizeof(n_nodes));
881
998
  uint32_t n_tensors;
882
999
  memcpy(&n_tensors, input.data() + sizeof(n_nodes) + n_nodes*sizeof(uint64_t), sizeof(n_tensors));
1000
+ if (input.size() < sizeof(uint32_t) + n_nodes*sizeof(uint64_t) + sizeof(uint32_t) + n_tensors*sizeof(rpc_tensor)) {
1001
+ return false;
1002
+ }
883
1003
  const rpc_tensor * tensors = (const rpc_tensor *)(input.data() + sizeof(n_nodes) + n_nodes*sizeof(uint64_t) + sizeof(n_tensors));
884
1004
  GGML_PRINT_DEBUG("[%s] n_nodes: %u, n_tensors: %u\n", __func__, n_nodes, n_tensors);
885
1005
 
@@ -905,9 +1025,17 @@ static void rpc_graph_compute(ggml_backend_t backend, const std::vector<uint8_t>
905
1025
  output.resize(1, 0);
906
1026
  output[0] = status;
907
1027
  ggml_free(ctx);
1028
+ return true;
1029
+ }
1030
+
1031
+ rpc_server::~rpc_server() {
1032
+ for (auto buffer : buffers) {
1033
+ ggml_backend_buffer_free(buffer);
1034
+ }
908
1035
  }
909
1036
 
910
1037
  static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t free_mem, size_t total_mem) {
1038
+ rpc_server server(backend);
911
1039
  while (true) {
912
1040
  uint8_t cmd;
913
1041
  if (!recv_data(sockfd, &cmd, 1)) {
@@ -923,45 +1051,46 @@ static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t fre
923
1051
  if (!recv_data(sockfd, input.data(), input_size)) {
924
1052
  break;
925
1053
  }
1054
+ bool ok = true;
926
1055
  switch (cmd) {
927
1056
  case ALLOC_BUFFER: {
928
- rpc_alloc_buffer(backend, input, output);
1057
+ ok = server.alloc_buffer(input, output);
929
1058
  break;
930
1059
  }
931
1060
  case GET_ALIGNMENT: {
932
- rpc_get_alignment(backend, output);
1061
+ server.get_alignment(output);
933
1062
  break;
934
1063
  }
935
1064
  case GET_MAX_SIZE: {
936
- rpc_get_max_size(backend, output);
1065
+ server.get_max_size(output);
937
1066
  break;
938
1067
  }
939
1068
  case BUFFER_GET_BASE: {
940
- rpc_buffer_get_base(input, output);
1069
+ ok = server.buffer_get_base(input, output);
941
1070
  break;
942
1071
  }
943
1072
  case FREE_BUFFER: {
944
- rpc_free_buffer(input);
1073
+ ok = server.free_buffer(input);
945
1074
  break;
946
1075
  }
947
1076
  case BUFFER_CLEAR: {
948
- rpc_buffer_clear(input);
1077
+ ok = server.buffer_clear(input);
949
1078
  break;
950
1079
  }
951
1080
  case SET_TENSOR: {
952
- rpc_set_tensor(input);
1081
+ ok = server.set_tensor(input);
953
1082
  break;
954
1083
  }
955
1084
  case GET_TENSOR: {
956
- rpc_get_tensor(input, output);
1085
+ ok = server.get_tensor(input, output);
957
1086
  break;
958
1087
  }
959
1088
  case COPY_TENSOR: {
960
- rpc_copy_tensor(input, output);
1089
+ ok = server.copy_tensor(input, output);
961
1090
  break;
962
1091
  }
963
1092
  case GRAPH_COMPUTE: {
964
- rpc_graph_compute(backend, input, output);
1093
+ ok = server.graph_compute(input, output);
965
1094
  break;
966
1095
  }
967
1096
  case GET_DEVICE_MEMORY: {
@@ -973,9 +1102,12 @@ static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t fre
973
1102
  }
974
1103
  default: {
975
1104
  fprintf(stderr, "Unknown command: %d\n", cmd);
976
- return;
1105
+ ok = false;
977
1106
  }
978
1107
  }
1108
+ if (!ok) {
1109
+ break;
1110
+ }
979
1111
  uint64_t output_size = output.size();
980
1112
  if (!send_data(sockfd, &output_size, sizeof(output_size))) {
981
1113
  break;