llama_cpp 0.16.1 → 0.16.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +12 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- data/vendor/tmp/llama.cpp/Makefile +10 -2
- data/vendor/tmp/llama.cpp/ggml-backend.c +14 -3
- data/vendor/tmp/llama.cpp/ggml-backend.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +10 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +28 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +6 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +982 -368
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +8 -3
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +2124 -13202
- data/vendor/tmp/llama.cpp/ggml-sycl.h +1 -10
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +27564 -23876
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +278 -366
- data/vendor/tmp/llama.cpp/ggml.c +67 -150
- data/vendor/tmp/llama.cpp/ggml.h +6 -0
- data/vendor/tmp/llama.cpp/llama.cpp +530 -237
- data/vendor/tmp/llama.cpp/llama.h +5 -1
- data/vendor/tmp/llama.cpp/sgemm.cpp +2 -0
- data/vendor/tmp/llama.cpp/unicode-data.cpp +851 -801
- data/vendor/tmp/llama.cpp/unicode.cpp +33 -19
- data/vendor/tmp/llama.cpp/unicode.h +1 -1
- metadata +2 -2
@@ -73,9 +73,13 @@ struct rpc_tensor {
|
|
73
73
|
uint64_t view_offs;
|
74
74
|
uint64_t data;
|
75
75
|
char name[GGML_MAX_NAME];
|
76
|
+
|
77
|
+
char padding[4];
|
76
78
|
};
|
77
79
|
#pragma pack(pop)
|
78
80
|
|
81
|
+
static_assert(sizeof(rpc_tensor) % 8 == 0, "rpc_tensor size must be multiple of 8");
|
82
|
+
|
79
83
|
// RPC commands
|
80
84
|
enum rpc_cmd {
|
81
85
|
ALLOC_BUFFER = 0,
|
@@ -599,9 +603,8 @@ static void serialize_graph(const ggml_cgraph * cgraph, std::vector<uint8_t> & o
|
|
599
603
|
int output_size = sizeof(uint32_t) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t) + n_tensors * sizeof(rpc_tensor);
|
600
604
|
output.resize(output_size, 0);
|
601
605
|
memcpy(output.data(), &n_nodes, sizeof(n_nodes));
|
602
|
-
uint64_t * out_nodes = (uint64_t *)(output.data() + sizeof(n_nodes));
|
603
606
|
for (uint32_t i = 0; i < n_nodes; i++) {
|
604
|
-
|
607
|
+
memcpy(output.data() + sizeof(n_nodes) + i * sizeof(uint64_t), &cgraph->nodes[i], sizeof(uint64_t));
|
605
608
|
}
|
606
609
|
uint32_t * out_ntensors = (uint32_t *)(output.data() + sizeof(n_nodes) + n_nodes * sizeof(uint64_t));
|
607
610
|
*out_ntensors = n_tensors;
|
@@ -1036,7 +1039,9 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input, std::vector<u
|
|
1036
1039
|
}
|
1037
1040
|
std::unordered_map<uint64_t, ggml_tensor*> tensor_map;
|
1038
1041
|
for (uint32_t i = 0; i < n_nodes; i++) {
|
1039
|
-
|
1042
|
+
int64_t id;
|
1043
|
+
memcpy(&id, &nodes[i], sizeof(id));
|
1044
|
+
graph->nodes[i] = create_node(id, ctx, tensor_ptrs, tensor_map);
|
1040
1045
|
}
|
1041
1046
|
ggml_status status = ggml_backend_graph_compute(backend, graph);
|
1042
1047
|
// output serialization format: | status (1 byte) |
|