@fugood/llama.node 0.3.6 → 0.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. package/README.md +17 -2
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +3 -1
  19. package/lib/index.js +16 -1
  20. package/lib/index.ts +16 -0
  21. package/package.json +1 -1
  22. package/src/EmbeddingWorker.cpp +4 -3
  23. package/src/LlamaCompletionWorker.cpp +4 -2
  24. package/src/LlamaContext.cpp +61 -6
  25. package/src/LlamaContext.h +1 -0
  26. package/src/common.hpp +6 -11
  27. package/src/llama.cpp/.github/workflows/build.yml +19 -17
  28. package/src/llama.cpp/.github/workflows/docker.yml +77 -30
  29. package/src/llama.cpp/.github/workflows/editorconfig.yml +3 -1
  30. package/src/llama.cpp/.github/workflows/server.yml +22 -3
  31. package/src/llama.cpp/CMakeLists.txt +49 -24
  32. package/src/llama.cpp/common/arg.cpp +82 -26
  33. package/src/llama.cpp/common/arg.h +3 -0
  34. package/src/llama.cpp/common/common.cpp +192 -72
  35. package/src/llama.cpp/common/common.h +51 -18
  36. package/src/llama.cpp/common/ngram-cache.cpp +12 -12
  37. package/src/llama.cpp/common/ngram-cache.h +2 -2
  38. package/src/llama.cpp/common/sampling.cpp +11 -6
  39. package/src/llama.cpp/common/speculative.cpp +18 -15
  40. package/src/llama.cpp/docs/build.md +2 -0
  41. package/src/llama.cpp/examples/batched/batched.cpp +9 -7
  42. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +3 -3
  43. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +10 -8
  44. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +11 -8
  45. package/src/llama.cpp/examples/cvector-generator/mean.hpp +1 -1
  46. package/src/llama.cpp/examples/cvector-generator/pca.hpp +1 -1
  47. package/src/llama.cpp/examples/embedding/embedding.cpp +8 -7
  48. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +7 -6
  49. package/src/llama.cpp/examples/export-lora/export-lora.cpp +8 -7
  50. package/src/llama.cpp/examples/gguf/gguf.cpp +10 -6
  51. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +1 -0
  52. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +8 -7
  53. package/src/llama.cpp/examples/gritlm/gritlm.cpp +13 -10
  54. package/src/llama.cpp/examples/imatrix/imatrix.cpp +13 -12
  55. package/src/llama.cpp/examples/infill/infill.cpp +23 -24
  56. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +44 -13
  57. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -6
  58. package/src/llama.cpp/examples/llava/clip.cpp +4 -2
  59. package/src/llama.cpp/examples/llava/llava-cli.cpp +9 -6
  60. package/src/llama.cpp/examples/llava/llava.cpp +2 -2
  61. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +8 -4
  62. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +11 -8
  63. package/src/llama.cpp/examples/lookahead/lookahead.cpp +6 -7
  64. package/src/llama.cpp/examples/lookup/lookup-create.cpp +4 -9
  65. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +3 -7
  66. package/src/llama.cpp/examples/lookup/lookup.cpp +5 -6
  67. package/src/llama.cpp/examples/main/main.cpp +51 -29
  68. package/src/llama.cpp/examples/parallel/parallel.cpp +5 -6
  69. package/src/llama.cpp/examples/passkey/passkey.cpp +7 -5
  70. package/src/llama.cpp/examples/perplexity/perplexity.cpp +37 -23
  71. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -14
  72. package/src/llama.cpp/examples/retrieval/retrieval.cpp +8 -8
  73. package/src/llama.cpp/examples/rpc/rpc-server.cpp +12 -0
  74. package/src/llama.cpp/examples/run/CMakeLists.txt +1 -1
  75. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +1351 -0
  76. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +114 -0
  77. package/src/llama.cpp/examples/run/run.cpp +175 -61
  78. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -25
  79. package/src/llama.cpp/examples/server/CMakeLists.txt +1 -0
  80. package/src/llama.cpp/examples/server/httplib.h +1295 -409
  81. package/src/llama.cpp/examples/server/server.cpp +387 -181
  82. package/src/llama.cpp/examples/server/tests/requirements.txt +1 -0
  83. package/src/llama.cpp/examples/server/utils.hpp +170 -58
  84. package/src/llama.cpp/examples/simple/simple.cpp +9 -8
  85. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +16 -12
  86. package/src/llama.cpp/examples/speculative/speculative.cpp +22 -23
  87. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +8 -12
  88. package/src/llama.cpp/examples/tokenize/tokenize.cpp +17 -5
  89. package/src/llama.cpp/examples/tts/tts.cpp +64 -23
  90. package/src/llama.cpp/ggml/CMakeLists.txt +5 -21
  91. package/src/llama.cpp/ggml/include/ggml-backend.h +2 -0
  92. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -0
  93. package/src/llama.cpp/ggml/include/ggml.h +36 -145
  94. package/src/llama.cpp/ggml/include/gguf.h +202 -0
  95. package/src/llama.cpp/ggml/src/CMakeLists.txt +6 -3
  96. package/src/llama.cpp/ggml/src/ggml-alloc.c +5 -0
  97. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +0 -1
  98. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +79 -49
  99. package/src/llama.cpp/ggml/src/ggml-backend.cpp +5 -2
  100. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +33 -23
  101. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +57 -72
  102. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +87 -2
  103. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +335 -66
  104. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -2
  105. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1090 -378
  106. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +2 -2
  107. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +1 -0
  108. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
  109. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -0
  110. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +3 -1
  111. package/src/llama.cpp/ggml/src/ggml-impl.h +11 -16
  112. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +16 -0
  113. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +6 -6
  114. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +154 -35
  115. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  116. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +9 -3
  117. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +18 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
  119. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +1 -2
  120. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +3 -2
  121. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +1 -2
  122. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +40 -95
  123. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +48 -48
  124. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +24 -24
  125. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -164
  126. package/src/llama.cpp/ggml/src/ggml-sycl/gla.cpp +105 -0
  127. package/src/llama.cpp/ggml/src/ggml-sycl/gla.hpp +8 -0
  128. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +3 -3
  129. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +1 -2
  130. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -2
  131. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +1 -2
  132. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +7 -5
  133. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +1 -2
  134. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +74 -4
  135. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +314 -116
  136. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -2
  137. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +9 -3
  138. package/src/llama.cpp/ggml/src/ggml.c +117 -1327
  139. package/src/llama.cpp/ggml/src/gguf.cpp +1329 -0
  140. package/src/llama.cpp/include/llama-cpp.h +6 -1
  141. package/src/llama.cpp/include/llama.h +138 -75
  142. package/src/llama.cpp/src/CMakeLists.txt +13 -1
  143. package/src/llama.cpp/src/llama-adapter.cpp +347 -0
  144. package/src/llama.cpp/src/llama-adapter.h +74 -0
  145. package/src/llama.cpp/src/llama-arch.cpp +1487 -0
  146. package/src/llama.cpp/src/llama-arch.h +400 -0
  147. package/src/llama.cpp/src/llama-batch.cpp +368 -0
  148. package/src/llama.cpp/src/llama-batch.h +88 -0
  149. package/src/llama.cpp/src/llama-chat.cpp +578 -0
  150. package/src/llama.cpp/src/llama-chat.h +52 -0
  151. package/src/llama.cpp/src/llama-context.cpp +1775 -0
  152. package/src/llama.cpp/src/llama-context.h +128 -0
  153. package/src/llama.cpp/src/llama-cparams.cpp +1 -0
  154. package/src/llama.cpp/src/llama-cparams.h +37 -0
  155. package/src/llama.cpp/src/llama-grammar.cpp +5 -4
  156. package/src/llama.cpp/src/llama-grammar.h +3 -1
  157. package/src/llama.cpp/src/llama-hparams.cpp +71 -0
  158. package/src/llama.cpp/src/llama-hparams.h +139 -0
  159. package/src/llama.cpp/src/llama-impl.cpp +167 -0
  160. package/src/llama.cpp/src/llama-impl.h +16 -136
  161. package/src/llama.cpp/src/llama-kv-cache.cpp +718 -0
  162. package/src/llama.cpp/src/llama-kv-cache.h +218 -0
  163. package/src/llama.cpp/src/llama-mmap.cpp +589 -0
  164. package/src/llama.cpp/src/llama-mmap.h +67 -0
  165. package/src/llama.cpp/src/llama-model-loader.cpp +1124 -0
  166. package/src/llama.cpp/src/llama-model-loader.h +167 -0
  167. package/src/llama.cpp/src/llama-model.cpp +3953 -0
  168. package/src/llama.cpp/src/llama-model.h +370 -0
  169. package/src/llama.cpp/src/llama-quant.cpp +934 -0
  170. package/src/llama.cpp/src/llama-quant.h +1 -0
  171. package/src/llama.cpp/src/llama-sampling.cpp +147 -32
  172. package/src/llama.cpp/src/llama-sampling.h +3 -19
  173. package/src/llama.cpp/src/llama-vocab.cpp +1832 -575
  174. package/src/llama.cpp/src/llama-vocab.h +97 -142
  175. package/src/llama.cpp/src/llama.cpp +7160 -20314
  176. package/src/llama.cpp/src/unicode.cpp +8 -3
  177. package/src/llama.cpp/tests/CMakeLists.txt +2 -0
  178. package/src/llama.cpp/tests/test-autorelease.cpp +3 -3
  179. package/src/llama.cpp/tests/test-backend-ops.cpp +370 -59
  180. package/src/llama.cpp/tests/test-chat-template.cpp +162 -125
  181. package/src/llama.cpp/tests/test-gguf.cpp +222 -187
  182. package/src/llama.cpp/tests/test-model-load-cancel.cpp +1 -1
  183. package/src/llama.cpp/tests/test-sampling.cpp +0 -1
  184. package/src/llama.cpp/tests/test-tokenizer-0.cpp +4 -4
  185. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +9 -7
  186. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +8 -6
@@ -5,8 +5,8 @@
5
5
  extern "C" {
6
6
  #endif
7
7
 
8
- bool llamafile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t,
9
- const void *, int64_t, void *, int64_t, int, int,
8
+ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t, int64_t, int64_t,
9
+ const void *, int64_t, const void *, int64_t, void *, int64_t,
10
10
  int, int, int);
11
11
 
12
12
  #ifdef __cplusplus
@@ -3,6 +3,7 @@
3
3
  #include <cuda_runtime.h>
4
4
  #include <cuda.h>
5
5
  #include <cublas_v2.h>
6
+ #include <cuda_bf16.h>
6
7
  #include <cuda_fp16.h>
7
8
 
8
9
  #if CUDART_VERSION < 11020
@@ -3,6 +3,7 @@
3
3
  #include <hip/hip_runtime.h>
4
4
  #include <hipblas/hipblas.h>
5
5
  #include <hip/hip_fp16.h>
6
+ #include <hip/hip_bfloat16.h>
6
7
  #ifdef __HIP_PLATFORM_AMD__
7
8
  // for rocblas_initialize()
8
9
  #include "rocblas/rocblas.h"
@@ -121,6 +122,8 @@
121
122
  #define __has_builtin(x) 0
122
123
  #endif
123
124
 
125
+ typedef hip_bfloat16 nv_bfloat16;
126
+
124
127
  typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
125
128
  typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
126
129
  static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
@@ -3,6 +3,7 @@
3
3
  #include <musa_runtime.h>
4
4
  #include <musa.h>
5
5
  #include <mublas.h>
6
+ #include <musa_bf16.h>
6
7
  #include <musa_fp16.h>
7
8
  #define CUBLAS_COMPUTE_16F CUDA_R_16F
8
9
  #define CUBLAS_COMPUTE_32F CUDA_R_32F
@@ -132,3 +133,5 @@
132
133
  #define cudaKernelNodeParams musaKernelNodeParams
133
134
  #define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
134
135
  #define cudaStreamEndCapture musaStreamEndCapture
136
+
137
+ typedef mt_bfloat16 nv_bfloat16;
@@ -70,7 +70,9 @@ ggml_add_backend_library(ggml-hip
70
70
  )
71
71
 
72
72
  # TODO: do not use CUDA definitions for HIP
73
- target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
73
+ if (NOT GGML_BACKEND_DL)
74
+ target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
75
+ endif()
74
76
 
75
77
  add_compile_definitions(GGML_USE_HIP)
76
78
 
@@ -3,6 +3,8 @@
3
3
  // GGML internal header
4
4
 
5
5
  #include "ggml.h"
6
+ #include "gguf.h"
7
+
6
8
  #include <assert.h>
7
9
  #include <math.h>
8
10
  #include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
@@ -551,22 +553,15 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
551
553
  #define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
552
554
  #define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
553
555
 
554
- // expose GGUF internals for test code
555
-
556
- GGML_API size_t gguf_type_size(enum gguf_type type);
557
-
558
- GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params);
559
-
560
- struct gguf_buf {
561
- void * data;
562
- size_t size;
563
- size_t offset;
564
- };
565
- GGML_API struct gguf_buf gguf_buf_init(size_t size);
566
- GGML_API void gguf_buf_free(struct gguf_buf buf);
567
-
568
- GGML_API void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta);
569
-
570
556
  #ifdef __cplusplus
571
557
  }
572
558
  #endif
559
+
560
+ #ifdef __cplusplus
561
+ #include <vector>
562
+
563
+ // expose GGUF internals for test code
564
+ GGML_API size_t gguf_type_size(enum gguf_type type);
565
+ GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params);
566
+ GGML_API void gguf_write_to_buf(const struct gguf_context * ctx, std::vector<int8_t> & buf, bool only_meta);
567
+ #endif // __cplusplus
@@ -103,3 +103,19 @@ else()
103
103
  DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
104
104
  )
105
105
  endif() # GGML_METAL_EMBED_LIBRARY
106
+
107
+ if (NOT GGML_METAL_EMBED_LIBRARY)
108
+ install(
109
+ FILES src/ggml-metal/ggml-metal.metal
110
+ PERMISSIONS
111
+ OWNER_READ
112
+ OWNER_WRITE
113
+ GROUP_READ
114
+ WORLD_READ
115
+ DESTINATION ${CMAKE_INSTALL_BINDIR})
116
+
117
+ install(
118
+ FILES ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
119
+ DESTINATION ${CMAKE_INSTALL_BINDIR}
120
+ )
121
+ endif()
@@ -2744,13 +2744,13 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
2744
2744
  cl_image_format img_fmt_1d;
2745
2745
  cl_image_desc img_desc_1d;
2746
2746
  cl_buffer_region region;
2747
- cl_mem A_image1d;
2748
- cl_mem B_image1d;
2749
- cl_mem B_sub_buffer;
2750
- cl_mem C_d;
2747
+ cl_mem A_image1d = nullptr;
2748
+ cl_mem B_image1d = nullptr;
2749
+ cl_mem B_sub_buffer = nullptr;
2750
+ cl_mem C_d = nullptr;
2751
2751
  // for B transpose
2752
- cl_mem B_d;
2753
- cl_mem B_d_input_image;
2752
+ cl_mem B_d = nullptr;
2753
+ cl_mem B_d_input_image = nullptr;
2754
2754
  // <--------------------------------------------> //
2755
2755
 
2756
2756
  // define matrix dimensions
@@ -27,15 +27,6 @@
27
27
  #endif
28
28
  #include <cstring>
29
29
 
30
- #define UNUSED GGML_UNUSED
31
-
32
- #define GGML_DEBUG 0
33
- #if (GGML_DEBUG >= 1)
34
- #define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
35
- #else
36
- #define GGML_PRINT_DEBUG(...)
37
- #endif
38
-
39
30
  #ifdef _WIN32
40
31
  typedef SOCKET sockfd_t;
41
32
  using ssize_t = __int64;
@@ -93,9 +84,23 @@ enum rpc_cmd {
93
84
  RPC_CMD_COPY_TENSOR,
94
85
  RPC_CMD_GRAPH_COMPUTE,
95
86
  RPC_CMD_GET_DEVICE_MEMORY,
87
+ RPC_CMD_INIT_TENSOR,
88
+ RPC_CMD_GET_ALLOC_SIZE,
96
89
  RPC_CMD_COUNT,
97
90
  };
98
91
 
92
+ struct rpc_msg_get_alloc_size_req {
93
+ rpc_tensor tensor;
94
+ };
95
+
96
+ struct rpc_msg_get_alloc_size_rsp {
97
+ uint64_t alloc_size;
98
+ };
99
+
100
+ struct rpc_msg_init_tensor_req {
101
+ rpc_tensor tensor;
102
+ };
103
+
99
104
  struct rpc_msg_alloc_buffer_req {
100
105
  uint64_t size;
101
106
  };
@@ -397,7 +402,7 @@ static std::shared_ptr<socket_t> get_socket(const std::string & endpoint) {
397
402
  initialized = true;
398
403
  }
399
404
  #else
400
- UNUSED(initialized);
405
+ GGML_UNUSED(initialized);
401
406
  #endif
402
407
  auto sock = socket_connect(host.c_str(), port);
403
408
  if (sock == nullptr) {
@@ -461,10 +466,18 @@ static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
461
466
  }
462
467
 
463
468
  static void ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
464
- UNUSED(buffer);
465
- if (ggml_is_quantized(tensor->type)) {
466
- // TODO: this check is due to MATRIX_ROW_PADDING in CUDA and should be generalized
467
- GGML_ASSERT(tensor->ne[0] % 512 == 0 && "unsupported quantized tensor");
469
+ ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
470
+
471
+ // CUDA backend on the server pads everything to 512 due to CUDA limitations.
472
+ // Due to bandwidth constraints, we only call the server init tensor functions if necessary.
473
+ // In particular, only quantized tensors need padding
474
+ if (ggml_is_quantized(tensor->type) && (tensor->ne[0] % 512 != 0) && (tensor->view_src == nullptr)) {
475
+ rpc_msg_init_tensor_req request;
476
+
477
+ request.tensor = serialize_tensor(tensor);
478
+
479
+ bool status = send_rpc_cmd(ctx->sock, RPC_CMD_INIT_TENSOR, &request, sizeof(request), nullptr, 0);
480
+ GGML_ASSERT(status);
468
481
  }
469
482
  }
470
483
 
@@ -577,8 +590,23 @@ static size_t ggml_backend_rpc_get_max_size(ggml_backend_buffer_type_t buft) {
577
590
  }
578
591
 
579
592
  static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
580
- UNUSED(buft);
581
- return ggml_nbytes(tensor);
593
+ // See comments in init_tensor.
594
+ if (ggml_is_quantized(tensor->type) && (tensor->ne[0] % 512 != 0) && (tensor->view_src == nullptr)) {
595
+ ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
596
+ auto sock = get_socket(buft_ctx->endpoint);
597
+
598
+ rpc_msg_get_alloc_size_req request;
599
+
600
+ request.tensor = serialize_tensor(tensor);
601
+
602
+ rpc_msg_get_alloc_size_rsp response;
603
+ bool status = send_rpc_cmd(sock, RPC_CMD_GET_ALLOC_SIZE, &request, sizeof(request), &response, sizeof(response));
604
+ GGML_ASSERT(status);
605
+
606
+ return response.alloc_size;
607
+ } else {
608
+ return ggml_nbytes(tensor);
609
+ }
582
610
  }
583
611
 
584
612
  static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = {
@@ -603,7 +631,7 @@ static void ggml_backend_rpc_free(ggml_backend_t backend) {
603
631
  }
604
632
 
605
633
  static void ggml_backend_rpc_synchronize(ggml_backend_t backend) {
606
- UNUSED(backend);
634
+ GGML_UNUSED(backend);
607
635
  // this is no-op because we don't have any async operations
608
636
  }
609
637
 
@@ -757,6 +785,8 @@ public:
757
785
  bool get_tensor(const rpc_msg_get_tensor_req & request, std::vector<uint8_t> & response);
758
786
  bool copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_copy_tensor_rsp & response);
759
787
  bool graph_compute(const std::vector<uint8_t> & input, rpc_msg_graph_compute_rsp & response);
788
+ bool init_tensor(const rpc_msg_init_tensor_req & request);
789
+ bool get_alloc_size(const rpc_msg_get_alloc_size_req & request, rpc_msg_get_alloc_size_rsp & response);
760
790
 
761
791
  private:
762
792
  ggml_tensor * deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor);
@@ -770,6 +800,36 @@ private:
770
800
  std::unordered_set<ggml_backend_buffer_t> buffers;
771
801
  };
772
802
 
803
+ bool rpc_server::get_alloc_size(const rpc_msg_get_alloc_size_req & request, rpc_msg_get_alloc_size_rsp & response) {
804
+ ggml_backend_buffer_type_t buft;
805
+ struct ggml_init_params params {
806
+ /*.mem_size =*/ ggml_tensor_overhead(),
807
+ /*.mem_buffer =*/ NULL,
808
+ /*.no_alloc =*/ true,
809
+ };
810
+
811
+ struct ggml_context * ctx = ggml_init(params);
812
+ ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
813
+
814
+ if (tensor == nullptr) {
815
+ GGML_LOG_ERROR("Null tensor pointer passed to server get_alloc_size function.\n");
816
+ ggml_free(ctx);
817
+ return false;
818
+ }
819
+
820
+ if (tensor->buffer == nullptr) {
821
+ //No buffer allocated.
822
+ buft = ggml_backend_get_default_buffer_type(backend);
823
+ } else {
824
+ buft = tensor->buffer->buft;
825
+ }
826
+
827
+ response.alloc_size = ggml_backend_buft_get_alloc_size(buft,tensor);
828
+
829
+ ggml_free(ctx);
830
+ return true;
831
+ }
832
+
773
833
  void rpc_server::alloc_buffer(const rpc_msg_alloc_buffer_req & request, rpc_msg_alloc_buffer_rsp & response) {
774
834
  ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
775
835
  ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, request.size);
@@ -781,7 +841,7 @@ void rpc_server::alloc_buffer(const rpc_msg_alloc_buffer_req & request, rpc_msg_
781
841
  GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> remote_ptr: %" PRIx64 ", remote_size: %" PRIu64 "\n", __func__, request.size, response.remote_ptr, response.remote_size);
782
842
  buffers.insert(buffer);
783
843
  } else {
784
- GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> failed\n", __func__, request.size);
844
+ GGML_LOG_ERROR("[%s] size: %" PRIu64 " -> failed\n", __func__, request.size);
785
845
  }
786
846
  }
787
847
 
@@ -803,7 +863,7 @@ bool rpc_server::buffer_get_base(const rpc_msg_buffer_get_base_req & request, rp
803
863
  GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, request.remote_ptr);
804
864
  ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
805
865
  if (buffers.find(buffer) == buffers.end()) {
806
- GGML_PRINT_DEBUG("[%s] buffer not found\n", __func__);
866
+ GGML_LOG_ERROR("[%s] buffer not found\n", __func__);
807
867
  return false;
808
868
  }
809
869
  void * base = ggml_backend_buffer_get_base(buffer);
@@ -815,7 +875,7 @@ bool rpc_server::free_buffer(const rpc_msg_free_buffer_req & request) {
815
875
  GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, request.remote_ptr);
816
876
  ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
817
877
  if (buffers.find(buffer) == buffers.end()) {
818
- GGML_PRINT_DEBUG("[%s] buffer not found\n", __func__);
878
+ GGML_LOG_ERROR("[%s] buffer not found\n", __func__);
819
879
  return false;
820
880
  }
821
881
  ggml_backend_buffer_free(buffer);
@@ -827,7 +887,7 @@ bool rpc_server::buffer_clear(const rpc_msg_buffer_clear_req & request) {
827
887
  GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 ", value: %u\n", __func__, request.remote_ptr, request.value);
828
888
  ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
829
889
  if (buffers.find(buffer) == buffers.end()) {
830
- GGML_PRINT_DEBUG("[%s] buffer not found\n", __func__);
890
+ GGML_LOG_ERROR("[%s] buffer not found\n", __func__);
831
891
  return false;
832
892
  }
833
893
  ggml_backend_buffer_clear(buffer, request.value);
@@ -883,7 +943,7 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
883
943
  struct ggml_context * ctx = ggml_init(params);
884
944
  ggml_tensor * tensor = deserialize_tensor(ctx, in_tensor);
885
945
  if (tensor == nullptr) {
886
- GGML_PRINT_DEBUG("[%s] error deserializing tensor\n", __func__);
946
+ GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
887
947
  ggml_free(ctx);
888
948
  return false;
889
949
  }
@@ -905,6 +965,40 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
905
965
  return true;
906
966
  }
907
967
 
968
+ bool rpc_server::init_tensor(const rpc_msg_init_tensor_req & request) {
969
+ struct ggml_init_params params {
970
+ /*.mem_size =*/ ggml_tensor_overhead(),
971
+ /*.mem_buffer =*/ NULL,
972
+ /*.no_alloc =*/ true,
973
+ };
974
+ struct ggml_context * ctx = ggml_init(params);
975
+ ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
976
+ if (tensor == nullptr) {
977
+ GGML_LOG_ERROR("Null tensor pointer passed to server init_tensor function.\n");
978
+ ggml_free(ctx);
979
+ return false;
980
+ }
981
+
982
+ // Call the backend's buffer_init_tensor function
983
+ ggml_backend_buffer_t buffer = tensor->buffer;
984
+ if (buffer && buffer->iface.init_tensor) {
985
+ buffer->iface.init_tensor(buffer, tensor);
986
+ } else {
987
+ GGML_LOG_ERROR("Null buffer for tensor passed to init_tensor function\n");
988
+ }
989
+
990
+ if (tensor->extra != nullptr) {
991
+ // This pointer can either be passed around client/server, or probably better stored server-side and kept track of.
992
+ // Currently unimplemented.
993
+ GGML_LOG_ERROR("tensor->extra populated by the backend, this is currently unsupported.\n");
994
+ ggml_free(ctx);
995
+ return false;
996
+ }
997
+
998
+ ggml_free(ctx);
999
+ return true;
1000
+ }
1001
+
908
1002
  bool rpc_server::get_tensor(const rpc_msg_get_tensor_req & request, std::vector<uint8_t> & response) {
909
1003
  struct ggml_init_params params {
910
1004
  /*.mem_size =*/ ggml_tensor_overhead(),
@@ -914,7 +1008,7 @@ bool rpc_server::get_tensor(const rpc_msg_get_tensor_req & request, std::vector<
914
1008
  struct ggml_context * ctx = ggml_init(params);
915
1009
  ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
916
1010
  if (tensor == nullptr) {
917
- GGML_PRINT_DEBUG("[%s] error deserializing tensor\n", __func__);
1011
+ GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
918
1012
  ggml_free(ctx);
919
1013
  return false;
920
1014
  }
@@ -948,7 +1042,7 @@ bool rpc_server::copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_co
948
1042
  ggml_tensor * src = deserialize_tensor(ctx, &request.src);
949
1043
  ggml_tensor * dst = deserialize_tensor(ctx, &request.dst);
950
1044
  if (src == nullptr || dst == nullptr) {
951
- GGML_PRINT_DEBUG("[%s] error deserializing tensors\n", __func__);
1045
+ GGML_LOG_ERROR("[%s] error deserializing tensors\n", __func__);
952
1046
  ggml_free(ctx);
953
1047
  return false;
954
1048
  }
@@ -1058,6 +1152,18 @@ static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t fre
1058
1152
  }
1059
1153
  break;
1060
1154
  }
1155
+ case RPC_CMD_GET_ALLOC_SIZE: {
1156
+ rpc_msg_get_alloc_size_req request;
1157
+ if (!recv_msg(sockfd, &request, sizeof(request))) {
1158
+ return;
1159
+ }
1160
+ rpc_msg_get_alloc_size_rsp response;
1161
+ server.get_alloc_size(request, response);
1162
+ if (!send_msg(sockfd, &response, sizeof(response))) {
1163
+ return;
1164
+ }
1165
+ break;
1166
+ }
1061
1167
  case RPC_CMD_GET_ALIGNMENT: {
1062
1168
  if (!recv_msg(sockfd, nullptr, 0)) {
1063
1169
  return;
@@ -1133,6 +1239,19 @@ static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t fre
1133
1239
  }
1134
1240
  break;
1135
1241
  }
1242
+ case RPC_CMD_INIT_TENSOR: {
1243
+ rpc_msg_init_tensor_req request;
1244
+ if (!recv_msg(sockfd, &request,sizeof(request))) {
1245
+ return;
1246
+ }
1247
+ if (!server.init_tensor(request)) {
1248
+ return;
1249
+ }
1250
+ if (!send_msg(sockfd, nullptr, 0)) {
1251
+ return;
1252
+ }
1253
+ break;
1254
+ }
1136
1255
  case RPC_CMD_GET_TENSOR: {
1137
1256
  rpc_msg_get_tensor_req request;
1138
1257
  if (!recv_msg(sockfd, &request, sizeof(request))) {
@@ -1257,14 +1376,14 @@ static void ggml_backend_rpc_device_get_memory(ggml_backend_dev_t dev, size_t *
1257
1376
 
1258
1377
  ggml_backend_rpc_get_device_memory(ctx->endpoint.c_str(), free, total);
1259
1378
 
1260
- UNUSED(dev);
1379
+ GGML_UNUSED(dev);
1261
1380
  }
1262
1381
 
1263
1382
  static enum ggml_backend_dev_type ggml_backend_rpc_device_get_type(ggml_backend_dev_t dev) {
1264
1383
  // TODO: obtain value from the server
1265
1384
  return GGML_BACKEND_DEVICE_TYPE_GPU;
1266
1385
 
1267
- UNUSED(dev);
1386
+ GGML_UNUSED(dev);
1268
1387
  }
1269
1388
 
1270
1389
  static void ggml_backend_rpc_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
@@ -1285,7 +1404,7 @@ static ggml_backend_t ggml_backend_rpc_device_init(ggml_backend_dev_t dev, const
1285
1404
 
1286
1405
  return ggml_backend_rpc_init(ctx->endpoint.c_str());
1287
1406
 
1288
- UNUSED(params);
1407
+ GGML_UNUSED(params);
1289
1408
  }
1290
1409
 
1291
1410
  static ggml_backend_buffer_type_t ggml_backend_rpc_device_get_buffer_type(ggml_backend_dev_t dev) {
@@ -1293,12 +1412,12 @@ static ggml_backend_buffer_type_t ggml_backend_rpc_device_get_buffer_type(ggml_b
1293
1412
 
1294
1413
  return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str());
1295
1414
 
1296
- UNUSED(dev);
1415
+ GGML_UNUSED(dev);
1297
1416
  }
1298
1417
 
1299
1418
  static bool ggml_backend_rpc_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
1300
- UNUSED(dev);
1301
- UNUSED(op);
1419
+ GGML_UNUSED(dev);
1420
+ GGML_UNUSED(op);
1302
1421
  //TODO: call the remote backend and cache the results
1303
1422
  return true;
1304
1423
  }
@@ -1335,20 +1454,20 @@ static const struct ggml_backend_device_i ggml_backend_rpc_device_i = {
1335
1454
  static const char * ggml_backend_rpc_reg_get_name(ggml_backend_reg_t reg) {
1336
1455
  return "RPC";
1337
1456
 
1338
- UNUSED(reg);
1457
+ GGML_UNUSED(reg);
1339
1458
  }
1340
1459
 
1341
1460
  static size_t ggml_backend_rpc_reg_get_device_count(ggml_backend_reg_t reg) {
1342
1461
  return 0;
1343
1462
 
1344
- UNUSED(reg);
1463
+ GGML_UNUSED(reg);
1345
1464
  }
1346
1465
 
1347
1466
  static ggml_backend_dev_t ggml_backend_rpc_reg_get_device(ggml_backend_reg_t reg, size_t index) {
1348
1467
  GGML_ABORT("The RPC backend does not have enumerated devices - use ggml_backend_add_device instead");
1349
1468
 
1350
- UNUSED(reg);
1351
- UNUSED(index);
1469
+ GGML_UNUSED(reg);
1470
+ GGML_UNUSED(index);
1352
1471
  }
1353
1472
 
1354
1473
  static void * ggml_backend_rpc_get_proc_address(ggml_backend_reg_t reg, const char * name) {
@@ -1357,7 +1476,7 @@ static void * ggml_backend_rpc_get_proc_address(ggml_backend_reg_t reg, const ch
1357
1476
  }
1358
1477
  return NULL;
1359
1478
 
1360
- UNUSED(reg);
1479
+ GGML_UNUSED(reg);
1361
1480
  }
1362
1481
 
1363
1482
  static const struct ggml_backend_reg_i ggml_backend_rpc_reg_i = {
@@ -29,5 +29,6 @@
29
29
  #include "wkv6.hpp"
30
30
  #include "outprod.hpp"
31
31
  #include "element_wise.hpp"
32
+ #include "gla.hpp"
32
33
 
33
34
  #endif // GGML_SYCL_BACKEND_HPP
@@ -11,6 +11,8 @@
11
11
  //
12
12
 
13
13
  #include "common.hpp"
14
+
15
+ #include "ggml-backend-impl.h"
14
16
  #include "ggml-impl.h"
15
17
 
16
18
  int get_current_device_id() {
@@ -49,6 +51,10 @@ void ggml_sycl_host_free(void* ptr) try {
49
51
  std::exit(1);
50
52
  }
51
53
 
54
+ bool gpu_has_xmx(sycl::device &dev) {
55
+ return dev.has(sycl::aspect::ext_intel_matrix);
56
+ }
57
+
52
58
  int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block_size) {
53
59
  const int64_t max_range = std::numeric_limits<int>::max();
54
60
  int64_t sycl_down_blk_size = block_size;
@@ -65,9 +71,9 @@ void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
65
71
  const ggml_sycl_op_flatten_t op) try {
66
72
 
67
73
  const bool use_src1 = src1 != nullptr;
68
-
69
- GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
70
- GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
74
+ if(use_src1)
75
+ GGML_ASSERT(strcmp(src1->buffer->buft->iface.get_name(src1->buffer->buft), GGML_SYCL_NAME "_Split") != 0);
76
+ GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0);
71
77
 
72
78
  // dd = data device
73
79
  float * src0_ddf = (float *) src0->data;
@@ -26,7 +26,11 @@
26
26
 
27
27
  #define GGML_COMMON_DECL_SYCL
28
28
  #define GGML_COMMON_IMPL_SYCL
29
+ /* suppress warning spam */
30
+ #pragma clang diagnostic push
31
+ #pragma clang diagnostic ignored "-Wnested-anon-types"
29
32
  #include "ggml-common.h"
33
+ #pragma clang diagnostic pop
30
34
 
31
35
  void* ggml_sycl_host_malloc(size_t size);
32
36
  void ggml_sycl_host_free(void* ptr);
@@ -329,8 +333,12 @@ struct ggml_backend_sycl_context {
329
333
  // pool
330
334
  std::unique_ptr<ggml_sycl_pool> pools[GGML_SYCL_MAX_DEVICES];
331
335
 
336
+ std::unique_ptr<ggml_sycl_pool> host_pools[GGML_SYCL_MAX_DEVICES];
337
+
332
338
  static std::unique_ptr<ggml_sycl_pool> new_pool_for_device(queue_ptr qptr, int device);
333
339
 
340
+ static std::unique_ptr<ggml_sycl_pool> new_pool_for_host(queue_ptr qptr, int device);
341
+
334
342
  ggml_sycl_pool & pool(int device) {
335
343
  if (pools[device] == nullptr) {
336
344
  pools[device] = new_pool_for_device(stream(device,0), device);
@@ -341,6 +349,15 @@ struct ggml_backend_sycl_context {
341
349
  ggml_sycl_pool & pool() {
342
350
  return pool(device);
343
351
  }
352
+
353
+ ggml_sycl_pool & host_pool(int device) {
354
+ if (host_pools[device] == nullptr) {
355
+ host_pools[device] = new_pool_for_host(stream(device, 0), device);
356
+ }
357
+ return *host_pools[device];
358
+ }
359
+
360
+ ggml_sycl_pool & host_pool() { return host_pool(device); }
344
361
  };
345
362
 
346
363
  // common device functions
@@ -658,6 +675,7 @@ inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_t
658
675
  }
659
676
  }
660
677
 
678
+ bool gpu_has_xmx(sycl::device &dev);
661
679
 
662
680
  void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
663
681
  const ggml_tensor *src1, ggml_tensor *dst,
@@ -158,8 +158,9 @@ static void concat_f32_sycl_non_cont(
158
158
  });
159
159
  }
160
160
 
161
- void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
162
- const ggml_tensor *src1, ggml_tensor *dst) {
161
+ void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
162
+ const ggml_tensor *src0 = dst->src[0];
163
+ const ggml_tensor *src1 = dst->src[1];
163
164
  queue_ptr stream = ctx.stream();
164
165
 
165
166
  const int32_t dim = ((int32_t *)dst->op_params)[0];
@@ -15,7 +15,6 @@
15
15
 
16
16
  #include "common.hpp"
17
17
 
18
- void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
19
- const ggml_tensor *src1, ggml_tensor *dst);
18
+ void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst);
20
19
 
21
20
  #endif // GGML_SYCL_CONCAT_HPP
@@ -71,8 +71,9 @@ static void conv_transpose_1d_f32_f32_sycl(
71
71
  });
72
72
  }
73
73
 
74
- void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
75
- const ggml_tensor *src1, ggml_tensor *dst) {
74
+ void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
75
+ const ggml_tensor *src0 = dst->src[0];
76
+ const ggml_tensor *src1 = dst->src[1];
76
77
  const float * src0_d = (const float *)src0->data;
77
78
  const float * src1_d = (const float *)src1->data;
78
79
 
@@ -15,7 +15,6 @@
15
15
 
16
16
  #include "common.hpp"
17
17
 
18
- void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
19
- const ggml_tensor *src1, ggml_tensor *dst);
18
+ void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor *dst);
20
19
 
21
20
  #endif // GGML_SYCL_CONV_HPP