@fugood/llama.node 0.3.6 → 0.3.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -2
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +3 -1
- package/lib/index.js +16 -1
- package/lib/index.ts +16 -0
- package/package.json +1 -1
- package/src/EmbeddingWorker.cpp +4 -3
- package/src/LlamaCompletionWorker.cpp +4 -2
- package/src/LlamaContext.cpp +61 -6
- package/src/LlamaContext.h +1 -0
- package/src/common.hpp +6 -11
- package/src/llama.cpp/.github/workflows/build.yml +19 -17
- package/src/llama.cpp/.github/workflows/docker.yml +77 -30
- package/src/llama.cpp/.github/workflows/editorconfig.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +22 -3
- package/src/llama.cpp/CMakeLists.txt +49 -24
- package/src/llama.cpp/common/arg.cpp +82 -26
- package/src/llama.cpp/common/arg.h +3 -0
- package/src/llama.cpp/common/common.cpp +192 -72
- package/src/llama.cpp/common/common.h +51 -18
- package/src/llama.cpp/common/ngram-cache.cpp +12 -12
- package/src/llama.cpp/common/ngram-cache.h +2 -2
- package/src/llama.cpp/common/sampling.cpp +11 -6
- package/src/llama.cpp/common/speculative.cpp +18 -15
- package/src/llama.cpp/docs/build.md +2 -0
- package/src/llama.cpp/examples/batched/batched.cpp +9 -7
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +3 -3
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +10 -8
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +11 -8
- package/src/llama.cpp/examples/cvector-generator/mean.hpp +1 -1
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +8 -7
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +7 -6
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +8 -7
- package/src/llama.cpp/examples/gguf/gguf.cpp +10 -6
- package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +1 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +8 -7
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +13 -10
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +13 -12
- package/src/llama.cpp/examples/infill/infill.cpp +23 -24
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +44 -13
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -6
- package/src/llama.cpp/examples/llava/clip.cpp +4 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +9 -6
- package/src/llama.cpp/examples/llava/llava.cpp +2 -2
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +8 -4
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +11 -8
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +6 -7
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +4 -9
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +3 -7
- package/src/llama.cpp/examples/lookup/lookup.cpp +5 -6
- package/src/llama.cpp/examples/main/main.cpp +51 -29
- package/src/llama.cpp/examples/parallel/parallel.cpp +5 -6
- package/src/llama.cpp/examples/passkey/passkey.cpp +7 -5
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +37 -23
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -14
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +8 -8
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +12 -0
- package/src/llama.cpp/examples/run/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +1351 -0
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +114 -0
- package/src/llama.cpp/examples/run/run.cpp +175 -61
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -25
- package/src/llama.cpp/examples/server/CMakeLists.txt +1 -0
- package/src/llama.cpp/examples/server/httplib.h +1295 -409
- package/src/llama.cpp/examples/server/server.cpp +387 -181
- package/src/llama.cpp/examples/server/tests/requirements.txt +1 -0
- package/src/llama.cpp/examples/server/utils.hpp +170 -58
- package/src/llama.cpp/examples/simple/simple.cpp +9 -8
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +16 -12
- package/src/llama.cpp/examples/speculative/speculative.cpp +22 -23
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +8 -12
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +17 -5
- package/src/llama.cpp/examples/tts/tts.cpp +64 -23
- package/src/llama.cpp/ggml/CMakeLists.txt +5 -21
- package/src/llama.cpp/ggml/include/ggml-backend.h +2 -0
- package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -0
- package/src/llama.cpp/ggml/include/ggml.h +36 -145
- package/src/llama.cpp/ggml/include/gguf.h +202 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +6 -3
- package/src/llama.cpp/ggml/src/ggml-alloc.c +5 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +0 -1
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +79 -49
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +5 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +33 -23
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +57 -72
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +87 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +335 -66
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1090 -378
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +2 -2
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +3 -1
- package/src/llama.cpp/ggml/src/ggml-impl.h +11 -16
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +16 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +6 -6
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +154 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +9 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +18 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +40 -95
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +48 -48
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +24 -24
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -164
- package/src/llama.cpp/ggml/src/ggml-sycl/gla.cpp +105 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gla.hpp +8 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +3 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +7 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +74 -4
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +314 -116
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -2
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +9 -3
- package/src/llama.cpp/ggml/src/ggml.c +117 -1327
- package/src/llama.cpp/ggml/src/gguf.cpp +1329 -0
- package/src/llama.cpp/include/llama-cpp.h +6 -1
- package/src/llama.cpp/include/llama.h +138 -75
- package/src/llama.cpp/src/CMakeLists.txt +13 -1
- package/src/llama.cpp/src/llama-adapter.cpp +347 -0
- package/src/llama.cpp/src/llama-adapter.h +74 -0
- package/src/llama.cpp/src/llama-arch.cpp +1487 -0
- package/src/llama.cpp/src/llama-arch.h +400 -0
- package/src/llama.cpp/src/llama-batch.cpp +368 -0
- package/src/llama.cpp/src/llama-batch.h +88 -0
- package/src/llama.cpp/src/llama-chat.cpp +578 -0
- package/src/llama.cpp/src/llama-chat.h +52 -0
- package/src/llama.cpp/src/llama-context.cpp +1775 -0
- package/src/llama.cpp/src/llama-context.h +128 -0
- package/src/llama.cpp/src/llama-cparams.cpp +1 -0
- package/src/llama.cpp/src/llama-cparams.h +37 -0
- package/src/llama.cpp/src/llama-grammar.cpp +5 -4
- package/src/llama.cpp/src/llama-grammar.h +3 -1
- package/src/llama.cpp/src/llama-hparams.cpp +71 -0
- package/src/llama.cpp/src/llama-hparams.h +139 -0
- package/src/llama.cpp/src/llama-impl.cpp +167 -0
- package/src/llama.cpp/src/llama-impl.h +16 -136
- package/src/llama.cpp/src/llama-kv-cache.cpp +718 -0
- package/src/llama.cpp/src/llama-kv-cache.h +218 -0
- package/src/llama.cpp/src/llama-mmap.cpp +589 -0
- package/src/llama.cpp/src/llama-mmap.h +67 -0
- package/src/llama.cpp/src/llama-model-loader.cpp +1124 -0
- package/src/llama.cpp/src/llama-model-loader.h +167 -0
- package/src/llama.cpp/src/llama-model.cpp +3953 -0
- package/src/llama.cpp/src/llama-model.h +370 -0
- package/src/llama.cpp/src/llama-quant.cpp +934 -0
- package/src/llama.cpp/src/llama-quant.h +1 -0
- package/src/llama.cpp/src/llama-sampling.cpp +147 -32
- package/src/llama.cpp/src/llama-sampling.h +3 -19
- package/src/llama.cpp/src/llama-vocab.cpp +1832 -575
- package/src/llama.cpp/src/llama-vocab.h +97 -142
- package/src/llama.cpp/src/llama.cpp +7160 -20314
- package/src/llama.cpp/src/unicode.cpp +8 -3
- package/src/llama.cpp/tests/CMakeLists.txt +2 -0
- package/src/llama.cpp/tests/test-autorelease.cpp +3 -3
- package/src/llama.cpp/tests/test-backend-ops.cpp +370 -59
- package/src/llama.cpp/tests/test-chat-template.cpp +162 -125
- package/src/llama.cpp/tests/test-gguf.cpp +222 -187
- package/src/llama.cpp/tests/test-model-load-cancel.cpp +1 -1
- package/src/llama.cpp/tests/test-sampling.cpp +0 -1
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +4 -4
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +9 -7
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +8 -6
|
@@ -5,8 +5,8 @@
|
|
|
5
5
|
extern "C" {
|
|
6
6
|
#endif
|
|
7
7
|
|
|
8
|
-
bool llamafile_sgemm(
|
|
9
|
-
const void *, int64_t, void *, int64_t,
|
|
8
|
+
bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t, int64_t, int64_t,
|
|
9
|
+
const void *, int64_t, const void *, int64_t, void *, int64_t,
|
|
10
10
|
int, int, int);
|
|
11
11
|
|
|
12
12
|
#ifdef __cplusplus
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
#include <hip/hip_runtime.h>
|
|
4
4
|
#include <hipblas/hipblas.h>
|
|
5
5
|
#include <hip/hip_fp16.h>
|
|
6
|
+
#include <hip/hip_bfloat16.h>
|
|
6
7
|
#ifdef __HIP_PLATFORM_AMD__
|
|
7
8
|
// for rocblas_initialize()
|
|
8
9
|
#include "rocblas/rocblas.h"
|
|
@@ -121,6 +122,8 @@
|
|
|
121
122
|
#define __has_builtin(x) 0
|
|
122
123
|
#endif
|
|
123
124
|
|
|
125
|
+
typedef hip_bfloat16 nv_bfloat16;
|
|
126
|
+
|
|
124
127
|
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
|
|
125
128
|
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
|
|
126
129
|
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
#include <musa_runtime.h>
|
|
4
4
|
#include <musa.h>
|
|
5
5
|
#include <mublas.h>
|
|
6
|
+
#include <musa_bf16.h>
|
|
6
7
|
#include <musa_fp16.h>
|
|
7
8
|
#define CUBLAS_COMPUTE_16F CUDA_R_16F
|
|
8
9
|
#define CUBLAS_COMPUTE_32F CUDA_R_32F
|
|
@@ -132,3 +133,5 @@
|
|
|
132
133
|
#define cudaKernelNodeParams musaKernelNodeParams
|
|
133
134
|
#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
|
|
134
135
|
#define cudaStreamEndCapture musaStreamEndCapture
|
|
136
|
+
|
|
137
|
+
typedef mt_bfloat16 nv_bfloat16;
|
|
@@ -70,7 +70,9 @@ ggml_add_backend_library(ggml-hip
|
|
|
70
70
|
)
|
|
71
71
|
|
|
72
72
|
# TODO: do not use CUDA definitions for HIP
|
|
73
|
-
|
|
73
|
+
if (NOT GGML_BACKEND_DL)
|
|
74
|
+
target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
|
|
75
|
+
endif()
|
|
74
76
|
|
|
75
77
|
add_compile_definitions(GGML_USE_HIP)
|
|
76
78
|
|
|
@@ -3,6 +3,8 @@
|
|
|
3
3
|
// GGML internal header
|
|
4
4
|
|
|
5
5
|
#include "ggml.h"
|
|
6
|
+
#include "gguf.h"
|
|
7
|
+
|
|
6
8
|
#include <assert.h>
|
|
7
9
|
#include <math.h>
|
|
8
10
|
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
|
|
@@ -551,22 +553,15 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
|
|
|
551
553
|
#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
|
|
552
554
|
#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
|
|
553
555
|
|
|
554
|
-
// expose GGUF internals for test code
|
|
555
|
-
|
|
556
|
-
GGML_API size_t gguf_type_size(enum gguf_type type);
|
|
557
|
-
|
|
558
|
-
GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params);
|
|
559
|
-
|
|
560
|
-
struct gguf_buf {
|
|
561
|
-
void * data;
|
|
562
|
-
size_t size;
|
|
563
|
-
size_t offset;
|
|
564
|
-
};
|
|
565
|
-
GGML_API struct gguf_buf gguf_buf_init(size_t size);
|
|
566
|
-
GGML_API void gguf_buf_free(struct gguf_buf buf);
|
|
567
|
-
|
|
568
|
-
GGML_API void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta);
|
|
569
|
-
|
|
570
556
|
#ifdef __cplusplus
|
|
571
557
|
}
|
|
572
558
|
#endif
|
|
559
|
+
|
|
560
|
+
#ifdef __cplusplus
|
|
561
|
+
#include <vector>
|
|
562
|
+
|
|
563
|
+
// expose GGUF internals for test code
|
|
564
|
+
GGML_API size_t gguf_type_size(enum gguf_type type);
|
|
565
|
+
GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params);
|
|
566
|
+
GGML_API void gguf_write_to_buf(const struct gguf_context * ctx, std::vector<int8_t> & buf, bool only_meta);
|
|
567
|
+
#endif // __cplusplus
|
|
@@ -103,3 +103,19 @@ else()
|
|
|
103
103
|
DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
|
|
104
104
|
)
|
|
105
105
|
endif() # GGML_METAL_EMBED_LIBRARY
|
|
106
|
+
|
|
107
|
+
if (NOT GGML_METAL_EMBED_LIBRARY)
|
|
108
|
+
install(
|
|
109
|
+
FILES src/ggml-metal/ggml-metal.metal
|
|
110
|
+
PERMISSIONS
|
|
111
|
+
OWNER_READ
|
|
112
|
+
OWNER_WRITE
|
|
113
|
+
GROUP_READ
|
|
114
|
+
WORLD_READ
|
|
115
|
+
DESTINATION ${CMAKE_INSTALL_BINDIR})
|
|
116
|
+
|
|
117
|
+
install(
|
|
118
|
+
FILES ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
|
|
119
|
+
DESTINATION ${CMAKE_INSTALL_BINDIR}
|
|
120
|
+
)
|
|
121
|
+
endif()
|
|
@@ -2744,13 +2744,13 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
2744
2744
|
cl_image_format img_fmt_1d;
|
|
2745
2745
|
cl_image_desc img_desc_1d;
|
|
2746
2746
|
cl_buffer_region region;
|
|
2747
|
-
cl_mem A_image1d;
|
|
2748
|
-
cl_mem B_image1d;
|
|
2749
|
-
cl_mem B_sub_buffer;
|
|
2750
|
-
cl_mem C_d;
|
|
2747
|
+
cl_mem A_image1d = nullptr;
|
|
2748
|
+
cl_mem B_image1d = nullptr;
|
|
2749
|
+
cl_mem B_sub_buffer = nullptr;
|
|
2750
|
+
cl_mem C_d = nullptr;
|
|
2751
2751
|
// for B transpose
|
|
2752
|
-
cl_mem B_d;
|
|
2753
|
-
cl_mem B_d_input_image;
|
|
2752
|
+
cl_mem B_d = nullptr;
|
|
2753
|
+
cl_mem B_d_input_image = nullptr;
|
|
2754
2754
|
// <--------------------------------------------> //
|
|
2755
2755
|
|
|
2756
2756
|
// define matrix dimensions
|
|
@@ -27,15 +27,6 @@
|
|
|
27
27
|
#endif
|
|
28
28
|
#include <cstring>
|
|
29
29
|
|
|
30
|
-
#define UNUSED GGML_UNUSED
|
|
31
|
-
|
|
32
|
-
#define GGML_DEBUG 0
|
|
33
|
-
#if (GGML_DEBUG >= 1)
|
|
34
|
-
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
|
|
35
|
-
#else
|
|
36
|
-
#define GGML_PRINT_DEBUG(...)
|
|
37
|
-
#endif
|
|
38
|
-
|
|
39
30
|
#ifdef _WIN32
|
|
40
31
|
typedef SOCKET sockfd_t;
|
|
41
32
|
using ssize_t = __int64;
|
|
@@ -93,9 +84,23 @@ enum rpc_cmd {
|
|
|
93
84
|
RPC_CMD_COPY_TENSOR,
|
|
94
85
|
RPC_CMD_GRAPH_COMPUTE,
|
|
95
86
|
RPC_CMD_GET_DEVICE_MEMORY,
|
|
87
|
+
RPC_CMD_INIT_TENSOR,
|
|
88
|
+
RPC_CMD_GET_ALLOC_SIZE,
|
|
96
89
|
RPC_CMD_COUNT,
|
|
97
90
|
};
|
|
98
91
|
|
|
92
|
+
struct rpc_msg_get_alloc_size_req {
|
|
93
|
+
rpc_tensor tensor;
|
|
94
|
+
};
|
|
95
|
+
|
|
96
|
+
struct rpc_msg_get_alloc_size_rsp {
|
|
97
|
+
uint64_t alloc_size;
|
|
98
|
+
};
|
|
99
|
+
|
|
100
|
+
struct rpc_msg_init_tensor_req {
|
|
101
|
+
rpc_tensor tensor;
|
|
102
|
+
};
|
|
103
|
+
|
|
99
104
|
struct rpc_msg_alloc_buffer_req {
|
|
100
105
|
uint64_t size;
|
|
101
106
|
};
|
|
@@ -397,7 +402,7 @@ static std::shared_ptr<socket_t> get_socket(const std::string & endpoint) {
|
|
|
397
402
|
initialized = true;
|
|
398
403
|
}
|
|
399
404
|
#else
|
|
400
|
-
|
|
405
|
+
GGML_UNUSED(initialized);
|
|
401
406
|
#endif
|
|
402
407
|
auto sock = socket_connect(host.c_str(), port);
|
|
403
408
|
if (sock == nullptr) {
|
|
@@ -461,10 +466,18 @@ static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
|
|
|
461
466
|
}
|
|
462
467
|
|
|
463
468
|
static void ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
469
|
+
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
|
470
|
+
|
|
471
|
+
// CUDA backend on the server pads everything to 512 due to CUDA limitations.
|
|
472
|
+
// Due to bandwidth constraints, we only call the server init tensor functions if necessary.
|
|
473
|
+
// In particular, only quantized tensors need padding
|
|
474
|
+
if (ggml_is_quantized(tensor->type) && (tensor->ne[0] % 512 != 0) && (tensor->view_src == nullptr)) {
|
|
475
|
+
rpc_msg_init_tensor_req request;
|
|
476
|
+
|
|
477
|
+
request.tensor = serialize_tensor(tensor);
|
|
478
|
+
|
|
479
|
+
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_INIT_TENSOR, &request, sizeof(request), nullptr, 0);
|
|
480
|
+
GGML_ASSERT(status);
|
|
468
481
|
}
|
|
469
482
|
}
|
|
470
483
|
|
|
@@ -577,8 +590,23 @@ static size_t ggml_backend_rpc_get_max_size(ggml_backend_buffer_type_t buft) {
|
|
|
577
590
|
}
|
|
578
591
|
|
|
579
592
|
static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
|
580
|
-
|
|
581
|
-
|
|
593
|
+
// See comments in init_tensor.
|
|
594
|
+
if (ggml_is_quantized(tensor->type) && (tensor->ne[0] % 512 != 0) && (tensor->view_src == nullptr)) {
|
|
595
|
+
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
|
|
596
|
+
auto sock = get_socket(buft_ctx->endpoint);
|
|
597
|
+
|
|
598
|
+
rpc_msg_get_alloc_size_req request;
|
|
599
|
+
|
|
600
|
+
request.tensor = serialize_tensor(tensor);
|
|
601
|
+
|
|
602
|
+
rpc_msg_get_alloc_size_rsp response;
|
|
603
|
+
bool status = send_rpc_cmd(sock, RPC_CMD_GET_ALLOC_SIZE, &request, sizeof(request), &response, sizeof(response));
|
|
604
|
+
GGML_ASSERT(status);
|
|
605
|
+
|
|
606
|
+
return response.alloc_size;
|
|
607
|
+
} else {
|
|
608
|
+
return ggml_nbytes(tensor);
|
|
609
|
+
}
|
|
582
610
|
}
|
|
583
611
|
|
|
584
612
|
static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = {
|
|
@@ -603,7 +631,7 @@ static void ggml_backend_rpc_free(ggml_backend_t backend) {
|
|
|
603
631
|
}
|
|
604
632
|
|
|
605
633
|
static void ggml_backend_rpc_synchronize(ggml_backend_t backend) {
|
|
606
|
-
|
|
634
|
+
GGML_UNUSED(backend);
|
|
607
635
|
// this is no-op because we don't have any async operations
|
|
608
636
|
}
|
|
609
637
|
|
|
@@ -757,6 +785,8 @@ public:
|
|
|
757
785
|
bool get_tensor(const rpc_msg_get_tensor_req & request, std::vector<uint8_t> & response);
|
|
758
786
|
bool copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_copy_tensor_rsp & response);
|
|
759
787
|
bool graph_compute(const std::vector<uint8_t> & input, rpc_msg_graph_compute_rsp & response);
|
|
788
|
+
bool init_tensor(const rpc_msg_init_tensor_req & request);
|
|
789
|
+
bool get_alloc_size(const rpc_msg_get_alloc_size_req & request, rpc_msg_get_alloc_size_rsp & response);
|
|
760
790
|
|
|
761
791
|
private:
|
|
762
792
|
ggml_tensor * deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor);
|
|
@@ -770,6 +800,36 @@ private:
|
|
|
770
800
|
std::unordered_set<ggml_backend_buffer_t> buffers;
|
|
771
801
|
};
|
|
772
802
|
|
|
803
|
+
bool rpc_server::get_alloc_size(const rpc_msg_get_alloc_size_req & request, rpc_msg_get_alloc_size_rsp & response) {
|
|
804
|
+
ggml_backend_buffer_type_t buft;
|
|
805
|
+
struct ggml_init_params params {
|
|
806
|
+
/*.mem_size =*/ ggml_tensor_overhead(),
|
|
807
|
+
/*.mem_buffer =*/ NULL,
|
|
808
|
+
/*.no_alloc =*/ true,
|
|
809
|
+
};
|
|
810
|
+
|
|
811
|
+
struct ggml_context * ctx = ggml_init(params);
|
|
812
|
+
ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
|
|
813
|
+
|
|
814
|
+
if (tensor == nullptr) {
|
|
815
|
+
GGML_LOG_ERROR("Null tensor pointer passed to server get_alloc_size function.\n");
|
|
816
|
+
ggml_free(ctx);
|
|
817
|
+
return false;
|
|
818
|
+
}
|
|
819
|
+
|
|
820
|
+
if (tensor->buffer == nullptr) {
|
|
821
|
+
//No buffer allocated.
|
|
822
|
+
buft = ggml_backend_get_default_buffer_type(backend);
|
|
823
|
+
} else {
|
|
824
|
+
buft = tensor->buffer->buft;
|
|
825
|
+
}
|
|
826
|
+
|
|
827
|
+
response.alloc_size = ggml_backend_buft_get_alloc_size(buft,tensor);
|
|
828
|
+
|
|
829
|
+
ggml_free(ctx);
|
|
830
|
+
return true;
|
|
831
|
+
}
|
|
832
|
+
|
|
773
833
|
void rpc_server::alloc_buffer(const rpc_msg_alloc_buffer_req & request, rpc_msg_alloc_buffer_rsp & response) {
|
|
774
834
|
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
|
|
775
835
|
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, request.size);
|
|
@@ -781,7 +841,7 @@ void rpc_server::alloc_buffer(const rpc_msg_alloc_buffer_req & request, rpc_msg_
|
|
|
781
841
|
GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> remote_ptr: %" PRIx64 ", remote_size: %" PRIu64 "\n", __func__, request.size, response.remote_ptr, response.remote_size);
|
|
782
842
|
buffers.insert(buffer);
|
|
783
843
|
} else {
|
|
784
|
-
|
|
844
|
+
GGML_LOG_ERROR("[%s] size: %" PRIu64 " -> failed\n", __func__, request.size);
|
|
785
845
|
}
|
|
786
846
|
}
|
|
787
847
|
|
|
@@ -803,7 +863,7 @@ bool rpc_server::buffer_get_base(const rpc_msg_buffer_get_base_req & request, rp
|
|
|
803
863
|
GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, request.remote_ptr);
|
|
804
864
|
ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
|
|
805
865
|
if (buffers.find(buffer) == buffers.end()) {
|
|
806
|
-
|
|
866
|
+
GGML_LOG_ERROR("[%s] buffer not found\n", __func__);
|
|
807
867
|
return false;
|
|
808
868
|
}
|
|
809
869
|
void * base = ggml_backend_buffer_get_base(buffer);
|
|
@@ -815,7 +875,7 @@ bool rpc_server::free_buffer(const rpc_msg_free_buffer_req & request) {
|
|
|
815
875
|
GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, request.remote_ptr);
|
|
816
876
|
ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
|
|
817
877
|
if (buffers.find(buffer) == buffers.end()) {
|
|
818
|
-
|
|
878
|
+
GGML_LOG_ERROR("[%s] buffer not found\n", __func__);
|
|
819
879
|
return false;
|
|
820
880
|
}
|
|
821
881
|
ggml_backend_buffer_free(buffer);
|
|
@@ -827,7 +887,7 @@ bool rpc_server::buffer_clear(const rpc_msg_buffer_clear_req & request) {
|
|
|
827
887
|
GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 ", value: %u\n", __func__, request.remote_ptr, request.value);
|
|
828
888
|
ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
|
|
829
889
|
if (buffers.find(buffer) == buffers.end()) {
|
|
830
|
-
|
|
890
|
+
GGML_LOG_ERROR("[%s] buffer not found\n", __func__);
|
|
831
891
|
return false;
|
|
832
892
|
}
|
|
833
893
|
ggml_backend_buffer_clear(buffer, request.value);
|
|
@@ -883,7 +943,7 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
|
|
|
883
943
|
struct ggml_context * ctx = ggml_init(params);
|
|
884
944
|
ggml_tensor * tensor = deserialize_tensor(ctx, in_tensor);
|
|
885
945
|
if (tensor == nullptr) {
|
|
886
|
-
|
|
946
|
+
GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
|
|
887
947
|
ggml_free(ctx);
|
|
888
948
|
return false;
|
|
889
949
|
}
|
|
@@ -905,6 +965,40 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
|
|
|
905
965
|
return true;
|
|
906
966
|
}
|
|
907
967
|
|
|
968
|
+
bool rpc_server::init_tensor(const rpc_msg_init_tensor_req & request) {
|
|
969
|
+
struct ggml_init_params params {
|
|
970
|
+
/*.mem_size =*/ ggml_tensor_overhead(),
|
|
971
|
+
/*.mem_buffer =*/ NULL,
|
|
972
|
+
/*.no_alloc =*/ true,
|
|
973
|
+
};
|
|
974
|
+
struct ggml_context * ctx = ggml_init(params);
|
|
975
|
+
ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
|
|
976
|
+
if (tensor == nullptr) {
|
|
977
|
+
GGML_LOG_ERROR("Null tensor pointer passed to server init_tensor function.\n");
|
|
978
|
+
ggml_free(ctx);
|
|
979
|
+
return false;
|
|
980
|
+
}
|
|
981
|
+
|
|
982
|
+
// Call the backend's buffer_init_tensor function
|
|
983
|
+
ggml_backend_buffer_t buffer = tensor->buffer;
|
|
984
|
+
if (buffer && buffer->iface.init_tensor) {
|
|
985
|
+
buffer->iface.init_tensor(buffer, tensor);
|
|
986
|
+
} else {
|
|
987
|
+
GGML_LOG_ERROR("Null buffer for tensor passed to init_tensor function\n");
|
|
988
|
+
}
|
|
989
|
+
|
|
990
|
+
if (tensor->extra != nullptr) {
|
|
991
|
+
// This pointer can either be passed around client/server, or probably better stored server-side and kept track of.
|
|
992
|
+
// Currently unimplemented.
|
|
993
|
+
GGML_LOG_ERROR("tensor->extra populated by the backend, this is currently unsupported.\n");
|
|
994
|
+
ggml_free(ctx);
|
|
995
|
+
return false;
|
|
996
|
+
}
|
|
997
|
+
|
|
998
|
+
ggml_free(ctx);
|
|
999
|
+
return true;
|
|
1000
|
+
}
|
|
1001
|
+
|
|
908
1002
|
bool rpc_server::get_tensor(const rpc_msg_get_tensor_req & request, std::vector<uint8_t> & response) {
|
|
909
1003
|
struct ggml_init_params params {
|
|
910
1004
|
/*.mem_size =*/ ggml_tensor_overhead(),
|
|
@@ -914,7 +1008,7 @@ bool rpc_server::get_tensor(const rpc_msg_get_tensor_req & request, std::vector<
|
|
|
914
1008
|
struct ggml_context * ctx = ggml_init(params);
|
|
915
1009
|
ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
|
|
916
1010
|
if (tensor == nullptr) {
|
|
917
|
-
|
|
1011
|
+
GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
|
|
918
1012
|
ggml_free(ctx);
|
|
919
1013
|
return false;
|
|
920
1014
|
}
|
|
@@ -948,7 +1042,7 @@ bool rpc_server::copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_co
|
|
|
948
1042
|
ggml_tensor * src = deserialize_tensor(ctx, &request.src);
|
|
949
1043
|
ggml_tensor * dst = deserialize_tensor(ctx, &request.dst);
|
|
950
1044
|
if (src == nullptr || dst == nullptr) {
|
|
951
|
-
|
|
1045
|
+
GGML_LOG_ERROR("[%s] error deserializing tensors\n", __func__);
|
|
952
1046
|
ggml_free(ctx);
|
|
953
1047
|
return false;
|
|
954
1048
|
}
|
|
@@ -1058,6 +1152,18 @@ static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t fre
|
|
|
1058
1152
|
}
|
|
1059
1153
|
break;
|
|
1060
1154
|
}
|
|
1155
|
+
case RPC_CMD_GET_ALLOC_SIZE: {
|
|
1156
|
+
rpc_msg_get_alloc_size_req request;
|
|
1157
|
+
if (!recv_msg(sockfd, &request, sizeof(request))) {
|
|
1158
|
+
return;
|
|
1159
|
+
}
|
|
1160
|
+
rpc_msg_get_alloc_size_rsp response;
|
|
1161
|
+
server.get_alloc_size(request, response);
|
|
1162
|
+
if (!send_msg(sockfd, &response, sizeof(response))) {
|
|
1163
|
+
return;
|
|
1164
|
+
}
|
|
1165
|
+
break;
|
|
1166
|
+
}
|
|
1061
1167
|
case RPC_CMD_GET_ALIGNMENT: {
|
|
1062
1168
|
if (!recv_msg(sockfd, nullptr, 0)) {
|
|
1063
1169
|
return;
|
|
@@ -1133,6 +1239,19 @@ static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t fre
|
|
|
1133
1239
|
}
|
|
1134
1240
|
break;
|
|
1135
1241
|
}
|
|
1242
|
+
case RPC_CMD_INIT_TENSOR: {
|
|
1243
|
+
rpc_msg_init_tensor_req request;
|
|
1244
|
+
if (!recv_msg(sockfd, &request,sizeof(request))) {
|
|
1245
|
+
return;
|
|
1246
|
+
}
|
|
1247
|
+
if (!server.init_tensor(request)) {
|
|
1248
|
+
return;
|
|
1249
|
+
}
|
|
1250
|
+
if (!send_msg(sockfd, nullptr, 0)) {
|
|
1251
|
+
return;
|
|
1252
|
+
}
|
|
1253
|
+
break;
|
|
1254
|
+
}
|
|
1136
1255
|
case RPC_CMD_GET_TENSOR: {
|
|
1137
1256
|
rpc_msg_get_tensor_req request;
|
|
1138
1257
|
if (!recv_msg(sockfd, &request, sizeof(request))) {
|
|
@@ -1257,14 +1376,14 @@ static void ggml_backend_rpc_device_get_memory(ggml_backend_dev_t dev, size_t *
|
|
|
1257
1376
|
|
|
1258
1377
|
ggml_backend_rpc_get_device_memory(ctx->endpoint.c_str(), free, total);
|
|
1259
1378
|
|
|
1260
|
-
|
|
1379
|
+
GGML_UNUSED(dev);
|
|
1261
1380
|
}
|
|
1262
1381
|
|
|
1263
1382
|
static enum ggml_backend_dev_type ggml_backend_rpc_device_get_type(ggml_backend_dev_t dev) {
|
|
1264
1383
|
// TODO: obtain value from the server
|
|
1265
1384
|
return GGML_BACKEND_DEVICE_TYPE_GPU;
|
|
1266
1385
|
|
|
1267
|
-
|
|
1386
|
+
GGML_UNUSED(dev);
|
|
1268
1387
|
}
|
|
1269
1388
|
|
|
1270
1389
|
static void ggml_backend_rpc_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
|
|
@@ -1285,7 +1404,7 @@ static ggml_backend_t ggml_backend_rpc_device_init(ggml_backend_dev_t dev, const
|
|
|
1285
1404
|
|
|
1286
1405
|
return ggml_backend_rpc_init(ctx->endpoint.c_str());
|
|
1287
1406
|
|
|
1288
|
-
|
|
1407
|
+
GGML_UNUSED(params);
|
|
1289
1408
|
}
|
|
1290
1409
|
|
|
1291
1410
|
static ggml_backend_buffer_type_t ggml_backend_rpc_device_get_buffer_type(ggml_backend_dev_t dev) {
|
|
@@ -1293,12 +1412,12 @@ static ggml_backend_buffer_type_t ggml_backend_rpc_device_get_buffer_type(ggml_b
|
|
|
1293
1412
|
|
|
1294
1413
|
return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str());
|
|
1295
1414
|
|
|
1296
|
-
|
|
1415
|
+
GGML_UNUSED(dev);
|
|
1297
1416
|
}
|
|
1298
1417
|
|
|
1299
1418
|
static bool ggml_backend_rpc_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
|
1300
|
-
|
|
1301
|
-
|
|
1419
|
+
GGML_UNUSED(dev);
|
|
1420
|
+
GGML_UNUSED(op);
|
|
1302
1421
|
//TODO: call the remote backend and cache the results
|
|
1303
1422
|
return true;
|
|
1304
1423
|
}
|
|
@@ -1335,20 +1454,20 @@ static const struct ggml_backend_device_i ggml_backend_rpc_device_i = {
|
|
|
1335
1454
|
static const char * ggml_backend_rpc_reg_get_name(ggml_backend_reg_t reg) {
|
|
1336
1455
|
return "RPC";
|
|
1337
1456
|
|
|
1338
|
-
|
|
1457
|
+
GGML_UNUSED(reg);
|
|
1339
1458
|
}
|
|
1340
1459
|
|
|
1341
1460
|
static size_t ggml_backend_rpc_reg_get_device_count(ggml_backend_reg_t reg) {
|
|
1342
1461
|
return 0;
|
|
1343
1462
|
|
|
1344
|
-
|
|
1463
|
+
GGML_UNUSED(reg);
|
|
1345
1464
|
}
|
|
1346
1465
|
|
|
1347
1466
|
static ggml_backend_dev_t ggml_backend_rpc_reg_get_device(ggml_backend_reg_t reg, size_t index) {
|
|
1348
1467
|
GGML_ABORT("The RPC backend does not have enumerated devices - use ggml_backend_add_device instead");
|
|
1349
1468
|
|
|
1350
|
-
|
|
1351
|
-
|
|
1469
|
+
GGML_UNUSED(reg);
|
|
1470
|
+
GGML_UNUSED(index);
|
|
1352
1471
|
}
|
|
1353
1472
|
|
|
1354
1473
|
static void * ggml_backend_rpc_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
|
@@ -1357,7 +1476,7 @@ static void * ggml_backend_rpc_get_proc_address(ggml_backend_reg_t reg, const ch
|
|
|
1357
1476
|
}
|
|
1358
1477
|
return NULL;
|
|
1359
1478
|
|
|
1360
|
-
|
|
1479
|
+
GGML_UNUSED(reg);
|
|
1361
1480
|
}
|
|
1362
1481
|
|
|
1363
1482
|
static const struct ggml_backend_reg_i ggml_backend_rpc_reg_i = {
|
|
@@ -11,6 +11,8 @@
|
|
|
11
11
|
//
|
|
12
12
|
|
|
13
13
|
#include "common.hpp"
|
|
14
|
+
|
|
15
|
+
#include "ggml-backend-impl.h"
|
|
14
16
|
#include "ggml-impl.h"
|
|
15
17
|
|
|
16
18
|
int get_current_device_id() {
|
|
@@ -49,6 +51,10 @@ void ggml_sycl_host_free(void* ptr) try {
|
|
|
49
51
|
std::exit(1);
|
|
50
52
|
}
|
|
51
53
|
|
|
54
|
+
bool gpu_has_xmx(sycl::device &dev) {
|
|
55
|
+
return dev.has(sycl::aspect::ext_intel_matrix);
|
|
56
|
+
}
|
|
57
|
+
|
|
52
58
|
int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block_size) {
|
|
53
59
|
const int64_t max_range = std::numeric_limits<int>::max();
|
|
54
60
|
int64_t sycl_down_blk_size = block_size;
|
|
@@ -65,9 +71,9 @@ void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
|
|
|
65
71
|
const ggml_sycl_op_flatten_t op) try {
|
|
66
72
|
|
|
67
73
|
const bool use_src1 = src1 != nullptr;
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
GGML_ASSERT(
|
|
74
|
+
if(use_src1)
|
|
75
|
+
GGML_ASSERT(strcmp(src1->buffer->buft->iface.get_name(src1->buffer->buft), GGML_SYCL_NAME "_Split") != 0);
|
|
76
|
+
GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0);
|
|
71
77
|
|
|
72
78
|
// dd = data device
|
|
73
79
|
float * src0_ddf = (float *) src0->data;
|
|
@@ -26,7 +26,11 @@
|
|
|
26
26
|
|
|
27
27
|
#define GGML_COMMON_DECL_SYCL
|
|
28
28
|
#define GGML_COMMON_IMPL_SYCL
|
|
29
|
+
/* suppress warning spam */
|
|
30
|
+
#pragma clang diagnostic push
|
|
31
|
+
#pragma clang diagnostic ignored "-Wnested-anon-types"
|
|
29
32
|
#include "ggml-common.h"
|
|
33
|
+
#pragma clang diagnostic pop
|
|
30
34
|
|
|
31
35
|
void* ggml_sycl_host_malloc(size_t size);
|
|
32
36
|
void ggml_sycl_host_free(void* ptr);
|
|
@@ -329,8 +333,12 @@ struct ggml_backend_sycl_context {
|
|
|
329
333
|
// pool
|
|
330
334
|
std::unique_ptr<ggml_sycl_pool> pools[GGML_SYCL_MAX_DEVICES];
|
|
331
335
|
|
|
336
|
+
std::unique_ptr<ggml_sycl_pool> host_pools[GGML_SYCL_MAX_DEVICES];
|
|
337
|
+
|
|
332
338
|
static std::unique_ptr<ggml_sycl_pool> new_pool_for_device(queue_ptr qptr, int device);
|
|
333
339
|
|
|
340
|
+
static std::unique_ptr<ggml_sycl_pool> new_pool_for_host(queue_ptr qptr, int device);
|
|
341
|
+
|
|
334
342
|
ggml_sycl_pool & pool(int device) {
|
|
335
343
|
if (pools[device] == nullptr) {
|
|
336
344
|
pools[device] = new_pool_for_device(stream(device,0), device);
|
|
@@ -341,6 +349,15 @@ struct ggml_backend_sycl_context {
|
|
|
341
349
|
ggml_sycl_pool & pool() {
|
|
342
350
|
return pool(device);
|
|
343
351
|
}
|
|
352
|
+
|
|
353
|
+
ggml_sycl_pool & host_pool(int device) {
|
|
354
|
+
if (host_pools[device] == nullptr) {
|
|
355
|
+
host_pools[device] = new_pool_for_host(stream(device, 0), device);
|
|
356
|
+
}
|
|
357
|
+
return *host_pools[device];
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
ggml_sycl_pool & host_pool() { return host_pool(device); }
|
|
344
361
|
};
|
|
345
362
|
|
|
346
363
|
// common device functions
|
|
@@ -658,6 +675,7 @@ inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_t
|
|
|
658
675
|
}
|
|
659
676
|
}
|
|
660
677
|
|
|
678
|
+
bool gpu_has_xmx(sycl::device &dev);
|
|
661
679
|
|
|
662
680
|
void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
663
681
|
const ggml_tensor *src1, ggml_tensor *dst,
|
|
@@ -158,8 +158,9 @@ static void concat_f32_sycl_non_cont(
|
|
|
158
158
|
});
|
|
159
159
|
}
|
|
160
160
|
|
|
161
|
-
void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx,
|
|
162
|
-
|
|
161
|
+
void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
|
162
|
+
const ggml_tensor *src0 = dst->src[0];
|
|
163
|
+
const ggml_tensor *src1 = dst->src[1];
|
|
163
164
|
queue_ptr stream = ctx.stream();
|
|
164
165
|
|
|
165
166
|
const int32_t dim = ((int32_t *)dst->op_params)[0];
|
|
@@ -15,7 +15,6 @@
|
|
|
15
15
|
|
|
16
16
|
#include "common.hpp"
|
|
17
17
|
|
|
18
|
-
void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx,
|
|
19
|
-
const ggml_tensor *src1, ggml_tensor *dst);
|
|
18
|
+
void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst);
|
|
20
19
|
|
|
21
20
|
#endif // GGML_SYCL_CONCAT_HPP
|
|
@@ -71,8 +71,9 @@ static void conv_transpose_1d_f32_f32_sycl(
|
|
|
71
71
|
});
|
|
72
72
|
}
|
|
73
73
|
|
|
74
|
-
void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx,
|
|
75
|
-
const ggml_tensor *
|
|
74
|
+
void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
|
75
|
+
const ggml_tensor *src0 = dst->src[0];
|
|
76
|
+
const ggml_tensor *src1 = dst->src[1];
|
|
76
77
|
const float * src0_d = (const float *)src0->data;
|
|
77
78
|
const float * src1_d = (const float *)src1->data;
|
|
78
79
|
|
|
@@ -15,7 +15,6 @@
|
|
|
15
15
|
|
|
16
16
|
#include "common.hpp"
|
|
17
17
|
|
|
18
|
-
void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx,
|
|
19
|
-
const ggml_tensor *src1, ggml_tensor *dst);
|
|
18
|
+
void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor *dst);
|
|
20
19
|
|
|
21
20
|
#endif // GGML_SYCL_CONV_HPP
|