@fugood/llama.node 0.3.12 → 0.3.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +2 -1
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +14 -0
- package/src/LlamaContext.cpp +110 -79
- package/src/LlamaContext.h +1 -1
- package/src/common.hpp +1 -2
- package/src/llama.cpp/.github/workflows/build.yml +95 -13
- package/src/llama.cpp/.github/workflows/docker.yml +2 -0
- package/src/llama.cpp/.github/workflows/labeler.yml +1 -1
- package/src/llama.cpp/.github/workflows/server.yml +2 -0
- package/src/llama.cpp/common/CMakeLists.txt +23 -6
- package/src/llama.cpp/common/arg.cpp +292 -14
- package/src/llama.cpp/common/chat.cpp +1128 -315
- package/src/llama.cpp/common/chat.h +135 -0
- package/src/llama.cpp/common/common.cpp +27 -171
- package/src/llama.cpp/common/common.h +41 -73
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
- package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
- package/src/llama.cpp/common/llguidance.cpp +3 -3
- package/src/llama.cpp/common/log.cpp +1 -0
- package/src/llama.cpp/common/log.h +2 -1
- package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +21 -7
- package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +61 -14
- package/src/llama.cpp/common/ngram-cache.cpp +1 -0
- package/src/llama.cpp/common/sampling.cpp +93 -49
- package/src/llama.cpp/common/speculative.cpp +6 -5
- package/src/llama.cpp/common/speculative.h +1 -1
- package/src/llama.cpp/docs/build.md +47 -9
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +4 -4
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +6 -5
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +373 -107
- package/src/llama.cpp/examples/llava/clip.h +19 -3
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
- package/src/llama.cpp/examples/llava/llava.cpp +4 -2
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
- package/src/llama.cpp/examples/main/main.cpp +73 -28
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +1 -0
- package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
- package/src/llama.cpp/examples/run/run.cpp +115 -79
- package/src/llama.cpp/examples/server/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/server/httplib.h +381 -292
- package/src/llama.cpp/examples/server/server.cpp +134 -128
- package/src/llama.cpp/examples/server/utils.hpp +95 -106
- package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +251 -142
- package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
- package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cpu.h +4 -1
- package/src/llama.cpp/ggml/include/ggml-metal.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +0 -2
- package/src/llama.cpp/ggml/include/ggml.h +6 -2
- package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
- package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
- package/src/llama.cpp/ggml/src/ggml-common.h +0 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +156 -11
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +2235 -641
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1572 -198
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +24 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +9 -8
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +16 -3
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +246 -120
- package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +174 -728
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +949 -602
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +37 -3
- package/src/llama.cpp/ggml/src/ggml.c +9 -4
- package/src/llama.cpp/include/llama.h +32 -14
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +1 -0
- package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
- package/src/llama.cpp/requirements.txt +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +21 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +1 -0
- package/src/llama.cpp/src/llama-grammar.cpp +183 -183
- package/src/llama.cpp/src/llama-grammar.h +13 -4
- package/src/llama.cpp/src/llama-impl.h +6 -6
- package/src/llama.cpp/src/llama-kv-cache.h +2 -1
- package/src/llama.cpp/src/llama-mmap.cpp +11 -1
- package/src/llama.cpp/src/llama-mmap.h +1 -0
- package/src/llama.cpp/src/llama-model.cpp +70 -6
- package/src/llama.cpp/src/llama-sampling.cpp +174 -67
- package/src/llama.cpp/src/llama-vocab.cpp +12 -0
- package/src/llama.cpp/src/llama.cpp +154 -5
- package/src/llama.cpp/src/unicode.cpp +9 -2
- package/src/llama.cpp/tests/test-backend-ops.cpp +171 -115
- package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
- package/src/llama.cpp/tests/test-chat.cpp +691 -325
- package/src/llama.cpp/tests/test-gguf.cpp +4 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
- package/src/llama.cpp/tests/test-sampling.cpp +15 -0
- package/src/llama.cpp/Sources/llama/llama.h +0 -4
- package/src/llama.cpp/common/chat.hpp +0 -52
|
@@ -464,7 +464,7 @@ static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
|
|
|
464
464
|
return result;
|
|
465
465
|
}
|
|
466
466
|
|
|
467
|
-
static
|
|
467
|
+
static enum ggml_status ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
|
468
468
|
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
|
469
469
|
|
|
470
470
|
// CUDA backend on the server pads everything to 512 due to CUDA limitations.
|
|
@@ -478,6 +478,7 @@ static void ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t buffer, gg
|
|
|
478
478
|
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_INIT_TENSOR, &request, sizeof(request), nullptr, 0);
|
|
479
479
|
GGML_ASSERT(status);
|
|
480
480
|
}
|
|
481
|
+
return GGML_STATUS_SUCCESS;
|
|
481
482
|
}
|
|
482
483
|
|
|
483
484
|
static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
@@ -99,3 +99,20 @@ catch (sycl::exception const &exc) {
|
|
|
99
99
|
<< ", line:" << __LINE__ << std::endl;
|
|
100
100
|
std::exit(1);
|
|
101
101
|
}
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams) {
|
|
105
|
+
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
|
|
106
|
+
for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) {
|
|
107
|
+
if (extra->events[i][is] != nullptr) {
|
|
108
|
+
SYCL_CHECK(CHECK_TRY_ERROR(dpct::destroy_event(extra->events[i][is])));
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
if (extra->data_device[i] != nullptr && streams.size()>0) {
|
|
112
|
+
ggml_sycl_set_device(i);
|
|
113
|
+
SYCL_CHECK(
|
|
114
|
+
CHECK_TRY_ERROR(sycl::free(extra->data_device[i], *(streams[i]))));
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
delete extra;
|
|
118
|
+
}
|
|
@@ -19,6 +19,9 @@
|
|
|
19
19
|
#include "dpct/helper.hpp"
|
|
20
20
|
#include "ggml-sycl.h"
|
|
21
21
|
#include "presets.hpp"
|
|
22
|
+
#include "sycl_hw.hpp"
|
|
23
|
+
|
|
24
|
+
|
|
22
25
|
#if GGML_SYCL_DNNL
|
|
23
26
|
#include "dnnl.hpp"
|
|
24
27
|
#include "dnnl_sycl.hpp"
|
|
@@ -31,11 +34,15 @@
|
|
|
31
34
|
#pragma clang diagnostic ignored "-Wnested-anon-types"
|
|
32
35
|
#include "ggml-common.h"
|
|
33
36
|
#pragma clang diagnostic pop
|
|
37
|
+
#include "ggml-impl.h"
|
|
34
38
|
|
|
35
39
|
void* ggml_sycl_host_malloc(size_t size);
|
|
36
40
|
void ggml_sycl_host_free(void* ptr);
|
|
37
41
|
|
|
38
|
-
|
|
42
|
+
|
|
43
|
+
extern int g_ggml_sycl_debug;
|
|
44
|
+
extern int g_ggml_sycl_disable_optimize;
|
|
45
|
+
|
|
39
46
|
#define GGML_SYCL_DEBUG(...) \
|
|
40
47
|
do { \
|
|
41
48
|
if (g_ggml_sycl_debug) \
|
|
@@ -182,18 +189,24 @@ inline dpct::err0 ggml_sycl_set_device(const int device) try {
|
|
|
182
189
|
}
|
|
183
190
|
|
|
184
191
|
//////////////////////
|
|
192
|
+
struct optimize_feature {
|
|
193
|
+
bool reorder=false;
|
|
194
|
+
};
|
|
195
|
+
|
|
196
|
+
struct sycl_device_info {
|
|
197
|
+
int cc; // compute capability
|
|
198
|
+
// int nsm; // number of streaming multiprocessors
|
|
199
|
+
// size_t smpb; // max. shared memory per block
|
|
200
|
+
bool vmm; // virtual memory support
|
|
201
|
+
size_t total_vram;
|
|
202
|
+
sycl_hw_info hw_info;
|
|
203
|
+
optimize_feature opt_feature;
|
|
204
|
+
};
|
|
205
|
+
|
|
185
206
|
|
|
186
207
|
struct ggml_sycl_device_info {
|
|
187
208
|
int device_count;
|
|
188
209
|
|
|
189
|
-
struct sycl_device_info {
|
|
190
|
-
int cc; // compute capability
|
|
191
|
-
// int nsm; // number of streaming multiprocessors
|
|
192
|
-
// size_t smpb; // max. shared memory per block
|
|
193
|
-
bool vmm; // virtual memory support
|
|
194
|
-
size_t total_vram;
|
|
195
|
-
};
|
|
196
|
-
|
|
197
210
|
sycl_device_info devices[GGML_SYCL_MAX_DEVICES] = {};
|
|
198
211
|
|
|
199
212
|
std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {};
|
|
@@ -260,17 +273,46 @@ struct ggml_tensor_extra_gpu {
|
|
|
260
273
|
// tensors
|
|
261
274
|
dpct::event_ptr events[GGML_SYCL_MAX_DEVICES]
|
|
262
275
|
[GGML_SYCL_MAX_STREAMS]; // events for synchronizing multiple GPUs
|
|
276
|
+
optimize_feature optimized_feature;
|
|
263
277
|
};
|
|
264
278
|
|
|
279
|
+
void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams={});
|
|
280
|
+
|
|
281
|
+
inline optimize_feature check_gpu_optimize_feature(syclex::architecture &arch) {
|
|
282
|
+
optimize_feature opt;
|
|
283
|
+
|
|
284
|
+
opt.reorder =
|
|
285
|
+
(arch == syclex::architecture::intel_gpu_dg1 ||
|
|
286
|
+
arch == syclex::architecture::intel_gpu_acm_g10 ||
|
|
287
|
+
arch == syclex::architecture::intel_gpu_acm_g11 ||
|
|
288
|
+
arch == syclex::architecture::intel_gpu_acm_g12 ||
|
|
289
|
+
arch == syclex::architecture::intel_gpu_pvc ||
|
|
290
|
+
arch == syclex::architecture::intel_gpu_pvc_vg ||
|
|
291
|
+
arch == syclex::architecture::intel_gpu_mtl_u ||
|
|
292
|
+
arch == syclex::architecture::intel_gpu_mtl_s ||
|
|
293
|
+
arch == syclex::architecture::intel_gpu_mtl_h ||
|
|
294
|
+
arch == syclex::architecture::intel_gpu_arl_u ||
|
|
295
|
+
arch == syclex::architecture::intel_gpu_arl_s ||
|
|
296
|
+
arch == syclex::architecture::intel_gpu_arl_h ||
|
|
297
|
+
arch == syclex::architecture::intel_gpu_bmg_g21 ||
|
|
298
|
+
arch == syclex::architecture::intel_gpu_lnl_m
|
|
299
|
+
);
|
|
300
|
+
|
|
301
|
+
return opt;
|
|
302
|
+
}
|
|
303
|
+
|
|
265
304
|
struct ggml_backend_sycl_context {
|
|
266
305
|
int device;
|
|
267
306
|
std::string name;
|
|
307
|
+
optimize_feature opt_feature;
|
|
308
|
+
bool optimized_graph=false;
|
|
268
309
|
|
|
269
310
|
queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
|
|
270
311
|
|
|
271
312
|
explicit ggml_backend_sycl_context(int device) :
|
|
272
313
|
device(device),
|
|
273
314
|
name(GGML_SYCL_NAME + std::to_string(device)) {
|
|
315
|
+
opt_feature = ggml_sycl_info().devices[device].opt_feature;
|
|
274
316
|
}
|
|
275
317
|
|
|
276
318
|
queue_ptr stream(int device, int stream) {
|
|
@@ -680,5 +722,4 @@ bool gpu_has_xmx(sycl::device &dev);
|
|
|
680
722
|
void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
681
723
|
const ggml_tensor *src1, ggml_tensor *dst,
|
|
682
724
|
const ggml_sycl_op_flatten_t op);
|
|
683
|
-
|
|
684
725
|
#endif // GGML_SYCL_COMMON_HPP
|
|
@@ -125,6 +125,25 @@ static void dequantize_row_q4_0_sycl(const void *vx, dst_t *y, const int64_t k,
|
|
|
125
125
|
}
|
|
126
126
|
}
|
|
127
127
|
|
|
128
|
+
template <typename dst_t>
|
|
129
|
+
static void dequantize_row_q4_0_sycl_reorder(const void *vx, dst_t *y, const int64_t k,
|
|
130
|
+
dpct::queue_ptr stream) {
|
|
131
|
+
|
|
132
|
+
dpct::has_capability_or_fail(stream->get_device(),
|
|
133
|
+
{sycl::aspect::fp16});
|
|
134
|
+
|
|
135
|
+
int constexpr WARP_K = WARP_SIZE * QK4_0;
|
|
136
|
+
const int n_warp = (k + WARP_K - 1) / WARP_K;
|
|
137
|
+
GGML_ASSERT(k % 2 == 0);
|
|
138
|
+
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, n_warp) *
|
|
139
|
+
sycl::range<3>(1, 1, WARP_SIZE),
|
|
140
|
+
sycl::range<3>(1, 1, WARP_SIZE)),
|
|
141
|
+
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]]{
|
|
142
|
+
dequantize_block_q4_0_reorder(vx, y, k, item_ct1);
|
|
143
|
+
});
|
|
144
|
+
|
|
145
|
+
}
|
|
146
|
+
|
|
128
147
|
template <typename dst_t>
|
|
129
148
|
static void dequantize_row_q4_1_sycl(const void *vx, dst_t *y, const int64_t k,
|
|
130
149
|
dpct::queue_ptr stream) {
|
|
@@ -452,10 +471,15 @@ static void convert_unary_sycl(const void *__restrict__ vx,
|
|
|
452
471
|
}
|
|
453
472
|
}
|
|
454
473
|
|
|
455
|
-
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) {
|
|
474
|
+
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor *dst) {
|
|
456
475
|
switch (type) {
|
|
457
476
|
case GGML_TYPE_Q4_0:
|
|
458
|
-
|
|
477
|
+
if (dst->src[0]->extra &&
|
|
478
|
+
((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
|
|
479
|
+
return dequantize_row_q4_0_sycl_reorder;
|
|
480
|
+
} else {
|
|
481
|
+
return dequantize_block_sycl<QK4_0, QR4_0, dequantize_q4_0>;
|
|
482
|
+
}
|
|
459
483
|
case GGML_TYPE_Q4_1:
|
|
460
484
|
return dequantize_block_sycl<QK4_1, QR4_1, dequantize_q4_1>;
|
|
461
485
|
case GGML_TYPE_Q5_0:
|
|
@@ -499,10 +523,15 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) {
|
|
|
499
523
|
}
|
|
500
524
|
}
|
|
501
525
|
|
|
502
|
-
to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type) {
|
|
526
|
+
to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
|
|
503
527
|
switch (type) {
|
|
504
528
|
case GGML_TYPE_Q4_0:
|
|
505
|
-
|
|
529
|
+
if (dst->src[0]->extra &&
|
|
530
|
+
((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
|
|
531
|
+
return dequantize_row_q4_0_sycl_reorder;
|
|
532
|
+
} else {
|
|
533
|
+
return dequantize_row_q4_0_sycl;
|
|
534
|
+
}
|
|
506
535
|
case GGML_TYPE_Q4_1:
|
|
507
536
|
return dequantize_row_q4_1_sycl;
|
|
508
537
|
case GGML_TYPE_Q5_0:
|
|
@@ -21,7 +21,7 @@ using to_t_sycl_t = void (*)(const void *__restrict__ x, T *__restrict__ y,
|
|
|
21
21
|
typedef to_t_sycl_t<float> to_fp32_sycl_t;
|
|
22
22
|
typedef to_t_sycl_t<sycl::half> to_fp16_sycl_t;
|
|
23
23
|
|
|
24
|
-
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type);
|
|
25
|
-
to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type);
|
|
24
|
+
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor *dst);
|
|
25
|
+
to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst);
|
|
26
26
|
|
|
27
27
|
#endif // GGML_SYCL_CONVERT_HPP
|