@novastera-oss/llamarn 0.2.7 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/cpp/include/llama.h +8 -3
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +56 -22
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +1 -1
- package/cpp/llama.cpp/common/arg.cpp +7 -0
- package/cpp/llama.cpp/common/common.cpp +3 -0
- package/cpp/llama.cpp/common/common.h +1 -0
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
- package/cpp/llama.cpp/convert_hf_to_gguf.py +118 -20
- package/cpp/llama.cpp/ggml/CMakeLists.txt +1 -0
- package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
- package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +17 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +31 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +1027 -1038
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +83 -102
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +192 -67
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +56 -40
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +54 -29
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +84 -31
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +227 -41
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +362 -182
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +240 -535
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -24
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +99 -166
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +94 -72
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +6 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +45 -54
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +132 -201
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +12 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +57 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +69 -13
- package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +76 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +21 -0
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +64 -0
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
- package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
- package/cpp/llama.cpp/include/llama.h +8 -3
- package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +55 -0
- package/cpp/llama.cpp/src/llama-arch.h +18 -0
- package/cpp/llama.cpp/src/llama-batch.cpp +570 -359
- package/cpp/llama.cpp/src/llama-batch.h +98 -70
- package/cpp/llama.cpp/src/llama-chat.cpp +11 -6
- package/cpp/llama.cpp/src/llama-context.cpp +101 -107
- package/cpp/llama.cpp/src/llama-context.h +13 -13
- package/cpp/llama.cpp/src/llama-graph.cpp +199 -252
- package/cpp/llama.cpp/src/llama-graph.h +44 -32
- package/cpp/llama.cpp/src/llama-hparams.cpp +4 -0
- package/cpp/llama.cpp/src/llama-hparams.h +8 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +51 -53
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +19 -24
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +110 -104
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +17 -22
- package/cpp/llama.cpp/src/llama-kv-cells.h +35 -11
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +66 -67
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +16 -21
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +69 -68
- package/cpp/llama.cpp/src/llama-memory-recurrent.h +15 -20
- package/cpp/llama.cpp/src/llama-memory.h +18 -22
- package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
- package/cpp/llama.cpp/src/llama-model.cpp +1006 -472
- package/cpp/llama.cpp/src/llama-model.h +22 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +87 -5
- package/cpp/llama.cpp/src/llama-vocab.cpp +26 -3
- package/cpp/llama.cpp/src/llama-vocab.h +1 -0
- package/cpp/rn-utils.h +3 -0
- package/ios/include/common.h +1 -0
- package/ios/include/llama.h +8 -3
- package/ios/libs/llama.xcframework/Info.plist +19 -19
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4863
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3742
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3744
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4863
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3742
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4900
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4871
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3773
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -1
|
@@ -225,9 +225,9 @@ struct bin_bcast_sycl {
|
|
|
225
225
|
dpct::has_capability_or_fail(stream->get_device(),
|
|
226
226
|
{sycl::aspect::fp16});
|
|
227
227
|
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
228
|
+
sycl_parallel_for(
|
|
229
|
+
stream,
|
|
230
|
+
sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) * sycl::range<3>(1, 1, block_size),
|
|
231
231
|
sycl::range<3>(1, 1, block_size)),
|
|
232
232
|
[=](sycl::nd_item<3> item_ct1) {
|
|
233
233
|
k_bin_bcast_unravel<bin_op>(
|
|
@@ -246,9 +246,8 @@ struct bin_bcast_sycl {
|
|
|
246
246
|
dpct::has_capability_or_fail(stream->get_device(),
|
|
247
247
|
{sycl::aspect::fp16});
|
|
248
248
|
|
|
249
|
-
|
|
250
|
-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
251
|
-
[=](sycl::nd_item<3> item_ct1) {
|
|
249
|
+
sycl_parallel_for(
|
|
250
|
+
stream, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
|
252
251
|
k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
|
|
253
252
|
ne2, ne3, ne10, ne11, ne12, ne13,
|
|
254
253
|
s1, s2, s3, s01, s02, s03, s11, s12, s13,
|
|
@@ -199,7 +199,7 @@ struct sycl_device_info {
|
|
|
199
199
|
// size_t smpb; // max. shared memory per block
|
|
200
200
|
bool vmm; // virtual memory support
|
|
201
201
|
size_t total_vram;
|
|
202
|
-
sycl_hw_info hw_info;
|
|
202
|
+
//sycl_hw_info hw_info; \\ device id and aarch, currently not used
|
|
203
203
|
optimize_feature opt_feature;
|
|
204
204
|
};
|
|
205
205
|
|
|
@@ -286,29 +286,6 @@ struct ggml_tensor_extra_gpu {
|
|
|
286
286
|
|
|
287
287
|
void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams={});
|
|
288
288
|
|
|
289
|
-
inline optimize_feature check_gpu_optimize_feature(syclex::architecture &arch) {
|
|
290
|
-
optimize_feature opt;
|
|
291
|
-
|
|
292
|
-
opt.reorder =
|
|
293
|
-
(arch == syclex::architecture::intel_gpu_dg1 ||
|
|
294
|
-
arch == syclex::architecture::intel_gpu_acm_g10 ||
|
|
295
|
-
arch == syclex::architecture::intel_gpu_acm_g11 ||
|
|
296
|
-
arch == syclex::architecture::intel_gpu_acm_g12 ||
|
|
297
|
-
arch == syclex::architecture::intel_gpu_pvc ||
|
|
298
|
-
arch == syclex::architecture::intel_gpu_pvc_vg ||
|
|
299
|
-
arch == syclex::architecture::intel_gpu_mtl_u ||
|
|
300
|
-
arch == syclex::architecture::intel_gpu_mtl_s ||
|
|
301
|
-
arch == syclex::architecture::intel_gpu_mtl_h ||
|
|
302
|
-
arch == syclex::architecture::intel_gpu_arl_u ||
|
|
303
|
-
arch == syclex::architecture::intel_gpu_arl_s ||
|
|
304
|
-
arch == syclex::architecture::intel_gpu_arl_h ||
|
|
305
|
-
arch == syclex::architecture::intel_gpu_bmg_g21 ||
|
|
306
|
-
arch == syclex::architecture::intel_gpu_lnl_m
|
|
307
|
-
);
|
|
308
|
-
|
|
309
|
-
return opt;
|
|
310
|
-
}
|
|
311
|
-
|
|
312
289
|
namespace sycl_ex = sycl::ext::oneapi::experimental;
|
|
313
290
|
struct ggml_backend_sycl_context {
|
|
314
291
|
int device;
|
|
@@ -89,33 +89,24 @@ static void concat_f32_sycl(const float *x, const float *y, float *dst,
|
|
|
89
89
|
sycl::range<3> gridDim(ne2, ne1, num_blocks);
|
|
90
90
|
switch (dim) {
|
|
91
91
|
case 0:
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
concat_f32_dim0(x, y, dst, ne0, ne00, item_ct1);
|
|
98
|
-
});
|
|
99
|
-
break;
|
|
92
|
+
sycl_parallel_for(stream,
|
|
93
|
+
sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
|
|
94
|
+
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
|
|
95
|
+
[=](sycl::nd_item<3> item_ct1) { concat_f32_dim0(x, y, dst, ne0, ne00, item_ct1); });
|
|
96
|
+
break;
|
|
100
97
|
case 1:
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
concat_f32_dim1(x, y, dst, ne0, ne01, item_ct1);
|
|
107
|
-
});
|
|
108
|
-
break;
|
|
98
|
+
sycl_parallel_for(stream,
|
|
99
|
+
sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
|
|
100
|
+
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
|
|
101
|
+
[=](sycl::nd_item<3> item_ct1) { concat_f32_dim1(x, y, dst, ne0, ne01, item_ct1); });
|
|
102
|
+
break;
|
|
109
103
|
// dim >=2 will be dispatched to the default path
|
|
110
104
|
default:
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
concat_f32_dim2(x, y, dst, ne0, ne02, item_ct1);
|
|
117
|
-
});
|
|
118
|
-
break;
|
|
105
|
+
sycl_parallel_for(stream,
|
|
106
|
+
sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
|
|
107
|
+
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
|
|
108
|
+
[=](sycl::nd_item<3> item_ct1) { concat_f32_dim2(x, y, dst, ne0, ne02, item_ct1); });
|
|
109
|
+
break;
|
|
119
110
|
}
|
|
120
111
|
}
|
|
121
112
|
|
|
@@ -129,33 +120,29 @@ static void concat_f32_sycl_non_cont(
|
|
|
129
120
|
int64_t ne2, int64_t ne3, uint64_t nb0, uint64_t nb1, uint64_t nb2,
|
|
130
121
|
uint64_t nb3, int32_t dim) {
|
|
131
122
|
sycl::range<3> gridDim(ne3, ne2, ne1);
|
|
132
|
-
stream
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
int64_t i2 = item_ct1.get_group(1);
|
|
137
|
-
int64_t i1 = item_ct1.get_group(2);
|
|
123
|
+
sycl_parallel_for(stream, sycl::nd_range<3>(gridDim, sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
|
|
124
|
+
int64_t i3 = item_ct1.get_group(0);
|
|
125
|
+
int64_t i2 = item_ct1.get_group(1);
|
|
126
|
+
int64_t i1 = item_ct1.get_group(2);
|
|
138
127
|
|
|
139
|
-
|
|
140
|
-
|
|
128
|
+
int64_t o[4] = { 0, 0, 0, 0 };
|
|
129
|
+
o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
|
|
141
130
|
|
|
142
|
-
|
|
131
|
+
const float * x;
|
|
143
132
|
|
|
144
|
-
|
|
145
|
-
i0 += item_ct1.get_local_range(2)) {
|
|
133
|
+
for (int i0 = item_ct1.get_local_id(2); i0 < ne0; i0 += item_ct1.get_local_range(2)) {
|
|
146
134
|
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
|
147
|
-
|
|
148
|
-
(i0)*nb00);
|
|
135
|
+
x = (const float *) (src0 + (i3) *nb03 + (i2) *nb02 + (i1) *nb01 + (i0) *nb00);
|
|
149
136
|
} else {
|
|
150
|
-
|
|
151
|
-
|
|
137
|
+
x = (const float *) (src1 + (i3 - o[3]) * nb13 + (i2 - o[2]) * nb12 + (i1 - o[1]) * nb11 +
|
|
138
|
+
(i0 - o[0]) * nb10);
|
|
152
139
|
}
|
|
153
140
|
|
|
154
141
|
float *y = (float *)(dst + i3 * nb3 + i2 * nb2 + i1 * nb1 + i0 * nb0);
|
|
155
142
|
|
|
156
143
|
*y = *x;
|
|
157
|
-
|
|
158
|
-
|
|
144
|
+
}
|
|
145
|
+
});
|
|
159
146
|
}
|
|
160
147
|
|
|
161
148
|
void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
|
@@ -59,16 +59,10 @@ static void conv_transpose_1d_f32_f32_sycl(
|
|
|
59
59
|
const int num_blocks = (output_size + SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE - 1) / SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE;
|
|
60
60
|
const sycl::range<3> block_dims(1, 1, SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE);
|
|
61
61
|
const sycl::range<3> block_nums(1, 1, num_blocks);
|
|
62
|
-
stream
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
conv_transpose_1d_kernel(
|
|
67
|
-
s0, output_size,
|
|
68
|
-
src0_ne0, src0_ne1, src0_ne2,
|
|
69
|
-
src1_ne0, dst_ne0,
|
|
70
|
-
src0, src1, dst, item_ct1);
|
|
71
|
-
});
|
|
62
|
+
sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
|
63
|
+
conv_transpose_1d_kernel(s0, output_size, src0_ne0, src0_ne1, src0_ne2, src1_ne0, dst_ne0, src0, src1, dst,
|
|
64
|
+
item_ct1);
|
|
65
|
+
});
|
|
72
66
|
}
|
|
73
67
|
|
|
74
68
|
void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
|
@@ -33,14 +33,11 @@ static void dequantize_block_sycl(const void *__restrict__ vx,
|
|
|
33
33
|
{
|
|
34
34
|
dpct::has_capability_or_fail(stream->get_device(),
|
|
35
35
|
{sycl::aspect::fp16});
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
[=](sycl::nd_item<3> item_ct1) {
|
|
42
|
-
dequantize_block<qk, qr, dequantize_kernel>(vx, y, k, item_ct1);
|
|
43
|
-
});
|
|
36
|
+
sycl_parallel_for(
|
|
37
|
+
stream,
|
|
38
|
+
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE),
|
|
39
|
+
sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE)),
|
|
40
|
+
[=](sycl::nd_item<3> item_ct1) { dequantize_block<qk, qr, dequantize_kernel>(vx, y, k, item_ct1); });
|
|
44
41
|
}
|
|
45
42
|
}
|
|
46
43
|
|
|
@@ -53,24 +50,18 @@ static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int64_t k,
|
|
|
53
50
|
dpct::has_capability_or_fail(stream->get_device(),
|
|
54
51
|
{sycl::aspect::fp16});
|
|
55
52
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
[=](sycl::nd_item<3> item_ct1) {
|
|
60
|
-
dequantize_block_q2_K(vx, y, item_ct1);
|
|
61
|
-
});
|
|
53
|
+
sycl_parallel_for(
|
|
54
|
+
stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
|
|
55
|
+
[=](sycl::nd_item<3> item_ct1) { dequantize_block_q2_K(vx, y, item_ct1); });
|
|
62
56
|
}
|
|
63
57
|
#else
|
|
64
58
|
{
|
|
65
59
|
dpct::has_capability_or_fail(stream->get_device(),
|
|
66
60
|
{sycl::aspect::fp16});
|
|
67
61
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
[=](sycl::nd_item<3> item_ct1) {
|
|
72
|
-
dequantize_block_q2_K(vx, y, item_ct1);
|
|
73
|
-
});
|
|
62
|
+
sycl_parallel_for(
|
|
63
|
+
stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
|
|
64
|
+
[=](sycl::nd_item<3> item_ct1) { dequantize_block_q2_K(vx, y, item_ct1); });
|
|
74
65
|
}
|
|
75
66
|
|
|
76
67
|
#endif
|
|
@@ -85,24 +76,18 @@ static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int64_t k,
|
|
|
85
76
|
dpct::has_capability_or_fail(stream->get_device(),
|
|
86
77
|
{sycl::aspect::fp16});
|
|
87
78
|
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
[=](sycl::nd_item<3> item_ct1) {
|
|
92
|
-
dequantize_block_q3_K(vx, y, item_ct1);
|
|
93
|
-
});
|
|
79
|
+
sycl_parallel_for(
|
|
80
|
+
stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
|
|
81
|
+
[=](sycl::nd_item<3> item_ct1) { dequantize_block_q3_K(vx, y, item_ct1); });
|
|
94
82
|
}
|
|
95
83
|
#else
|
|
96
84
|
{
|
|
97
85
|
dpct::has_capability_or_fail(stream->get_device(),
|
|
98
86
|
{sycl::aspect::fp16});
|
|
99
87
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
[=](sycl::nd_item<3> item_ct1) {
|
|
104
|
-
dequantize_block_q3_K(vx, y, item_ct1);
|
|
105
|
-
});
|
|
88
|
+
sycl_parallel_for(
|
|
89
|
+
stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
|
|
90
|
+
[=](sycl::nd_item<3> item_ct1) { dequantize_block_q3_K(vx, y, item_ct1); });
|
|
106
91
|
}
|
|
107
92
|
#endif
|
|
108
93
|
}
|
|
@@ -116,12 +101,9 @@ static void dequantize_row_q4_0_sycl(const void *vx, dst_t *y, const int64_t k,
|
|
|
116
101
|
dpct::has_capability_or_fail(stream->get_device(),
|
|
117
102
|
{sycl::aspect::fp16});
|
|
118
103
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
[=](sycl::nd_item<3> item_ct1) {
|
|
123
|
-
dequantize_block_q4_0(vx, y, nb32, item_ct1);
|
|
124
|
-
});
|
|
104
|
+
sycl_parallel_for(
|
|
105
|
+
stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
|
|
106
|
+
[=](sycl::nd_item<3> item_ct1) { dequantize_block_q4_0(vx, y, nb32, item_ct1); });
|
|
125
107
|
}
|
|
126
108
|
}
|
|
127
109
|
|
|
@@ -135,13 +117,12 @@ static void dequantize_row_q4_0_sycl_reorder(const void *vx, dst_t *y, const int
|
|
|
135
117
|
int constexpr WARP_K = WARP_SIZE * QK4_0;
|
|
136
118
|
const int n_warp = (k + WARP_K - 1) / WARP_K;
|
|
137
119
|
GGML_ASSERT(k % 2 == 0);
|
|
138
|
-
stream
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
120
|
+
sycl_parallel_for(stream,
|
|
121
|
+
sycl::nd_range<3>(sycl::range<3>(1, 1, n_warp) * sycl::range<3>(1, 1, WARP_SIZE),
|
|
122
|
+
sycl::range<3>(1, 1, WARP_SIZE)),
|
|
123
|
+
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
124
|
+
dequantize_block_q4_0_reorder(vx, y, k, item_ct1);
|
|
125
|
+
});
|
|
145
126
|
}
|
|
146
127
|
|
|
147
128
|
template <typename dst_t>
|
|
@@ -153,12 +134,9 @@ static void dequantize_row_q4_1_sycl(const void *vx, dst_t *y, const int64_t k,
|
|
|
153
134
|
dpct::has_capability_or_fail(stream->get_device(),
|
|
154
135
|
{sycl::aspect::fp16});
|
|
155
136
|
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
[=](sycl::nd_item<3> item_ct1) {
|
|
160
|
-
dequantize_block_q4_1(vx, y, nb32, item_ct1);
|
|
161
|
-
});
|
|
137
|
+
sycl_parallel_for(
|
|
138
|
+
stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
|
|
139
|
+
[=](sycl::nd_item<3> item_ct1) { dequantize_block_q4_1(vx, y, nb32, item_ct1); });
|
|
162
140
|
}
|
|
163
141
|
}
|
|
164
142
|
|
|
@@ -171,14 +149,13 @@ static void dequantize_row_q4_K_sycl(const void *vx, dst_t *y, const int64_t k,
|
|
|
171
149
|
dpct::has_capability_or_fail(stream->get_device(),
|
|
172
150
|
{sycl::aspect::fp16});
|
|
173
151
|
|
|
174
|
-
stream
|
|
152
|
+
sycl_launch(stream, [&](sycl::handler & cgh) {
|
|
175
153
|
sycl::local_accessor<uint8_t, 1> scale_local_acc(sycl::range<1>(12), cgh);
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
});
|
|
154
|
+
sycl_parallel_for(
|
|
155
|
+
cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
|
|
156
|
+
[=](sycl::nd_item<3> item_ct1) {
|
|
157
|
+
dequantize_block_q4_K(vx, y, get_pointer(scale_local_acc), item_ct1);
|
|
158
|
+
});
|
|
182
159
|
});
|
|
183
160
|
}
|
|
184
161
|
}
|
|
@@ -191,13 +168,13 @@ static void dequantize_row_q4_K_sycl_reorder(const void * vx, dst_t * y, const i
|
|
|
191
168
|
|
|
192
169
|
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
|
|
193
170
|
|
|
194
|
-
stream
|
|
171
|
+
sycl_launch(stream, [&](sycl::handler & cgh) {
|
|
195
172
|
sycl::local_accessor<uint8_t, 1> scale_local_acc(sycl::range<1>(12), cgh);
|
|
196
173
|
|
|
197
|
-
cgh
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
174
|
+
sycl_parallel_for<1>(cgh, sycl::nd_range<1>(sycl::range<1>(global_size), sycl::range<1>(local_size)),
|
|
175
|
+
[=](sycl::nd_item<1> item_ct1) {
|
|
176
|
+
dequantize_block_q4_K_reorder(vx, y, get_pointer(scale_local_acc), item_ct1, nb);
|
|
177
|
+
});
|
|
201
178
|
});
|
|
202
179
|
}
|
|
203
180
|
|
|
@@ -210,24 +187,18 @@ static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int64_t k,
|
|
|
210
187
|
dpct::has_capability_or_fail(stream->get_device(),
|
|
211
188
|
{sycl::aspect::fp16});
|
|
212
189
|
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
[=](sycl::nd_item<3> item_ct1) {
|
|
217
|
-
dequantize_block_q5_K(vx, y, item_ct1);
|
|
218
|
-
});
|
|
190
|
+
sycl_parallel_for(
|
|
191
|
+
stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
|
|
192
|
+
[=](sycl::nd_item<3> item_ct1) { dequantize_block_q5_K(vx, y, item_ct1); });
|
|
219
193
|
}
|
|
220
194
|
#else
|
|
221
195
|
{
|
|
222
196
|
dpct::has_capability_or_fail(stream->get_device(),
|
|
223
197
|
{sycl::aspect::fp16});
|
|
224
198
|
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
[=](sycl::nd_item<3> item_ct1) {
|
|
229
|
-
dequantize_block_q5_K(vx, y, item_ct1);
|
|
230
|
-
});
|
|
199
|
+
sycl_parallel_for(
|
|
200
|
+
stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
|
|
201
|
+
[=](sycl::nd_item<3> item_ct1) { dequantize_block_q5_K(vx, y, item_ct1); });
|
|
231
202
|
}
|
|
232
203
|
|
|
233
204
|
#endif
|
|
@@ -242,24 +213,18 @@ static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int64_t k,
|
|
|
242
213
|
dpct::has_capability_or_fail(stream->get_device(),
|
|
243
214
|
{sycl::aspect::fp16});
|
|
244
215
|
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
[=](sycl::nd_item<3> item_ct1) {
|
|
249
|
-
dequantize_block_q6_K(vx, y, item_ct1);
|
|
250
|
-
});
|
|
216
|
+
sycl_parallel_for(
|
|
217
|
+
stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
|
|
218
|
+
[=](sycl::nd_item<3> item_ct1) { dequantize_block_q6_K(vx, y, item_ct1); });
|
|
251
219
|
}
|
|
252
220
|
#else
|
|
253
221
|
{
|
|
254
222
|
dpct::has_capability_or_fail(stream->get_device(),
|
|
255
223
|
{sycl::aspect::fp16});
|
|
256
224
|
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
[=](sycl::nd_item<3> item_ct1) {
|
|
261
|
-
dequantize_block_q6_K(vx, y, item_ct1);
|
|
262
|
-
});
|
|
225
|
+
sycl_parallel_for(
|
|
226
|
+
stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
|
|
227
|
+
[=](sycl::nd_item<3> item_ct1) { dequantize_block_q6_K(vx, y, item_ct1); });
|
|
263
228
|
}
|
|
264
229
|
|
|
265
230
|
#endif
|
|
@@ -271,9 +236,9 @@ static void dequantize_row_q6_K_sycl_reorder(const void * vx, dst_t * y, const i
|
|
|
271
236
|
|
|
272
237
|
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
|
|
273
238
|
|
|
274
|
-
stream
|
|
275
|
-
|
|
276
|
-
|
|
239
|
+
sycl_parallel_for(stream,
|
|
240
|
+
sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
|
|
241
|
+
[=](sycl::nd_item<3> item_ct1) { dequantize_block_q6_K_reorder(vx, y, item_ct1, nb); });
|
|
277
242
|
}
|
|
278
243
|
|
|
279
244
|
template <typename dst_t>
|
|
@@ -284,15 +249,10 @@ static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int64_t k,
|
|
|
284
249
|
dpct::has_capability_or_fail(stream->get_device(),
|
|
285
250
|
{sycl::aspect::fp16});
|
|
286
251
|
|
|
287
|
-
stream
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
[=](sycl::nd_item<3> item_ct1) {
|
|
292
|
-
dequantize_block_iq1_s(
|
|
293
|
-
vx, y, item_ct1, iq1s_grid_gpu
|
|
294
|
-
);
|
|
295
|
-
});
|
|
252
|
+
sycl_launch(stream, [&](sycl::handler & cgh) {
|
|
253
|
+
sycl_parallel_for(
|
|
254
|
+
cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
|
|
255
|
+
[=](sycl::nd_item<3> item_ct1) { dequantize_block_iq1_s(vx, y, item_ct1, iq1s_grid_gpu); });
|
|
296
256
|
});
|
|
297
257
|
}
|
|
298
258
|
}
|
|
@@ -305,15 +265,10 @@ static void dequantize_row_iq1_m_sycl(const void *vx, dst_t *y, const int64_t k,
|
|
|
305
265
|
dpct::has_capability_or_fail(stream->get_device(),
|
|
306
266
|
{sycl::aspect::fp16});
|
|
307
267
|
|
|
308
|
-
stream
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
[=](sycl::nd_item<3> item_ct1) {
|
|
313
|
-
dequantize_block_iq1_m(
|
|
314
|
-
vx, y, item_ct1, iq1s_grid_gpu
|
|
315
|
-
);
|
|
316
|
-
});
|
|
268
|
+
sycl_launch(stream, [&](sycl::handler & cgh) {
|
|
269
|
+
sycl_parallel_for(
|
|
270
|
+
cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
|
|
271
|
+
[=](sycl::nd_item<3> item_ct1) { dequantize_block_iq1_m(vx, y, item_ct1, iq1s_grid_gpu); });
|
|
317
272
|
});
|
|
318
273
|
}
|
|
319
274
|
}
|
|
@@ -326,15 +281,12 @@ static void dequantize_row_iq2_xxs_sycl(const void *vx, dst_t *y, const int64_t
|
|
|
326
281
|
dpct::has_capability_or_fail(stream->get_device(),
|
|
327
282
|
{sycl::aspect::fp16});
|
|
328
283
|
|
|
329
|
-
stream
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
vx, y, item_ct1, iq2xxs_grid,
|
|
336
|
-
ksigns_iq2xs, kmask_iq2xs);
|
|
337
|
-
});
|
|
284
|
+
sycl_launch(stream, [&](sycl::handler & cgh) {
|
|
285
|
+
sycl_parallel_for(
|
|
286
|
+
cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
|
|
287
|
+
[=](sycl::nd_item<3> item_ct1) {
|
|
288
|
+
dequantize_block_iq2_xxs(vx, y, item_ct1, iq2xxs_grid, ksigns_iq2xs, kmask_iq2xs);
|
|
289
|
+
});
|
|
338
290
|
});
|
|
339
291
|
}
|
|
340
292
|
}
|
|
@@ -347,15 +299,12 @@ static void dequantize_row_iq2_xs_sycl(const void *vx, dst_t *y, const int64_t k
|
|
|
347
299
|
dpct::has_capability_or_fail(stream->get_device(),
|
|
348
300
|
{sycl::aspect::fp16});
|
|
349
301
|
|
|
350
|
-
stream
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
vx, y, item_ct1, iq2xs_grid,
|
|
357
|
-
ksigns_iq2xs, kmask_iq2xs);
|
|
358
|
-
});
|
|
302
|
+
sycl_launch(stream, [&](sycl::handler & cgh) {
|
|
303
|
+
sycl_parallel_for(
|
|
304
|
+
cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
|
|
305
|
+
[=](sycl::nd_item<3> item_ct1) {
|
|
306
|
+
dequantize_block_iq2_xs(vx, y, item_ct1, iq2xs_grid, ksigns_iq2xs, kmask_iq2xs);
|
|
307
|
+
});
|
|
359
308
|
});
|
|
360
309
|
}
|
|
361
310
|
}
|
|
@@ -368,13 +317,10 @@ static void dequantize_row_iq2_s_sycl(const void *vx, dst_t *y, const int64_t k,
|
|
|
368
317
|
dpct::has_capability_or_fail(stream->get_device(),
|
|
369
318
|
{sycl::aspect::fp16});
|
|
370
319
|
|
|
371
|
-
stream
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
[=](sycl::nd_item<3> item_ct1) {
|
|
376
|
-
dequantize_block_iq2_s(vx, y, item_ct1);
|
|
377
|
-
});
|
|
320
|
+
sycl_launch(stream, [&](sycl::handler & cgh) {
|
|
321
|
+
sycl_parallel_for(
|
|
322
|
+
cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
|
|
323
|
+
[=](sycl::nd_item<3> item_ct1) { dequantize_block_iq2_s(vx, y, item_ct1); });
|
|
378
324
|
});
|
|
379
325
|
}
|
|
380
326
|
}
|
|
@@ -388,15 +334,12 @@ static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int64_t
|
|
|
388
334
|
dpct::has_capability_or_fail(stream->get_device(),
|
|
389
335
|
{sycl::aspect::fp16});
|
|
390
336
|
|
|
391
|
-
stream
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
vx, y, item_ct1, iq3xxs_grid,
|
|
398
|
-
ksigns_iq2xs, kmask_iq2xs);
|
|
399
|
-
});
|
|
337
|
+
sycl_launch(stream, [&](sycl::handler & cgh) {
|
|
338
|
+
sycl_parallel_for(
|
|
339
|
+
cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
|
|
340
|
+
[=](sycl::nd_item<3> item_ct1) {
|
|
341
|
+
dequantize_block_iq3_xxs(vx, y, item_ct1, iq3xxs_grid, ksigns_iq2xs, kmask_iq2xs);
|
|
342
|
+
});
|
|
400
343
|
});
|
|
401
344
|
}
|
|
402
345
|
}
|
|
@@ -409,14 +352,10 @@ static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int64_t k,
|
|
|
409
352
|
dpct::has_capability_or_fail(stream->get_device(),
|
|
410
353
|
{sycl::aspect::fp16});
|
|
411
354
|
|
|
412
|
-
stream
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
[=](sycl::nd_item<3> item_ct1) {
|
|
417
|
-
dequantize_block_iq3_s(
|
|
418
|
-
vx, y, item_ct1, kmask_iq2xs, iq3s_grid);
|
|
419
|
-
});
|
|
355
|
+
sycl_launch(stream, [&](sycl::handler & cgh) {
|
|
356
|
+
sycl_parallel_for(
|
|
357
|
+
cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
|
|
358
|
+
[=](sycl::nd_item<3> item_ct1) { dequantize_block_iq3_s(vx, y, item_ct1, kmask_iq2xs, iq3s_grid); });
|
|
420
359
|
});
|
|
421
360
|
}
|
|
422
361
|
}
|
|
@@ -432,14 +371,11 @@ static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int64_t k
|
|
|
432
371
|
dpct::has_capability_or_fail(stream->get_device(),
|
|
433
372
|
{sycl::aspect::fp16});
|
|
434
373
|
|
|
435
|
-
stream
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
[=](sycl::nd_item<3> item_ct1) {
|
|
441
|
-
dequantize_block_iq4_xs(vx, y, item_ct1);
|
|
442
|
-
});
|
|
374
|
+
sycl_launch(stream, [&](sycl::handler & cgh) {
|
|
375
|
+
sycl_parallel_for(
|
|
376
|
+
cgh,
|
|
377
|
+
sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
|
|
378
|
+
[=](sycl::nd_item<3> item_ct1) { dequantize_block_iq4_xs(vx, y, item_ct1); });
|
|
443
379
|
});
|
|
444
380
|
}
|
|
445
381
|
#endif
|
|
@@ -453,14 +389,11 @@ static void dequantize_row_iq4_nl_sycl(const void *vx, dst_t *y, const int64_t k
|
|
|
453
389
|
dpct::has_capability_or_fail(stream->get_device(),
|
|
454
390
|
{sycl::aspect::fp16});
|
|
455
391
|
|
|
456
|
-
stream
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
[=](sycl::nd_item<3> item_ct1) {
|
|
462
|
-
dequantize_block_iq4_nl(vx, y, item_ct1);
|
|
463
|
-
});
|
|
392
|
+
sycl_launch(stream, [&](sycl::handler & cgh) {
|
|
393
|
+
sycl_parallel_for(
|
|
394
|
+
cgh,
|
|
395
|
+
sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
|
|
396
|
+
[=](sycl::nd_item<3> item_ct1) { dequantize_block_iq4_nl(vx, y, item_ct1); });
|
|
464
397
|
});
|
|
465
398
|
}
|
|
466
399
|
}
|