@novastera-oss/llamarn 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/build.gradle +2 -1
- package/android/proguard-rules.pro +12 -0
- package/android/src/main/cpp/include/llama.h +15 -47
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86/libggml.so +0 -0
- package/android/src/main/jniLibs/x86/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakePresets.json +11 -0
- package/cpp/llama.cpp/CODEOWNERS +1 -0
- package/cpp/llama.cpp/README.md +4 -3
- package/cpp/llama.cpp/common/arg.cpp +45 -1
- package/cpp/llama.cpp/common/common.cpp +22 -6
- package/cpp/llama.cpp/common/common.h +18 -4
- package/cpp/llama.cpp/convert_hf_to_gguf.py +500 -32
- package/cpp/llama.cpp/convert_hf_to_gguf_update.py +12 -13
- package/cpp/llama.cpp/ggml/CMakeLists.txt +6 -1
- package/cpp/llama.cpp/ggml/cmake/ggml-config.cmake.in +85 -47
- package/cpp/llama.cpp/ggml/include/ggml-webgpu.h +19 -0
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-alloc.c +0 -15
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +8 -20
- package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +3 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +58 -3
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +130 -22
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +122 -16
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +5 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +109 -12
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +88 -10
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +343 -1094
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +14 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +64 -17
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh +225 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +41 -301
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +85 -67
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +45 -62
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +28 -43
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +41 -56
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +36 -47
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +31 -43
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +22 -37
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +3 -13
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +73 -23
- package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cu +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mma.cuh +111 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +6 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +1152 -689
- package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu +92 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cu +275 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cuh +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +13 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +16 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +13 -3
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +407 -69
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +380 -83
- package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +18 -4
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +295 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +79 -0
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +4 -4
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +14 -26
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +131 -46
- package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +8 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +43 -43
- package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +2 -6
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +287 -22
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +265 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +1 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +3 -8
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +8 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +1 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +71 -16
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/CMakeLists.txt +54 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +907 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +60 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +35 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +56 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +4 -6
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +98 -0
- package/cpp/llama.cpp/gguf-py/gguf/metadata.py +4 -0
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_dump.py +24 -1
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +75 -52
- package/cpp/llama.cpp/include/llama.h +15 -7
- package/cpp/llama.cpp/models/templates/llama-cpp-rwkv-world.jinja +34 -0
- package/cpp/llama.cpp/models/templates/moonshotai-Kimi-K2.jinja +43 -0
- package/cpp/llama.cpp/requirements/requirements-all.txt +1 -0
- package/cpp/llama.cpp/requirements/requirements-server-bench.txt +5 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +106 -0
- package/cpp/llama.cpp/src/llama-arch.h +5 -0
- package/cpp/llama.cpp/src/llama-batch.cpp +76 -70
- package/cpp/llama.cpp/src/llama-batch.h +24 -18
- package/cpp/llama.cpp/src/llama-chat.cpp +43 -1
- package/cpp/llama.cpp/src/llama-chat.h +2 -0
- package/cpp/llama.cpp/src/llama-context.cpp +180 -106
- package/cpp/llama.cpp/src/llama-context.h +26 -16
- package/cpp/llama.cpp/src/llama-cparams.h +3 -2
- package/cpp/llama.cpp/src/llama-graph.cpp +203 -39
- package/cpp/llama.cpp/src/llama-graph.h +147 -72
- package/cpp/llama.cpp/src/llama-hparams.cpp +40 -0
- package/cpp/llama.cpp/src/llama-hparams.h +10 -2
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +11 -5
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +3 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +698 -302
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +89 -31
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +1 -0
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +16 -1
- package/cpp/llama.cpp/src/llama-model.cpp +1293 -312
- package/cpp/llama.cpp/src/llama-model.h +3 -4
- package/cpp/llama.cpp/src/llama-quant.cpp +1 -2
- package/cpp/llama.cpp/src/llama-vocab.cpp +363 -8
- package/cpp/llama.cpp/src/llama-vocab.h +2 -0
- package/cpp/llama.cpp/src/unicode.cpp +207 -0
- package/cpp/llama.cpp/src/unicode.h +2 -0
- package/ios/include/common.h +18 -4
- package/ios/include/llama.h +15 -7
- package/ios/libs/llama.xcframework/Info.plist +15 -15
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5267 -5059
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +15 -7
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5238 -5030
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4014 -3889
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +15 -7
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5238 -5030
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4016 -3891
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +15 -7
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +15 -7
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +15 -7
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5267 -5059
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +15 -7
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5238 -5030
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4014 -3889
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +15 -7
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5303 -5095
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +15 -7
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5274 -5066
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4044 -3919
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +15 -7
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +4 -4
|
@@ -4015,6 +4015,9 @@ static void ggml_compute_forward_rms_norm_f32(
|
|
|
4015
4015
|
|
|
4016
4016
|
const float scale = 1.0f/sqrtf(mean + eps);
|
|
4017
4017
|
|
|
4018
|
+
// if you hit this, likely you got an inf somewhere earlier
|
|
4019
|
+
assert(scale > 0.0f);
|
|
4020
|
+
|
|
4018
4021
|
ggml_vec_scale_f32(ne00, y, scale);
|
|
4019
4022
|
}
|
|
4020
4023
|
}
|
|
@@ -221,6 +221,9 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
|
|
|
221
221
|
for (int i = np; i < n; ++i) {
|
|
222
222
|
sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
|
223
223
|
}
|
|
224
|
+
|
|
225
|
+
// if you hit this, you are likely running outside the FP range
|
|
226
|
+
assert(!isnan(sumf) && !isinf(sumf));
|
|
224
227
|
#else
|
|
225
228
|
for (int i = 0; i < n; ++i) {
|
|
226
229
|
sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
|
@@ -102,12 +102,12 @@ if (CUDAToolkit_FOUND)
|
|
|
102
102
|
if (GGML_STATIC)
|
|
103
103
|
if (WIN32)
|
|
104
104
|
# As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
|
|
105
|
-
target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas
|
|
105
|
+
target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas)
|
|
106
106
|
else ()
|
|
107
|
-
target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas_static
|
|
107
|
+
target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas_static)
|
|
108
108
|
endif()
|
|
109
109
|
else()
|
|
110
|
-
target_link_libraries(ggml-cuda PRIVATE CUDA::cudart CUDA::cublas
|
|
110
|
+
target_link_libraries(ggml-cuda PRIVATE CUDA::cudart CUDA::cublas)
|
|
111
111
|
endif()
|
|
112
112
|
|
|
113
113
|
if (GGML_CUDA_NO_VMM)
|
|
@@ -56,7 +56,7 @@
|
|
|
56
56
|
#define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 0x803) // Tonga, Fiji, Polaris, minimum for fast fp16
|
|
57
57
|
#define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 0x900) // Vega56/64, minimum for fp16 dual issue
|
|
58
58
|
#define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 0x906) // MI50/Radeon VII, minimum for dp4a
|
|
59
|
-
#define
|
|
59
|
+
#define GGML_CUDA_CC_CDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x908) // MI100, minimum for MFMA, acc registers
|
|
60
60
|
#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x910) // MI210, minimum acc register renameing
|
|
61
61
|
#define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x942) // MI300
|
|
62
62
|
|
|
@@ -72,8 +72,9 @@
|
|
|
72
72
|
#define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
|
|
73
73
|
#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4)
|
|
74
74
|
#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
|
|
75
|
-
#define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc <
|
|
76
|
-
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >=
|
|
75
|
+
#define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1)
|
|
76
|
+
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1)
|
|
77
|
+
#define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1)
|
|
77
78
|
|
|
78
79
|
// Moore Threads
|
|
79
80
|
#define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
|
|
@@ -226,6 +227,10 @@ typedef float2 dfloat2;
|
|
|
226
227
|
#define FP16_MMA_AVAILABLE
|
|
227
228
|
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
|
|
228
229
|
|
|
230
|
+
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && defined(CDNA3)
|
|
231
|
+
#define AMD_MFMA_AVAILABLE
|
|
232
|
+
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && defined(CDNA3)
|
|
233
|
+
|
|
229
234
|
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
|
|
230
235
|
#define NEW_MMA_AVAILABLE
|
|
231
236
|
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
|
|
@@ -288,6 +293,11 @@ static bool fp32_mma_hardware_available(const int cc) {
|
|
|
288
293
|
return GGML_CUDA_CC_IS_CDNA(cc);
|
|
289
294
|
}
|
|
290
295
|
|
|
296
|
+
// AMD CDNA3 matrix cores.. Will add support for other CDNA generations later.
|
|
297
|
+
static bool amd_mfma_available(const int cc) {
|
|
298
|
+
return cc >= GGML_CUDA_CC_OFFSET_AMD && GGML_CUDA_CC_IS_CDNA3(cc);
|
|
299
|
+
}
|
|
300
|
+
|
|
291
301
|
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
|
|
292
302
|
static bool new_mma_available(const int cc) {
|
|
293
303
|
return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;
|
|
@@ -765,7 +775,7 @@ struct ggml_tensor_extra_gpu {
|
|
|
765
775
|
};
|
|
766
776
|
|
|
767
777
|
|
|
768
|
-
#if (defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS))
|
|
778
|
+
#if (defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)) || defined(GGML_MUSA_GRAPHS)
|
|
769
779
|
#define USE_CUDA_GRAPH
|
|
770
780
|
#endif
|
|
771
781
|
|
|
@@ -6,24 +6,33 @@
|
|
|
6
6
|
#define CUDA_Q8_0_NE_ALIGN 2048
|
|
7
7
|
|
|
8
8
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
|
9
|
-
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y,
|
|
10
|
-
|
|
9
|
+
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y,
|
|
10
|
+
const int64_t ne00, const int64_t ne01, const int64_t ne02,
|
|
11
|
+
const int64_t s01, const int64_t s02, const int64_t s03) {
|
|
12
|
+
const int64_t i00 = 2 * (int64_t(blockDim.x)*blockIdx.x + threadIdx.x);
|
|
11
13
|
|
|
12
|
-
if (
|
|
14
|
+
if (i00 >= ne00) {
|
|
13
15
|
return;
|
|
14
16
|
}
|
|
15
17
|
|
|
16
|
-
const int64_t
|
|
17
|
-
const int64_t
|
|
18
|
-
const int64_t
|
|
18
|
+
const int64_t i01 = blockIdx.y;
|
|
19
|
+
const int64_t i02 = blockIdx.z % ne02;
|
|
20
|
+
const int64_t i03 = blockIdx.z / ne02;
|
|
21
|
+
|
|
22
|
+
const int64_t ibx0 = i03*s03 + i02*s02 + i01*s01;
|
|
23
|
+
|
|
24
|
+
const int64_t ib = ibx0 + i00/qk; // block index
|
|
25
|
+
const int64_t iqs = (i00%qk)/qr; // quant index
|
|
26
|
+
const int64_t iybs = i00 - i00%qk; // y block start index
|
|
19
27
|
const int64_t y_offset = qr == 1 ? 1 : qk/2;
|
|
20
28
|
|
|
21
29
|
// dequantize
|
|
22
30
|
dfloat2 v;
|
|
23
31
|
dequantize_kernel(vx, ib, iqs, v);
|
|
24
32
|
|
|
25
|
-
|
|
26
|
-
y[
|
|
33
|
+
const int64_t iy0 = ((i03*ne02 + i02)*ne01 + i01)*ne00 + iybs + iqs;
|
|
34
|
+
y[iy0 + 0] = float(v.x);
|
|
35
|
+
y[iy0 + y_offset] = float(v.y);
|
|
27
36
|
}
|
|
28
37
|
|
|
29
38
|
template <bool need_check>
|
|
@@ -457,9 +466,17 @@ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst
|
|
|
457
466
|
}
|
|
458
467
|
|
|
459
468
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
|
460
|
-
static void dequantize_block_cuda(const void *
|
|
461
|
-
|
|
462
|
-
|
|
469
|
+
static void dequantize_block_cuda(const void * vx, dst_t * y,
|
|
470
|
+
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
|
|
471
|
+
const int64_t s01, const int64_t s02, const int64_t s03, cudaStream_t stream) {
|
|
472
|
+
const dim3 num_blocks((ne00 + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE), ne01, ne02*ne03);
|
|
473
|
+
dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>
|
|
474
|
+
(vx, y, ne00, ne01, ne02, s01, s02, s03);
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
|
478
|
+
static void dequantize_block_cont_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
|
|
479
|
+
dequantize_block_cuda<qk, qr, dequantize_kernel, dst_t>(vx, y, k, 1, 1, 1, k/qk, k/qk, k/qk, stream);
|
|
463
480
|
}
|
|
464
481
|
|
|
465
482
|
static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int64_t k, cudaStream_t stream) {
|
|
@@ -624,14 +641,14 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
|
|
624
641
|
case GGML_TYPE_Q4_1:
|
|
625
642
|
return dequantize_row_q4_1_cuda;
|
|
626
643
|
case GGML_TYPE_Q5_0:
|
|
627
|
-
return
|
|
644
|
+
return dequantize_block_cont_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
|
628
645
|
case GGML_TYPE_Q5_1:
|
|
629
|
-
return
|
|
646
|
+
return dequantize_block_cont_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
|
630
647
|
case GGML_TYPE_Q8_0:
|
|
631
648
|
if (fp16_available(ggml_cuda_info().devices[ggml_cuda_get_device()].cc)) {
|
|
632
649
|
return dequantize_block_q8_0_f16_cuda;
|
|
633
650
|
}
|
|
634
|
-
return
|
|
651
|
+
return dequantize_block_cont_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
|
635
652
|
case GGML_TYPE_Q2_K:
|
|
636
653
|
return dequantize_row_q2_K_cuda;
|
|
637
654
|
case GGML_TYPE_Q3_K:
|
|
@@ -676,11 +693,11 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
|
|
676
693
|
case GGML_TYPE_Q4_1:
|
|
677
694
|
return dequantize_row_q4_1_cuda;
|
|
678
695
|
case GGML_TYPE_Q5_0:
|
|
679
|
-
return
|
|
696
|
+
return dequantize_block_cont_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
|
680
697
|
case GGML_TYPE_Q5_1:
|
|
681
|
-
return
|
|
698
|
+
return dequantize_block_cont_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
|
682
699
|
case GGML_TYPE_Q8_0:
|
|
683
|
-
return
|
|
700
|
+
return dequantize_block_cont_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
|
684
701
|
case GGML_TYPE_Q2_K:
|
|
685
702
|
return dequantize_row_q2_K_cuda;
|
|
686
703
|
case GGML_TYPE_Q3_K:
|
|
@@ -722,6 +739,16 @@ to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type) {
|
|
|
722
739
|
switch (type) {
|
|
723
740
|
case GGML_TYPE_F32:
|
|
724
741
|
return convert_unary_cuda<float>;
|
|
742
|
+
case GGML_TYPE_Q4_0:
|
|
743
|
+
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
|
|
744
|
+
case GGML_TYPE_Q4_1:
|
|
745
|
+
return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
|
|
746
|
+
case GGML_TYPE_Q5_0:
|
|
747
|
+
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
|
748
|
+
case GGML_TYPE_Q5_1:
|
|
749
|
+
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
|
750
|
+
case GGML_TYPE_Q8_0:
|
|
751
|
+
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
|
725
752
|
case GGML_TYPE_BF16:
|
|
726
753
|
return convert_unary_cuda<nv_bfloat16>;
|
|
727
754
|
default:
|
|
@@ -733,6 +760,16 @@ to_bf16_nc_cuda_t ggml_get_to_bf16_nc_cuda(ggml_type type) {
|
|
|
733
760
|
switch (type) {
|
|
734
761
|
case GGML_TYPE_F32:
|
|
735
762
|
return convert_unary_cuda<float, nv_bfloat16>;
|
|
763
|
+
case GGML_TYPE_Q4_0:
|
|
764
|
+
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
|
|
765
|
+
case GGML_TYPE_Q4_1:
|
|
766
|
+
return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
|
|
767
|
+
case GGML_TYPE_Q5_0:
|
|
768
|
+
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
|
769
|
+
case GGML_TYPE_Q5_1:
|
|
770
|
+
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
|
771
|
+
case GGML_TYPE_Q8_0:
|
|
772
|
+
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
|
736
773
|
case GGML_TYPE_F16:
|
|
737
774
|
return convert_unary_cuda<half, nv_bfloat16>;
|
|
738
775
|
default:
|
|
@@ -744,6 +781,16 @@ to_fp32_nc_cuda_t ggml_get_to_fp32_nc_cuda(ggml_type type) {
|
|
|
744
781
|
switch (type) {
|
|
745
782
|
case GGML_TYPE_F16:
|
|
746
783
|
return convert_unary_cuda<half, float>;
|
|
784
|
+
case GGML_TYPE_Q4_0:
|
|
785
|
+
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
|
|
786
|
+
case GGML_TYPE_Q4_1:
|
|
787
|
+
return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
|
|
788
|
+
case GGML_TYPE_Q5_0:
|
|
789
|
+
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
|
790
|
+
case GGML_TYPE_Q5_1:
|
|
791
|
+
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
|
792
|
+
case GGML_TYPE_Q8_0:
|
|
793
|
+
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
|
747
794
|
case GGML_TYPE_BF16:
|
|
748
795
|
return convert_unary_cuda<nv_bfloat16, float>;
|
|
749
796
|
default:
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include "ggml-common.h"
|
|
4
|
+
|
|
5
|
+
template<typename src_t, typename dst_t>
|
|
6
|
+
static __device__ __forceinline__ void convert_flt(const src_t * src, dst_t * dst) {
|
|
7
|
+
if constexpr (std::is_same_v<src_t, dst_t>) {
|
|
8
|
+
*dst = *src;
|
|
9
|
+
} else {
|
|
10
|
+
*dst = float(*src);
|
|
11
|
+
}
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
static __device__ __forceinline__ int best_index_int8(int n, const int8_t * val, float x) {
|
|
15
|
+
if (x <= val[0]) return 0;
|
|
16
|
+
if (x >= val[n-1]) return n-1;
|
|
17
|
+
int ml = 0, mu = n-1;
|
|
18
|
+
while (mu-ml > 1) {
|
|
19
|
+
int mav = (ml+mu)/2;
|
|
20
|
+
if (x < val[mav]) mu = mav; else ml = mav;
|
|
21
|
+
}
|
|
22
|
+
return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
static __device__ void quantize_f32_q4_0_block(const float * __restrict__ x, block_q4_0 * __restrict__ y) {
|
|
26
|
+
float amax = 0.0f;
|
|
27
|
+
float vmax = 0.0f;
|
|
28
|
+
|
|
29
|
+
for (int j = 0; j < QK4_0; ++j) {
|
|
30
|
+
const float v = x[j];
|
|
31
|
+
if (amax < fabsf(v)) {
|
|
32
|
+
amax = fabsf(v);
|
|
33
|
+
vmax = v;
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
const float d = vmax / -8;
|
|
38
|
+
const float id = d ? 1.0f/d : 0.0f;
|
|
39
|
+
|
|
40
|
+
y->d = d;
|
|
41
|
+
|
|
42
|
+
for (int j = 0; j < QK4_0/2; ++j) {
|
|
43
|
+
const float x0 = x[0 + j]*id;
|
|
44
|
+
const float x1 = x[QK4_0/2 + j]*id;
|
|
45
|
+
|
|
46
|
+
const uint8_t xi0 = min(15, (int8_t)(x0 + 8.5f));
|
|
47
|
+
const uint8_t xi1 = min(15, (int8_t)(x1 + 8.5f));
|
|
48
|
+
|
|
49
|
+
y->qs[j] = xi0;
|
|
50
|
+
y->qs[j] |= xi1 << 4;
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
static __device__ void quantize_f32_q4_1_block(const float * __restrict__ x, block_q4_1 * __restrict__ y) {
|
|
55
|
+
float vmin = FLT_MAX;
|
|
56
|
+
float vmax = -FLT_MAX;
|
|
57
|
+
|
|
58
|
+
for (int j = 0; j < QK4_1; ++j) {
|
|
59
|
+
const float v = x[j];
|
|
60
|
+
if (v < vmin) vmin = v;
|
|
61
|
+
if (v > vmax) vmax = v;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
const float d = (vmax - vmin) / ((1 << 4) - 1);
|
|
65
|
+
const float id = d ? 1.0f/d : 0.0f;
|
|
66
|
+
|
|
67
|
+
y->dm.x = d;
|
|
68
|
+
y->dm.y = vmin;
|
|
69
|
+
|
|
70
|
+
for (int j = 0; j < QK4_1/2; ++j) {
|
|
71
|
+
const float x0 = (x[0 + j] - vmin)*id;
|
|
72
|
+
const float x1 = (x[QK4_1/2 + j] - vmin)*id;
|
|
73
|
+
|
|
74
|
+
const uint8_t xi0 = min(15, (int8_t)(x0 + 0.5f));
|
|
75
|
+
const uint8_t xi1 = min(15, (int8_t)(x1 + 0.5f));
|
|
76
|
+
|
|
77
|
+
y->qs[j] = xi0;
|
|
78
|
+
y->qs[j] |= xi1 << 4;
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
static __device__ void quantize_f32_q5_0_block(const float * __restrict__ x, block_q5_0 * __restrict__ y) {
|
|
83
|
+
float amax = 0.0f;
|
|
84
|
+
float vmax = 0.0f;
|
|
85
|
+
|
|
86
|
+
for (int j = 0; j < QK5_0; ++j) {
|
|
87
|
+
const float v = x[j];
|
|
88
|
+
if (amax < fabsf(v)) {
|
|
89
|
+
amax = fabsf(v);
|
|
90
|
+
vmax = v;
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
const float d = vmax / -16;
|
|
95
|
+
const float id = d ? 1.0f/d : 0.0f;
|
|
96
|
+
|
|
97
|
+
y->d = d;
|
|
98
|
+
|
|
99
|
+
uint32_t qh = 0;
|
|
100
|
+
for (int j = 0; j < QK5_0/2; ++j) {
|
|
101
|
+
const float x0 = x[0 + j]*id;
|
|
102
|
+
const float x1 = x[QK5_0/2 + j]*id;
|
|
103
|
+
|
|
104
|
+
const uint8_t xi0 = min(31, (int8_t)(x0 + 16.5f));
|
|
105
|
+
const uint8_t xi1 = min(31, (int8_t)(x1 + 16.5f));
|
|
106
|
+
|
|
107
|
+
y->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
|
|
108
|
+
qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
|
|
109
|
+
qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
|
|
110
|
+
}
|
|
111
|
+
memcpy(y->qh, &qh, sizeof(qh));
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
static __device__ void quantize_f32_q5_1_block(const float * __restrict__ x, block_q5_1 * __restrict__ y) {
|
|
115
|
+
float min = x[0];
|
|
116
|
+
float max = x[0];
|
|
117
|
+
|
|
118
|
+
for (int j = 1; j < QK5_1; ++j) {
|
|
119
|
+
const float v = x[j];
|
|
120
|
+
min = v < min ? v : min;
|
|
121
|
+
max = v > max ? v : max;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
const float d = (max - min) / 31;
|
|
125
|
+
const float id = d ? 1.0f/d : 0.0f;
|
|
126
|
+
|
|
127
|
+
y->dm.x = d;
|
|
128
|
+
y->dm.y = min;
|
|
129
|
+
|
|
130
|
+
uint32_t qh = 0;
|
|
131
|
+
for (int j = 0; j < QK5_1/2; ++j) {
|
|
132
|
+
const float x0 = (x[0 + j] - min)*id;
|
|
133
|
+
const float x1 = (x[QK5_1/2 + j] - min)*id;
|
|
134
|
+
|
|
135
|
+
const uint8_t xi0 = (uint8_t)(x0 + 0.5f);
|
|
136
|
+
const uint8_t xi1 = (uint8_t)(x1 + 0.5f);
|
|
137
|
+
|
|
138
|
+
y->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
|
|
139
|
+
qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
|
|
140
|
+
qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_1/2);
|
|
141
|
+
}
|
|
142
|
+
memcpy(y->qh, &qh, sizeof(qh));
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
static __device__ void quantize_f32_q8_0_block(const float * __restrict__ x, block_q8_0 * __restrict__ y) {
|
|
146
|
+
float amax = 0.0f; // absolute max
|
|
147
|
+
|
|
148
|
+
for (int j = 0; j < QK8_0; j++) {
|
|
149
|
+
const float v = x[j];
|
|
150
|
+
amax = fmaxf(amax, fabsf(v));
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
const float d = amax / ((1 << 7) - 1);
|
|
154
|
+
const float id = d ? 1.0f/d : 0.0f;
|
|
155
|
+
|
|
156
|
+
y->d = d;
|
|
157
|
+
|
|
158
|
+
for (int j = 0; j < QK8_0; ++j) {
|
|
159
|
+
const float x0 = x[j]*id;
|
|
160
|
+
y->qs[j] = roundf(x0);
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
static __device__ void quantize_f32_iq4_nl_block(const float * __restrict__ x, block_iq4_nl * __restrict__ y) {
|
|
165
|
+
float amax = 0.0f;
|
|
166
|
+
float vmax = 0.0f;
|
|
167
|
+
|
|
168
|
+
for (int j = 0; j < QK4_NL; ++j) {
|
|
169
|
+
const float v = x[j];
|
|
170
|
+
if (amax < fabsf(v)) {
|
|
171
|
+
amax = fabsf(v);
|
|
172
|
+
vmax = v;
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
float d = vmax / kvalues_iq4nl[0];
|
|
177
|
+
const float id = d ? 1.0f/d : 0.0f;
|
|
178
|
+
|
|
179
|
+
float sumqx = 0, sumq2 = 0;
|
|
180
|
+
for (int j = 0; j < QK4_NL/2; ++j) {
|
|
181
|
+
const float x0 = x[0 + j]*id;
|
|
182
|
+
const float x1 = x[QK4_NL/2 + j]*id;
|
|
183
|
+
const uint8_t xi0 = best_index_int8(16, kvalues_iq4nl, x0);
|
|
184
|
+
const uint8_t xi1 = best_index_int8(16, kvalues_iq4nl, x1);
|
|
185
|
+
y->qs[j] = xi0 | (xi1 << 4);
|
|
186
|
+
const float v0 = kvalues_iq4nl[xi0];
|
|
187
|
+
const float v1 = kvalues_iq4nl[xi1];
|
|
188
|
+
const float w0 = x[0 + j]*x[0 + j];
|
|
189
|
+
const float w1 = x[QK4_NL/2 + j]*x[QK4_NL/2 + j];
|
|
190
|
+
sumqx += w0*v0*x[j] + w1*v1*x[QK4_NL/2 + j];
|
|
191
|
+
sumq2 += w0*v0*v0 + w1*v1*v1;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
y->d = sumq2 > 0 ? sumqx/sumq2 : d;
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
// Wrapper functions for cpy.cu compatibility
|
|
198
|
+
static __device__ void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
|
|
199
|
+
quantize_f32_q4_0_block((const float *)cxi, (block_q4_0 *)cdsti);
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
|
|
203
|
+
quantize_f32_q4_1_block((const float *)cxi, (block_q4_1 *)cdsti);
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
static __device__ void cpy_blck_f32_q5_0(const char * cxi, char * cdsti) {
|
|
207
|
+
quantize_f32_q5_0_block((const float *)cxi, (block_q5_0 *)cdsti);
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
static __device__ void cpy_blck_f32_q5_1(const char * cxi, char * cdsti) {
|
|
211
|
+
quantize_f32_q5_1_block((const float *)cxi, (block_q5_1 *)cdsti);
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
|
|
215
|
+
quantize_f32_q8_0_block((const float *)cxi, (block_q8_0 *)cdsti);
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
|
|
219
|
+
quantize_f32_iq4_nl_block((const float *)cxi, (block_iq4_nl *)cdsti);
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
template<typename src_t, typename dst_t>
|
|
223
|
+
static __device__ void cpy_1_flt(const char * cxi, char * cdsti) {
|
|
224
|
+
convert_flt((const src_t *)cxi, (dst_t *)cdsti);
|
|
225
|
+
}
|