@fugood/llama.node 0.3.2 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +7 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +18 -1
- package/package.json +1 -1
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +17 -7
- package/src/EmbeddingWorker.h +2 -1
- package/src/LlamaCompletionWorker.cpp +8 -8
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +89 -27
- package/src/LlamaContext.h +2 -0
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +4 -4
- package/src/llama.cpp/.github/workflows/build.yml +240 -168
- package/src/llama.cpp/.github/workflows/docker.yml +8 -8
- package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
- package/src/llama.cpp/.github/workflows/server.yml +21 -14
- package/src/llama.cpp/CMakeLists.txt +14 -6
- package/src/llama.cpp/Sources/llama/llama.h +4 -0
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/cmake/common.cmake +33 -0
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
- package/src/llama.cpp/common/CMakeLists.txt +6 -4
- package/src/llama.cpp/common/arg.cpp +986 -770
- package/src/llama.cpp/common/arg.h +22 -22
- package/src/llama.cpp/common/common.cpp +212 -351
- package/src/llama.cpp/common/common.h +204 -117
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +50 -50
- package/src/llama.cpp/common/log.h +18 -18
- package/src/llama.cpp/common/ngram-cache.cpp +36 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +163 -121
- package/src/llama.cpp/common/sampling.h +41 -20
- package/src/llama.cpp/common/speculative.cpp +274 -0
- package/src/llama.cpp/common/speculative.h +28 -0
- package/src/llama.cpp/docs/build.md +134 -161
- package/src/llama.cpp/examples/CMakeLists.txt +33 -14
- package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +19 -18
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +31 -13
- package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +41 -87
- package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +439 -459
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +2 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
- package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
- package/src/llama.cpp/examples/llava/clip.cpp +263 -66
- package/src/llama.cpp/examples/llava/clip.h +8 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
- package/src/llama.cpp/examples/llava/llava.cpp +83 -22
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +16 -15
- package/src/llama.cpp/examples/lookup/lookup.cpp +30 -30
- package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/main/main.cpp +73 -114
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +16 -16
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
- package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/run/run.cpp +911 -0
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +38 -21
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -16
- package/src/llama.cpp/examples/server/server.cpp +2073 -1339
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
- package/src/llama.cpp/examples/server/utils.hpp +354 -277
- package/src/llama.cpp/examples/simple/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/simple/simple.cpp +130 -94
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +200 -0
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +68 -64
- package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +3 -3
- package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/tts/tts.cpp +932 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +54 -36
- package/src/llama.cpp/ggml/include/ggml-backend.h +63 -34
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +135 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
- package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +159 -417
- package/src/llama.cpp/ggml/src/CMakeLists.txt +121 -1155
- package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -28
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +57 -36
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +552 -0
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +306 -867
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +87 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +216 -65
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +76 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +343 -177
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +336 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
- package/src/llama.cpp/ggml/src/{ggml-aarch64.c → ggml-cpu/ggml-cpu-aarch64.cpp} +1299 -246
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
- package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +14 -242
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +628 -0
- package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +666 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +152 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +104 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +393 -22
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +360 -127
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +105 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +854 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +188 -10702
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +478 -300
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +84 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +36 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +34 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3638 -4151
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -87
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +7 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +141 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +92 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2138 -887
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +3 -1
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
- package/src/llama.cpp/ggml/src/ggml.c +4427 -20125
- package/src/llama.cpp/include/llama-cpp.h +25 -0
- package/src/llama.cpp/include/llama.h +93 -52
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
- package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/src/CMakeLists.txt +4 -8
- package/src/llama.cpp/src/llama-grammar.cpp +15 -15
- package/src/llama.cpp/src/llama-grammar.h +2 -5
- package/src/llama.cpp/src/llama-sampling.cpp +779 -194
- package/src/llama.cpp/src/llama-sampling.h +21 -2
- package/src/llama.cpp/src/llama-vocab.cpp +55 -10
- package/src/llama.cpp/src/llama-vocab.h +35 -11
- package/src/llama.cpp/src/llama.cpp +4317 -2979
- package/src/llama.cpp/src/unicode-data.cpp +2 -2
- package/src/llama.cpp/src/unicode.cpp +62 -51
- package/src/llama.cpp/src/unicode.h +9 -10
- package/src/llama.cpp/tests/CMakeLists.txt +48 -38
- package/src/llama.cpp/tests/test-arg-parser.cpp +15 -15
- package/src/llama.cpp/tests/test-backend-ops.cpp +324 -80
- package/src/llama.cpp/tests/test-barrier.cpp +1 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +59 -9
- package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
- package/src/llama.cpp/tests/test-log.cpp +2 -2
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +24 -21
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +62 -20
- package/src/llama.cpp/tests/test-sampling.cpp +163 -138
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
- package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
- package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
- package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
- package/src/llama.cpp/common/train.cpp +0 -1515
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -39
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +0 -600
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
|
@@ -1,7 +1,3 @@
|
|
|
1
|
-
if (NOT SOC_TYPE)
|
|
2
|
-
set (SOC_TYPE "Ascend910B3")
|
|
3
|
-
endif()
|
|
4
|
-
|
|
5
1
|
file(GLOB SRC_FILES
|
|
6
2
|
get_row_f32.cpp
|
|
7
3
|
get_row_f16.cpp
|
|
@@ -13,7 +9,6 @@ file(GLOB SRC_FILES
|
|
|
13
9
|
dup.cpp
|
|
14
10
|
)
|
|
15
11
|
|
|
16
|
-
string(TOLOWER ${SOC_TYPE} SOC_VERSION)
|
|
17
12
|
set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR})
|
|
18
13
|
set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim")
|
|
19
14
|
|
|
@@ -30,4 +25,6 @@ ascendc_library(ascendc_kernels STATIC
|
|
|
30
25
|
${SRC_FILES}
|
|
31
26
|
)
|
|
32
27
|
|
|
28
|
+
message(STATUS "CANN: compile ascend kernels witch SOC_TYPE:${SOC_TYPE}, SOC_VERSION:${SOC_VERSION}, compile macro:-D${SOC_TYPE_COMPILE_OPTION}.")
|
|
29
|
+
ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
|
|
33
30
|
# ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
|
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
using namespace AscendC;
|
|
6
6
|
|
|
7
7
|
#define BUFFER_NUM 2
|
|
8
|
+
const int64_t SUPPORTED_MAX_DIM = 65535; // currently the limit of max block dim supportted by dup kernel is 65535template <typename SRC_T, typename DST_T>
|
|
8
9
|
|
|
9
10
|
template <typename SRC_T, typename DST_T>
|
|
10
11
|
class DupByRows {
|
|
@@ -51,24 +52,36 @@ class DupByRows {
|
|
|
51
52
|
|
|
52
53
|
__aicore__ inline void copy_in() {
|
|
53
54
|
LocalTensor<SRC_T> src_local = src_queue.AllocTensor<SRC_T>();
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
DataCopyPadExtParams<SRC_T> padParams;
|
|
59
|
-
DataCopyPad(src_local, src_gm, dataCopyParams, padParams);
|
|
60
|
-
|
|
55
|
+
const size_t elem_per_block = 32 / sizeof(SRC_T);
|
|
56
|
+
size_t tail = num_elem % elem_per_block;
|
|
57
|
+
size_t cpy_elements_len = tail > 0 ? num_elem + 1 : num_elem;
|
|
58
|
+
DataCopy(src_local, src_gm, cpy_elements_len);
|
|
61
59
|
src_queue.EnQue(src_local);
|
|
62
60
|
}
|
|
63
61
|
|
|
64
62
|
__aicore__ inline void copy_out() {
|
|
65
63
|
LocalTensor<DST_T> dst_local = dst_queue.DeQue<DST_T>();
|
|
66
|
-
|
|
64
|
+
#ifdef ASCEND_310P
|
|
65
|
+
const size_t elem_per_block = 32 / sizeof(DST_T);
|
|
66
|
+
size_t tail = num_elem % elem_per_block;
|
|
67
|
+
size_t len = num_elem & ~(elem_per_block - 1);
|
|
68
|
+
if (len > 0) {
|
|
69
|
+
DataCopy(dst_gm, dst_local, len);
|
|
70
|
+
}
|
|
71
|
+
if(tail != 0) {
|
|
72
|
+
for (size_t i = tail; i < elem_per_block; i++) {
|
|
73
|
+
dst_local[len + i].SetValue(0, 0);
|
|
74
|
+
}
|
|
75
|
+
SetAtomicAdd<float>();
|
|
76
|
+
DataCopy(dst_gm[len], dst_local[len], elem_per_block);
|
|
77
|
+
SetAtomicNone();
|
|
78
|
+
}
|
|
79
|
+
#else
|
|
67
80
|
DataCopyExtParams dataCopyParams;
|
|
68
81
|
dataCopyParams.blockCount = 1;
|
|
69
82
|
dataCopyParams.blockLen = num_elem * sizeof(DST_T);
|
|
70
83
|
DataCopyPad(dst_gm, dst_local, dataCopyParams);
|
|
71
|
-
|
|
84
|
+
#endif
|
|
72
85
|
dst_queue.FreeTensor(dst_local);
|
|
73
86
|
}
|
|
74
87
|
|
|
@@ -14,7 +14,7 @@ class GET_ROW_F16 {
|
|
|
14
14
|
int64_t *output_ne_ub, size_t *output_nb_ub) {
|
|
15
15
|
// TODO, use template for F16/f32
|
|
16
16
|
int64_t op_block_num = GetBlockNum();
|
|
17
|
-
|
|
17
|
+
op_block_idx = GetBlockIdx();
|
|
18
18
|
|
|
19
19
|
for (int i = 0; i < 4; i++) {
|
|
20
20
|
input_ne[i] = input_ne_ub[i];
|
|
@@ -59,32 +59,42 @@ class GET_ROW_F16 {
|
|
|
59
59
|
}
|
|
60
60
|
|
|
61
61
|
__aicore__ inline void copy_in(uint32_t offset, size_t len) {
|
|
62
|
+
size_t origin_len = len;
|
|
62
63
|
LocalTensor<half> input_local = input_queue.AllocTensor<half>();
|
|
63
|
-
size_t
|
|
64
|
-
|
|
65
|
-
|
|
64
|
+
const size_t elem_per_block = 32 / sizeof(half);
|
|
65
|
+
size_t tail = len % elem_per_block;
|
|
66
|
+
len = len & ~(elem_per_block - 1);
|
|
66
67
|
if(tail != 0) {
|
|
67
|
-
|
|
68
|
-
dataCopyParams.blockCount = 1;
|
|
69
|
-
dataCopyParams.blockLen = tail * sizeof(half);
|
|
70
|
-
DataCopyPadExtParams<half> padParams;
|
|
71
|
-
DataCopyPad(input_local[len], input_gm[offset + len],
|
|
72
|
-
dataCopyParams, padParams);
|
|
68
|
+
len += elem_per_block;
|
|
73
69
|
}
|
|
70
|
+
DataCopy(input_local, input_gm[offset], len);
|
|
74
71
|
input_queue.EnQue(input_local);
|
|
75
72
|
}
|
|
76
73
|
|
|
77
74
|
__aicore__ inline void copy_out(uint32_t offset, size_t len) {
|
|
78
75
|
LocalTensor<float> output_local = output_queue.DeQue<float>();
|
|
79
|
-
size_t
|
|
80
|
-
|
|
81
|
-
|
|
76
|
+
const size_t elem_per_block = 32 / sizeof(float);
|
|
77
|
+
size_t tail = len % elem_per_block;
|
|
78
|
+
len = len & ~(elem_per_block - 1);
|
|
79
|
+
if (len > 0) {
|
|
80
|
+
DataCopy(output_gm[offset], output_local, len);
|
|
81
|
+
}
|
|
82
|
+
|
|
82
83
|
if(tail != 0) {
|
|
84
|
+
#ifdef ASCEND_310P
|
|
85
|
+
for (size_t i = tail; i < elem_per_block; i++) {
|
|
86
|
+
output_local[len + i].SetValue(0, 0);
|
|
87
|
+
}
|
|
88
|
+
SetAtomicAdd<float>();
|
|
89
|
+
DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
|
|
90
|
+
SetAtomicNone();
|
|
91
|
+
#else
|
|
83
92
|
DataCopyExtParams dataCopyParams;
|
|
84
93
|
dataCopyParams.blockCount = 1;
|
|
85
94
|
dataCopyParams.blockLen = tail * sizeof(float);
|
|
86
95
|
DataCopyPad(output_gm[offset + len], output_local[len],
|
|
87
96
|
dataCopyParams);
|
|
97
|
+
#endif
|
|
88
98
|
}
|
|
89
99
|
output_queue.FreeTensor(output_local);
|
|
90
100
|
}
|
|
@@ -150,6 +160,7 @@ class GET_ROW_F16 {
|
|
|
150
160
|
GlobalTensor<float> output_gm;
|
|
151
161
|
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
|
|
152
162
|
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
|
|
163
|
+
int64_t op_block_idx;
|
|
153
164
|
};
|
|
154
165
|
|
|
155
166
|
template <typename T>
|
|
@@ -13,7 +13,7 @@ class GET_ROW_F32 {
|
|
|
13
13
|
int64_t *indices_ne_ub, size_t *indices_nb_ub,
|
|
14
14
|
int64_t *output_ne_ub, size_t *output_nb_ub) {
|
|
15
15
|
int64_t op_block_num = GetBlockNum();
|
|
16
|
-
|
|
16
|
+
op_block_idx = GetBlockIdx();
|
|
17
17
|
|
|
18
18
|
for (int i = 0; i < 4; i++) {
|
|
19
19
|
input_ne[i] = input_ne_ub[i];
|
|
@@ -55,31 +55,40 @@ class GET_ROW_F32 {
|
|
|
55
55
|
|
|
56
56
|
__aicore__ inline void copy_in(uint32_t offset, size_t len) {
|
|
57
57
|
LocalTensor<float> input_local = input_queue.AllocTensor<float>();
|
|
58
|
-
size_t
|
|
59
|
-
|
|
60
|
-
|
|
58
|
+
const size_t elem_per_block = 32 / sizeof(float);
|
|
59
|
+
size_t tail = len % elem_per_block;
|
|
60
|
+
len = len & ~(elem_per_block - 1);
|
|
61
61
|
if(tail != 0) {
|
|
62
|
-
|
|
63
|
-
dataCopyParams.blockCount = 1;
|
|
64
|
-
dataCopyParams.blockLen = tail * sizeof(float);
|
|
65
|
-
DataCopyPadExtParams<float> padParams;
|
|
66
|
-
DataCopyPad(input_local[len], input_gm[offset + len],
|
|
67
|
-
dataCopyParams, padParams);
|
|
62
|
+
len += elem_per_block;
|
|
68
63
|
}
|
|
64
|
+
DataCopy(input_local, input_gm[offset], len);
|
|
69
65
|
input_queue.EnQue(input_local);
|
|
70
66
|
}
|
|
71
67
|
|
|
72
68
|
__aicore__ inline void copy_out(uint32_t offset, size_t len) {
|
|
73
69
|
LocalTensor<float> output_local = output_queue.DeQue<float>();
|
|
74
|
-
size_t
|
|
75
|
-
|
|
76
|
-
|
|
70
|
+
const size_t elem_per_block = 32 / sizeof(float);
|
|
71
|
+
size_t tail = len % elem_per_block;
|
|
72
|
+
len = len & ~(elem_per_block - 1);
|
|
73
|
+
if (len > 0) {
|
|
74
|
+
DataCopy(output_gm[offset], output_local, len);
|
|
75
|
+
}
|
|
76
|
+
|
|
77
77
|
if(tail != 0) {
|
|
78
|
+
#ifdef ASCEND_310P
|
|
79
|
+
for (size_t i = tail; i < elem_per_block; i++) {
|
|
80
|
+
output_local[len + i].SetValue(0, 0);
|
|
81
|
+
}
|
|
82
|
+
SetAtomicAdd<float>();
|
|
83
|
+
DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
|
|
84
|
+
SetAtomicNone();
|
|
85
|
+
#else
|
|
78
86
|
DataCopyExtParams dataCopyParams;
|
|
79
87
|
dataCopyParams.blockCount = 1;
|
|
80
88
|
dataCopyParams.blockLen = tail * sizeof(float);
|
|
81
89
|
DataCopyPad(output_gm[offset + len], output_local[len],
|
|
82
90
|
dataCopyParams);
|
|
91
|
+
#endif
|
|
83
92
|
}
|
|
84
93
|
output_queue.FreeTensor(output_local);
|
|
85
94
|
}
|
|
@@ -144,6 +153,7 @@ class GET_ROW_F32 {
|
|
|
144
153
|
GlobalTensor<float> output_gm;
|
|
145
154
|
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
|
|
146
155
|
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
|
|
156
|
+
int64_t op_block_idx;
|
|
147
157
|
};
|
|
148
158
|
|
|
149
159
|
template <typename T>
|
|
@@ -2,6 +2,15 @@
|
|
|
2
2
|
|
|
3
3
|
// optimize me. Use template to avoid copy code.
|
|
4
4
|
using namespace AscendC;
|
|
5
|
+
#ifdef ASCEND_310P // 310P not support 4bit get row
|
|
6
|
+
extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
|
|
7
|
+
GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
|
|
8
|
+
GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
|
|
9
|
+
GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
|
|
10
|
+
// let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
|
|
11
|
+
printf("Ascend310P not support 4bit get row.\n");
|
|
12
|
+
}
|
|
13
|
+
#else
|
|
5
14
|
|
|
6
15
|
#define BUFFER_NUM 2
|
|
7
16
|
|
|
@@ -191,3 +200,5 @@ extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
|
|
|
191
200
|
indices_nb_ub, output_ne_ub, output_nb_ub);
|
|
192
201
|
op.calculate();
|
|
193
202
|
}
|
|
203
|
+
|
|
204
|
+
#endif // #ifdef ASCEND_310P
|
|
@@ -1,6 +1,14 @@
|
|
|
1
1
|
#include "kernel_operator.h"
|
|
2
2
|
|
|
3
3
|
using namespace AscendC;
|
|
4
|
+
#ifdef ASCEND_310P
|
|
5
|
+
extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
|
|
6
|
+
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
|
|
7
|
+
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
|
|
8
|
+
// let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
|
|
9
|
+
printf("Ascend310P not support f16->8bit quantization.\n");
|
|
10
|
+
}
|
|
11
|
+
#else
|
|
4
12
|
|
|
5
13
|
#define BUFFER_NUM 2
|
|
6
14
|
#define QK8_0 32
|
|
@@ -206,3 +214,5 @@ extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
|
|
|
206
214
|
op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
|
|
207
215
|
op.calculate();
|
|
208
216
|
}
|
|
217
|
+
|
|
218
|
+
#endif // #ifdef ASCEND_310P
|
|
@@ -1,6 +1,14 @@
|
|
|
1
1
|
#include "kernel_operator.h"
|
|
2
2
|
|
|
3
3
|
using namespace AscendC;
|
|
4
|
+
#ifdef ASCEND_310P // 310P not support f32->8bit quantization
|
|
5
|
+
extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
|
|
6
|
+
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
|
|
7
|
+
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
|
|
8
|
+
// let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
|
|
9
|
+
printf("Ascend310P not support f32->8bit quantization.\n");
|
|
10
|
+
}
|
|
11
|
+
#else
|
|
4
12
|
|
|
5
13
|
#define BUFFER_NUM 2
|
|
6
14
|
#define QK8_0 32
|
|
@@ -204,3 +212,5 @@ extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
|
|
|
204
212
|
op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
|
|
205
213
|
op.calculate();
|
|
206
214
|
}
|
|
215
|
+
|
|
216
|
+
#endif // #ifdef ASCEND_310P
|
|
@@ -1,6 +1,21 @@
|
|
|
1
1
|
#include "kernel_operator.h"
|
|
2
2
|
|
|
3
3
|
using namespace AscendC;
|
|
4
|
+
#ifdef ASCEND_310P // 310P not support float->4bit quantization
|
|
5
|
+
extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
|
|
6
|
+
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
|
|
7
|
+
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
|
|
8
|
+
// let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
|
|
9
|
+
printf("Ascend310P not support f32->4bit quantization.\n");
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0(
|
|
13
|
+
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
|
|
14
|
+
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
|
|
15
|
+
// let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
|
|
16
|
+
printf("Ascend310P not support f16->4bit quantization.\n");
|
|
17
|
+
}
|
|
18
|
+
#else
|
|
4
19
|
|
|
5
20
|
#define BUFFER_NUM 2
|
|
6
21
|
#define Group_Size 32
|
|
@@ -276,3 +291,5 @@ extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
|
|
|
276
291
|
op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
|
|
277
292
|
op.calculate();
|
|
278
293
|
}
|
|
294
|
+
|
|
295
|
+
#endif // #ifdef ASCEND_310P
|
|
@@ -6,7 +6,20 @@
|
|
|
6
6
|
typedef uint16_t ggml_half;
|
|
7
7
|
typedef uint32_t ggml_half2;
|
|
8
8
|
|
|
9
|
-
#define
|
|
9
|
+
#define GGML_COMMON_AGGR_U
|
|
10
|
+
#define GGML_COMMON_AGGR_S
|
|
11
|
+
|
|
12
|
+
#define GGML_COMMON_DECL
|
|
13
|
+
#elif defined(GGML_COMMON_DECL_CPP)
|
|
14
|
+
#include <cstdint>
|
|
15
|
+
|
|
16
|
+
typedef uint16_t ggml_half;
|
|
17
|
+
typedef uint32_t ggml_half2;
|
|
18
|
+
|
|
19
|
+
// std-c++ allow anonymous unions but some compiler warn on it
|
|
20
|
+
#define GGML_COMMON_AGGR_U data
|
|
21
|
+
// std-c++ do not allow it.
|
|
22
|
+
#define GGML_COMMON_AGGR_S data
|
|
10
23
|
|
|
11
24
|
#define GGML_COMMON_DECL
|
|
12
25
|
#elif defined(GGML_COMMON_DECL_METAL)
|
|
@@ -15,7 +28,8 @@ typedef uint32_t ggml_half2;
|
|
|
15
28
|
typedef half ggml_half;
|
|
16
29
|
typedef half2 ggml_half2;
|
|
17
30
|
|
|
18
|
-
#define
|
|
31
|
+
#define GGML_COMMON_AGGR_U
|
|
32
|
+
#define GGML_COMMON_AGGR_S
|
|
19
33
|
|
|
20
34
|
#define GGML_COMMON_DECL
|
|
21
35
|
#elif defined(GGML_COMMON_DECL_CUDA)
|
|
@@ -29,7 +43,8 @@ typedef half2 ggml_half2;
|
|
|
29
43
|
typedef half ggml_half;
|
|
30
44
|
typedef half2 ggml_half2;
|
|
31
45
|
|
|
32
|
-
#define
|
|
46
|
+
#define GGML_COMMON_AGGR_U
|
|
47
|
+
#define GGML_COMMON_AGGR_S data
|
|
33
48
|
|
|
34
49
|
#define GGML_COMMON_DECL
|
|
35
50
|
#elif defined(GGML_COMMON_DECL_HIP)
|
|
@@ -39,7 +54,8 @@ typedef half2 ggml_half2;
|
|
|
39
54
|
typedef half ggml_half;
|
|
40
55
|
typedef half2 ggml_half2;
|
|
41
56
|
|
|
42
|
-
#define
|
|
57
|
+
#define GGML_COMMON_AGGR_U
|
|
58
|
+
#define GGML_COMMON_AGGR_S data
|
|
43
59
|
|
|
44
60
|
#define GGML_COMMON_DECL
|
|
45
61
|
#elif defined(GGML_COMMON_DECL_SYCL)
|
|
@@ -49,7 +65,8 @@ typedef half2 ggml_half2;
|
|
|
49
65
|
typedef sycl::half ggml_half;
|
|
50
66
|
typedef sycl::half2 ggml_half2;
|
|
51
67
|
|
|
52
|
-
#define
|
|
68
|
+
#define GGML_COMMON_AGGR_U
|
|
69
|
+
#define GGML_COMMON_AGGR_S data
|
|
53
70
|
|
|
54
71
|
#define GGML_COMMON_DECL
|
|
55
72
|
#endif
|
|
@@ -154,9 +171,9 @@ typedef struct {
|
|
|
154
171
|
struct {
|
|
155
172
|
ggml_half d; // delta
|
|
156
173
|
ggml_half m; // min
|
|
157
|
-
}
|
|
174
|
+
} GGML_COMMON_AGGR_S;
|
|
158
175
|
ggml_half2 dm;
|
|
159
|
-
};
|
|
176
|
+
} GGML_COMMON_AGGR_U;
|
|
160
177
|
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
|
161
178
|
} block_q4_1;
|
|
162
179
|
static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding");
|
|
@@ -175,9 +192,9 @@ typedef struct {
|
|
|
175
192
|
struct {
|
|
176
193
|
ggml_half d; // delta
|
|
177
194
|
ggml_half m; // min
|
|
178
|
-
}
|
|
195
|
+
} GGML_COMMON_AGGR_S;
|
|
179
196
|
ggml_half2 dm;
|
|
180
|
-
};
|
|
197
|
+
} GGML_COMMON_AGGR_U;
|
|
181
198
|
uint8_t qh[4]; // 5-th bit of quants
|
|
182
199
|
uint8_t qs[QK5_1 / 2]; // nibbles / quants
|
|
183
200
|
} block_q5_1;
|
|
@@ -196,37 +213,13 @@ typedef struct {
|
|
|
196
213
|
struct {
|
|
197
214
|
ggml_half d; // delta
|
|
198
215
|
ggml_half s; // d * sum(qs[i])
|
|
199
|
-
}
|
|
216
|
+
} GGML_COMMON_AGGR_S;
|
|
200
217
|
ggml_half2 ds;
|
|
201
|
-
};
|
|
218
|
+
} GGML_COMMON_AGGR_U;
|
|
202
219
|
int8_t qs[QK8_1]; // quants
|
|
203
220
|
} block_q8_1;
|
|
204
221
|
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");
|
|
205
222
|
|
|
206
|
-
typedef struct {
|
|
207
|
-
ggml_half d[4]; // deltas for 4 q4_0 blocks
|
|
208
|
-
uint8_t qs[QK4_0 * 2]; // nibbles / quants for 4 q4_0 blocks
|
|
209
|
-
} block_q4_0x4;
|
|
210
|
-
static_assert(sizeof(block_q4_0x4) == 4 * sizeof(ggml_half) + QK4_0 * 2, "wrong q4_0x4 block size/padding");
|
|
211
|
-
|
|
212
|
-
typedef struct {
|
|
213
|
-
ggml_half d[8]; // deltas for 8 q4_0 blocks
|
|
214
|
-
uint8_t qs[QK4_0 * 4]; // nibbles / quants for 8 q4_0 blocks
|
|
215
|
-
} block_q4_0x8;
|
|
216
|
-
static_assert(sizeof(block_q4_0x8) == 8 * sizeof(ggml_half) + QK4_0 * 4, "wrong q4_0x8 block size/padding");
|
|
217
|
-
|
|
218
|
-
typedef struct {
|
|
219
|
-
ggml_half d[4]; // deltas for 4 q8_0 blocks
|
|
220
|
-
int8_t qs[QK8_0 * 4]; // quants for 4 q8_0 blocks
|
|
221
|
-
} block_q8_0x4;
|
|
222
|
-
static_assert(sizeof(block_q8_0x4) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong q8_0x4 block size/padding");
|
|
223
|
-
|
|
224
|
-
typedef struct {
|
|
225
|
-
ggml_half d[8]; // deltas for 8 q8_0 blocks
|
|
226
|
-
int8_t qs[QK8_0 * 8]; // quants for 8 q8_0 blocks
|
|
227
|
-
} block_q8_0x8;
|
|
228
|
-
static_assert(sizeof(block_q8_0x8) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong q8_0x8 block size/padding");
|
|
229
|
-
|
|
230
223
|
//
|
|
231
224
|
// Ternary quantization
|
|
232
225
|
//
|
|
@@ -261,9 +254,9 @@ typedef struct {
|
|
|
261
254
|
struct {
|
|
262
255
|
ggml_half d; // super-block scale for quantized scales
|
|
263
256
|
ggml_half dmin; // super-block scale for quantized mins
|
|
264
|
-
}
|
|
257
|
+
} GGML_COMMON_AGGR_S;
|
|
265
258
|
ggml_half2 dm;
|
|
266
|
-
};
|
|
259
|
+
} GGML_COMMON_AGGR_U;
|
|
267
260
|
} block_q2_K;
|
|
268
261
|
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
|
269
262
|
|
|
@@ -288,9 +281,9 @@ typedef struct {
|
|
|
288
281
|
struct {
|
|
289
282
|
ggml_half d; // super-block scale for quantized scales
|
|
290
283
|
ggml_half dmin; // super-block scale for quantized mins
|
|
291
|
-
}
|
|
284
|
+
} GGML_COMMON_AGGR_S;
|
|
292
285
|
ggml_half2 dm;
|
|
293
|
-
};
|
|
286
|
+
} GGML_COMMON_AGGR_U;
|
|
294
287
|
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
|
295
288
|
uint8_t qs[QK_K/2]; // 4--bit quants
|
|
296
289
|
} block_q4_K;
|
|
@@ -305,9 +298,9 @@ typedef struct {
|
|
|
305
298
|
struct {
|
|
306
299
|
ggml_half d; // super-block scale for quantized scales
|
|
307
300
|
ggml_half dmin; // super-block scale for quantized mins
|
|
308
|
-
}
|
|
301
|
+
} GGML_COMMON_AGGR_S;
|
|
309
302
|
ggml_half2 dm;
|
|
310
|
-
};
|
|
303
|
+
} GGML_COMMON_AGGR_U;
|
|
311
304
|
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
|
312
305
|
uint8_t qh[QK_K/8]; // quants, high bit
|
|
313
306
|
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
|
@@ -431,6 +424,13 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_
|
|
|
431
424
|
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
|
|
432
425
|
#define GGML_TABLE_END() };
|
|
433
426
|
|
|
427
|
+
#define GGML_COMMON_IMPL
|
|
428
|
+
#elif defined(GGML_COMMON_IMPL_CPP)
|
|
429
|
+
#include <cstdint>
|
|
430
|
+
|
|
431
|
+
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
|
|
432
|
+
#define GGML_TABLE_END() };
|
|
433
|
+
|
|
434
434
|
#define GGML_COMMON_IMPL
|
|
435
435
|
#elif defined(GGML_COMMON_IMPL_METAL)
|
|
436
436
|
#include <metal_stdlib>
|
|
@@ -473,7 +473,7 @@ GGML_TABLE_BEGIN(uint8_t, ksigns_iq2xs, 128)
|
|
|
473
473
|
240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
|
|
474
474
|
GGML_TABLE_END()
|
|
475
475
|
|
|
476
|
-
//#if __CUDA_ARCH__ >=
|
|
476
|
+
//#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A // lowest compute capability for integer intrinsics
|
|
477
477
|
GGML_TABLE_BEGIN(uint64_t, ksigns64, 128)
|
|
478
478
|
0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,
|
|
479
479
|
0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,
|