@fugood/llama.node 0.3.16 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +6 -1
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +44 -2
- package/lib/index.js +132 -1
- package/lib/index.ts +203 -3
- package/package.json +2 -1
- package/src/EmbeddingWorker.cpp +1 -1
- package/src/LlamaCompletionWorker.cpp +374 -19
- package/src/LlamaCompletionWorker.h +31 -10
- package/src/LlamaContext.cpp +216 -7
- package/src/LlamaContext.h +12 -0
- package/src/common.hpp +15 -0
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +233 -0
- package/src/llama.cpp/.github/workflows/build.yml +89 -767
- package/src/llama.cpp/.github/workflows/docker.yml +9 -6
- package/src/llama.cpp/.github/workflows/release.yml +716 -0
- package/src/llama.cpp/.github/workflows/server.yml +19 -23
- package/src/llama.cpp/CMakeLists.txt +11 -1
- package/src/llama.cpp/cmake/build-info.cmake +8 -2
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
- package/src/llama.cpp/common/CMakeLists.txt +35 -4
- package/src/llama.cpp/common/arg.cpp +844 -121
- package/src/llama.cpp/common/arg.h +9 -0
- package/src/llama.cpp/common/chat.cpp +129 -107
- package/src/llama.cpp/common/chat.h +2 -0
- package/src/llama.cpp/common/common.cpp +64 -518
- package/src/llama.cpp/common/common.h +35 -45
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
- package/src/llama.cpp/common/llguidance.cpp +31 -47
- package/src/llama.cpp/common/minja/chat-template.hpp +23 -11
- package/src/llama.cpp/common/minja/minja.hpp +186 -127
- package/src/llama.cpp/common/regex-partial.cpp +204 -0
- package/src/llama.cpp/common/regex-partial.h +56 -0
- package/src/llama.cpp/common/sampling.cpp +60 -50
- package/src/llama.cpp/docs/build.md +122 -7
- package/src/llama.cpp/examples/CMakeLists.txt +2 -32
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +9 -12
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/parallel/parallel.cpp +89 -15
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +2 -2
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
- package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/training/finetune.cpp +96 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +35 -2
- package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
- package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
- package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
- package/src/llama.cpp/ggml/include/ggml.h +76 -106
- package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -8
- package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
- package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +66 -33
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -194
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1060 -410
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1008 -13533
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +31 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +90 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +266 -72
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1034 -88
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8796 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +252 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +106 -14
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -262
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
- package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +307 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +125 -45
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +239 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +9 -307
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +944 -438
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +507 -411
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
- package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +83 -49
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1278 -282
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +32 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +133 -30
- package/src/llama.cpp/ggml/src/ggml.c +170 -265
- package/src/llama.cpp/ggml/src/gguf.cpp +34 -33
- package/src/llama.cpp/include/llama.h +82 -22
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +5 -3
- package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
- package/src/llama.cpp/scripts/xxd.cmake +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +4 -2
- package/src/llama.cpp/src/llama-adapter.cpp +43 -1
- package/src/llama.cpp/src/llama-arch.cpp +163 -17
- package/src/llama.cpp/src/llama-arch.h +16 -0
- package/src/llama.cpp/src/llama-batch.cpp +5 -1
- package/src/llama.cpp/src/llama-batch.h +2 -1
- package/src/llama.cpp/src/llama-chat.cpp +91 -16
- package/src/llama.cpp/src/llama-chat.h +7 -2
- package/src/llama.cpp/src/llama-context.cpp +479 -575
- package/src/llama.cpp/src/llama-context.h +44 -33
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +209 -157
- package/src/llama.cpp/src/llama-graph.h +38 -14
- package/src/llama.cpp/src/llama-hparams.h +13 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +1604 -543
- package/src/llama.cpp/src/llama-kv-cache.h +283 -171
- package/src/llama.cpp/src/llama-memory.h +12 -2
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +34 -20
- package/src/llama.cpp/src/llama-model-loader.h +5 -3
- package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
- package/src/llama.cpp/src/llama-model-saver.h +37 -0
- package/src/llama.cpp/src/llama-model.cpp +1803 -330
- package/src/llama.cpp/src/llama-model.h +21 -2
- package/src/llama.cpp/src/llama-quant.cpp +33 -10
- package/src/llama.cpp/src/llama-sampling.cpp +25 -7
- package/src/llama.cpp/src/llama-vocab.cpp +86 -10
- package/src/llama.cpp/src/llama-vocab.h +6 -0
- package/src/llama.cpp/src/llama.cpp +15 -1
- package/src/llama.cpp/tests/CMakeLists.txt +52 -31
- package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
- package/src/llama.cpp/tests/test-backend-ops.cpp +189 -90
- package/src/llama.cpp/tests/test-chat-template.cpp +26 -6
- package/src/llama.cpp/tests/test-chat.cpp +15 -3
- package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
- package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
- package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
- package/src/llama.cpp/tests/test-opt.cpp +33 -21
- package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
- package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
- package/src/llama.cpp/tests/test-sampling.cpp +1 -1
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
- package/src/llama.cpp/tools/CMakeLists.txt +39 -0
- package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +3 -3
- package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +1 -1
- package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +15 -16
- package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
- package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +623 -274
- package/src/llama.cpp/{examples → tools}/main/main.cpp +22 -14
- package/src/llama.cpp/tools/mtmd/CMakeLists.txt +47 -0
- package/src/llama.cpp/tools/mtmd/clip-impl.h +365 -0
- package/src/llama.cpp/tools/mtmd/clip.cpp +3646 -0
- package/src/llama.cpp/tools/mtmd/clip.h +99 -0
- package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +22 -0
- package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +370 -0
- package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
- package/src/llama.cpp/tools/mtmd/mtmd.cpp +678 -0
- package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
- package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +21 -5
- package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +53 -3
- package/src/llama.cpp/tools/rpc/CMakeLists.txt +4 -0
- package/src/llama.cpp/tools/rpc/rpc-server.cpp +322 -0
- package/src/llama.cpp/tools/run/CMakeLists.txt +16 -0
- package/src/llama.cpp/{examples → tools}/run/run.cpp +30 -30
- package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
- package/src/llama.cpp/{examples → tools}/server/httplib.h +313 -247
- package/src/llama.cpp/{examples → tools}/server/server.cpp +529 -215
- package/src/llama.cpp/{examples → tools}/server/utils.hpp +427 -6
- package/src/llama.cpp/{examples → tools}/tts/tts.cpp +6 -9
- package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/infill/infill.cpp +0 -590
- package/src/llama.cpp/examples/llava/CMakeLists.txt +0 -66
- package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
- package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
- package/src/llama.cpp/examples/llava/clip.cpp +0 -3206
- package/src/llama.cpp/examples/llava/clip.h +0 -118
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
- package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
- package/src/llama.cpp/examples/llava/llava.cpp +0 -574
- package/src/llama.cpp/examples/llava/llava.h +0 -49
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +0 -584
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +0 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +0 -171
- package/src/llama.cpp/examples/run/CMakeLists.txt +0 -5
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
- /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
- /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
|
@@ -1,190 +0,0 @@
|
|
|
1
|
-
#include "kernel_operator.h"
|
|
2
|
-
|
|
3
|
-
// optimize me. Use template to avoid copy code.
|
|
4
|
-
using namespace AscendC;
|
|
5
|
-
|
|
6
|
-
#define BUFFER_NUM 2
|
|
7
|
-
|
|
8
|
-
class GET_ROW_F32 {
|
|
9
|
-
public:
|
|
10
|
-
__aicore__ inline GET_ROW_F32() {}
|
|
11
|
-
__aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
|
|
12
|
-
int64_t *input_ne_ub, size_t *input_nb_ub,
|
|
13
|
-
int64_t *indices_ne_ub, size_t *indices_nb_ub,
|
|
14
|
-
int64_t *output_ne_ub, size_t *output_nb_ub) {
|
|
15
|
-
int64_t op_block_num = GetBlockNum();
|
|
16
|
-
op_block_idx = GetBlockIdx();
|
|
17
|
-
|
|
18
|
-
for (int i = 0; i < 4; i++) {
|
|
19
|
-
input_ne[i] = input_ne_ub[i];
|
|
20
|
-
input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
|
|
21
|
-
|
|
22
|
-
indices_ne[i] = indices_ne_ub[i];
|
|
23
|
-
indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
|
|
24
|
-
|
|
25
|
-
output_ne[i] = output_ne_ub[i];
|
|
26
|
-
output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
// Indices has two dims. n_elements = all rows should get.
|
|
30
|
-
// dr, all rows should this thread get.
|
|
31
|
-
uint64_t n_elements =
|
|
32
|
-
indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
|
|
33
|
-
dr = n_elements / op_block_num;
|
|
34
|
-
|
|
35
|
-
uint64_t tails = n_elements % op_block_num;
|
|
36
|
-
if (op_block_idx < tails) {
|
|
37
|
-
dr += 1;
|
|
38
|
-
ir = dr * op_block_idx;
|
|
39
|
-
} else {
|
|
40
|
-
ir = dr * op_block_idx + tails;
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
input_gm.SetGlobalBuffer((__gm__ float *)input);
|
|
44
|
-
indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
|
|
45
|
-
output_gm.SetGlobalBuffer((__gm__ float *)output);
|
|
46
|
-
|
|
47
|
-
uint64_t local_buffer_size = ((input_ne[0] * sizeof(float) + 31) & ~31);
|
|
48
|
-
local_buffer_elems = local_buffer_size / sizeof(float);
|
|
49
|
-
|
|
50
|
-
// TODO, consider long row that can't put in UB.
|
|
51
|
-
// All data should asign to 32. It's ok because all data is align to 32.
|
|
52
|
-
pipe.InitBuffer(input_queue, BUFFER_NUM, local_buffer_size);
|
|
53
|
-
pipe.InitBuffer(output_queue, BUFFER_NUM, local_buffer_size);
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
__aicore__ inline void copy_in(uint32_t offset, size_t len) {
|
|
57
|
-
LocalTensor<float> input_local = input_queue.AllocTensor<float>();
|
|
58
|
-
const size_t elem_per_block = 32 / sizeof(float);
|
|
59
|
-
size_t tail = len % elem_per_block;
|
|
60
|
-
len = len & ~(elem_per_block - 1);
|
|
61
|
-
if(tail != 0) {
|
|
62
|
-
len += elem_per_block;
|
|
63
|
-
}
|
|
64
|
-
DataCopy(input_local, input_gm[offset], len);
|
|
65
|
-
input_queue.EnQue(input_local);
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
__aicore__ inline void copy_out(uint32_t offset, size_t len) {
|
|
69
|
-
LocalTensor<float> output_local = output_queue.DeQue<float>();
|
|
70
|
-
const size_t elem_per_block = 32 / sizeof(float);
|
|
71
|
-
size_t tail = len % elem_per_block;
|
|
72
|
-
len = len & ~(elem_per_block - 1);
|
|
73
|
-
if (len > 0) {
|
|
74
|
-
DataCopy(output_gm[offset], output_local, len);
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
if(tail != 0) {
|
|
78
|
-
#ifdef ASCEND_310P
|
|
79
|
-
for (size_t i = tail; i < elem_per_block; i++) {
|
|
80
|
-
output_local[len + i].SetValue(0, 0);
|
|
81
|
-
}
|
|
82
|
-
SetAtomicAdd<float>();
|
|
83
|
-
DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
|
|
84
|
-
SetAtomicNone();
|
|
85
|
-
#else
|
|
86
|
-
DataCopyExtParams dataCopyParams;
|
|
87
|
-
dataCopyParams.blockCount = 1;
|
|
88
|
-
dataCopyParams.blockLen = tail * sizeof(float);
|
|
89
|
-
DataCopyPad(output_gm[offset + len], output_local[len],
|
|
90
|
-
dataCopyParams);
|
|
91
|
-
#endif
|
|
92
|
-
}
|
|
93
|
-
output_queue.FreeTensor(output_local);
|
|
94
|
-
}
|
|
95
|
-
|
|
96
|
-
__aicore__ inline void calculate_row(int64_t idx) {
|
|
97
|
-
const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
|
|
98
|
-
const int64_t indices_ne1_idx =
|
|
99
|
-
(idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
|
|
100
|
-
indices_ne[0];
|
|
101
|
-
const int64_t indices_ne0_idx =
|
|
102
|
-
(idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
|
|
103
|
-
indices_ne1_idx * indices_ne[0]);
|
|
104
|
-
|
|
105
|
-
const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
|
|
106
|
-
indices_ne1_idx * indices_stride[1] +
|
|
107
|
-
indices_ne2_idx * indices_stride[2];
|
|
108
|
-
const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
|
|
109
|
-
|
|
110
|
-
const int64_t input_offset = selected_row_idx * input_stride[1] +
|
|
111
|
-
indices_ne1_idx * input_stride[2] +
|
|
112
|
-
indices_ne2_idx * input_stride[3];
|
|
113
|
-
|
|
114
|
-
const int64_t output_offset = indices_ne0_idx * output_stride[1] +
|
|
115
|
-
indices_ne1_idx * output_stride[2] +
|
|
116
|
-
indices_ne2_idx * output_stride[3];
|
|
117
|
-
|
|
118
|
-
copy_in(input_offset, input_ne[0]);
|
|
119
|
-
LocalTensor<float> input_local = input_queue.DeQue<float>();
|
|
120
|
-
LocalTensor<float> output_local = output_queue.AllocTensor<float>();
|
|
121
|
-
|
|
122
|
-
DataCopy(output_local, input_local, local_buffer_elems);
|
|
123
|
-
output_queue.EnQue(output_local);
|
|
124
|
-
copy_out(output_offset, input_ne[0]);
|
|
125
|
-
|
|
126
|
-
input_queue.FreeTensor(input_local);
|
|
127
|
-
}
|
|
128
|
-
|
|
129
|
-
__aicore__ inline void calculate() {
|
|
130
|
-
for (int64_t i = ir; i < ir + dr; i++) {
|
|
131
|
-
calculate_row(i);
|
|
132
|
-
}
|
|
133
|
-
}
|
|
134
|
-
|
|
135
|
-
private:
|
|
136
|
-
int64_t input_ne[4];
|
|
137
|
-
size_t input_stride[4];
|
|
138
|
-
|
|
139
|
-
int64_t indices_ne[4];
|
|
140
|
-
size_t indices_stride[4];
|
|
141
|
-
|
|
142
|
-
int64_t output_ne[4];
|
|
143
|
-
size_t output_stride[4];
|
|
144
|
-
|
|
145
|
-
size_t local_buffer_elems;
|
|
146
|
-
|
|
147
|
-
int64_t ir;
|
|
148
|
-
int64_t dr;
|
|
149
|
-
|
|
150
|
-
TPipe pipe;
|
|
151
|
-
GlobalTensor<float> input_gm;
|
|
152
|
-
GlobalTensor<int32_t> indices_gm;
|
|
153
|
-
GlobalTensor<float> output_gm;
|
|
154
|
-
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
|
|
155
|
-
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
|
|
156
|
-
int64_t op_block_idx;
|
|
157
|
-
};
|
|
158
|
-
|
|
159
|
-
template <typename T>
|
|
160
|
-
__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
|
|
161
|
-
auto gm_ptr = (__gm__ uint8_t *)gm;
|
|
162
|
-
auto ub_ptr = (uint8_t *)(ub);
|
|
163
|
-
for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
|
|
164
|
-
*ub_ptr = *gm_ptr;
|
|
165
|
-
}
|
|
166
|
-
}
|
|
167
|
-
|
|
168
|
-
extern "C" __global__ __aicore__ void ascendc_get_row_f32(
|
|
169
|
-
GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
|
|
170
|
-
GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm,
|
|
171
|
-
GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
|
|
172
|
-
int64_t input_ne_ub[4];
|
|
173
|
-
size_t input_nb_ub[4];
|
|
174
|
-
int64_t indices_ne_ub[4];
|
|
175
|
-
size_t indices_nb_ub[4];
|
|
176
|
-
int64_t output_ne_ub[4];
|
|
177
|
-
size_t output_nb_ub[4];
|
|
178
|
-
|
|
179
|
-
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
|
180
|
-
copy_to_ub(input_nb_gm, input_nb_ub, 32);
|
|
181
|
-
copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
|
|
182
|
-
copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
|
|
183
|
-
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
|
184
|
-
copy_to_ub(output_nb_gm, output_nb_ub, 32);
|
|
185
|
-
|
|
186
|
-
GET_ROW_F32 op;
|
|
187
|
-
op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub,
|
|
188
|
-
indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub);
|
|
189
|
-
op.calculate();
|
|
190
|
-
}
|
|
@@ -1,204 +0,0 @@
|
|
|
1
|
-
#include "kernel_operator.h"
|
|
2
|
-
|
|
3
|
-
// optimize me. Use template to avoid copy code.
|
|
4
|
-
using namespace AscendC;
|
|
5
|
-
#ifdef ASCEND_310P // 310P not support 4bit get row
|
|
6
|
-
extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
|
|
7
|
-
GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
|
|
8
|
-
GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
|
|
9
|
-
GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
|
|
10
|
-
// let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
|
|
11
|
-
printf("Ascend310P not support 4bit get row.\n");
|
|
12
|
-
}
|
|
13
|
-
#else
|
|
14
|
-
|
|
15
|
-
#define BUFFER_NUM 2
|
|
16
|
-
|
|
17
|
-
#define QK4_0 32
|
|
18
|
-
|
|
19
|
-
class GET_ROW_Q4_0 {
|
|
20
|
-
public:
|
|
21
|
-
__aicore__ inline GET_ROW_Q4_0() {}
|
|
22
|
-
__aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
|
|
23
|
-
int64_t *input_ne_ub, int64_t *indices_ne_ub,
|
|
24
|
-
size_t *indices_nb_ub, int64_t *output_ne_ub,
|
|
25
|
-
size_t *output_nb_ub) {
|
|
26
|
-
int64_t op_block_num = GetBlockNum();
|
|
27
|
-
int64_t op_block_idx = GetBlockIdx();
|
|
28
|
-
|
|
29
|
-
for (int i = 0; i < 4; i++) {
|
|
30
|
-
input_ne[i] = input_ne_ub[i];
|
|
31
|
-
indices_ne[i] = indices_ne_ub[i];
|
|
32
|
-
indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
|
|
33
|
-
scale_ne[i] = input_ne_ub[i];
|
|
34
|
-
output_ne[i] = output_ne_ub[i];
|
|
35
|
-
output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
// one scale for a group.
|
|
39
|
-
scale_ne[0] /= QK4_0;
|
|
40
|
-
|
|
41
|
-
input_stride[0] = 1;
|
|
42
|
-
scale_stride[0] = 1;
|
|
43
|
-
output_stride[0] = 1;
|
|
44
|
-
for (int i = 1; i < 4; i++) {
|
|
45
|
-
input_stride[i] = input_stride[i - 1] * input_ne[i - 1];
|
|
46
|
-
scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
|
|
47
|
-
}
|
|
48
|
-
|
|
49
|
-
group_size_in_row = input_ne[0] / QK4_0;
|
|
50
|
-
int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] *
|
|
51
|
-
input_ne[3] / 2;
|
|
52
|
-
|
|
53
|
-
// Indices has two dims. n_elements = all rows should get.
|
|
54
|
-
// dr, all rows should this thread get.
|
|
55
|
-
uint64_t n_elements =
|
|
56
|
-
indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
|
|
57
|
-
dr = n_elements / op_block_num;
|
|
58
|
-
|
|
59
|
-
uint64_t tails = n_elements % op_block_num;
|
|
60
|
-
if (op_block_idx < tails) {
|
|
61
|
-
dr += 1;
|
|
62
|
-
ir = dr * op_block_idx;
|
|
63
|
-
} else {
|
|
64
|
-
ir = dr * op_block_idx + tails;
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
input_gm.SetGlobalBuffer((__gm__ int4b_t *)input);
|
|
68
|
-
scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset));
|
|
69
|
-
indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
|
|
70
|
-
output_gm.SetGlobalBuffer((__gm__ float *)output);
|
|
71
|
-
|
|
72
|
-
pipe.InitBuffer(input_queue, BUFFER_NUM, QK4_0 * sizeof(int4b_t));
|
|
73
|
-
pipe.InitBuffer(cast_queue, BUFFER_NUM, QK4_0 * sizeof(half));
|
|
74
|
-
pipe.InitBuffer(output_queue, BUFFER_NUM, QK4_0 * sizeof(float));
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
__aicore__ inline void copy_in(uint32_t offset) {
|
|
78
|
-
LocalTensor<int4b_t> input_local = input_queue.AllocTensor<int4b_t>();
|
|
79
|
-
// 32 * sizeof(int4b_t) = 16, which is not aligned to 32, why no error?
|
|
80
|
-
DataCopy(input_local, input_gm[offset], QK4_0);
|
|
81
|
-
input_queue.EnQue(input_local);
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
__aicore__ inline void copy_out(uint32_t offset) {
|
|
85
|
-
LocalTensor<float> output_local = output_queue.DeQue<float>();
|
|
86
|
-
DataCopy(output_gm[offset], output_local, QK4_0);
|
|
87
|
-
output_queue.FreeTensor(output_local);
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
__aicore__ inline void calculate_group(int64_t idx, int64_t group) {
|
|
91
|
-
const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
|
|
92
|
-
const int64_t indices_ne1_idx =
|
|
93
|
-
(idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
|
|
94
|
-
indices_ne[0];
|
|
95
|
-
const int64_t indices_ne0_idx =
|
|
96
|
-
(idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
|
|
97
|
-
indices_ne1_idx * indices_ne[0]);
|
|
98
|
-
|
|
99
|
-
const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
|
|
100
|
-
indices_ne1_idx * indices_stride[1] +
|
|
101
|
-
indices_ne2_idx * indices_stride[2];
|
|
102
|
-
const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
|
|
103
|
-
|
|
104
|
-
const int64_t input_offset = selected_row_idx * input_stride[1] +
|
|
105
|
-
indices_ne1_idx * input_stride[2] +
|
|
106
|
-
indices_ne2_idx * input_stride[3] +
|
|
107
|
-
group * QK4_0;
|
|
108
|
-
const int64_t scale_offset = selected_row_idx * scale_stride[1] +
|
|
109
|
-
indices_ne1_idx * scale_stride[2] +
|
|
110
|
-
indices_ne2_idx * scale_stride[3] + group;
|
|
111
|
-
const int64_t output_offset = indices_ne0_idx * output_stride[1] +
|
|
112
|
-
indices_ne1_idx * output_stride[2] +
|
|
113
|
-
indices_ne2_idx * output_stride[3] +
|
|
114
|
-
group * QK4_0;
|
|
115
|
-
|
|
116
|
-
copy_in(input_offset);
|
|
117
|
-
LocalTensor<int4b_t> input_local = input_queue.DeQue<int4b_t>();
|
|
118
|
-
LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
|
|
119
|
-
LocalTensor<float> output_local = output_queue.AllocTensor<float>();
|
|
120
|
-
|
|
121
|
-
// TODO: cast more data to speed up.
|
|
122
|
-
Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0);
|
|
123
|
-
Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0);
|
|
124
|
-
|
|
125
|
-
// Only mul need compile by group.
|
|
126
|
-
half scale = scale_gm.GetValue(scale_offset);
|
|
127
|
-
|
|
128
|
-
Muls(output_local, output_local, (float)scale, QK4_0);
|
|
129
|
-
|
|
130
|
-
input_queue.FreeTensor(input_local);
|
|
131
|
-
cast_queue.FreeTensor(cast_local);
|
|
132
|
-
output_queue.EnQue(output_local);
|
|
133
|
-
|
|
134
|
-
copy_out(output_offset);
|
|
135
|
-
}
|
|
136
|
-
|
|
137
|
-
__aicore__ inline void calculate() {
|
|
138
|
-
for (int64_t i = ir; i < ir + dr; i++) {
|
|
139
|
-
for (int64_t j = 0; j < group_size_in_row; j++) {
|
|
140
|
-
calculate_group(i, j);
|
|
141
|
-
}
|
|
142
|
-
}
|
|
143
|
-
}
|
|
144
|
-
|
|
145
|
-
private:
|
|
146
|
-
int64_t input_ne[4];
|
|
147
|
-
size_t input_stride[4];
|
|
148
|
-
|
|
149
|
-
int64_t scale_ne[4];
|
|
150
|
-
size_t scale_stride[4];
|
|
151
|
-
|
|
152
|
-
int64_t indices_ne[4];
|
|
153
|
-
size_t indices_stride[4];
|
|
154
|
-
|
|
155
|
-
int64_t output_ne[4];
|
|
156
|
-
size_t output_stride[4];
|
|
157
|
-
|
|
158
|
-
int64_t ir;
|
|
159
|
-
int64_t dr;
|
|
160
|
-
|
|
161
|
-
int64_t group_size_in_row;
|
|
162
|
-
|
|
163
|
-
TPipe pipe;
|
|
164
|
-
GlobalTensor<int4b_t> input_gm;
|
|
165
|
-
GlobalTensor<half> scale_gm;
|
|
166
|
-
GlobalTensor<int32_t> indices_gm;
|
|
167
|
-
GlobalTensor<float> output_gm;
|
|
168
|
-
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
|
|
169
|
-
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
|
|
170
|
-
TQue<QuePosition::VECIN, BUFFER_NUM> cast_queue;
|
|
171
|
-
};
|
|
172
|
-
|
|
173
|
-
template <typename T>
|
|
174
|
-
__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
|
|
175
|
-
auto gm_ptr = (__gm__ uint8_t *)gm;
|
|
176
|
-
auto ub_ptr = (uint8_t *)(ub);
|
|
177
|
-
for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
|
|
178
|
-
*ub_ptr = *gm_ptr;
|
|
179
|
-
}
|
|
180
|
-
}
|
|
181
|
-
|
|
182
|
-
extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
|
|
183
|
-
GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
|
|
184
|
-
GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
|
|
185
|
-
GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
|
|
186
|
-
int64_t input_ne_ub[4];
|
|
187
|
-
int64_t indices_ne_ub[4];
|
|
188
|
-
size_t indices_nb_ub[4];
|
|
189
|
-
int64_t output_ne_ub[4];
|
|
190
|
-
size_t output_nb_ub[4];
|
|
191
|
-
|
|
192
|
-
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
|
193
|
-
copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
|
|
194
|
-
copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
|
|
195
|
-
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
|
196
|
-
copy_to_ub(output_nb_gm, output_nb_ub, 32);
|
|
197
|
-
|
|
198
|
-
GET_ROW_Q4_0 op;
|
|
199
|
-
op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub,
|
|
200
|
-
indices_nb_ub, output_ne_ub, output_nb_ub);
|
|
201
|
-
op.calculate();
|
|
202
|
-
}
|
|
203
|
-
|
|
204
|
-
#endif // #ifdef ASCEND_310P
|
|
@@ -1,191 +0,0 @@
|
|
|
1
|
-
#include "kernel_operator.h"
|
|
2
|
-
|
|
3
|
-
// optimize me. Use template to avoid copy code.
|
|
4
|
-
using namespace AscendC;
|
|
5
|
-
|
|
6
|
-
#define BUFFER_NUM 2
|
|
7
|
-
|
|
8
|
-
#define QK8_0 32
|
|
9
|
-
|
|
10
|
-
class GET_ROW_Q8_0 {
|
|
11
|
-
public:
|
|
12
|
-
__aicore__ inline GET_ROW_Q8_0() {}
|
|
13
|
-
__aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
|
|
14
|
-
int64_t *input_ne_ub, int64_t *indices_ne_ub,
|
|
15
|
-
size_t *indices_nb_ub, int64_t *output_ne_ub,
|
|
16
|
-
size_t *output_nb_ub) {
|
|
17
|
-
int64_t op_block_num = GetBlockNum();
|
|
18
|
-
int64_t op_block_idx = GetBlockIdx();
|
|
19
|
-
|
|
20
|
-
for (int i = 0; i < 4; i++) {
|
|
21
|
-
input_ne[i] = input_ne_ub[i];
|
|
22
|
-
indices_ne[i] = indices_ne_ub[i];
|
|
23
|
-
indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
|
|
24
|
-
scale_ne[i] = input_ne_ub[i];
|
|
25
|
-
output_ne[i] = output_ne_ub[i];
|
|
26
|
-
output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
// one scale for a group.
|
|
30
|
-
scale_ne[0] /= QK8_0;
|
|
31
|
-
|
|
32
|
-
input_stride[0] = 1;
|
|
33
|
-
scale_stride[0] = 1;
|
|
34
|
-
output_stride[0] = 1;
|
|
35
|
-
for (int i = 1; i < 4; i++) {
|
|
36
|
-
input_stride[i] = input_stride[i - 1] * input_ne[i - 1];
|
|
37
|
-
scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
group_size_in_row = input_ne[0] / QK8_0;
|
|
41
|
-
int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] *
|
|
42
|
-
input_ne[3] * sizeof(int8_t);
|
|
43
|
-
|
|
44
|
-
// Indices has two dims. n_elements = all rows should get.
|
|
45
|
-
// dr, all rows should this thread get.
|
|
46
|
-
uint64_t n_elements =
|
|
47
|
-
indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
|
|
48
|
-
dr = n_elements / op_block_num;
|
|
49
|
-
|
|
50
|
-
uint64_t tails = n_elements % op_block_num;
|
|
51
|
-
if (op_block_idx < tails) {
|
|
52
|
-
dr += 1;
|
|
53
|
-
ir = dr * op_block_idx;
|
|
54
|
-
} else {
|
|
55
|
-
ir = dr * op_block_idx + tails;
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
input_gm.SetGlobalBuffer((__gm__ int8_t *)input);
|
|
59
|
-
scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset));
|
|
60
|
-
indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
|
|
61
|
-
output_gm.SetGlobalBuffer((__gm__ float *)output);
|
|
62
|
-
|
|
63
|
-
pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
|
|
64
|
-
pipe.InitBuffer(cast_queue, BUFFER_NUM, QK8_0 * sizeof(half));
|
|
65
|
-
pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(float));
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
__aicore__ inline void copy_in(uint32_t offset) {
|
|
69
|
-
LocalTensor<int8_t> input_local = input_queue.AllocTensor<int8_t>();
|
|
70
|
-
DataCopy(input_local, input_gm[offset], QK8_0);
|
|
71
|
-
input_queue.EnQue(input_local);
|
|
72
|
-
}
|
|
73
|
-
|
|
74
|
-
__aicore__ inline void copy_out(uint32_t offset) {
|
|
75
|
-
LocalTensor<float> output_local = output_queue.DeQue<float>();
|
|
76
|
-
DataCopy(output_gm[offset], output_local, QK8_0);
|
|
77
|
-
output_queue.FreeTensor(output_local);
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
__aicore__ inline void calculate_group(int64_t idx, int64_t group) {
|
|
81
|
-
const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
|
|
82
|
-
const int64_t indices_ne1_idx =
|
|
83
|
-
(idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
|
|
84
|
-
indices_ne[0];
|
|
85
|
-
const int64_t indices_ne0_idx =
|
|
86
|
-
(idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
|
|
87
|
-
indices_ne1_idx * indices_ne[0]);
|
|
88
|
-
|
|
89
|
-
const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
|
|
90
|
-
indices_ne1_idx * indices_stride[1] +
|
|
91
|
-
indices_ne2_idx * indices_stride[2];
|
|
92
|
-
const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
|
|
93
|
-
|
|
94
|
-
const int64_t input_offset = selected_row_idx * input_stride[1] +
|
|
95
|
-
indices_ne1_idx * input_stride[2] +
|
|
96
|
-
indices_ne2_idx * input_stride[3] +
|
|
97
|
-
group * QK8_0;
|
|
98
|
-
const int64_t scale_offset = selected_row_idx * scale_stride[1] +
|
|
99
|
-
indices_ne1_idx * scale_stride[2] +
|
|
100
|
-
indices_ne2_idx * scale_stride[3] + group;
|
|
101
|
-
const int64_t output_offset = indices_ne0_idx * output_stride[1] +
|
|
102
|
-
indices_ne1_idx * output_stride[2] +
|
|
103
|
-
indices_ne2_idx * output_stride[3] +
|
|
104
|
-
group * QK8_0;
|
|
105
|
-
|
|
106
|
-
copy_in(input_offset);
|
|
107
|
-
LocalTensor<int8_t> input_local = input_queue.DeQue<int8_t>();
|
|
108
|
-
LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
|
|
109
|
-
LocalTensor<float> output_local = output_queue.AllocTensor<float>();
|
|
110
|
-
|
|
111
|
-
// TODO: cast more data to speed up.
|
|
112
|
-
Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0);
|
|
113
|
-
Cast(output_local, cast_local, RoundMode::CAST_NONE, QK8_0);
|
|
114
|
-
|
|
115
|
-
// Only mul need compile by group.
|
|
116
|
-
half scale = scale_gm.GetValue(scale_offset);
|
|
117
|
-
Muls(output_local, output_local, (float)scale, QK8_0);
|
|
118
|
-
|
|
119
|
-
input_queue.FreeTensor(input_local);
|
|
120
|
-
cast_queue.FreeTensor(cast_local);
|
|
121
|
-
output_queue.EnQue(output_local);
|
|
122
|
-
|
|
123
|
-
copy_out(output_offset);
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
__aicore__ inline void calculate() {
|
|
127
|
-
for (int64_t i = ir; i < ir + dr; i++) {
|
|
128
|
-
for (int64_t j = 0; j < group_size_in_row; j++) {
|
|
129
|
-
calculate_group(i, j);
|
|
130
|
-
}
|
|
131
|
-
}
|
|
132
|
-
}
|
|
133
|
-
|
|
134
|
-
private:
|
|
135
|
-
int64_t input_ne[4];
|
|
136
|
-
size_t input_stride[4];
|
|
137
|
-
|
|
138
|
-
int64_t scale_ne[4];
|
|
139
|
-
size_t scale_stride[4];
|
|
140
|
-
|
|
141
|
-
int64_t indices_ne[4];
|
|
142
|
-
size_t indices_stride[4];
|
|
143
|
-
|
|
144
|
-
int64_t output_ne[4];
|
|
145
|
-
size_t output_stride[4];
|
|
146
|
-
|
|
147
|
-
int64_t ir;
|
|
148
|
-
int64_t dr;
|
|
149
|
-
|
|
150
|
-
int64_t group_size_in_row;
|
|
151
|
-
|
|
152
|
-
TPipe pipe;
|
|
153
|
-
GlobalTensor<int8_t> input_gm;
|
|
154
|
-
GlobalTensor<half> scale_gm;
|
|
155
|
-
GlobalTensor<int32_t> indices_gm;
|
|
156
|
-
GlobalTensor<float> output_gm;
|
|
157
|
-
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
|
|
158
|
-
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
|
|
159
|
-
TQue<QuePosition::VECIN, BUFFER_NUM> cast_queue;
|
|
160
|
-
};
|
|
161
|
-
|
|
162
|
-
template <typename T>
|
|
163
|
-
__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
|
|
164
|
-
auto gm_ptr = (__gm__ uint8_t *)gm;
|
|
165
|
-
auto ub_ptr = (uint8_t *)(ub);
|
|
166
|
-
for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
|
|
167
|
-
*ub_ptr = *gm_ptr;
|
|
168
|
-
}
|
|
169
|
-
}
|
|
170
|
-
|
|
171
|
-
extern "C" __global__ __aicore__ void ascendc_get_row_q8_0(
|
|
172
|
-
GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
|
|
173
|
-
GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
|
|
174
|
-
GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
|
|
175
|
-
int64_t input_ne_ub[4];
|
|
176
|
-
int64_t indices_ne_ub[4];
|
|
177
|
-
size_t indices_nb_ub[4];
|
|
178
|
-
int64_t output_ne_ub[4];
|
|
179
|
-
size_t output_nb_ub[4];
|
|
180
|
-
|
|
181
|
-
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
|
182
|
-
copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
|
|
183
|
-
copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
|
|
184
|
-
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
|
185
|
-
copy_to_ub(output_nb_gm, output_nb_ub, 32);
|
|
186
|
-
|
|
187
|
-
GET_ROW_Q8_0 op;
|
|
188
|
-
op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub,
|
|
189
|
-
indices_nb_ub, output_ne_ub, output_nb_ub);
|
|
190
|
-
op.calculate();
|
|
191
|
-
}
|