llama-cpp-pydist 0.18.0__py3-none-any.whl → 0.20.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llama_cpp/binaries/{llama-b7376-bin-win-cpu-x64.zip → llama-b7621-bin-win-cpu-x64.zip} +0 -0
- llama_cpp_pydist-0.20.0.dist-info/METADATA +4539 -0
- {llama_cpp_pydist-0.18.0.dist-info → llama_cpp_pydist-0.20.0.dist-info}/RECORD +358 -318
- {llama_cpp_pydist-0.18.0.dist-info → llama_cpp_pydist-0.20.0.dist-info}/WHEEL +1 -1
- scripts/generate_changelog.py +10 -0
- vendor_llama_cpp_pydist/llama.cpp/.devops/cann.Dockerfile +1 -1
- vendor_llama_cpp_pydist/llama.cpp/.devops/cuda-new.Dockerfile +95 -0
- vendor_llama_cpp_pydist/llama.cpp/.devops/llama-cli-cann.Dockerfile +3 -2
- vendor_llama_cpp_pydist/llama.cpp/.devops/llama-cpp-cuda.srpm.spec +2 -0
- vendor_llama_cpp_pydist/llama.cpp/.devops/llama-cpp.srpm.spec +2 -0
- vendor_llama_cpp_pydist/llama.cpp/.gemini/settings.json +1 -0
- vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/010-bug-compilation.yml +2 -1
- vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/011-bug-results.yml +19 -5
- vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/019-bug-misc.yml +14 -2
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/build.yml +33 -2
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/docker.yml +25 -13
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/release.yml +12 -48
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/server-webui.yml +225 -0
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/server.yml +0 -264
- vendor_llama_cpp_pydist/llama.cpp/.gitignore +1 -0
- vendor_llama_cpp_pydist/llama.cpp/AGENTS.md +81 -0
- vendor_llama_cpp_pydist/llama.cpp/CLAUDE.md +1 -0
- vendor_llama_cpp_pydist/llama.cpp/CODEOWNERS +3 -2
- vendor_llama_cpp_pydist/llama.cpp/CONTRIBUTING.md +34 -5
- vendor_llama_cpp_pydist/llama.cpp/README.md +4 -2
- vendor_llama_cpp_pydist/llama.cpp/SECURITY.md +3 -0
- vendor_llama_cpp_pydist/llama.cpp/ci/run.sh +6 -0
- vendor_llama_cpp_pydist/llama.cpp/common/CMakeLists.txt +4 -3
- vendor_llama_cpp_pydist/llama.cpp/common/arg.cpp +259 -66
- vendor_llama_cpp_pydist/llama.cpp/common/arg.h +12 -2
- vendor_llama_cpp_pydist/llama.cpp/common/chat-parser.cpp +11 -0
- vendor_llama_cpp_pydist/llama.cpp/common/chat-peg-parser.cpp +12 -2
- vendor_llama_cpp_pydist/llama.cpp/common/chat.cpp +172 -3
- vendor_llama_cpp_pydist/llama.cpp/common/chat.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/common/common.cpp +151 -88
- vendor_llama_cpp_pydist/llama.cpp/common/common.h +38 -13
- vendor_llama_cpp_pydist/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
- vendor_llama_cpp_pydist/llama.cpp/common/json-schema-to-grammar.h +20 -0
- vendor_llama_cpp_pydist/llama.cpp/common/peg-parser.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/common/preset.cpp +218 -6
- vendor_llama_cpp_pydist/llama.cpp/common/preset.h +45 -3
- vendor_llama_cpp_pydist/llama.cpp/common/sampling.cpp +67 -54
- vendor_llama_cpp_pydist/llama.cpp/common/sampling.h +8 -0
- vendor_llama_cpp_pydist/llama.cpp/convert_hf_to_gguf.py +904 -454
- vendor_llama_cpp_pydist/llama.cpp/convert_hf_to_gguf_update.py +6 -0
- vendor_llama_cpp_pydist/llama.cpp/docs/android/imported-into-android-studio.jpg +0 -0
- vendor_llama_cpp_pydist/llama.cpp/docs/android.md +22 -2
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/OPENCL.md +1 -1
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/SYCL.md +3 -1
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/hexagon/CMakeUserPresets.json +2 -0
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/hexagon/README.md +5 -5
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/hexagon/developer.md +1 -1
- vendor_llama_cpp_pydist/llama.cpp/docs/build.md +21 -2
- vendor_llama_cpp_pydist/llama.cpp/docs/development/HOWTO-add-model.md +3 -2
- vendor_llama_cpp_pydist/llama.cpp/docs/development/parsing.md +2 -2
- vendor_llama_cpp_pydist/llama.cpp/docs/docker.md +15 -11
- vendor_llama_cpp_pydist/llama.cpp/docs/ops/Metal.csv +360 -322
- vendor_llama_cpp_pydist/llama.cpp/docs/ops/SYCL.csv +797 -361
- vendor_llama_cpp_pydist/llama.cpp/docs/ops.md +10 -10
- vendor_llama_cpp_pydist/llama.cpp/ggml/CMakeLists.txt +18 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/include/ggml-alloc.h +9 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/include/ggml-backend.h +2 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/include/ggml.h +2 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/CMakeLists.txt +24 -7
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-alloc.c +56 -12
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-backend.cpp +32 -13
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +248 -19
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +3 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/common.h +153 -9
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +21 -172
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +16 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +285 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +28 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +51 -6
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/repack.cpp +286 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/repack.h +8 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/vec.cpp +41 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/vec.h +125 -22
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +42 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/argmax.cu +2 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/argsort.cu +2 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/common.cuh +36 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/cpy.cu +117 -103
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/cumsum.cu +69 -33
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +3 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +2 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +67 -31
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mean.cu +3 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mma.cuh +189 -111
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mmf.cuh +14 -10
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mmq.cu +34 -8
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +204 -42
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/quantize.cu +151 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/quantize.cuh +14 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/ssm-conv.cu +14 -20
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/topk-moe.cu +17 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/topk-moe.cuh +6 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/CMakeLists.txt +3 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +224 -746
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/CMakeLists.txt +2 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/act-ops.c +369 -129
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.c +5 -11
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.h +46 -15
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/htp-msg.h +6 -5
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.c +154 -47
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +65 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/main.c +4 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +196 -48
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp-utils.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/op-desc.h +153 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-impl.h +0 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +57 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.h +2 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.m +12 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +20 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +71 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +73 -6
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +134 -13
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/kernels/cvt.cl +21 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/kernels/transpose.cl +13 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +14 -7
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +42 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/add-id.cpp +77 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/add-id.hpp +8 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/common.hpp +17 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/convert.cpp +15 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +18 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +56 -3
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +97 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +4 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +17 -6
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +22 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/pad.cpp +5 -5
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/ssm_conv.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +58 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +916 -337
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +28 -14
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +1 -7
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +29 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +3 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +3 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.glsl +2 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +2 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +17 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +40 -24
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +42 -24
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +11 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +115 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +10 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +29 -18
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +27 -21
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +10 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +11 -4
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +1 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +4 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp +62 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp +79 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp +65 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl +53 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +57 -21
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +312 -6
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +54 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +24 -5
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml.c +5 -0
- vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/constants.py +188 -0
- vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/gguf_writer.py +38 -2
- vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/tensor_mapping.py +122 -3
- vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/utility.py +1 -9
- vendor_llama_cpp_pydist/llama.cpp/grammars/README.md +6 -3
- vendor_llama_cpp_pydist/llama.cpp/include/llama.h +28 -2
- vendor_llama_cpp_pydist/llama.cpp/pyrightconfig.json +1 -1
- vendor_llama_cpp_pydist/llama.cpp/scripts/compare-logprobs.py +281 -0
- vendor_llama_cpp_pydist/llama.cpp/scripts/snapdragon/adb/run-cli.sh +9 -9
- vendor_llama_cpp_pydist/llama.cpp/scripts/snapdragon/adb/run-completion.sh +53 -0
- vendor_llama_cpp_pydist/llama.cpp/scripts/snapdragon/adb/run-mtmd.sh +65 -0
- vendor_llama_cpp_pydist/llama.cpp/scripts/sync-ggml.last +1 -1
- vendor_llama_cpp_pydist/llama.cpp/src/CMakeLists.txt +4 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-adapter.cpp +12 -3
- vendor_llama_cpp_pydist/llama.cpp/src/llama-adapter.h +7 -1
- vendor_llama_cpp_pydist/llama.cpp/src/llama-arch.cpp +1966 -2248
- vendor_llama_cpp_pydist/llama.cpp/src/llama-arch.h +16 -2
- vendor_llama_cpp_pydist/llama.cpp/src/llama-chat.cpp +11 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-chat.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-context.cpp +99 -20
- vendor_llama_cpp_pydist/llama.cpp/src/llama-context.h +8 -2
- vendor_llama_cpp_pydist/llama.cpp/src/llama-graph.cpp +75 -7
- vendor_llama_cpp_pydist/llama.cpp/src/llama-graph.h +17 -4
- vendor_llama_cpp_pydist/llama.cpp/src/llama-hparams.cpp +3 -9
- vendor_llama_cpp_pydist/llama.cpp/src/llama-hparams.h +7 -9
- vendor_llama_cpp_pydist/llama.cpp/src/llama-impl.cpp +4 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-kv-cache.cpp +85 -31
- vendor_llama_cpp_pydist/llama.cpp/src/llama-kv-cache.h +20 -3
- vendor_llama_cpp_pydist/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/src/llama-mmap.cpp +130 -28
- vendor_llama_cpp_pydist/llama.cpp/src/llama-mmap.h +5 -1
- vendor_llama_cpp_pydist/llama.cpp/src/llama-model-loader.cpp +81 -13
- vendor_llama_cpp_pydist/llama.cpp/src/llama-model-loader.h +4 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-model.cpp +380 -68
- vendor_llama_cpp_pydist/llama.cpp/src/llama-model.h +14 -2
- vendor_llama_cpp_pydist/llama.cpp/src/llama-quant.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.cpp +60 -33
- vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.h +3 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-vocab.cpp +103 -34
- vendor_llama_cpp_pydist/llama.cpp/src/llama-vocab.h +2 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama.cpp +680 -1
- vendor_llama_cpp_pydist/llama.cpp/src/models/bert.cpp +4 -2
- vendor_llama_cpp_pydist/llama.cpp/src/models/cogvlm.cpp +5 -3
- vendor_llama_cpp_pydist/llama.cpp/src/models/deepseek2.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/src/models/gemma-embedding.cpp +2 -6
- vendor_llama_cpp_pydist/llama.cpp/src/models/gemma3.cpp +3 -4
- vendor_llama_cpp_pydist/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
- vendor_llama_cpp_pydist/llama.cpp/src/models/glm4-moe.cpp +28 -11
- vendor_llama_cpp_pydist/llama.cpp/src/models/glm4.cpp +27 -4
- vendor_llama_cpp_pydist/llama.cpp/src/models/llama.cpp +19 -6
- vendor_llama_cpp_pydist/llama.cpp/src/models/maincoder.cpp +117 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/models.h +23 -5
- vendor_llama_cpp_pydist/llama.cpp/src/models/modern-bert.cpp +115 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/nemotron-h.cpp +35 -6
- vendor_llama_cpp_pydist/llama.cpp/src/models/plamo3.cpp +128 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/qwen2.cpp +12 -3
- vendor_llama_cpp_pydist/llama.cpp/src/models/qwen3next.cpp +81 -266
- vendor_llama_cpp_pydist/llama.cpp/src/unicode.cpp +23 -14
- vendor_llama_cpp_pydist/llama.cpp/tests/CMakeLists.txt +8 -0
- vendor_llama_cpp_pydist/llama.cpp/tests/test-arg-parser.cpp +29 -0
- vendor_llama_cpp_pydist/llama.cpp/tests/test-backend-ops.cpp +200 -61
- vendor_llama_cpp_pydist/llama.cpp/tests/test-chat.cpp +186 -3
- vendor_llama_cpp_pydist/llama.cpp/tests/test-grammar-llguidance.cpp +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tests/test-json-schema-to-grammar.cpp +75 -0
- vendor_llama_cpp_pydist/llama.cpp/tests/test-state-restore-fragmented.cpp +122 -0
- vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-0.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-1-bpe.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-1-spm.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/CMakeLists.txt +1 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/batched-bench/batched-bench.cpp +11 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/cli/README.md +187 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/cli/cli.cpp +1 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/completion/README.md +195 -23
- vendor_llama_cpp_pydist/llama.cpp/tools/completion/completion.cpp +11 -17
- vendor_llama_cpp_pydist/llama.cpp/tools/cvector-generator/cvector-generator.cpp +3 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/fit-params/CMakeLists.txt +8 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/fit-params/README.md +55 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/fit-params/fit-params.cpp +66 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/imatrix/imatrix.cpp +3 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/llama-bench/README.md +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/llama-bench/llama-bench.cpp +18 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/CMakeLists.txt +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip-graph.h +7 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip-impl.h +41 -8
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip-model.h +56 -2
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip.cpp +375 -41
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip.h +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/conformer.cpp +217 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/glm4v.cpp +120 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/models.h +20 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/whisper-enc.cpp +18 -10
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/youtuvl.cpp +179 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd-audio.cpp +371 -550
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd-audio.h +19 -28
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd-cli.cpp +22 -5
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd.cpp +34 -14
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd.h +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/tests.sh +1 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/perplexity/perplexity.cpp +3 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/quantize/quantize.cpp +6 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/CMakeLists.txt +0 -8
- vendor_llama_cpp_pydist/llama.cpp/tools/server/README-dev.md +2 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/README.md +71 -40
- vendor_llama_cpp_pydist/llama.cpp/tools/server/public/index.html.gz +0 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-common.cpp +10 -17
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-common.h +2 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-context.cpp +473 -287
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-context.h +52 -15
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-http.cpp +16 -10
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.cpp +256 -315
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.h +35 -28
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-queue.cpp +78 -21
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-queue.h +48 -10
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-task.cpp +33 -11
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-task.h +28 -35
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server.cpp +26 -12
- vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/unit/test_chat_completion.py +11 -2
- vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/unit/test_compat_anthropic.py +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/unit/test_sleep.py +39 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/utils.py +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/README.md +6 -5
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/docs/architecture/high-level-architecture-simplified.md +6 -2
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/docs/architecture/high-level-architecture.md +13 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/package-lock.json +10 -10
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/package.json +3 -4
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/app.d.ts +7 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentPreview.svelte +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentThumbnailFile.svelte +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte +28 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionFileAttachments.svelte +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte +1 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte +53 -8
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte +29 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageEditForm.svelte +391 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte +165 -10
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageUser.svelte +26 -48
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessages.svelte +9 -4
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte +21 -4
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreenHeader.svelte +6 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte +16 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsImportExportTab.svelte +68 -2
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebar.svelte +5 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/misc/BadgeChatStatistic.svelte +26 -7
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/misc/MarkdownContent.svelte +263 -167
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/misc/SyntaxHighlightedCode.svelte +3 -2
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/models/ModelsSelector.svelte +199 -185
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/server/ServerErrorSplash.svelte +2 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/ui/switch/index.ts +7 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/ui/switch/switch.svelte +29 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/constants/settings-config.ts +9 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/enums/chat.ts +4 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/enums/index.ts +2 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts +125 -11
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/markdown/enhance-code-blocks.ts +162 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/markdown/enhance-links.ts +33 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/services/chat.ts +12 -8
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/services/parameter-sync.spec.ts +14 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/services/parameter-sync.ts +75 -13
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/chat.svelte.ts +235 -171
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/conversations.svelte.ts +24 -2
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/server.svelte.ts +4 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/settings.svelte.ts +6 -6
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/api.d.ts +4 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/settings.d.ts +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/utils/api-key-validation.ts +2 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/utils/clipboard.ts +259 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/utils/file-preview.ts +0 -9
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/utils/index.ts +11 -2
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/utils/text.ts +7 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/routes/+layout.svelte +26 -10
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/tests/unit/clipboard.test.ts +423 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/{src/lib/utils → tests/unit}/latex-protection.test.ts +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/{src/lib/utils → tests/unit}/model-names.test.ts +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/vite.config.ts +2 -2
- vendor_llama_cpp_pydist/llama.cpp/tools/tts/tts.cpp +6 -6
- vendor_llama_cpp_pydist/llama.cpp/vendor/cpp-httplib/CMakeLists.txt +2 -1
- llama_cpp_pydist-0.18.0.dist-info/METADATA +0 -2448
- vendor_llama_cpp_pydist/llama.cpp/.github/copilot-instructions.md +0 -262
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/utils/copy.ts +0 -71
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/tests/server/demo.spec.ts +0 -7
- {llama_cpp_pydist-0.18.0.dist-info/licenses → llama_cpp_pydist-0.20.0.dist-info}/LICENSE +0 -0
- {llama_cpp_pydist-0.18.0.dist-info → llama_cpp_pydist-0.20.0.dist-info}/top_level.txt +0 -0
|
@@ -7,9 +7,10 @@
|
|
|
7
7
|
|
|
8
8
|
#include <atomic>
|
|
9
9
|
#include <chrono>
|
|
10
|
+
#include <cstddef>
|
|
10
11
|
#include <mutex>
|
|
11
|
-
#include <string>
|
|
12
12
|
#include <stdexcept>
|
|
13
|
+
#include <string>
|
|
13
14
|
|
|
14
15
|
#ifdef _WIN32
|
|
15
16
|
# include <sal.h>
|
|
@@ -36,6 +37,7 @@
|
|
|
36
37
|
#include "ggml-hexagon.h"
|
|
37
38
|
#include "ggml-impl.h"
|
|
38
39
|
#include "ggml-quants.h"
|
|
40
|
+
#include "op-desc.h"
|
|
39
41
|
#include "htp-msg.h"
|
|
40
42
|
#include "htp_iface.h"
|
|
41
43
|
|
|
@@ -55,9 +57,6 @@ static int opt_opsync = 0; // synchronous ops
|
|
|
55
57
|
#define HEX_VERBOSE(...) \
|
|
56
58
|
if (opt_verbose) GGML_LOG_DEBUG(__VA_ARGS__)
|
|
57
59
|
|
|
58
|
-
#define HEX_PROFILE(...) \
|
|
59
|
-
if (opt_profile) GGML_LOG_INFO(__VA_ARGS__)
|
|
60
|
-
|
|
61
60
|
static inline uint64_t hex_is_aligned(void * addr, uint32_t align) {
|
|
62
61
|
return ((size_t) addr & (align - 1)) == 0;
|
|
63
62
|
}
|
|
@@ -85,128 +84,30 @@ static const char * status_to_str(uint32_t status) {
|
|
|
85
84
|
|
|
86
85
|
// ** debug helpers
|
|
87
86
|
|
|
88
|
-
static
|
|
89
|
-
if (
|
|
90
|
-
return sprintf(str, "%d:%d", (int) t->ne[0], (int) t->ne[1]);
|
|
91
|
-
} else {
|
|
92
|
-
return sprintf(str, "%d:%d:%d:%d", (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]);
|
|
93
|
-
}
|
|
94
|
-
}
|
|
95
|
-
|
|
96
|
-
static inline void hex_format_op_dims(char * str, const struct ggml_tensor * t) {
|
|
97
|
-
char * p = str;
|
|
98
|
-
|
|
99
|
-
// append src0 and src1 (if any)
|
|
100
|
-
if (t->src[0]) {
|
|
101
|
-
p += hex_format_tensor_dims(p, t->src[0]);
|
|
102
|
-
|
|
103
|
-
for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
|
|
104
|
-
p += sprintf(p, " x ");
|
|
105
|
-
p += hex_format_tensor_dims(p, t->src[i]);
|
|
106
|
-
}
|
|
87
|
+
static void ggml_hexagon_dump_op_exec(const std::string &sess_name, const ggml_tensor * op, const uint32_t req_flags) {
|
|
88
|
+
if (!opt_verbose) return;
|
|
107
89
|
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
// format self dims separately for better visual alignment
|
|
112
|
-
char self[64];
|
|
113
|
-
hex_format_tensor_dims(self, t);
|
|
114
|
-
|
|
115
|
-
p += sprintf(p, "%s", self);
|
|
90
|
+
op_desc desc(op);
|
|
91
|
+
GGML_LOG_DEBUG("ggml-hex: %s execute-op %s: %s : %s : %s : %s : %s : flags 0x%x\n", sess_name.c_str(),
|
|
92
|
+
ggml_op_name(op->op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs, req_flags);
|
|
116
93
|
}
|
|
117
94
|
|
|
118
|
-
static
|
|
119
|
-
|
|
95
|
+
static void ggml_hexagon_dump_op_supp(const std::string &sess_name, const struct ggml_tensor * op, bool supp) {
|
|
96
|
+
if (!opt_verbose) return;
|
|
120
97
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
return sprintf(str, "%zu:%zu:%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], (size_t) t->nb[2],
|
|
125
|
-
(size_t) t->nb[3], c);
|
|
126
|
-
}
|
|
98
|
+
op_desc desc(op);
|
|
99
|
+
GGML_LOG_DEBUG("ggml-hex: %s supports-op %s : %s : %s : %s : %s : %s : %s\n", sess_name.c_str(),
|
|
100
|
+
ggml_op_name(op->op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs, supp ? "yes" : "no");
|
|
127
101
|
}
|
|
128
102
|
|
|
129
|
-
static
|
|
130
|
-
|
|
103
|
+
static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const ggml_tensor * op,
|
|
104
|
+
uint32_t op_usec, uint32_t op_cycles, uint32_t op_pkts, uint64_t call_usec) {
|
|
105
|
+
if (!opt_profile) return;
|
|
131
106
|
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
|
|
137
|
-
p += sprintf(p, " x ");
|
|
138
|
-
p += hex_format_tensor_strides(p, t->src[i]);
|
|
139
|
-
}
|
|
140
|
-
|
|
141
|
-
p += sprintf(p, " -> ");
|
|
142
|
-
}
|
|
143
|
-
|
|
144
|
-
// format self dims separately for better visual alignment
|
|
145
|
-
char self[64];
|
|
146
|
-
hex_format_tensor_strides(self, t);
|
|
147
|
-
|
|
148
|
-
p += sprintf(p, "%s", self);
|
|
149
|
-
}
|
|
150
|
-
|
|
151
|
-
static inline void hex_format_op_types(char * str, const struct ggml_tensor * t) {
|
|
152
|
-
char * p = str;
|
|
153
|
-
|
|
154
|
-
// append src0 and src1 (if any)
|
|
155
|
-
if (t->src[0]) {
|
|
156
|
-
p += sprintf(p, "%s", ggml_type_name(t->src[0]->type));
|
|
157
|
-
|
|
158
|
-
for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
|
|
159
|
-
p += sprintf(p, " x ");
|
|
160
|
-
p += sprintf(p, "%s", ggml_type_name(t->src[i]->type));
|
|
161
|
-
}
|
|
162
|
-
|
|
163
|
-
p += sprintf(p, " -> ");
|
|
164
|
-
}
|
|
165
|
-
|
|
166
|
-
p += sprintf(p, "%s", ggml_type_name(t->type));
|
|
167
|
-
}
|
|
168
|
-
|
|
169
|
-
static inline const char * hex_tensor_buff_name(const struct ggml_tensor * t) {
|
|
170
|
-
if (t->buffer) {
|
|
171
|
-
return ggml_backend_buffer_name(t->buffer);
|
|
172
|
-
}
|
|
173
|
-
return "NONE";
|
|
174
|
-
}
|
|
175
|
-
|
|
176
|
-
static inline void hex_format_op_buffs(char * str, const struct ggml_tensor * t) {
|
|
177
|
-
char * p = str;
|
|
178
|
-
|
|
179
|
-
// append src0 and src1 (if any)
|
|
180
|
-
if (t->src[0]) {
|
|
181
|
-
p += sprintf(p, "%s", hex_tensor_buff_name(t->src[0]));
|
|
182
|
-
|
|
183
|
-
for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
|
|
184
|
-
p += sprintf(p, " x ");
|
|
185
|
-
p += sprintf(p, "%s", hex_tensor_buff_name(t->src[i]));
|
|
186
|
-
}
|
|
187
|
-
|
|
188
|
-
p += sprintf(p, " -> ");
|
|
189
|
-
}
|
|
190
|
-
|
|
191
|
-
p += sprintf(p, "%s", hex_tensor_buff_name(t));
|
|
192
|
-
}
|
|
193
|
-
|
|
194
|
-
static inline void hex_format_op_names(char * str, const struct ggml_tensor * t) {
|
|
195
|
-
char * p = str;
|
|
196
|
-
|
|
197
|
-
// append src0 and src1 (if any)
|
|
198
|
-
if (t->src[0]) {
|
|
199
|
-
p += sprintf(p, "%s", t->src[0]->name);
|
|
200
|
-
|
|
201
|
-
for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
|
|
202
|
-
p += sprintf(p, " x ");
|
|
203
|
-
p += sprintf(p, "%s", t->src[i]->name);
|
|
204
|
-
}
|
|
205
|
-
|
|
206
|
-
p += sprintf(p, " -> ");
|
|
207
|
-
}
|
|
208
|
-
|
|
209
|
-
p += sprintf(p, "%s", t->name);
|
|
107
|
+
op_desc desc(op);
|
|
108
|
+
GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : %s : op-usec %u op-cycles %u op-pkts %u (%f) call-usec %llu\n", sess_name.c_str(),
|
|
109
|
+
ggml_op_name(op->op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs,
|
|
110
|
+
op_usec, op_cycles, op_pkts, (float) op_cycles / op_pkts, (unsigned long long) call_usec);
|
|
210
111
|
}
|
|
211
112
|
|
|
212
113
|
// ** backend sessions
|
|
@@ -221,8 +122,8 @@ struct ggml_hexagon_session {
|
|
|
221
122
|
void enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync = false);
|
|
222
123
|
void flush();
|
|
223
124
|
|
|
224
|
-
ggml_backend_buffer_type buffer_type;
|
|
225
|
-
ggml_backend_buffer_type repack_buffer_type;
|
|
125
|
+
ggml_backend_buffer_type buffer_type = {};
|
|
126
|
+
ggml_backend_buffer_type repack_buffer_type = {};
|
|
226
127
|
|
|
227
128
|
std::string name;
|
|
228
129
|
remote_handle64 handle;
|
|
@@ -241,23 +142,6 @@ struct ggml_hexagon_session {
|
|
|
241
142
|
uint32_t prof_pkts;
|
|
242
143
|
};
|
|
243
144
|
|
|
244
|
-
static inline void hex_print_op_info(const ggml_tensor * op, ggml_hexagon_session * sess, const uint32_t req_flags) {
|
|
245
|
-
char dims[64 * GGML_MAX_SRC];
|
|
246
|
-
char strides[64 * GGML_MAX_SRC];
|
|
247
|
-
char types[16 * GGML_MAX_SRC];
|
|
248
|
-
char buffs[64 * GGML_MAX_SRC];
|
|
249
|
-
char names[64 * GGML_MAX_SRC];
|
|
250
|
-
|
|
251
|
-
hex_format_op_dims(dims, op);
|
|
252
|
-
hex_format_op_strides(strides, op);
|
|
253
|
-
hex_format_op_types(types, op);
|
|
254
|
-
hex_format_op_buffs(buffs, op);
|
|
255
|
-
hex_format_op_names(names, op);
|
|
256
|
-
|
|
257
|
-
HEX_VERBOSE("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
|
|
258
|
-
names, dims, types, strides, buffs, req_flags);
|
|
259
|
-
}
|
|
260
|
-
|
|
261
145
|
void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync) {
|
|
262
146
|
// Bump pending flag (cleared in the session::flush once we get the responce)
|
|
263
147
|
this->op_pending++; // atomic inc
|
|
@@ -1598,7 +1482,7 @@ static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(
|
|
|
1598
1482
|
try {
|
|
1599
1483
|
ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, false /*repack*/);
|
|
1600
1484
|
return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size);
|
|
1601
|
-
} catch (std::exception
|
|
1485
|
+
} catch (const std::exception & exc) {
|
|
1602
1486
|
GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->name.c_str(), exc.what());
|
|
1603
1487
|
return nullptr;
|
|
1604
1488
|
}
|
|
@@ -1610,7 +1494,7 @@ static ggml_backend_buffer_t ggml_backend_hexagon_repack_buffer_type_alloc_buffe
|
|
|
1610
1494
|
try {
|
|
1611
1495
|
ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, true /*repack*/);
|
|
1612
1496
|
return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size);
|
|
1613
|
-
} catch (std::exception
|
|
1497
|
+
} catch (const std::exception & exc) {
|
|
1614
1498
|
GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->name.c_str(), exc.what());
|
|
1615
1499
|
return nullptr;
|
|
1616
1500
|
}
|
|
@@ -1697,8 +1581,8 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
|
|
|
1697
1581
|
}
|
|
1698
1582
|
|
|
1699
1583
|
// Save the IDs
|
|
1700
|
-
this->session_id
|
|
1701
|
-
this->domain_id
|
|
1584
|
+
this->session_id = n.session_id;
|
|
1585
|
+
this->domain_id = n.effective_domain_id;
|
|
1702
1586
|
this->valid_session = true;
|
|
1703
1587
|
}
|
|
1704
1588
|
|
|
@@ -1751,7 +1635,7 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
|
|
|
1751
1635
|
this->valid_handle = true;
|
|
1752
1636
|
|
|
1753
1637
|
GGML_LOG_INFO("ggml-hex: new session: %s : session-id %d domain-id %d uri %s handle 0x%lx\n", this->name.c_str(),
|
|
1754
|
-
|
|
1638
|
+
this->session_id, this->domain_id, session_uri, (unsigned long) this->handle);
|
|
1755
1639
|
|
|
1756
1640
|
// Enable FastRPC QoS mode
|
|
1757
1641
|
{
|
|
@@ -1838,11 +1722,8 @@ void ggml_hexagon_session::release() noexcept(true) {
|
|
|
1838
1722
|
}
|
|
1839
1723
|
|
|
1840
1724
|
ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) noexcept(false) {
|
|
1841
|
-
buffer_type.
|
|
1842
|
-
repack_buffer_type.
|
|
1843
|
-
|
|
1844
|
-
buffer_type.device = dev;
|
|
1845
|
-
repack_buffer_type.device = dev;
|
|
1725
|
+
buffer_type.device = dev;
|
|
1726
|
+
repack_buffer_type.device = dev;
|
|
1846
1727
|
|
|
1847
1728
|
try {
|
|
1848
1729
|
allocate(dev_id);
|
|
@@ -1852,7 +1733,7 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n
|
|
|
1852
1733
|
|
|
1853
1734
|
repack_buffer_type.iface = ggml_backend_hexagon_repack_buffer_type_interface;
|
|
1854
1735
|
repack_buffer_type.context = new ggml_backend_hexagon_buffer_type_context(this->name + "-REPACK", this);
|
|
1855
|
-
} catch (std::exception
|
|
1736
|
+
} catch (const std::exception & exc) {
|
|
1856
1737
|
release();
|
|
1857
1738
|
throw;
|
|
1858
1739
|
}
|
|
@@ -1861,8 +1742,8 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n
|
|
|
1861
1742
|
ggml_hexagon_session::~ggml_hexagon_session() noexcept(true) {
|
|
1862
1743
|
release();
|
|
1863
1744
|
|
|
1864
|
-
delete static_cast<ggml_backend_hexagon_buffer_type_context*>(buffer_type.context);
|
|
1865
|
-
delete static_cast<ggml_backend_hexagon_buffer_type_context*>(repack_buffer_type.context);
|
|
1745
|
+
delete static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type.context);
|
|
1746
|
+
delete static_cast<ggml_backend_hexagon_buffer_type_context *>(repack_buffer_type.context);
|
|
1866
1747
|
}
|
|
1867
1748
|
|
|
1868
1749
|
// ** backend interface
|
|
@@ -1930,15 +1811,6 @@ static bool hex_supported_dims(const struct ggml_tensor * x, const struct ggml_t
|
|
|
1930
1811
|
return true;
|
|
1931
1812
|
}
|
|
1932
1813
|
|
|
1933
|
-
template <typename... _TTensor>
|
|
1934
|
-
static inline bool hex_supported_buffer(const struct ggml_hexagon_session * sess, _TTensor... tensors) {
|
|
1935
|
-
return ([&]() -> bool {
|
|
1936
|
-
return !tensors || !tensors->buffer ||
|
|
1937
|
-
(ggml_backend_buffer_is_hexagon(tensors->buffer) &&
|
|
1938
|
-
ggml_backend_hexagon_buffer_get_sess(tensors->buffer) == sess);
|
|
1939
|
-
}() && ...);
|
|
1940
|
-
}
|
|
1941
|
-
|
|
1942
1814
|
static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * dst) {
|
|
1943
1815
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
1944
1816
|
const struct ggml_tensor * src1 = dst->src[1];
|
|
@@ -1976,7 +1848,8 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
|
|
|
1976
1848
|
break;
|
|
1977
1849
|
|
|
1978
1850
|
case GGML_TYPE_F16:
|
|
1979
|
-
if (
|
|
1851
|
+
if (src0->nb[1] < src0->nb[0]) {
|
|
1852
|
+
GGML_LOG_DEBUG("ggml_hexagon_supported_mul_mat: permuted F16 src0 not supported\n");
|
|
1980
1853
|
return false;
|
|
1981
1854
|
}
|
|
1982
1855
|
break;
|
|
@@ -1985,11 +1858,6 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
|
|
|
1985
1858
|
return false;
|
|
1986
1859
|
}
|
|
1987
1860
|
|
|
1988
|
-
// src0 & src1 & dst must be mapped to the same session
|
|
1989
|
-
if (!hex_supported_buffer(sess, src0, src1, dst)) {
|
|
1990
|
-
return false;
|
|
1991
|
-
}
|
|
1992
|
-
|
|
1993
1861
|
return true;
|
|
1994
1862
|
}
|
|
1995
1863
|
|
|
@@ -2032,12 +1900,6 @@ static bool ggml_hexagon_supported_mul_mat_id(const struct ggml_hexagon_session
|
|
|
2032
1900
|
return false;
|
|
2033
1901
|
}
|
|
2034
1902
|
|
|
2035
|
-
// src0 (weights) must be repacked and mapped to the same session
|
|
2036
|
-
// src1 & sr2 & dst must be mapped to the same session
|
|
2037
|
-
if (!hex_supported_buffer(sess, src0, src1, src2, dst)) {
|
|
2038
|
-
return false;
|
|
2039
|
-
}
|
|
2040
|
-
|
|
2041
1903
|
return true;
|
|
2042
1904
|
}
|
|
2043
1905
|
|
|
@@ -2067,18 +1929,12 @@ static bool ggml_hexagon_supported_binary(const struct ggml_hexagon_session * se
|
|
|
2067
1929
|
return false;
|
|
2068
1930
|
}
|
|
2069
1931
|
|
|
2070
|
-
// src0, src1 & dst must be mapped to the same session
|
|
2071
|
-
if (!hex_supported_buffer(sess, src0, src1, dst)) {
|
|
2072
|
-
return false;
|
|
2073
|
-
}
|
|
2074
|
-
|
|
2075
1932
|
return true;
|
|
2076
1933
|
}
|
|
2077
1934
|
|
|
2078
1935
|
static bool ggml_hexagon_supported_add_id(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
|
2079
1936
|
const struct ggml_tensor * src0 = op->src[0];
|
|
2080
1937
|
const struct ggml_tensor * src1 = op->src[1];
|
|
2081
|
-
const struct ggml_tensor * src2 = op->src[2];
|
|
2082
1938
|
const struct ggml_tensor * dst = op;
|
|
2083
1939
|
|
|
2084
1940
|
if (!hex_supported_src0_type(src0->type)) {
|
|
@@ -2099,11 +1955,6 @@ static bool ggml_hexagon_supported_add_id(const struct ggml_hexagon_session * se
|
|
|
2099
1955
|
return false;
|
|
2100
1956
|
}
|
|
2101
1957
|
|
|
2102
|
-
// src0, src1 & dst must be mapped to the same session
|
|
2103
|
-
if (!hex_supported_buffer(sess, src0, src1, src2, dst)) {
|
|
2104
|
-
return false;
|
|
2105
|
-
}
|
|
2106
|
-
|
|
2107
1958
|
return true;
|
|
2108
1959
|
}
|
|
2109
1960
|
|
|
@@ -2126,11 +1977,6 @@ static bool ggml_hexagon_supported_unary(const struct ggml_hexagon_session * ses
|
|
|
2126
1977
|
return false;
|
|
2127
1978
|
}
|
|
2128
1979
|
|
|
2129
|
-
// src0 & dst must be mapped to the same session
|
|
2130
|
-
if (!hex_supported_buffer(sess, src0, dst)) {
|
|
2131
|
-
return false;
|
|
2132
|
-
}
|
|
2133
|
-
|
|
2134
1980
|
return true;
|
|
2135
1981
|
}
|
|
2136
1982
|
|
|
@@ -2163,11 +2009,6 @@ static bool ggml_hexagon_supported_activations(const struct ggml_hexagon_session
|
|
|
2163
2009
|
}
|
|
2164
2010
|
}
|
|
2165
2011
|
|
|
2166
|
-
// src0, src1 & dst must be mapped to the same session
|
|
2167
|
-
if (!hex_supported_buffer(sess, src0, src1, dst)) {
|
|
2168
|
-
return false;
|
|
2169
|
-
}
|
|
2170
|
-
|
|
2171
2012
|
return true;
|
|
2172
2013
|
}
|
|
2173
2014
|
|
|
@@ -2216,11 +2057,6 @@ static bool ggml_hexagon_supported_softmax(const struct ggml_hexagon_session * s
|
|
|
2216
2057
|
}
|
|
2217
2058
|
}
|
|
2218
2059
|
|
|
2219
|
-
// src0, src1 & dst must be mapped to the same session
|
|
2220
|
-
if (!hex_supported_buffer(sess, src0, src1, dst)) {
|
|
2221
|
-
return false;
|
|
2222
|
-
}
|
|
2223
|
-
|
|
2224
2060
|
return true;
|
|
2225
2061
|
}
|
|
2226
2062
|
|
|
@@ -2271,16 +2107,28 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess
|
|
|
2271
2107
|
}
|
|
2272
2108
|
}
|
|
2273
2109
|
|
|
2274
|
-
// src0, src1, src2 & dst must be mapped to the same session
|
|
2275
|
-
if (!hex_supported_buffer(sess, src0, src1, src2, dst)) {
|
|
2276
|
-
return false;
|
|
2277
|
-
}
|
|
2278
|
-
|
|
2279
2110
|
return true;
|
|
2280
2111
|
}
|
|
2281
2112
|
|
|
2113
|
+
enum dspqbuf_type {
|
|
2114
|
+
DSPQBUF_TYPE_DSP_WRITE_CPU_READ = 0,
|
|
2115
|
+
DSPQBUF_TYPE_CPU_WRITE_DSP_READ,
|
|
2116
|
+
DSPQBUF_TYPE_CONSTANT,
|
|
2117
|
+
};
|
|
2118
|
+
|
|
2119
|
+
static void dspqbuf_dump(dspqueue_buffer * d, const struct ggml_tensor * t, dspqbuf_type type) {
|
|
2120
|
+
if (opt_verbose < 2) return;
|
|
2121
|
+
|
|
2122
|
+
auto buf = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
|
|
2123
|
+
auto sess = buf->sess;
|
|
2124
|
+
|
|
2125
|
+
GGML_LOG_DEBUG("ggml-hex: %s dspqbuf : %s base-addr %p base-size %zu data %p offset %u size %u\n", sess->name.c_str(),
|
|
2126
|
+
t->name, (void *) buf->base, buf->size, (void *) d->ptr, (unsigned int) d->offset,
|
|
2127
|
+
(unsigned int) d->size);
|
|
2128
|
+
}
|
|
2129
|
+
|
|
2282
2130
|
// Init hexagon tensor from GGML tensor and Hexagon buffer
|
|
2283
|
-
static void
|
|
2131
|
+
static void htp_req_tensor_init(htp_tensor * h, const ggml_tensor * t) {
|
|
2284
2132
|
h->data = 0; // updated by the receiver
|
|
2285
2133
|
h->type = t->type;
|
|
2286
2134
|
h->ne[0] = t->ne[0];
|
|
@@ -2293,125 +2141,52 @@ static void init_htp_tensor(htp_tensor * h, const ggml_tensor * t) {
|
|
|
2293
2141
|
h->nb[3] = t->nb[3];
|
|
2294
2142
|
}
|
|
2295
2143
|
|
|
2296
|
-
static size_t
|
|
2144
|
+
static size_t htp_req_buff_init(htp_tensor *h, dspqueue_buffer * d, const ggml_tensor * t, dspqbuf_type type) {
|
|
2297
2145
|
if (!t) {
|
|
2298
2146
|
return 0;
|
|
2299
2147
|
}
|
|
2300
2148
|
|
|
2301
|
-
|
|
2302
|
-
auto tensor_buf = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
|
|
2303
|
-
buf->fd = tensor_buf->fd;
|
|
2304
|
-
buf->ptr = t->data;
|
|
2305
|
-
buf->offset = (uint8_t *) t->data - tensor_buf->base;
|
|
2306
|
-
buf->size = ggml_nbytes(t);
|
|
2307
|
-
buf->flags = (flush_host ? DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER : 0); // Flush CPU
|
|
2308
|
-
buf->flags |= (flush_htp ? DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT : 0); // Invalidate DSP
|
|
2309
|
-
return 1;
|
|
2310
|
-
}
|
|
2311
|
-
|
|
2312
|
-
static ggml_hexagon_session * get_session_from_tensor(const ggml_tensor * t) {
|
|
2313
|
-
return static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context)->sess;
|
|
2314
|
-
}
|
|
2315
|
-
|
|
2316
|
-
static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer * d) {
|
|
2317
|
-
auto buf = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
|
|
2318
|
-
auto sess = buf->sess;
|
|
2319
|
-
|
|
2320
|
-
HEX_VERBOSE("ggml-hex: %s dspqbuf : %s base-addr %p base-size %zu data %p offset %u size %u\n", sess->name.c_str(),
|
|
2321
|
-
t->name, (void *) buf->base, buf->size, (void *) d->ptr, (unsigned int) d->offset,
|
|
2322
|
-
(unsigned int) d->size);
|
|
2323
|
-
}
|
|
2324
|
-
|
|
2325
|
-
static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags) {
|
|
2326
|
-
const struct ggml_tensor * src0 = op->src[0];
|
|
2327
|
-
const struct ggml_tensor * src1 = op->src[1];
|
|
2328
|
-
const struct ggml_tensor * dst = op;
|
|
2329
|
-
|
|
2330
|
-
uint64_t t1, t2;
|
|
2331
|
-
t1 = ggml_time_us();
|
|
2332
|
-
|
|
2333
|
-
// Construct HTP message
|
|
2334
|
-
htp_general_req req;
|
|
2335
|
-
req.op = HTP_OP_MUL_MAT;
|
|
2336
|
-
req.flags = flags;
|
|
2149
|
+
auto buf = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
|
|
2337
2150
|
|
|
2338
|
-
|
|
2339
|
-
|
|
2340
|
-
|
|
2151
|
+
memset(d, 0, sizeof(*d));
|
|
2152
|
+
d->fd = buf->fd;
|
|
2153
|
+
d->ptr = t->data;
|
|
2154
|
+
d->offset = (uint8_t *) t->data - buf->base;
|
|
2155
|
+
d->size = ggml_nbytes(t);
|
|
2341
2156
|
|
|
2342
|
-
|
|
2343
|
-
|
|
2344
|
-
|
|
2345
|
-
|
|
2346
|
-
|
|
2347
|
-
|
|
2348
|
-
|
|
2349
|
-
|
|
2350
|
-
|
|
2351
|
-
|
|
2352
|
-
|
|
2353
|
-
|
|
2354
|
-
|
|
2355
|
-
|
|
2356
|
-
// Second buffer Input Activations. This is a buffer that the CPU
|
|
2357
|
-
// writes and the DSP reads, so we'll need to flush CPU caches and
|
|
2358
|
-
// invalidate DSP ones. On platforms with I/O coherency support the
|
|
2359
|
-
// framework will automatically skip cache operations where possible.
|
|
2360
|
-
dspqueue_buffers_init(&bufs[1], src1, true, true);
|
|
2361
|
-
|
|
2362
|
-
// Third buffer Output Activations. We'll handle DSP
|
|
2363
|
-
// cache maintenance in the response message but need to flush
|
|
2364
|
-
// CPU caches to ensure any previously written dirty lines are
|
|
2365
|
-
// written out before writes from the DSP start.
|
|
2366
|
-
dspqueue_buffers_init(&bufs[2], dst, true, false);
|
|
2367
|
-
|
|
2368
|
-
auto * sess = get_session_from_tensor(src0);
|
|
2369
|
-
|
|
2370
|
-
if (opt_verbose) {
|
|
2371
|
-
hex_print_op_info(op, sess, req.flags);
|
|
2372
|
-
if (opt_verbose > 1) {
|
|
2373
|
-
hex_dump_dspbuf(src0, &bufs[0]);
|
|
2374
|
-
hex_dump_dspbuf(src1, &bufs[1]);
|
|
2375
|
-
hex_dump_dspbuf(dst, &bufs[2]);
|
|
2376
|
-
}
|
|
2157
|
+
switch (type) {
|
|
2158
|
+
case DSPQBUF_TYPE_DSP_WRITE_CPU_READ:
|
|
2159
|
+
// Flush CPU
|
|
2160
|
+
d->flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER;
|
|
2161
|
+
break;
|
|
2162
|
+
case DSPQBUF_TYPE_CPU_WRITE_DSP_READ:
|
|
2163
|
+
// Flush CPU, Invalidate DSP
|
|
2164
|
+
d->flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
|
|
2165
|
+
break;
|
|
2166
|
+
default:
|
|
2167
|
+
// Constant buffer, no cache maintenance
|
|
2168
|
+
d->flags = 0;
|
|
2169
|
+
break;
|
|
2377
2170
|
}
|
|
2378
2171
|
|
|
2379
|
-
|
|
2380
|
-
sess->enqueue(req, bufs, 3, opt_opsync);
|
|
2381
|
-
}
|
|
2172
|
+
htp_req_tensor_init(h, t);
|
|
2382
2173
|
|
|
2383
|
-
|
|
2174
|
+
dspqbuf_dump(d, t, type);
|
|
2384
2175
|
|
|
2385
|
-
|
|
2386
|
-
"ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) "
|
|
2387
|
-
"call-usec %llu\n",
|
|
2388
|
-
sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
|
|
2389
|
-
(uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
|
|
2390
|
-
(uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
|
|
2391
|
-
(uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
|
|
2392
|
-
(float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
|
|
2176
|
+
return 1;
|
|
2393
2177
|
}
|
|
2394
2178
|
|
|
2395
|
-
|
|
2396
|
-
const struct ggml_tensor * src0 = op->src[0];
|
|
2397
|
-
const struct ggml_tensor * src1 = op->src[1];
|
|
2398
|
-
const struct ggml_tensor * src2 = op->src[2];
|
|
2399
|
-
const struct ggml_tensor * dst = op;
|
|
2179
|
+
typedef size_t (*htp_req_init_func_t)(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * op);
|
|
2400
2180
|
|
|
2401
|
-
|
|
2402
|
-
|
|
2181
|
+
template <htp_req_init_func_t _init_req_func>
|
|
2182
|
+
static inline void ggml_hexagon_dispatch_op(ggml_hexagon_session *sess, const struct ggml_tensor * op, uint32_t flags) {
|
|
2183
|
+
uint64_t t = ggml_time_us();
|
|
2403
2184
|
|
|
2404
|
-
// Construct HTP
|
|
2185
|
+
// Construct HTP request
|
|
2405
2186
|
htp_general_req req;
|
|
2406
|
-
req
|
|
2407
|
-
req.flags = flags;
|
|
2187
|
+
memset(&req, 0, sizeof(req));
|
|
2408
2188
|
|
|
2409
|
-
|
|
2410
|
-
init_htp_tensor(&req.src1, src1);
|
|
2411
|
-
init_htp_tensor(&req.src2, src2);
|
|
2412
|
-
init_htp_tensor(&req.dst, dst);
|
|
2413
|
-
|
|
2414
|
-
// Use opmask to override flags
|
|
2189
|
+
req.flags = flags;
|
|
2415
2190
|
if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
|
|
2416
2191
|
req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
|
|
2417
2192
|
}
|
|
@@ -2419,461 +2194,141 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag
|
|
|
2419
2194
|
req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
|
|
2420
2195
|
}
|
|
2421
2196
|
|
|
2422
|
-
|
|
2423
|
-
// First buffer Weights.
|
|
2424
|
-
// The content is static, there is no need to do any cache management
|
|
2425
|
-
dspqueue_buffers_init(bufs, src0, false, false);
|
|
2426
|
-
|
|
2427
|
-
// Second buffer Input Activations. This is a buffer that the CPU
|
|
2428
|
-
// writes and the DSP reads, so we'll need to flush CPU caches and
|
|
2429
|
-
// invalidate DSP ones. On platforms with I/O coherency support the
|
|
2430
|
-
// framework will automatically skip cache operations where possible.
|
|
2431
|
-
dspqueue_buffers_init(&bufs[1], src1, true, true);
|
|
2432
|
-
|
|
2433
|
-
// Third buffer expert IDs. This is a buffer that the CPU
|
|
2434
|
-
// writes and the DSP reads, so we'll need to flush CPU caches and
|
|
2435
|
-
// invalidate DSP ones. On platforms with I/O coherency support the
|
|
2436
|
-
// framework will automatically skip cache operations where possible.
|
|
2437
|
-
dspqueue_buffers_init(&bufs[2], src2, true, true);
|
|
2438
|
-
|
|
2439
|
-
// Forth buffer Output Activations. We'll handle DSP
|
|
2440
|
-
// cache maintenance in the response message but need to flush
|
|
2441
|
-
// CPU caches to ensure any previously written dirty lines are
|
|
2442
|
-
// written out before writes from the DSP start.
|
|
2443
|
-
dspqueue_buffers_init(&bufs[3], dst, true, false);
|
|
2444
|
-
|
|
2445
|
-
auto * sess = get_session_from_tensor(src0);
|
|
2446
|
-
|
|
2447
|
-
if (opt_verbose) {
|
|
2448
|
-
hex_print_op_info(op, sess, req.flags);
|
|
2449
|
-
if (opt_verbose > 1) {
|
|
2450
|
-
hex_dump_dspbuf(src0, &bufs[0]);
|
|
2451
|
-
hex_dump_dspbuf(src1, &bufs[1]);
|
|
2452
|
-
hex_dump_dspbuf(src2, &bufs[2]);
|
|
2453
|
-
hex_dump_dspbuf(dst, &bufs[3]);
|
|
2454
|
-
}
|
|
2455
|
-
}
|
|
2197
|
+
ggml_hexagon_dump_op_exec(sess->name, op, req.flags);
|
|
2456
2198
|
|
|
2457
2199
|
if ((opt_opmask & HTP_OPMASK_QUEUE)) {
|
|
2458
|
-
|
|
2200
|
+
dspqueue_buffer bufs[HTP_MAX_PACKET_BUFFERS];
|
|
2201
|
+
size_t n_bufs = _init_req_func(&req, bufs, op);
|
|
2202
|
+
sess->enqueue(req, bufs, n_bufs, opt_opsync);
|
|
2459
2203
|
}
|
|
2460
2204
|
|
|
2461
|
-
|
|
2205
|
+
t = ggml_time_us() - t;
|
|
2462
2206
|
|
|
2463
|
-
|
|
2464
|
-
"ggml-hex: %s matmul-id %s %u:%u:%u:%u x %s %u:%u:%u:%u (%s %u:%u:%u:%u) -> %s %u:%u:%u:%u : op-usec %u "
|
|
2465
|
-
"op-cycles %u op-pkts %u (%f) call-usec %llu\n",
|
|
2466
|
-
sess->name.c_str(), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], (uint32_t) src0->ne[2],
|
|
2467
|
-
(uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2],
|
|
2468
|
-
(uint32_t) src1->ne[3], src2->name, (uint32_t) src2->ne[0], (uint32_t) src2->ne[1], (uint32_t) src2->ne[2],
|
|
2469
|
-
(uint32_t) src2->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],
|
|
2470
|
-
(uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
|
|
2471
|
-
(float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
|
|
2207
|
+
ggml_hexagon_dump_op_prof(sess->name, op, sess->prof_usecs, sess->prof_cycles, sess->prof_pkts, t);
|
|
2472
2208
|
}
|
|
2473
2209
|
|
|
2474
|
-
|
|
2475
|
-
|
|
2476
|
-
|
|
2477
|
-
|
|
2478
|
-
|
|
2479
|
-
|
|
2480
|
-
uint64_t t1 = 0;
|
|
2481
|
-
uint64_t t2 = 0;
|
|
2482
|
-
|
|
2483
|
-
t1 = ggml_time_us();
|
|
2484
|
-
|
|
2485
|
-
// Construct HTP message
|
|
2486
|
-
htp_general_req req;
|
|
2487
|
-
req.flags = flags;
|
|
2488
|
-
|
|
2489
|
-
// Use opmask to override flags
|
|
2490
|
-
if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
|
|
2491
|
-
req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
|
|
2492
|
-
}
|
|
2493
|
-
if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
|
|
2494
|
-
req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
|
|
2495
|
-
}
|
|
2496
|
-
|
|
2497
|
-
switch (node->op) {
|
|
2210
|
+
template <bool _is_src0_constant>
|
|
2211
|
+
static inline size_t init_binary_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
|
|
2212
|
+
switch (t->op) {
|
|
2213
|
+
case GGML_OP_MUL_MAT:
|
|
2214
|
+
req->op = HTP_OP_MUL_MAT;
|
|
2215
|
+
break;
|
|
2498
2216
|
case GGML_OP_MUL:
|
|
2499
|
-
req
|
|
2217
|
+
req->op = HTP_OP_MUL;
|
|
2500
2218
|
break;
|
|
2501
2219
|
case GGML_OP_ADD:
|
|
2502
|
-
req
|
|
2220
|
+
req->op = HTP_OP_ADD;
|
|
2503
2221
|
break;
|
|
2504
2222
|
case GGML_OP_SUB:
|
|
2505
|
-
req
|
|
2223
|
+
req->op = HTP_OP_SUB;
|
|
2506
2224
|
break;
|
|
2507
2225
|
default:
|
|
2508
|
-
GGML_ABORT("ggml-hex: binary : unsupported op
|
|
2509
|
-
|
|
2510
|
-
|
|
2511
|
-
init_htp_tensor(&req.src0, src0);
|
|
2512
|
-
init_htp_tensor(&req.src1, src1);
|
|
2513
|
-
init_htp_tensor(&req.dst, dst);
|
|
2514
|
-
|
|
2515
|
-
dspqueue_buffer bufs[3];
|
|
2516
|
-
// First buffer = First Operand of Binary op
|
|
2517
|
-
// This is a buffer that the CPU writes and the DSP reads, so we'll
|
|
2518
|
-
// need to flush CPU caches and invalidate DSP ones. On platforms
|
|
2519
|
-
// with I/O coherency support the framework will automatically skip
|
|
2520
|
-
// cache operations where possible.
|
|
2521
|
-
dspqueue_buffers_init(bufs, src0, true, true);
|
|
2522
|
-
|
|
2523
|
-
// Second buffer = Second Operand of Binary op
|
|
2524
|
-
// This is a buffer that the CPU writes and the DSP reads, so we'll
|
|
2525
|
-
// need to flush CPU caches and invalidate DSP ones. On platforms
|
|
2526
|
-
// with I/O coherency support the framework will automatically skip
|
|
2527
|
-
// cache operations where possible.
|
|
2528
|
-
dspqueue_buffers_init(&bufs[1], src1, true, true);
|
|
2529
|
-
|
|
2530
|
-
// Third buffer = Output Activations. We'll handle DSP
|
|
2531
|
-
// cache maintenance in the response message but need to flush
|
|
2532
|
-
// CPU caches to ensure any previously written dirty lines are
|
|
2533
|
-
// written out before writes from the DSP start.
|
|
2534
|
-
dspqueue_buffers_init(&bufs[2], dst, true, false);
|
|
2535
|
-
|
|
2536
|
-
auto * sess = get_session_from_tensor(src0);
|
|
2537
|
-
|
|
2538
|
-
if (opt_verbose) {
|
|
2539
|
-
hex_print_op_info(op, sess, req.flags);
|
|
2540
|
-
if (opt_verbose > 1) {
|
|
2541
|
-
hex_dump_dspbuf(src0, &bufs[0]);
|
|
2542
|
-
hex_dump_dspbuf(src1, &bufs[1]);
|
|
2543
|
-
hex_dump_dspbuf(dst, &bufs[2]);
|
|
2544
|
-
}
|
|
2226
|
+
GGML_ABORT("ggml-hex: binary : unsupported op: %d\n", t->op);
|
|
2227
|
+
break;
|
|
2545
2228
|
}
|
|
2546
2229
|
|
|
2547
|
-
|
|
2548
|
-
|
|
2549
|
-
|
|
2230
|
+
// src0: Weights (mulmat) or First Operand (binary op).
|
|
2231
|
+
// If constant (e.g. weights), no cache management is needed.
|
|
2232
|
+
// src1: Input Activations (mulmat) or Second Operand (binary op).
|
|
2550
2233
|
|
|
2551
|
-
|
|
2234
|
+
size_t n_bufs = 0;
|
|
2235
|
+
n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], _is_src0_constant ? DSPQBUF_TYPE_CONSTANT : DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2236
|
+
n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2237
|
+
n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
|
|
2552
2238
|
|
|
2553
|
-
|
|
2554
|
-
"ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) "
|
|
2555
|
-
"call-usec %llu\n",
|
|
2556
|
-
sess->name.c_str(), ggml_op_name(node->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
|
|
2557
|
-
(uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
|
|
2558
|
-
(uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
|
|
2559
|
-
(uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
|
|
2560
|
-
(float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
|
|
2239
|
+
return n_bufs;
|
|
2561
2240
|
}
|
|
2562
2241
|
|
|
2563
|
-
|
|
2564
|
-
|
|
2565
|
-
|
|
2566
|
-
|
|
2567
|
-
|
|
2568
|
-
|
|
2569
|
-
|
|
2570
|
-
uint64_t t1 = 0;
|
|
2571
|
-
uint64_t t2 = 0;
|
|
2572
|
-
|
|
2573
|
-
t1 = ggml_time_us();
|
|
2574
|
-
|
|
2575
|
-
// Construct HTP message
|
|
2576
|
-
htp_general_req req;
|
|
2577
|
-
req.flags = flags;
|
|
2578
|
-
|
|
2579
|
-
// Use opmask to override flags
|
|
2580
|
-
if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
|
|
2581
|
-
req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
|
|
2582
|
-
}
|
|
2583
|
-
if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
|
|
2584
|
-
req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
|
|
2585
|
-
}
|
|
2586
|
-
|
|
2587
|
-
switch (node->op) {
|
|
2242
|
+
template <bool _is_src0_constant>
|
|
2243
|
+
static inline size_t init_binary_id_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
|
|
2244
|
+
switch (t->op) {
|
|
2245
|
+
case GGML_OP_MUL_MAT_ID:
|
|
2246
|
+
req->op = HTP_OP_MUL_MAT_ID;
|
|
2247
|
+
break;
|
|
2588
2248
|
case GGML_OP_ADD_ID:
|
|
2589
|
-
req
|
|
2249
|
+
req->op = HTP_OP_ADD_ID;
|
|
2590
2250
|
break;
|
|
2591
2251
|
default:
|
|
2592
|
-
GGML_ABORT("ggml-hex: unsupported op
|
|
2593
|
-
}
|
|
2594
|
-
|
|
2595
|
-
init_htp_tensor(&req.src0, src0);
|
|
2596
|
-
init_htp_tensor(&req.src1, src1);
|
|
2597
|
-
init_htp_tensor(&req.src2, src2);
|
|
2598
|
-
init_htp_tensor(&req.dst, dst);
|
|
2599
|
-
|
|
2600
|
-
dspqueue_buffer bufs[4];
|
|
2601
|
-
// First buffer = input activations
|
|
2602
|
-
dspqueue_buffers_init(bufs, src0, true, true);
|
|
2603
|
-
// Second buffer = experts bias
|
|
2604
|
-
dspqueue_buffers_init(&bufs[1], src1, true, true);
|
|
2605
|
-
// Third buffer = activated experts
|
|
2606
|
-
dspqueue_buffers_init(&bufs[2], src2, true, true);
|
|
2607
|
-
// Forth buffer = output activations
|
|
2608
|
-
dspqueue_buffers_init(&bufs[3], dst, true, true);
|
|
2609
|
-
|
|
2610
|
-
auto * sess = get_session_from_tensor(src0);
|
|
2611
|
-
|
|
2612
|
-
if (opt_verbose) {
|
|
2613
|
-
hex_print_op_info(op, sess, req.flags);
|
|
2614
|
-
if (opt_verbose > 1) {
|
|
2615
|
-
hex_dump_dspbuf(src0, &bufs[0]);
|
|
2616
|
-
hex_dump_dspbuf(src1, &bufs[1]);
|
|
2617
|
-
hex_dump_dspbuf(src2, &bufs[2]);
|
|
2618
|
-
hex_dump_dspbuf(dst, &bufs[3]);
|
|
2619
|
-
}
|
|
2252
|
+
GGML_ABORT("ggml-hex: unsupported op: %d\n", t->op);
|
|
2620
2253
|
}
|
|
2621
2254
|
|
|
2622
|
-
|
|
2623
|
-
|
|
2624
|
-
|
|
2255
|
+
// src0: Weights (mulmat) or Input Activations (other op).
|
|
2256
|
+
// If constant, no cache management is needed.
|
|
2257
|
+
// src1: Input Activations (mulmat) or Second Operand (binary op).
|
|
2258
|
+
// src2: Expert IDs (mulmat) or Activated Experts (other op).
|
|
2625
2259
|
|
|
2626
|
-
|
|
2260
|
+
size_t n_bufs = 0;
|
|
2261
|
+
n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], _is_src0_constant ? DSPQBUF_TYPE_CONSTANT : DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2262
|
+
n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2263
|
+
n_bufs += htp_req_buff_init(&req->src2, &bufs[n_bufs], t->src[2], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2264
|
+
n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
|
|
2627
2265
|
|
|
2628
|
-
|
|
2629
|
-
"ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) "
|
|
2630
|
-
"call-usec %llu\n",
|
|
2631
|
-
sess->name.c_str(), ggml_op_name(node->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
|
|
2632
|
-
(uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
|
|
2633
|
-
(uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
|
|
2634
|
-
(uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
|
|
2635
|
-
(float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
|
|
2266
|
+
return n_bufs;
|
|
2636
2267
|
}
|
|
2637
2268
|
|
|
2638
|
-
static
|
|
2639
|
-
|
|
2640
|
-
const struct ggml_tensor * src1 = op->src[1];
|
|
2641
|
-
const struct ggml_tensor * dst = op;
|
|
2642
|
-
|
|
2643
|
-
uint64_t t1 = 0;
|
|
2644
|
-
uint64_t t2 = 0;
|
|
2645
|
-
|
|
2646
|
-
t1 = ggml_time_us();
|
|
2647
|
-
|
|
2648
|
-
// Construct HTP message
|
|
2649
|
-
htp_general_req req;
|
|
2650
|
-
|
|
2651
|
-
memset(&req, 0, sizeof(htp_general_req));
|
|
2652
|
-
memcpy(&req.op_params, &op->op_params, sizeof(op->op_params));
|
|
2653
|
-
req.flags = flags;
|
|
2269
|
+
static inline size_t init_unary_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
|
|
2270
|
+
memcpy(&req->op_params, &t->op_params, sizeof(t->op_params));
|
|
2654
2271
|
|
|
2655
2272
|
bool supported = false;
|
|
2656
2273
|
|
|
2657
|
-
switch (
|
|
2274
|
+
switch (t->op) {
|
|
2658
2275
|
case GGML_OP_RMS_NORM:
|
|
2659
|
-
req
|
|
2276
|
+
req->op = HTP_OP_RMS_NORM;
|
|
2660
2277
|
supported = true;
|
|
2661
2278
|
break;
|
|
2662
2279
|
|
|
2663
2280
|
case GGML_OP_UNARY:
|
|
2664
|
-
if (ggml_get_unary_op(
|
|
2665
|
-
req
|
|
2281
|
+
if (ggml_get_unary_op(t) == GGML_UNARY_OP_SILU) {
|
|
2282
|
+
req->op = HTP_OP_UNARY_SILU;
|
|
2283
|
+
supported = true;
|
|
2284
|
+
} else if (ggml_get_unary_op(t) == GGML_UNARY_OP_GELU) {
|
|
2285
|
+
req->op = HTP_OP_UNARY_GELU;
|
|
2666
2286
|
supported = true;
|
|
2667
2287
|
}
|
|
2668
2288
|
break;
|
|
2669
2289
|
|
|
2670
2290
|
case GGML_OP_GLU:
|
|
2671
|
-
if (ggml_get_glu_op(
|
|
2672
|
-
req
|
|
2291
|
+
if (ggml_get_glu_op(t) == GGML_GLU_OP_SWIGLU) {
|
|
2292
|
+
req->op = HTP_OP_GLU_SWIGLU;
|
|
2673
2293
|
supported = true;
|
|
2674
|
-
} else if (ggml_get_glu_op(
|
|
2675
|
-
req
|
|
2294
|
+
} else if (ggml_get_glu_op(t) == GGML_GLU_OP_SWIGLU_OAI) {
|
|
2295
|
+
req->op = HTP_OP_GLU_SWIGLU_OAI;
|
|
2676
2296
|
supported = true;
|
|
2677
2297
|
}
|
|
2678
2298
|
break;
|
|
2679
2299
|
|
|
2680
2300
|
case GGML_OP_SOFT_MAX:
|
|
2681
|
-
req
|
|
2301
|
+
req->op = HTP_OP_SOFTMAX;
|
|
2682
2302
|
supported = true;
|
|
2303
|
+
break;
|
|
2683
2304
|
|
|
2684
2305
|
default:
|
|
2685
2306
|
break;
|
|
2686
2307
|
}
|
|
2687
2308
|
|
|
2688
2309
|
if (!supported) {
|
|
2689
|
-
GGML_ABORT("ggml-hex: unary : unsupported op
|
|
2690
|
-
}
|
|
2691
|
-
|
|
2692
|
-
init_htp_tensor(&req.dst, dst);
|
|
2693
|
-
init_htp_tensor(&req.src0, src0);
|
|
2694
|
-
if (src1) {
|
|
2695
|
-
init_htp_tensor(&req.src1, src1);
|
|
2696
|
-
}
|
|
2697
|
-
|
|
2698
|
-
// Use opmask to override flags
|
|
2699
|
-
if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
|
|
2700
|
-
req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
|
|
2701
|
-
}
|
|
2702
|
-
if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
|
|
2703
|
-
req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
|
|
2704
|
-
}
|
|
2705
|
-
|
|
2706
|
-
dspqueue_buffer bufs[3];
|
|
2707
|
-
|
|
2708
|
-
// First buffer = Only Operand of Unary op
|
|
2709
|
-
// This is a buffer that the CPU writes and the DSP reads, so we'll
|
|
2710
|
-
// need to flush CPU caches and invalidate DSP ones. On platforms
|
|
2711
|
-
// with I/O coherency support the framework will automatically skip
|
|
2712
|
-
// cache operations where possible.
|
|
2713
|
-
size_t n_bufs = dspqueue_buffers_init(bufs, src0, true, true);
|
|
2714
|
-
|
|
2715
|
-
// Second buffer(nullable) = Second Operand of Binary op
|
|
2716
|
-
// This is a buffer that the CPU writes and the DSP reads, so we'll
|
|
2717
|
-
// need to flush CPU caches and invalidate DSP ones. On platforms
|
|
2718
|
-
// with I/O coherency support the framework will automatically skip
|
|
2719
|
-
// cache operations where possible.
|
|
2720
|
-
n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src1, true, true);
|
|
2721
|
-
|
|
2722
|
-
// Second or third buffer = Output Activations. We'll handle DSP
|
|
2723
|
-
// Second buffer = Output Activations. We'll handle DSP
|
|
2724
|
-
// cache maintenance in the response message but need to flush
|
|
2725
|
-
// CPU caches to ensure any previously written dirty lines are
|
|
2726
|
-
// written out before writes from the DSP start.
|
|
2727
|
-
n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, true, false);
|
|
2728
|
-
|
|
2729
|
-
// Primary DSP session from the src0 tensor
|
|
2730
|
-
auto * sess = get_session_from_tensor(src0);
|
|
2731
|
-
|
|
2732
|
-
if (opt_verbose) {
|
|
2733
|
-
hex_print_op_info(op, sess, req.flags);
|
|
2734
|
-
if (opt_verbose > 1) {
|
|
2735
|
-
hex_dump_dspbuf(src0, &bufs[0]);
|
|
2736
|
-
if (src1) {
|
|
2737
|
-
hex_dump_dspbuf(src1, &bufs[1]);
|
|
2738
|
-
hex_dump_dspbuf(dst, &bufs[2]);
|
|
2739
|
-
} else {
|
|
2740
|
-
hex_dump_dspbuf(dst, &bufs[1]);
|
|
2741
|
-
}
|
|
2742
|
-
}
|
|
2743
|
-
}
|
|
2744
|
-
|
|
2745
|
-
if ((opt_opmask & HTP_OPMASK_QUEUE)) {
|
|
2746
|
-
sess->enqueue(req, bufs, n_bufs, opt_opsync);
|
|
2310
|
+
GGML_ABORT("ggml-hex: unary : unsupported op: %d\n", t->op);
|
|
2747
2311
|
}
|
|
2748
2312
|
|
|
2749
|
-
|
|
2313
|
+
size_t n_bufs = 0;
|
|
2314
|
+
n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2315
|
+
n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2316
|
+
n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
|
|
2750
2317
|
|
|
2751
|
-
|
|
2752
|
-
HEX_PROFILE(
|
|
2753
|
-
"ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u "
|
|
2754
|
-
"(%f) call-usec %llu\n",
|
|
2755
|
-
sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
|
|
2756
|
-
(uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
|
|
2757
|
-
(uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
|
|
2758
|
-
(uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
|
|
2759
|
-
(float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
|
|
2760
|
-
} else {
|
|
2761
|
-
HEX_PROFILE(
|
|
2762
|
-
"ggml-hex: %s %s %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) call-usec "
|
|
2763
|
-
"%llu\n",
|
|
2764
|
-
sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
|
|
2765
|
-
(uint32_t) src0->ne[2], (uint32_t) src0->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
|
|
2766
|
-
(uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
|
|
2767
|
-
(float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
|
|
2768
|
-
}
|
|
2318
|
+
return n_bufs;
|
|
2769
2319
|
}
|
|
2770
2320
|
|
|
2771
|
-
static
|
|
2772
|
-
|
|
2773
|
-
|
|
2774
|
-
const struct ggml_tensor * src2 = op->src[2];
|
|
2775
|
-
const struct ggml_tensor * dst = op;
|
|
2776
|
-
|
|
2777
|
-
uint64_t t1 = 0;
|
|
2778
|
-
uint64_t t2 = 0;
|
|
2321
|
+
static inline size_t init_rope_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
|
|
2322
|
+
memcpy(&req->op_params, &t->op_params, sizeof(t->op_params));
|
|
2323
|
+
req->op = HTP_OP_ROPE;
|
|
2779
2324
|
|
|
2780
|
-
|
|
2781
|
-
|
|
2782
|
-
|
|
2783
|
-
|
|
2325
|
+
size_t n_bufs = 0;
|
|
2326
|
+
n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2327
|
+
n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2328
|
+
n_bufs += htp_req_buff_init(&req->src2, &bufs[n_bufs], t->src[2], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
|
2329
|
+
n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
|
|
2784
2330
|
|
|
2785
|
-
|
|
2786
|
-
memcpy(&req.op_params, &op->op_params, sizeof(op->op_params));
|
|
2787
|
-
req.flags = flags;
|
|
2788
|
-
req.op = HTP_OP_ROPE;
|
|
2789
|
-
|
|
2790
|
-
init_htp_tensor(&req.dst, dst);
|
|
2791
|
-
init_htp_tensor(&req.src0, src0);
|
|
2792
|
-
init_htp_tensor(&req.src1, src1);
|
|
2793
|
-
if (src2) {
|
|
2794
|
-
init_htp_tensor(&req.src2, src2);
|
|
2795
|
-
}
|
|
2796
|
-
|
|
2797
|
-
// Use opmask to override flags
|
|
2798
|
-
if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
|
|
2799
|
-
req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
|
|
2800
|
-
}
|
|
2801
|
-
if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
|
|
2802
|
-
req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
|
|
2803
|
-
}
|
|
2804
|
-
|
|
2805
|
-
dspqueue_buffer bufs[4];
|
|
2806
|
-
|
|
2807
|
-
// First buffer
|
|
2808
|
-
// This is a buffer that the CPU writes and the DSP reads, so we'll
|
|
2809
|
-
// need to flush CPU caches and invalidate DSP ones. On platforms
|
|
2810
|
-
// with I/O coherency support the framework will automatically skip
|
|
2811
|
-
// cache operations where possible.
|
|
2812
|
-
size_t n_bufs = dspqueue_buffers_init(bufs, src0, true, true);
|
|
2813
|
-
|
|
2814
|
-
// Second buffer
|
|
2815
|
-
// This is a buffer that the CPU writes and the DSP reads, so we'll
|
|
2816
|
-
// need to flush CPU caches and invalidate DSP ones. On platforms
|
|
2817
|
-
// with I/O coherency support the framework will automatically skip
|
|
2818
|
-
// cache operations where possible.
|
|
2819
|
-
n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src1, true, true);
|
|
2820
|
-
|
|
2821
|
-
// Third buffer(nullable)
|
|
2822
|
-
// This is a buffer that the CPU writes and the DSP reads, so we'll
|
|
2823
|
-
// need to flush CPU caches and invalidate DSP ones. On platforms
|
|
2824
|
-
// with I/O coherency support the framework will automatically skip
|
|
2825
|
-
// cache operations where possible.
|
|
2826
|
-
n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src2, true, true);
|
|
2827
|
-
|
|
2828
|
-
// Final buffer = Output Activations. We'll handle DSP
|
|
2829
|
-
// Second buffer = Output Activations. We'll handle DSP
|
|
2830
|
-
// cache maintenance in the response message but need to flush
|
|
2831
|
-
// CPU caches to ensure any previously written dirty lines are
|
|
2832
|
-
// written out before writes from the DSP start.
|
|
2833
|
-
n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, true, false);
|
|
2834
|
-
|
|
2835
|
-
// Primary DSP session from the src0 tensor
|
|
2836
|
-
auto * sess = get_session_from_tensor(src0);
|
|
2837
|
-
|
|
2838
|
-
if (opt_verbose) {
|
|
2839
|
-
hex_print_op_info(op, sess, req.flags);
|
|
2840
|
-
if (opt_verbose > 1) {
|
|
2841
|
-
hex_dump_dspbuf(src0, &bufs[0]);
|
|
2842
|
-
if (src1) {
|
|
2843
|
-
hex_dump_dspbuf(src1, &bufs[1]);
|
|
2844
|
-
hex_dump_dspbuf(dst, &bufs[2]);
|
|
2845
|
-
} else {
|
|
2846
|
-
hex_dump_dspbuf(dst, &bufs[1]);
|
|
2847
|
-
}
|
|
2848
|
-
}
|
|
2849
|
-
}
|
|
2850
|
-
|
|
2851
|
-
if ((opt_opmask & HTP_OPMASK_QUEUE)) {
|
|
2852
|
-
sess->enqueue(req, bufs, n_bufs, opt_opsync);
|
|
2853
|
-
}
|
|
2854
|
-
|
|
2855
|
-
t2 = ggml_time_us();
|
|
2856
|
-
|
|
2857
|
-
if (src2) {
|
|
2858
|
-
HEX_PROFILE(
|
|
2859
|
-
"ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles "
|
|
2860
|
-
"%u op-pkts %u (%f) call-usec %llu\n",
|
|
2861
|
-
sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
|
|
2862
|
-
(uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
|
|
2863
|
-
(uint32_t) src1->ne[2], (uint32_t) src1->ne[3], src2->name, (uint32_t) src2->ne[0], (uint32_t) src2->ne[1],
|
|
2864
|
-
(uint32_t) src2->ne[2], (uint32_t) src2->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
|
|
2865
|
-
(uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
|
|
2866
|
-
(float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
|
|
2867
|
-
} else {
|
|
2868
|
-
HEX_PROFILE(
|
|
2869
|
-
"ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u "
|
|
2870
|
-
"(%f) call-usec %llu\n",
|
|
2871
|
-
sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
|
|
2872
|
-
(uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
|
|
2873
|
-
(uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
|
|
2874
|
-
(uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
|
|
2875
|
-
(float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
|
|
2876
|
-
}
|
|
2331
|
+
return n_bufs;
|
|
2877
2332
|
}
|
|
2878
2333
|
|
|
2879
2334
|
static const char * ggml_backend_hexagon_name(ggml_backend_t backend) {
|
|
@@ -2888,7 +2343,7 @@ static void ggml_backend_hexagon_free(ggml_backend_t backend) {
|
|
|
2888
2343
|
}
|
|
2889
2344
|
|
|
2890
2345
|
static inline bool op_reuse_src1(const ggml_tensor * op1, const ggml_tensor * op0) {
|
|
2891
|
-
return (op0 && op0->src[1] == op1->src[1]);
|
|
2346
|
+
return (op0 && op0->src[1] == op1->src[1] && ggml_is_quantized(op0->src[0]->type) && ggml_is_quantized(op1->src[1]->type));
|
|
2892
2347
|
}
|
|
2893
2348
|
|
|
2894
2349
|
static inline bool is_compute_op(ggml_tensor *node)
|
|
@@ -2938,41 +2393,50 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
|
|
|
2938
2393
|
|
|
2939
2394
|
switch (node->op) {
|
|
2940
2395
|
case GGML_OP_MUL_MAT:
|
|
2941
|
-
|
|
2396
|
+
if (ggml_is_quantized(node->src[0]->type)) {
|
|
2397
|
+
ggml_hexagon_dispatch_op<init_binary_req<true>>(sess, node, flags);
|
|
2398
|
+
} else {
|
|
2399
|
+
ggml_hexagon_dispatch_op<init_binary_req<false>>(sess, node, flags);
|
|
2400
|
+
}
|
|
2942
2401
|
prev_quant_op = node;
|
|
2943
2402
|
break;
|
|
2944
2403
|
case GGML_OP_MUL_MAT_ID:
|
|
2945
|
-
|
|
2404
|
+
if (ggml_is_quantized(node->src[0]->type)) {
|
|
2405
|
+
ggml_hexagon_dispatch_op<init_binary_id_req<true>>(sess, node, flags);
|
|
2406
|
+
} else {
|
|
2407
|
+
ggml_hexagon_dispatch_op<init_binary_id_req<false>>(sess, node, flags);
|
|
2408
|
+
}
|
|
2946
2409
|
prev_quant_op = node;
|
|
2947
2410
|
break;
|
|
2948
2411
|
case GGML_OP_MUL:
|
|
2949
2412
|
case GGML_OP_ADD:
|
|
2950
2413
|
case GGML_OP_SUB:
|
|
2951
|
-
|
|
2414
|
+
ggml_hexagon_dispatch_op<init_binary_req<false>>(sess, node, flags);
|
|
2952
2415
|
break;
|
|
2953
2416
|
case GGML_OP_ADD_ID:
|
|
2954
|
-
|
|
2417
|
+
ggml_hexagon_dispatch_op<init_binary_id_req<false>>(sess, node, flags);
|
|
2955
2418
|
break;
|
|
2956
2419
|
case GGML_OP_RMS_NORM:
|
|
2957
|
-
|
|
2420
|
+
ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
|
|
2958
2421
|
break;
|
|
2959
2422
|
case GGML_OP_UNARY:
|
|
2960
|
-
if (ggml_get_unary_op(node) == GGML_UNARY_OP_SILU)
|
|
2961
|
-
|
|
2423
|
+
if ((ggml_get_unary_op(node) == GGML_UNARY_OP_SILU) ||
|
|
2424
|
+
(ggml_get_unary_op(node) == GGML_UNARY_OP_GELU)) {
|
|
2425
|
+
ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
|
|
2962
2426
|
}
|
|
2963
2427
|
break;
|
|
2964
2428
|
case GGML_OP_GLU:
|
|
2965
2429
|
if ((ggml_get_glu_op(node) == GGML_GLU_OP_SWIGLU) ||
|
|
2966
|
-
|
|
2967
|
-
|
|
2430
|
+
(ggml_get_glu_op(node) == GGML_GLU_OP_SWIGLU_OAI)) {
|
|
2431
|
+
ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
|
|
2968
2432
|
}
|
|
2969
2433
|
break;
|
|
2970
2434
|
case GGML_OP_SOFT_MAX:
|
|
2971
|
-
|
|
2435
|
+
ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
|
|
2972
2436
|
break;
|
|
2973
2437
|
|
|
2974
2438
|
case GGML_OP_ROPE:
|
|
2975
|
-
|
|
2439
|
+
ggml_hexagon_dispatch_op<init_rope_req>(sess, node, flags);
|
|
2976
2440
|
break;
|
|
2977
2441
|
|
|
2978
2442
|
default:
|
|
@@ -3101,8 +2565,8 @@ static void ggml_backend_hexagon_graph_optimize(ggml_backend_t backend, ggml_cgr
|
|
|
3101
2565
|
// and perform the reorder over the fused nodes. after the reorder is done, we unfuse
|
|
3102
2566
|
for (int i = 0; i < n; i++) {
|
|
3103
2567
|
node_info node = {
|
|
3104
|
-
/*.node =*/
|
|
3105
|
-
/*.fused =*/
|
|
2568
|
+
/*.node =*/gf->nodes[i],
|
|
2569
|
+
/*.fused =*/{},
|
|
3106
2570
|
};
|
|
3107
2571
|
|
|
3108
2572
|
// fuse only ops that start with these operations
|
|
@@ -3253,11 +2717,39 @@ static ggml_backend_buffer_type_t ggml_backend_hexagon_device_get_repack_buffer_
|
|
|
3253
2717
|
return &sess->repack_buffer_type;
|
|
3254
2718
|
}
|
|
3255
2719
|
|
|
2720
|
+
static bool ggml_hexagon_supported_buffer(ggml_hexagon_session *sess, const struct ggml_tensor * t) {
|
|
2721
|
+
if (t && t->buffer) {
|
|
2722
|
+
if (ggml_backend_buffer_is_hexagon(t->buffer) == false) return false; // not our buffer
|
|
2723
|
+
if (ggml_backend_hexagon_buffer_get_sess(t->buffer) != sess) return false; // wrong session
|
|
2724
|
+
}
|
|
2725
|
+
return true;
|
|
2726
|
+
}
|
|
2727
|
+
|
|
2728
|
+
static bool ggml_hexagon_supported_buffers(ggml_hexagon_session *sess, const struct ggml_tensor * t) {
|
|
2729
|
+
// all srcs & dsts must be mapped to the same session
|
|
2730
|
+
if (!ggml_hexagon_supported_buffer(sess, t)) {
|
|
2731
|
+
return false;
|
|
2732
|
+
}
|
|
2733
|
+
|
|
2734
|
+
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
|
2735
|
+
if (!ggml_hexagon_supported_buffer(sess, t->src[i])) {
|
|
2736
|
+
return false;
|
|
2737
|
+
}
|
|
2738
|
+
}
|
|
2739
|
+
|
|
2740
|
+
return true;
|
|
2741
|
+
}
|
|
2742
|
+
|
|
3256
2743
|
static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
|
3257
2744
|
auto sess = static_cast<ggml_hexagon_session *>(dev->context);
|
|
3258
2745
|
|
|
3259
|
-
|
|
2746
|
+
// all srcs & dsts must be mapped to the same session
|
|
2747
|
+
if (!ggml_hexagon_supported_buffers(sess, op)) {
|
|
2748
|
+
ggml_hexagon_dump_op_supp(sess->name, op, false);
|
|
2749
|
+
return false;
|
|
2750
|
+
}
|
|
3260
2751
|
|
|
2752
|
+
bool supp = false;
|
|
3261
2753
|
switch (op->op) {
|
|
3262
2754
|
case GGML_OP_NONE:
|
|
3263
2755
|
case GGML_OP_RESHAPE:
|
|
@@ -3294,17 +2786,21 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
|
|
|
3294
2786
|
break;
|
|
3295
2787
|
|
|
3296
2788
|
case GGML_OP_UNARY:
|
|
3297
|
-
|
|
3298
|
-
|
|
2789
|
+
{
|
|
2790
|
+
const auto unary_op = ggml_get_unary_op(op);
|
|
2791
|
+
if (unary_op == GGML_UNARY_OP_SILU || unary_op == GGML_UNARY_OP_GELU) {
|
|
2792
|
+
supp = ggml_hexagon_supported_activations(sess, op);
|
|
2793
|
+
}
|
|
2794
|
+
break;
|
|
3299
2795
|
}
|
|
3300
|
-
break;
|
|
3301
|
-
|
|
3302
2796
|
case GGML_OP_GLU:
|
|
3303
|
-
|
|
3304
|
-
|
|
2797
|
+
{
|
|
2798
|
+
const auto glu_op = ggml_get_glu_op(op);
|
|
2799
|
+
if ((glu_op == GGML_GLU_OP_SWIGLU) || (glu_op == GGML_GLU_OP_SWIGLU_OAI)) {
|
|
2800
|
+
supp = ggml_hexagon_supported_activations(sess, op);
|
|
2801
|
+
}
|
|
2802
|
+
break;
|
|
3305
2803
|
}
|
|
3306
|
-
break;
|
|
3307
|
-
|
|
3308
2804
|
case GGML_OP_ROPE:
|
|
3309
2805
|
supp = ggml_hexagon_supported_rope(sess, op);
|
|
3310
2806
|
break;
|
|
@@ -3313,26 +2809,8 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
|
|
|
3313
2809
|
break;
|
|
3314
2810
|
}
|
|
3315
2811
|
|
|
3316
|
-
|
|
3317
|
-
char dims[64 * GGML_MAX_SRC];
|
|
3318
|
-
char strides[64 * GGML_MAX_SRC];
|
|
3319
|
-
char types[16 * GGML_MAX_SRC];
|
|
3320
|
-
char buffs[64 * GGML_MAX_SRC];
|
|
3321
|
-
char names[64 * GGML_MAX_SRC];
|
|
3322
|
-
|
|
3323
|
-
hex_format_op_dims(dims, op);
|
|
3324
|
-
hex_format_op_strides(strides, op);
|
|
3325
|
-
hex_format_op_types(types, op);
|
|
3326
|
-
hex_format_op_buffs(buffs, op);
|
|
3327
|
-
hex_format_op_names(names, op);
|
|
3328
|
-
|
|
3329
|
-
HEX_VERBOSE("ggml-hex: %s device-supports-op %s : %s : %s : %s : %s : %s : (%d)\n", sess->name.c_str(),
|
|
3330
|
-
ggml_op_name(op->op), names, dims, types, strides, buffs, (int) supp);
|
|
3331
|
-
}
|
|
3332
|
-
|
|
2812
|
+
ggml_hexagon_dump_op_supp(sess->name, op, supp);
|
|
3333
2813
|
return supp;
|
|
3334
|
-
|
|
3335
|
-
GGML_UNUSED(dev);
|
|
3336
2814
|
}
|
|
3337
2815
|
|
|
3338
2816
|
static bool ggml_backend_hexagon_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
|
@@ -3401,7 +2879,7 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
|
|
|
3401
2879
|
}
|
|
3402
2880
|
}
|
|
3403
2881
|
|
|
3404
|
-
if(opt_arch < 75) {
|
|
2882
|
+
if (opt_arch < 75) {
|
|
3405
2883
|
opt_ndev = 1;
|
|
3406
2884
|
GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75.\n");
|
|
3407
2885
|
}
|
|
@@ -3410,11 +2888,11 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
|
|
|
3410
2888
|
|
|
3411
2889
|
// Create devices / sessions
|
|
3412
2890
|
for (size_t i = 0; i < opt_ndev; i++) {
|
|
3413
|
-
devices[i].iface
|
|
3414
|
-
devices[i].reg
|
|
2891
|
+
devices[i].iface = ggml_backend_hexagon_device_i;
|
|
2892
|
+
devices[i].reg = reg;
|
|
3415
2893
|
try {
|
|
3416
2894
|
devices[i].context = new ggml_hexagon_session(i, &devices[i]);
|
|
3417
|
-
} catch (std::exception
|
|
2895
|
+
} catch (const std::exception & exc) {
|
|
3418
2896
|
GGML_LOG_ERROR("ggml-hex: failed to create device/session %zu\n", i);
|
|
3419
2897
|
devices[i].context = nullptr;
|
|
3420
2898
|
}
|