whispercpp 1.3.4 → 1.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +60 -43
- data/ext/extconf.rb +2 -2
- data/ext/ruby_whisper.c +14 -2
- data/ext/ruby_whisper.h +39 -0
- data/ext/ruby_whisper_context.c +22 -22
- data/ext/ruby_whisper_model.c +12 -12
- data/ext/ruby_whisper_params.c +47 -23
- data/ext/ruby_whisper_segment.c +84 -19
- data/ext/ruby_whisper_token.c +351 -0
- data/ext/ruby_whisper_transcribe.cpp +1 -1
- data/ext/ruby_whisper_vad_context.c +75 -0
- data/ext/ruby_whisper_vad_context_detect.cpp +50 -0
- data/ext/ruby_whisper_vad_segment.c +139 -0
- data/ext/ruby_whisper_vad_segments.c +106 -0
- data/ext/sources/CMakeLists.txt +4 -1
- data/ext/sources/bindings/javascript/package.json +1 -1
- data/ext/sources/cmake/arm64-apple-clang.cmake +16 -0
- data/ext/sources/cmake/arm64-windows-llvm.cmake +16 -0
- data/ext/sources/cmake/riscv64-spacemit-linux-gnu-gcc.cmake +29 -0
- data/ext/sources/cmake/x64-windows-llvm.cmake +5 -0
- data/ext/sources/examples/addon.node/vad-example.js +2 -2
- data/ext/sources/examples/cli/cli.cpp +121 -112
- data/ext/sources/examples/lsp/CMakeLists.txt +2 -1
- data/ext/sources/examples/quantize/CMakeLists.txt +2 -1
- data/ext/sources/examples/server/server.cpp +10 -11
- data/ext/sources/examples/talk-llama/CMakeLists.txt +5 -1
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +12 -3
- data/ext/sources/examples/talk-llama/llama-adapter.h +7 -1
- data/ext/sources/examples/talk-llama/llama-arch.cpp +2046 -1974
- data/ext/sources/examples/talk-llama/llama-arch.h +67 -2
- data/ext/sources/examples/talk-llama/llama-batch.cpp +75 -33
- data/ext/sources/examples/talk-llama/llama-batch.h +17 -4
- data/ext/sources/examples/talk-llama/llama-chat.cpp +79 -3
- data/ext/sources/examples/talk-llama/llama-chat.h +4 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +775 -78
- data/ext/sources/examples/talk-llama/llama-context.h +57 -9
- data/ext/sources/examples/talk-llama/llama-cparams.h +1 -0
- data/ext/sources/examples/talk-llama/llama-grammar.cpp +288 -53
- data/ext/sources/examples/talk-llama/llama-grammar.h +22 -1
- data/ext/sources/examples/talk-llama/llama-graph.cpp +381 -64
- data/ext/sources/examples/talk-llama/llama-graph.h +103 -13
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +26 -2
- data/ext/sources/examples/talk-llama/llama-hparams.h +41 -10
- data/ext/sources/examples/talk-llama/llama-impl.cpp +7 -3
- data/ext/sources/examples/talk-llama/llama-impl.h +1 -1
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +5 -3
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +145 -65
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +22 -7
- data/ext/sources/examples/talk-llama/llama-kv-cells.h +44 -2
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +12 -10
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +32 -19
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +2 -2
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +172 -37
- data/ext/sources/examples/talk-llama/llama-mmap.h +8 -3
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +91 -9
- data/ext/sources/examples/talk-llama/llama-model-loader.h +6 -0
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +3 -0
- data/ext/sources/examples/talk-llama/llama-model.cpp +1529 -13134
- data/ext/sources/examples/talk-llama/llama-model.h +44 -3
- data/ext/sources/examples/talk-llama/llama-quant.cpp +8 -23
- data/ext/sources/examples/talk-llama/llama-sampling.cpp +1294 -198
- data/ext/sources/examples/talk-llama/llama-sampling.h +19 -7
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +133 -37
- data/ext/sources/examples/talk-llama/llama-vocab.h +45 -40
- data/ext/sources/examples/talk-llama/llama.cpp +729 -2
- data/ext/sources/examples/talk-llama/llama.h +152 -14
- data/ext/sources/examples/talk-llama/models/afmoe.cpp +191 -0
- data/ext/sources/examples/talk-llama/models/apertus.cpp +125 -0
- data/ext/sources/examples/talk-llama/models/arcee.cpp +135 -0
- data/ext/sources/examples/talk-llama/models/arctic.cpp +138 -0
- data/ext/sources/examples/talk-llama/models/arwkv7.cpp +86 -0
- data/ext/sources/examples/talk-llama/models/baichuan.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +144 -0
- data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +135 -0
- data/ext/sources/examples/talk-llama/models/bert.cpp +178 -0
- data/ext/sources/examples/talk-llama/models/bitnet.cpp +160 -0
- data/ext/sources/examples/talk-llama/models/bloom.cpp +101 -0
- data/ext/sources/examples/talk-llama/models/chameleon.cpp +178 -0
- data/ext/sources/examples/talk-llama/models/chatglm.cpp +132 -0
- data/ext/sources/examples/talk-llama/models/codeshell.cpp +111 -0
- data/ext/sources/examples/talk-llama/models/cogvlm.cpp +102 -0
- data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +134 -0
- data/ext/sources/examples/talk-llama/models/command-r.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/dbrx.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/deci.cpp +135 -0
- data/ext/sources/examples/talk-llama/models/deepseek.cpp +144 -0
- data/ext/sources/examples/talk-llama/models/deepseek2.cpp +259 -0
- data/ext/sources/examples/talk-llama/models/dots1.cpp +134 -0
- data/ext/sources/examples/talk-llama/models/dream.cpp +105 -0
- data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +150 -0
- data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +110 -0
- data/ext/sources/examples/talk-llama/models/exaone.cpp +114 -0
- data/ext/sources/examples/talk-llama/models/exaone4.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +113 -0
- data/ext/sources/examples/talk-llama/models/falcon.cpp +120 -0
- data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +116 -0
- data/ext/sources/examples/talk-llama/models/gemma.cpp +112 -0
- data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +128 -0
- data/ext/sources/examples/talk-llama/models/gemma3.cpp +155 -0
- data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +384 -0
- data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +170 -0
- data/ext/sources/examples/talk-llama/models/glm4.cpp +150 -0
- data/ext/sources/examples/talk-llama/models/gpt2.cpp +105 -0
- data/ext/sources/examples/talk-llama/models/gptneox.cpp +144 -0
- data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +196 -0
- data/ext/sources/examples/talk-llama/models/granite.cpp +211 -0
- data/ext/sources/examples/talk-llama/models/graph-context-mamba.cpp +283 -0
- data/ext/sources/examples/talk-llama/models/grok.cpp +159 -0
- data/ext/sources/examples/talk-llama/models/grovemoe.cpp +141 -0
- data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +132 -0
- data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +154 -0
- data/ext/sources/examples/talk-llama/models/internlm2.cpp +120 -0
- data/ext/sources/examples/talk-llama/models/jais.cpp +86 -0
- data/ext/sources/examples/talk-llama/models/jamba.cpp +106 -0
- data/ext/sources/examples/talk-llama/models/lfm2.cpp +175 -0
- data/ext/sources/examples/talk-llama/models/llada-moe.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/llada.cpp +99 -0
- data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +178 -0
- data/ext/sources/examples/talk-llama/models/llama.cpp +168 -0
- data/ext/sources/examples/talk-llama/models/maincoder.cpp +117 -0
- data/ext/sources/examples/talk-llama/models/mamba.cpp +55 -0
- data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/minicpm3.cpp +199 -0
- data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/mistral3.cpp +160 -0
- data/ext/sources/examples/talk-llama/models/models.h +569 -0
- data/ext/sources/examples/talk-llama/models/modern-bert.cpp +116 -0
- data/ext/sources/examples/talk-llama/models/mpt.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +150 -0
- data/ext/sources/examples/talk-llama/models/nemotron.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/neo-bert.cpp +104 -0
- data/ext/sources/examples/talk-llama/models/olmo.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/olmo2.cpp +150 -0
- data/ext/sources/examples/talk-llama/models/olmoe.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +127 -0
- data/ext/sources/examples/talk-llama/models/openelm.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/orion.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/phi2.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/phi3.cpp +152 -0
- data/ext/sources/examples/talk-llama/models/plamo.cpp +110 -0
- data/ext/sources/examples/talk-llama/models/plamo2.cpp +316 -0
- data/ext/sources/examples/talk-llama/models/plamo3.cpp +128 -0
- data/ext/sources/examples/talk-llama/models/plm.cpp +168 -0
- data/ext/sources/examples/talk-llama/models/qwen.cpp +108 -0
- data/ext/sources/examples/talk-llama/models/qwen2.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +151 -0
- data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +117 -0
- data/ext/sources/examples/talk-llama/models/qwen3.cpp +117 -0
- data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/qwen3next.cpp +873 -0
- data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +149 -0
- data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +141 -0
- data/ext/sources/examples/talk-llama/models/refact.cpp +94 -0
- data/ext/sources/examples/talk-llama/models/rnd1.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +162 -0
- data/ext/sources/examples/talk-llama/models/rwkv6.cpp +94 -0
- data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +86 -0
- data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +135 -0
- data/ext/sources/examples/talk-llama/models/rwkv7.cpp +90 -0
- data/ext/sources/examples/talk-llama/models/seed-oss.cpp +124 -0
- data/ext/sources/examples/talk-llama/models/smallthinker.cpp +126 -0
- data/ext/sources/examples/talk-llama/models/smollm3.cpp +128 -0
- data/ext/sources/examples/talk-llama/models/stablelm.cpp +146 -0
- data/ext/sources/examples/talk-llama/models/starcoder.cpp +100 -0
- data/ext/sources/examples/talk-llama/models/starcoder2.cpp +121 -0
- data/ext/sources/examples/talk-llama/models/t5-dec.cpp +166 -0
- data/ext/sources/examples/talk-llama/models/t5-enc.cpp +96 -0
- data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +149 -0
- data/ext/sources/examples/talk-llama/models/xverse.cpp +108 -0
- data/ext/sources/examples/talk-llama/unicode.cpp +102 -16
- data/ext/sources/examples/vad-speech-segments/CMakeLists.txt +1 -1
- data/ext/sources/examples/whisper.wasm/index-tmpl.html +1 -1
- data/ext/sources/ggml/CMakeLists.txt +82 -54
- data/ext/sources/ggml/include/ggml-alloc.h +9 -0
- data/ext/sources/ggml/include/ggml-backend.h +4 -1
- data/ext/sources/ggml/include/ggml-cpu.h +1 -0
- data/ext/sources/ggml/include/ggml-hexagon.h +19 -0
- data/ext/sources/ggml/include/ggml-rpc.h +8 -11
- data/ext/sources/ggml/include/ggml-zendnn.h +22 -0
- data/ext/sources/ggml/include/ggml.h +190 -12
- data/ext/sources/ggml/src/CMakeLists.txt +82 -11
- data/ext/sources/ggml/src/ggml-alloc.c +124 -41
- data/ext/sources/ggml/src/ggml-backend-impl.h +1 -4
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +27 -3
- data/ext/sources/ggml/src/ggml-backend.cpp +71 -21
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +17 -3
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +5 -9
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +57 -45
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +138 -47
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +2179 -1696
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +238 -317
- data/ext/sources/ggml/src/ggml-cann/common.h +283 -208
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +626 -776
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +156 -86
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +1 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +4 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +428 -26
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +1004 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +4 -5
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +108 -49
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +6 -6
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +50 -2
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +5 -3
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +195 -71
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +573 -106
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +33 -44
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +298 -112
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +333 -0
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +819 -125
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +6 -0
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +708 -431
- data/ext/sources/ggml/src/ggml-cpu/ops.h +5 -4
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +671 -31
- data/ext/sources/ggml/src/ggml-cpu/repack.h +14 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +41 -43
- data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +3 -2
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +151 -0
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.h +7 -0
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +124 -1
- data/ext/sources/ggml/src/ggml-cpu/vec.h +261 -146
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +72 -1
- data/ext/sources/ggml/src/ggml-cuda/argmax.cu +2 -2
- data/ext/sources/ggml/src/ggml-cuda/argsort.cu +123 -6
- data/ext/sources/ggml/src/ggml-cuda/argsort.cuh +16 -0
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +1 -1
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +353 -80
- data/ext/sources/ggml/src/ggml-cuda/convert.cuh +10 -0
- data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +339 -246
- data/ext/sources/ggml/src/ggml-cuda/cpy.cuh +1 -5
- data/ext/sources/ggml/src/ggml-cuda/cumsum.cu +307 -0
- data/ext/sources/ggml/src/ggml-cuda/cumsum.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/diag.cu +77 -0
- data/ext/sources/ggml/src/ggml-cuda/diag.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +31 -21
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +663 -596
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +35 -741
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +1241 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +30 -37
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +14 -13
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +48 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +83 -37
- data/ext/sources/ggml/src/ggml-cuda/fill.cu +37 -0
- data/ext/sources/ggml/src/ggml-cuda/fill.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1155 -164
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +5 -4
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +741 -48
- data/ext/sources/ggml/src/ggml-cuda/mmf.cu +60 -12
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +381 -42
- data/ext/sources/ggml/src/ggml-cuda/mmid.cu +164 -0
- data/ext/sources/ggml/src/ggml-cuda/mmid.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/mmq.cu +69 -176
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +498 -171
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +375 -79
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +3 -2
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +241 -95
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/pad.cu +64 -33
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +151 -0
- data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +14 -0
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +192 -77
- data/ext/sources/ggml/src/ggml-cuda/rope.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +101 -47
- data/ext/sources/ggml/src/ggml-cuda/set.cu +39 -0
- data/ext/sources/ggml/src/ggml-cuda/set.cuh +7 -0
- data/ext/sources/ggml/src/ggml-cuda/softmax.cu +203 -6
- data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +275 -0
- data/ext/sources/ggml/src/ggml-cuda/solve_tri.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +14 -20
- data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +49 -84
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +19 -1
- data/ext/sources/ggml/src/ggml-cuda/top-k.cu +96 -0
- data/ext/sources/ggml/src/ggml-cuda/top-k.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +168 -76
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +11 -4
- data/ext/sources/ggml/src/ggml-cuda/tri.cu +136 -0
- data/ext/sources/ggml/src/ggml-cuda/tri.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +105 -11
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +36 -0
- data/ext/sources/ggml/src/ggml-cuda/upscale.cu +163 -7
- data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +12 -1
- data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +6 -0
- data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +80 -0
- data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3151 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +44 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +682 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +360 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +566 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +112 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +35 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-dma.c +63 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-dma.h +157 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +165 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +92 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +94 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +72 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +49 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +1020 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +1353 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +1001 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +2503 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +149 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +487 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +168 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +402 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +287 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +297 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +454 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +221 -0
- data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +153 -0
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +8 -13
- data/ext/sources/ggml/src/ggml-impl.h +67 -6
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +2 -2
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +29 -20
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +652 -285
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +103 -56
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +496 -118
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +231 -9
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +1227 -224
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +12 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +14 -8
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +1972 -704
- data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +3 -1
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +11 -0
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +1430 -120
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +63 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +82 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +4 -3
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +36 -12
- data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +39 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +24 -10
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +24 -10
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +29 -20
- data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +25 -10
- data/ext/sources/ggml/src/ggml-opencl/kernels/rope.cl +50 -24
- data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +35 -16
- data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +13 -0
- data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +438 -156
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +48 -3
- data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +77 -0
- data/ext/sources/ggml/src/ggml-sycl/add-id.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +0 -9
- data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +0 -6
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +117 -15
- data/ext/sources/ggml/src/ggml-sycl/concat.cpp +55 -44
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +34 -0
- data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +79 -0
- data/ext/sources/ggml/src/ggml-sycl/count-equal.hpp +9 -0
- data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +0 -3
- data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +18 -0
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +76 -3
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +333 -300
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +10 -2
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +335 -110
- data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +22 -0
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +156 -0
- data/ext/sources/ggml/src/ggml-sycl/norm.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/pad.cpp +97 -0
- data/ext/sources/ggml/src/ggml-sycl/pad.hpp +24 -0
- data/ext/sources/ggml/src/ggml-sycl/pad_reflect_1d.cpp +100 -0
- data/ext/sources/ggml/src/ggml-sycl/pad_reflect_1d.hpp +10 -0
- data/ext/sources/ggml/src/ggml-sycl/presets.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
- data/ext/sources/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/roll.cpp +122 -0
- data/ext/sources/ggml/src/ggml-sycl/roll.hpp +20 -0
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +30 -17
- data/ext/sources/ggml/src/ggml-sycl/set.cpp +73 -0
- data/ext/sources/ggml/src/ggml-sycl/set.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +327 -162
- data/ext/sources/ggml/src/ggml-sycl/softmax.hpp +4 -0
- data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +127 -0
- data/ext/sources/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +58 -0
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +38 -18
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +5013 -2859
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +33 -26
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +47 -49
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +4 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +83 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs.comp → dequant_funcs.glsl} +9 -21
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs_cm2.comp → dequant_funcs_cm2.glsl} +18 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_head.comp → dequant_head.glsl} +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +39 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{flash_attn_base.comp → flash_attn_base.glsl} +19 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +45 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +50 -12
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_binary_head.comp → generic_binary_head.glsl} +17 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_head.comp → generic_head.glsl} +2 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_unary_head.comp → generic_unary_head.glsl} +7 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +4 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{glu_head.comp → glu_head.glsl} +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +19 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +2 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +18 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{mul_mat_vec_base.comp → mul_mat_vec_base.glsl} +70 -25
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +35 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +71 -21
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +41 -25
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +44 -26
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +9 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +9 -7
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +4 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +4 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +4 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +39 -36
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +494 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +78 -103
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +34 -23
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{mul_mm_funcs.comp → mul_mm_funcs.glsl} +69 -59
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +72 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +88 -228
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +454 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +78 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +97 -13
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +21 -6
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +10 -10
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +50 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +234 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +20 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +6 -50
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +6 -33
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +6 -33
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +28 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +6 -39
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp +62 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp +79 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp +65 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl +53 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/solve_tri.comp +81 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +44 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +124 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +2 -25
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.glsl +25 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +2 -2
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_argsort.comp +118 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +213 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_nary_search.comp +246 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +43 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{types.comp → types.glsl} +345 -26
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +90 -12
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +335 -151
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
- data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +28 -2
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +169 -0
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1964 -435
- data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +188 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +101 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +33 -10
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +591 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +1 -1
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +6 -6
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +97 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl +247 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.tmpl.wgsl +302 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +267 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +83 -17
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/scale.tmpl.wgsl +90 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +112 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +483 -0
- data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +92 -0
- data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +466 -0
- data/ext/sources/ggml/src/ggml.c +425 -33
- data/ext/sources/include/whisper.h +1 -0
- data/ext/sources/src/CMakeLists.txt +3 -1
- data/ext/sources/src/whisper.cpp +101 -35
- data/ext/sources/tests/CMakeLists.txt +2 -2
- data/ext/sources/tests/test-vad-full.cpp +4 -2
- data/ext/sources/tests/test-vad.cpp +1 -1
- data/extsources.rb +1 -0
- data/lib/whisper/model/uri.rb +17 -18
- data/sig/whisper.rbs +119 -2
- data/test/test_params.rb +16 -8
- data/test/test_segment.rb +0 -1
- data/test/test_token.rb +70 -0
- data/test/test_vad.rb +1 -1
- data/test/test_vad_context.rb +50 -0
- data/test/test_vad_segment.rb +19 -0
- data/test/test_vad_segments.rb +16 -0
- data/test/test_whisper.rb +7 -0
- data/whispercpp.gemspec +1 -1
- metadata +287 -34
- data/ext/sources/build-xcframework.sh +0 -571
- data/ext/sources/ggml/src/ggml-cann/Doxyfile +0 -2579
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +0 -105
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +0 -55
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl +0 -44
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl +0 -41
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +0 -60
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl +0 -44
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl +0 -41
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_in_place.wgsl +0 -48
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_bfloat16_support.comp → feature-tests/bfloat16.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat_support.comp → feature-tests/coopmat.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat2_support.comp → feature-tests/coopmat2.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_integer_dot_support.comp → feature-tests/integer_dot.comp} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{glu_main.comp → glu_main.glsl} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{rte.comp → rte.glsl} +0 -0
- /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{utils.comp → utils.glsl} +0 -0
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
|
|
8
8
|
#include <algorithm>
|
|
9
9
|
#include <cassert>
|
|
10
|
+
#include <cstring>
|
|
10
11
|
#include <limits>
|
|
11
12
|
#include <map>
|
|
12
13
|
#include <stdexcept>
|
|
@@ -32,8 +33,15 @@ llama_memory_recurrent::llama_memory_recurrent(
|
|
|
32
33
|
cells.clear();
|
|
33
34
|
cells.resize(mem_size);
|
|
34
35
|
|
|
36
|
+
// define a comparator for the buft -> ctx map to ensure that the order is well-defined:
|
|
37
|
+
struct ggml_backend_buft_comparator {
|
|
38
|
+
bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
|
|
39
|
+
return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
|
|
40
|
+
}
|
|
41
|
+
};
|
|
42
|
+
std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
|
|
43
|
+
|
|
35
44
|
// create a context for each buffer type
|
|
36
|
-
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
|
37
45
|
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
|
|
38
46
|
auto it = ctx_map.find(buft);
|
|
39
47
|
if (it == ctx_map.end()) {
|
|
@@ -48,13 +56,12 @@ llama_memory_recurrent::llama_memory_recurrent(
|
|
|
48
56
|
return nullptr;
|
|
49
57
|
}
|
|
50
58
|
|
|
51
|
-
ctx_map
|
|
52
|
-
ctxs.emplace_back(ctx);
|
|
59
|
+
ctx_map.emplace(buft, ctx);
|
|
53
60
|
|
|
54
61
|
return ctx;
|
|
55
62
|
}
|
|
56
63
|
|
|
57
|
-
return it->second;
|
|
64
|
+
return it->second.get();
|
|
58
65
|
};
|
|
59
66
|
|
|
60
67
|
r_l.resize(n_layer);
|
|
@@ -93,17 +100,14 @@ llama_memory_recurrent::llama_memory_recurrent(
|
|
|
93
100
|
}
|
|
94
101
|
|
|
95
102
|
// allocate tensors and initialize the buffers to avoid NaNs in the padding
|
|
96
|
-
for (auto
|
|
97
|
-
|
|
98
|
-
auto * ctx = it.second;
|
|
99
|
-
|
|
100
|
-
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
|
103
|
+
for (auto & [buft, ctx] : ctx_map) {
|
|
104
|
+
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft);
|
|
101
105
|
if (!buf) {
|
|
102
106
|
throw std::runtime_error("failed to allocate buffer for rs cache");
|
|
103
107
|
}
|
|
104
108
|
ggml_backend_buffer_clear(buf, 0);
|
|
105
109
|
LLAMA_LOG_INFO("%s: %10s RS buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
|
|
106
|
-
|
|
110
|
+
ctxs_bufs.emplace_back(std::move(ctx), buf);
|
|
107
111
|
}
|
|
108
112
|
|
|
109
113
|
{
|
|
@@ -129,13 +133,14 @@ void llama_memory_recurrent::clear(bool data) {
|
|
|
129
133
|
used = 0;
|
|
130
134
|
|
|
131
135
|
if (data) {
|
|
132
|
-
for (auto & buf :
|
|
136
|
+
for (auto & [_, buf] : ctxs_bufs) {
|
|
133
137
|
ggml_backend_buffer_clear(buf.get(), 0);
|
|
134
138
|
}
|
|
135
139
|
}
|
|
136
140
|
}
|
|
137
141
|
|
|
138
142
|
bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
|
143
|
+
//printf("[DEBUG] calling llama_memory_recurrent::seq_rm` with `seq_id=%d, p0=%d, p1=%d`\n", seq_id, p0, p1);
|
|
139
144
|
uint32_t new_head = size;
|
|
140
145
|
|
|
141
146
|
if (p0 < 0) {
|
|
@@ -146,7 +151,8 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
|
|
|
146
151
|
p1 = std::numeric_limits<llama_pos>::max();
|
|
147
152
|
}
|
|
148
153
|
|
|
149
|
-
// models like Mamba or RWKV can't have a state partially erased
|
|
154
|
+
// models like Mamba or RWKV can't have a state partially erased at the end
|
|
155
|
+
// of the sequence because their state isn't preserved for previous tokens
|
|
150
156
|
if (seq_id >= (int64_t) size) {
|
|
151
157
|
// could be fatal
|
|
152
158
|
return false;
|
|
@@ -155,8 +161,9 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
|
|
|
155
161
|
int32_t & tail_id = cells[seq_id].tail;
|
|
156
162
|
if (tail_id >= 0) {
|
|
157
163
|
const auto & cell = cells[tail_id];
|
|
158
|
-
// partial intersection is invalid
|
|
159
|
-
if (
|
|
164
|
+
// partial intersection is invalid if it includes the final pos
|
|
165
|
+
if (0 < p0 && p0 <= cell.pos && p1 > cell.pos) {
|
|
166
|
+
//printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: partial intersection is invalid, so returning false\n");
|
|
160
167
|
return false;
|
|
161
168
|
}
|
|
162
169
|
// invalidate tails which will be cleared
|
|
@@ -167,6 +174,7 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
|
|
|
167
174
|
} else {
|
|
168
175
|
// seq_id is negative, then the range should include everything or nothing
|
|
169
176
|
if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
|
|
177
|
+
//printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: `seq_id` is negative, so returning false\n");
|
|
170
178
|
return false;
|
|
171
179
|
}
|
|
172
180
|
}
|
|
@@ -361,8 +369,8 @@ llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const {
|
|
|
361
369
|
|
|
362
370
|
std::map<ggml_backend_buffer_type_t, size_t> llama_memory_recurrent::memory_breakdown() const {
|
|
363
371
|
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
|
364
|
-
for (const
|
|
365
|
-
ret[ggml_backend_buffer_get_type(
|
|
372
|
+
for (const auto & [_, buf] : ctxs_bufs) {
|
|
373
|
+
ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
|
|
366
374
|
}
|
|
367
375
|
return ret;
|
|
368
376
|
}
|
|
@@ -379,7 +387,9 @@ llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr &
|
|
|
379
387
|
// if all tokens are output, split by sequence
|
|
380
388
|
ubatch = balloc.split_seq(n_ubatch);
|
|
381
389
|
} else {
|
|
382
|
-
|
|
390
|
+
// TODO: non-sequential equal split can be done if using unified KV cache
|
|
391
|
+
// for simplicity, we always use sequential equal split for now
|
|
392
|
+
ubatch = balloc.split_equal(n_ubatch, true);
|
|
383
393
|
}
|
|
384
394
|
|
|
385
395
|
if (ubatch.n_tokens == 0) {
|
|
@@ -657,7 +667,7 @@ bool llama_memory_recurrent::get_can_shift() const {
|
|
|
657
667
|
|
|
658
668
|
size_t llama_memory_recurrent::total_size() const {
|
|
659
669
|
size_t size = 0;
|
|
660
|
-
for (const auto & buf :
|
|
670
|
+
for (const auto & [_, buf] : ctxs_bufs) {
|
|
661
671
|
size += ggml_backend_buffer_get_size(buf.get());
|
|
662
672
|
}
|
|
663
673
|
|
|
@@ -856,9 +866,12 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
|
|
|
856
866
|
bool llama_memory_recurrent::state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id) {
|
|
857
867
|
if (dest_seq_id != -1) {
|
|
858
868
|
// single sequence
|
|
859
|
-
|
|
860
869
|
seq_rm(dest_seq_id, -1, -1);
|
|
861
870
|
|
|
871
|
+
if (cell_count == 0) {
|
|
872
|
+
return true;
|
|
873
|
+
}
|
|
874
|
+
|
|
862
875
|
llama_batch_allocr balloc(hparams.n_pos_per_embd());
|
|
863
876
|
|
|
864
877
|
llama_ubatch ubatch = balloc.ubatch_reserve(cell_count, 1);
|
|
@@ -109,8 +109,8 @@ private:
|
|
|
109
109
|
|
|
110
110
|
const uint32_t n_seq_max = 1;
|
|
111
111
|
|
|
112
|
-
|
|
113
|
-
std::vector<ggml_backend_buffer_ptr
|
|
112
|
+
// ggml contexts for the KV cache along with the allocated backend buffers:
|
|
113
|
+
std::vector<std::pair<ggml_context_ptr, ggml_backend_buffer_ptr>> ctxs_bufs;
|
|
114
114
|
|
|
115
115
|
size_t total_size() const;
|
|
116
116
|
|
|
@@ -13,9 +13,10 @@
|
|
|
13
13
|
#ifdef __has_include
|
|
14
14
|
#if __has_include(<unistd.h>)
|
|
15
15
|
#include <unistd.h>
|
|
16
|
+
#include <fcntl.h>
|
|
17
|
+
#include <sys/stat.h>
|
|
16
18
|
#if defined(_POSIX_MAPPED_FILES)
|
|
17
19
|
#include <sys/mman.h>
|
|
18
|
-
#include <fcntl.h>
|
|
19
20
|
#endif
|
|
20
21
|
#if defined(_POSIX_MEMLOCK_RANGE)
|
|
21
22
|
#include <sys/resource.h>
|
|
@@ -74,7 +75,7 @@ struct llama_file::impl {
|
|
|
74
75
|
return ret;
|
|
75
76
|
}
|
|
76
77
|
|
|
77
|
-
impl(const char * fname, const char * mode) {
|
|
78
|
+
impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
|
|
78
79
|
fp = ggml_fopen(fname, mode);
|
|
79
80
|
if (fp == NULL) {
|
|
80
81
|
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
|
@@ -109,7 +110,7 @@ struct llama_file::impl {
|
|
|
109
110
|
}
|
|
110
111
|
}
|
|
111
112
|
|
|
112
|
-
void read_raw(void * ptr, size_t len)
|
|
113
|
+
void read_raw(void * ptr, size_t len) {
|
|
113
114
|
size_t bytes_read = 0;
|
|
114
115
|
while (bytes_read < len) {
|
|
115
116
|
size_t chunk_size = std::min<size_t>(len - bytes_read, 64*1024*1024);
|
|
@@ -126,7 +127,7 @@ struct llama_file::impl {
|
|
|
126
127
|
}
|
|
127
128
|
}
|
|
128
129
|
|
|
129
|
-
uint32_t read_u32()
|
|
130
|
+
uint32_t read_u32() {
|
|
130
131
|
uint32_t val;
|
|
131
132
|
read_raw(&val, sizeof(val));
|
|
132
133
|
return val;
|
|
@@ -153,16 +154,55 @@ struct llama_file::impl {
|
|
|
153
154
|
write_raw(&val, sizeof(val));
|
|
154
155
|
}
|
|
155
156
|
|
|
157
|
+
bool has_direct_io() const {
|
|
158
|
+
return true;
|
|
159
|
+
}
|
|
160
|
+
|
|
156
161
|
~impl() {
|
|
157
162
|
if (fp) {
|
|
158
163
|
std::fclose(fp);
|
|
159
164
|
}
|
|
160
165
|
}
|
|
161
166
|
#else
|
|
162
|
-
impl(const char * fname, const char * mode) {
|
|
163
|
-
|
|
167
|
+
impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) : fname(fname) {
|
|
168
|
+
#ifdef __linux__
|
|
169
|
+
// Try unbuffered I/O for read only
|
|
170
|
+
if (use_direct_io && std::strcmp(mode, "rb") == 0) {
|
|
171
|
+
if (init_fd()) {
|
|
172
|
+
return;
|
|
173
|
+
}
|
|
174
|
+
LLAMA_LOG_WARN("Failed to open file '%s' with error: %s. Falling back to buffered I/O",
|
|
175
|
+
fname, strerror(errno));
|
|
176
|
+
}
|
|
177
|
+
#endif
|
|
178
|
+
init_fp(mode);
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
#ifdef __linux__
|
|
182
|
+
bool init_fd() {
|
|
183
|
+
fd = open(fname.c_str(), O_RDONLY | O_DIRECT);
|
|
184
|
+
|
|
185
|
+
if (fd != -1) {
|
|
186
|
+
struct stat file_stats{};
|
|
187
|
+
fstat(fd, &file_stats);
|
|
188
|
+
|
|
189
|
+
size = file_stats.st_size;
|
|
190
|
+
alignment = file_stats.st_blksize;
|
|
191
|
+
|
|
192
|
+
off_t ret = lseek(fd, 0, SEEK_SET);
|
|
193
|
+
if (ret == -1) {
|
|
194
|
+
throw std::runtime_error(format("seek error: %s", strerror(errno)));
|
|
195
|
+
}
|
|
196
|
+
return true;
|
|
197
|
+
}
|
|
198
|
+
return false;
|
|
199
|
+
}
|
|
200
|
+
#endif
|
|
201
|
+
|
|
202
|
+
void init_fp(const char * mode) {
|
|
203
|
+
fp = ggml_fopen(fname.c_str(), mode);
|
|
164
204
|
if (fp == NULL) {
|
|
165
|
-
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
|
205
|
+
throw std::runtime_error(format("failed to open %s: %s", fname.c_str(), strerror(errno)));
|
|
166
206
|
}
|
|
167
207
|
seek(0, SEEK_END);
|
|
168
208
|
size = tell();
|
|
@@ -170,46 +210,118 @@ struct llama_file::impl {
|
|
|
170
210
|
}
|
|
171
211
|
|
|
172
212
|
size_t tell() const {
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
throw std::runtime_error(format("ftell error: %s", strerror(errno)));
|
|
213
|
+
if (fd == -1) {
|
|
214
|
+
long ret = std::ftell(fp);
|
|
215
|
+
if (ret == -1) {
|
|
216
|
+
throw std::runtime_error(format("ftell error: %s", strerror(errno)));
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
return (size_t) ret;
|
|
181
220
|
}
|
|
182
221
|
|
|
183
|
-
|
|
222
|
+
off_t pos = lseek(fd, 0, SEEK_CUR);
|
|
223
|
+
if (pos == -1) {
|
|
224
|
+
throw std::runtime_error(format("lseek error: %s", strerror(errno)));
|
|
225
|
+
}
|
|
226
|
+
return (size_t) pos;
|
|
184
227
|
}
|
|
185
228
|
|
|
186
229
|
void seek(size_t offset, int whence) const {
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
if (ret
|
|
230
|
+
off_t ret = 0;
|
|
231
|
+
if (fd == -1) {
|
|
232
|
+
ret = std::fseek(fp, (long) offset, whence);
|
|
233
|
+
} else {
|
|
234
|
+
ret = lseek(fd, offset, whence);
|
|
235
|
+
}
|
|
236
|
+
if (ret == -1) {
|
|
194
237
|
throw std::runtime_error(format("seek error: %s", strerror(errno)));
|
|
195
238
|
}
|
|
196
239
|
}
|
|
197
240
|
|
|
198
|
-
void
|
|
241
|
+
void read_raw_unsafe(void * ptr, size_t len) {
|
|
199
242
|
if (len == 0) {
|
|
200
243
|
return;
|
|
201
244
|
}
|
|
202
245
|
errno = 0;
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
246
|
+
if (fd == -1) {
|
|
247
|
+
std::size_t ret = std::fread(ptr, len, 1, fp);
|
|
248
|
+
if (ferror(fp)) {
|
|
249
|
+
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
|
250
|
+
}
|
|
251
|
+
if (ret != 1) {
|
|
252
|
+
throw std::runtime_error("unexpectedly reached end of file");
|
|
253
|
+
}
|
|
254
|
+
} else {
|
|
255
|
+
size_t bytes_read = 0;
|
|
256
|
+
while (bytes_read < len) {
|
|
257
|
+
const size_t to_read = len - bytes_read;
|
|
258
|
+
ssize_t ret = ::read(fd, reinterpret_cast<char *>(ptr) + bytes_read, to_read);
|
|
259
|
+
|
|
260
|
+
if (ret == -1) {
|
|
261
|
+
if (errno == EINTR) {
|
|
262
|
+
continue; // Interrupted by signal, retry
|
|
263
|
+
}
|
|
264
|
+
// Fallback to std::fread in case the DMA controller cannot access the buffer
|
|
265
|
+
if (errno == EFAULT) {
|
|
266
|
+
auto curr_off = tell();
|
|
267
|
+
close(fd);
|
|
268
|
+
fd = -1;
|
|
269
|
+
alignment = 1;
|
|
270
|
+
init_fp("rb");
|
|
271
|
+
seek(curr_off, SEEK_SET);
|
|
272
|
+
read_raw_unsafe(ptr, len);
|
|
273
|
+
return;
|
|
274
|
+
}
|
|
275
|
+
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
|
276
|
+
}
|
|
277
|
+
if (ret == 0) {
|
|
278
|
+
// EOF: allow if this read was only pulling alignment padding past file end
|
|
279
|
+
off_t pos = lseek(fd, 0, SEEK_CUR);
|
|
280
|
+
if (pos != -1 && (size_t) pos == size) {
|
|
281
|
+
std::memset(reinterpret_cast<char *>(ptr) + bytes_read, 0, len - bytes_read);
|
|
282
|
+
return;
|
|
283
|
+
}
|
|
284
|
+
throw std::runtime_error("unexpectedly reached end of file");
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
bytes_read += (size_t) ret;
|
|
288
|
+
}
|
|
206
289
|
}
|
|
207
|
-
|
|
208
|
-
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
void read_aligned_chunk(void * dest, size_t size) {
|
|
293
|
+
size_t offset = tell();
|
|
294
|
+
off_t aligned_offset = offset & ~(alignment - 1);
|
|
295
|
+
off_t offset_from_alignment = offset - aligned_offset;
|
|
296
|
+
size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1);
|
|
297
|
+
|
|
298
|
+
void * raw_buffer = nullptr;
|
|
299
|
+
int ret = posix_memalign(&raw_buffer, alignment, bytes_to_read);
|
|
300
|
+
if (ret != 0) {
|
|
301
|
+
throw std::runtime_error(format("posix_memalign failed with error %d", ret));
|
|
209
302
|
}
|
|
303
|
+
|
|
304
|
+
struct aligned_buffer_deleter {
|
|
305
|
+
void operator()(void * p) const { free(p); }
|
|
306
|
+
};
|
|
307
|
+
std::unique_ptr<void, aligned_buffer_deleter> buffer(raw_buffer);
|
|
308
|
+
|
|
309
|
+
seek(aligned_offset, SEEK_SET);
|
|
310
|
+
read_raw_unsafe(buffer.get(), bytes_to_read);
|
|
311
|
+
|
|
312
|
+
uintptr_t actual_data = reinterpret_cast<uintptr_t>(buffer.get()) + offset_from_alignment;
|
|
313
|
+
memcpy(dest, reinterpret_cast<void *>(actual_data), size);
|
|
210
314
|
}
|
|
211
315
|
|
|
212
|
-
|
|
316
|
+
void read_raw(void * ptr, size_t len) {
|
|
317
|
+
if (has_direct_io()) {
|
|
318
|
+
read_aligned_chunk(ptr, len);
|
|
319
|
+
} else {
|
|
320
|
+
read_raw_unsafe(ptr, len);
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
uint32_t read_u32() {
|
|
213
325
|
uint32_t ret;
|
|
214
326
|
read_raw(&ret, sizeof(ret));
|
|
215
327
|
return ret;
|
|
@@ -230,23 +342,41 @@ struct llama_file::impl {
|
|
|
230
342
|
write_raw(&val, sizeof(val));
|
|
231
343
|
}
|
|
232
344
|
|
|
345
|
+
bool has_direct_io() const {
|
|
346
|
+
return fd != -1 && alignment > 1;
|
|
347
|
+
}
|
|
348
|
+
|
|
233
349
|
~impl() {
|
|
234
|
-
if (
|
|
350
|
+
if (fd != -1) {
|
|
351
|
+
close(fd);
|
|
352
|
+
} else {
|
|
235
353
|
std::fclose(fp);
|
|
236
354
|
}
|
|
237
355
|
}
|
|
356
|
+
int fd = -1;
|
|
357
|
+
std::string fname;
|
|
238
358
|
#endif
|
|
239
359
|
|
|
240
|
-
|
|
241
|
-
|
|
360
|
+
size_t read_alignment() const {
|
|
361
|
+
return alignment;
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
size_t alignment = 1;
|
|
365
|
+
|
|
366
|
+
FILE * fp{};
|
|
367
|
+
size_t size{};
|
|
242
368
|
};
|
|
243
369
|
|
|
244
|
-
llama_file::llama_file(const char * fname, const char * mode
|
|
370
|
+
llama_file::llama_file(const char * fname, const char * mode, const bool use_direct_io) :
|
|
371
|
+
pimpl(std::make_unique<impl>(fname, mode, use_direct_io)) {}
|
|
245
372
|
llama_file::~llama_file() = default;
|
|
246
373
|
|
|
247
374
|
size_t llama_file::tell() const { return pimpl->tell(); }
|
|
248
375
|
size_t llama_file::size() const { return pimpl->size; }
|
|
249
376
|
|
|
377
|
+
size_t llama_file::read_alignment() const { return pimpl->read_alignment(); }
|
|
378
|
+
bool llama_file::has_direct_io() const { return pimpl->has_direct_io(); }
|
|
379
|
+
|
|
250
380
|
int llama_file::file_id() const {
|
|
251
381
|
#ifdef _WIN32
|
|
252
382
|
return _fileno(pimpl->fp);
|
|
@@ -260,9 +390,14 @@ int llama_file::file_id() const {
|
|
|
260
390
|
}
|
|
261
391
|
|
|
262
392
|
void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
|
|
263
|
-
void llama_file::read_raw(void * ptr, size_t len)
|
|
393
|
+
void llama_file::read_raw(void * ptr, size_t len) { pimpl->read_raw(ptr, len); }
|
|
394
|
+
#ifdef _WIN32
|
|
395
|
+
void llama_file::read_raw_unsafe(void * ptr, size_t len) { pimpl->read_raw(ptr, len); }
|
|
396
|
+
#else
|
|
397
|
+
void llama_file::read_raw_unsafe(void * ptr, size_t len) { pimpl->read_raw_unsafe(ptr, len); }
|
|
398
|
+
#endif
|
|
264
399
|
|
|
265
|
-
uint32_t llama_file::read_u32()
|
|
400
|
+
uint32_t llama_file::read_u32() { return pimpl->read_u32(); }
|
|
266
401
|
|
|
267
402
|
void llama_file::write_raw(const void * ptr, size_t len) const { pimpl->write_raw(ptr, len); }
|
|
268
403
|
void llama_file::write_u32(uint32_t val) const { pimpl->write_u32(val); }
|
|
@@ -485,7 +620,7 @@ struct llama_mlock::impl {
|
|
|
485
620
|
if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {
|
|
486
621
|
suggest = false;
|
|
487
622
|
}
|
|
488
|
-
if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) {
|
|
623
|
+
if (suggest && ((uint64_t)lock_limit.rlim_max > (uint64_t)lock_limit.rlim_cur + size)) {
|
|
489
624
|
suggest = false;
|
|
490
625
|
}
|
|
491
626
|
#endif
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
#include <cstdint>
|
|
4
4
|
#include <memory>
|
|
5
5
|
#include <vector>
|
|
6
|
+
#include <cstdio>
|
|
6
7
|
|
|
7
8
|
struct llama_file;
|
|
8
9
|
struct llama_mmap;
|
|
@@ -13,7 +14,7 @@ using llama_mmaps = std::vector<std::unique_ptr<llama_mmap>>;
|
|
|
13
14
|
using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
|
|
14
15
|
|
|
15
16
|
struct llama_file {
|
|
16
|
-
llama_file(const char * fname, const char * mode);
|
|
17
|
+
llama_file(const char * fname, const char * mode, bool use_direct_io = false);
|
|
17
18
|
~llama_file();
|
|
18
19
|
|
|
19
20
|
size_t tell() const;
|
|
@@ -23,12 +24,16 @@ struct llama_file {
|
|
|
23
24
|
|
|
24
25
|
void seek(size_t offset, int whence) const;
|
|
25
26
|
|
|
26
|
-
void read_raw(void * ptr, size_t len)
|
|
27
|
-
|
|
27
|
+
void read_raw(void * ptr, size_t len);
|
|
28
|
+
void read_raw_unsafe(void * ptr, size_t len);
|
|
29
|
+
void read_aligned_chunk(void * dest, size_t size);
|
|
30
|
+
uint32_t read_u32();
|
|
28
31
|
|
|
29
32
|
void write_raw(const void * ptr, size_t len) const;
|
|
30
33
|
void write_u32(uint32_t val) const;
|
|
31
34
|
|
|
35
|
+
size_t read_alignment() const;
|
|
36
|
+
bool has_direct_io() const;
|
|
32
37
|
private:
|
|
33
38
|
struct impl;
|
|
34
39
|
std::unique_ptr<impl> pimpl;
|
|
@@ -462,15 +462,42 @@ namespace GGUFMeta {
|
|
|
462
462
|
return get_key_or_arr(llm_kv(kid), result, n, required);
|
|
463
463
|
}
|
|
464
464
|
|
|
465
|
+
bool llama_model_loader::get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required) {
|
|
466
|
+
const std::string key = llm_kv(kid);
|
|
467
|
+
|
|
468
|
+
const int id = gguf_find_key(meta.get(), key.c_str());
|
|
469
|
+
|
|
470
|
+
if (id < 0) {
|
|
471
|
+
if (required) {
|
|
472
|
+
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
|
|
473
|
+
}
|
|
474
|
+
return false;
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
// throw and error if type is an array
|
|
478
|
+
if (gguf_get_kv_type(meta.get(), id) == GGUF_TYPE_ARRAY) {
|
|
479
|
+
if (required) {
|
|
480
|
+
throw std::runtime_error(format("expected scalar, found array for key: %s", key.c_str()));
|
|
481
|
+
}
|
|
482
|
+
return false;
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
return get_key(key, result, required);
|
|
486
|
+
}
|
|
487
|
+
|
|
465
488
|
// TODO: this is not very clever - figure out something better
|
|
466
489
|
template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
|
|
467
490
|
template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
|
|
491
|
+
template bool llama_model_loader::get_key_or_arr<std::array<float, 512>>(enum llm_kv kid, std::array<float, 512> & result, uint32_t n, bool required);
|
|
492
|
+
|
|
468
493
|
|
|
469
494
|
llama_model_loader::llama_model_loader(
|
|
470
495
|
const std::string & fname,
|
|
471
496
|
std::vector<std::string> & splits,
|
|
472
497
|
bool use_mmap,
|
|
498
|
+
bool use_direct_io,
|
|
473
499
|
bool check_tensors,
|
|
500
|
+
bool no_alloc,
|
|
474
501
|
const llama_model_kv_override * param_overrides_p,
|
|
475
502
|
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) {
|
|
476
503
|
int trace = 0;
|
|
@@ -501,9 +528,17 @@ llama_model_loader::llama_model_loader(
|
|
|
501
528
|
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
|
502
529
|
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
|
503
530
|
|
|
504
|
-
files.emplace_back(new llama_file(fname.c_str(), "rb"));
|
|
531
|
+
files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
|
|
505
532
|
contexts.emplace_back(ctx);
|
|
506
533
|
|
|
534
|
+
use_direct_io = use_direct_io && files.back()->has_direct_io();
|
|
535
|
+
|
|
536
|
+
// Disable mmap in case Direct I/O is enabled and available
|
|
537
|
+
if (use_direct_io && use_mmap) {
|
|
538
|
+
use_mmap = false;
|
|
539
|
+
LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
|
|
540
|
+
}
|
|
541
|
+
|
|
507
542
|
// Save tensors data offset of the main file.
|
|
508
543
|
// For subsidiary files, `meta` tensor data offset must not be used,
|
|
509
544
|
// so we build a unified tensors index for weights.
|
|
@@ -569,7 +604,7 @@ llama_model_loader::llama_model_loader(
|
|
|
569
604
|
}
|
|
570
605
|
}
|
|
571
606
|
|
|
572
|
-
files.emplace_back(new llama_file(fname_split, "rb"));
|
|
607
|
+
files.emplace_back(new llama_file(fname_split, "rb", use_direct_io));
|
|
573
608
|
contexts.emplace_back(ctx);
|
|
574
609
|
|
|
575
610
|
// Save tensors data offset info of the shard.
|
|
@@ -713,7 +748,9 @@ llama_model_loader::llama_model_loader(
|
|
|
713
748
|
}
|
|
714
749
|
|
|
715
750
|
this->use_mmap = use_mmap;
|
|
751
|
+
this->use_direct_io = use_direct_io;
|
|
716
752
|
this->check_tensors = check_tensors;
|
|
753
|
+
this->no_alloc = no_alloc;
|
|
717
754
|
}
|
|
718
755
|
|
|
719
756
|
std::string llama_model_loader::get_arch_name() const {
|
|
@@ -931,7 +968,15 @@ bool llama_model_loader::load_all_data(
|
|
|
931
968
|
// 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
|
|
932
969
|
// NVMe raid configurations might require more / larger buffers.
|
|
933
970
|
constexpr size_t n_buffers = 4;
|
|
934
|
-
|
|
971
|
+
|
|
972
|
+
size_t alignment = 1;
|
|
973
|
+
for (const auto & file : files) {
|
|
974
|
+
alignment = std::max(file->read_alignment(), alignment);
|
|
975
|
+
}
|
|
976
|
+
|
|
977
|
+
// Buffer size: balance between memory usage and I/O efficiency
|
|
978
|
+
// 64MB works well for NVMe drives
|
|
979
|
+
const size_t buffer_size = alignment != 1 ? 64 * 1024 * 1024 + 2 * alignment : 1 * 1024 * 1024;
|
|
935
980
|
|
|
936
981
|
std::vector<ggml_backend_buffer_t> host_buffers;
|
|
937
982
|
std::vector<ggml_backend_event_t> events;
|
|
@@ -981,6 +1026,7 @@ bool llama_model_loader::load_all_data(
|
|
|
981
1026
|
// If the backend is supported, create pinned memory buffers and events for synchronisation.
|
|
982
1027
|
for (size_t idx = 0; idx < n_buffers; ++idx) {
|
|
983
1028
|
auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
|
|
1029
|
+
|
|
984
1030
|
if (!buf) {
|
|
985
1031
|
LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func,
|
|
986
1032
|
ggml_backend_dev_name(dev));
|
|
@@ -1062,6 +1108,7 @@ bool llama_model_loader::load_all_data(
|
|
|
1062
1108
|
}
|
|
1063
1109
|
} else {
|
|
1064
1110
|
const auto & file = files.at(weight->idx);
|
|
1111
|
+
|
|
1065
1112
|
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
|
1066
1113
|
file->seek(weight->offs, SEEK_SET);
|
|
1067
1114
|
file->read_raw(cur->data, n_size);
|
|
@@ -1073,19 +1120,54 @@ bool llama_model_loader::load_all_data(
|
|
|
1073
1120
|
} else {
|
|
1074
1121
|
// If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
|
|
1075
1122
|
if (upload_backend) {
|
|
1076
|
-
|
|
1123
|
+
size_t offset = weight->offs;
|
|
1124
|
+
alignment = file->read_alignment();
|
|
1125
|
+
size_t aligned_offset = offset & ~(alignment - 1);
|
|
1126
|
+
size_t offset_from_alignment = offset - aligned_offset;
|
|
1127
|
+
file->seek(aligned_offset, SEEK_SET);
|
|
1128
|
+
|
|
1129
|
+
// Calculate aligned read boundaries
|
|
1130
|
+
size_t read_start = aligned_offset;
|
|
1131
|
+
size_t read_end = (offset + n_size + alignment - 1) & ~(alignment - 1);
|
|
1077
1132
|
|
|
1078
1133
|
size_t bytes_read = 0;
|
|
1134
|
+
size_t data_read = 0; // Actual tensor data copied (excluding padding)
|
|
1135
|
+
|
|
1136
|
+
while (bytes_read < read_end - read_start) {
|
|
1137
|
+
size_t read_size = std::min<size_t>(buffer_size, read_end - read_start - bytes_read);
|
|
1079
1138
|
|
|
1080
|
-
|
|
1081
|
-
|
|
1139
|
+
// Align the destination pointer within the pinned buffer
|
|
1140
|
+
uintptr_t ptr_dest_aligned = (reinterpret_cast<uintptr_t>(host_ptrs[buffer_idx]) + alignment - 1) & ~(alignment - 1);
|
|
1082
1141
|
|
|
1142
|
+
// Wait for previous upload to complete before reusing buffer
|
|
1083
1143
|
ggml_backend_event_synchronize(events[buffer_idx]);
|
|
1084
|
-
|
|
1085
|
-
|
|
1144
|
+
|
|
1145
|
+
// Read aligned chunk from file
|
|
1146
|
+
file->read_raw_unsafe(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
|
|
1147
|
+
|
|
1148
|
+
// Calculate actual data portion (excluding alignment padding)
|
|
1149
|
+
uintptr_t ptr_data = ptr_dest_aligned;
|
|
1150
|
+
size_t data_to_copy = read_size;
|
|
1151
|
+
|
|
1152
|
+
// Skip alignment padding at start of first chunk
|
|
1153
|
+
if (bytes_read == 0) {
|
|
1154
|
+
ptr_data += offset_from_alignment;
|
|
1155
|
+
data_to_copy -= offset_from_alignment;
|
|
1156
|
+
}
|
|
1157
|
+
|
|
1158
|
+
// Trim alignment padding at end of last chunk
|
|
1159
|
+
if (aligned_offset + bytes_read + read_size > offset + n_size) {
|
|
1160
|
+
data_to_copy -= (read_end - (offset + n_size));
|
|
1161
|
+
}
|
|
1162
|
+
|
|
1163
|
+
// Async upload actual data to GPU
|
|
1164
|
+
ggml_backend_tensor_set_async(upload_backend, cur,
|
|
1165
|
+
reinterpret_cast<void *>(ptr_data), data_read, data_to_copy);
|
|
1086
1166
|
ggml_backend_event_record(events[buffer_idx], upload_backend);
|
|
1087
1167
|
|
|
1088
|
-
|
|
1168
|
+
data_read += data_to_copy;
|
|
1169
|
+
bytes_read += read_size;
|
|
1170
|
+
|
|
1089
1171
|
++buffer_idx;
|
|
1090
1172
|
buffer_idx %= n_buffers;
|
|
1091
1173
|
}
|