llama-cpp-pydist 0.18.0__py3-none-any.whl → 0.19.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llama_cpp/binaries/{llama-b7376-bin-win-cpu-x64.zip → llama-b7488-bin-win-cpu-x64.zip} +0 -0
- {llama_cpp_pydist-0.18.0.dist-info → llama_cpp_pydist-0.19.0.dist-info}/METADATA +1216 -1158
- {llama_cpp_pydist-0.18.0.dist-info → llama_cpp_pydist-0.19.0.dist-info}/RECORD +231 -206
- scripts/generate_changelog.py +10 -0
- vendor_llama_cpp_pydist/llama.cpp/.devops/cann.Dockerfile +1 -1
- vendor_llama_cpp_pydist/llama.cpp/.devops/llama-cli-cann.Dockerfile +3 -2
- vendor_llama_cpp_pydist/llama.cpp/.devops/llama-cpp-cuda.srpm.spec +2 -0
- vendor_llama_cpp_pydist/llama.cpp/.devops/llama-cpp.srpm.spec +2 -0
- vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/011-bug-results.yml +6 -3
- vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/019-bug-misc.yml +1 -0
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/build.yml +33 -2
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/release.yml +8 -46
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/server-webui.yml +225 -0
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/server.yml +0 -264
- vendor_llama_cpp_pydist/llama.cpp/.gitignore +1 -0
- vendor_llama_cpp_pydist/llama.cpp/CODEOWNERS +3 -2
- vendor_llama_cpp_pydist/llama.cpp/README.md +4 -2
- vendor_llama_cpp_pydist/llama.cpp/SECURITY.md +3 -0
- vendor_llama_cpp_pydist/llama.cpp/ci/run.sh +6 -0
- vendor_llama_cpp_pydist/llama.cpp/common/arg.cpp +226 -58
- vendor_llama_cpp_pydist/llama.cpp/common/arg.h +11 -2
- vendor_llama_cpp_pydist/llama.cpp/common/chat-peg-parser.cpp +12 -2
- vendor_llama_cpp_pydist/llama.cpp/common/chat.cpp +140 -0
- vendor_llama_cpp_pydist/llama.cpp/common/common.cpp +130 -67
- vendor_llama_cpp_pydist/llama.cpp/common/common.h +36 -12
- vendor_llama_cpp_pydist/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
- vendor_llama_cpp_pydist/llama.cpp/common/json-schema-to-grammar.h +20 -0
- vendor_llama_cpp_pydist/llama.cpp/common/peg-parser.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/common/preset.cpp +218 -6
- vendor_llama_cpp_pydist/llama.cpp/common/preset.h +45 -3
- vendor_llama_cpp_pydist/llama.cpp/common/sampling.cpp +67 -54
- vendor_llama_cpp_pydist/llama.cpp/common/sampling.h +8 -0
- vendor_llama_cpp_pydist/llama.cpp/convert_hf_to_gguf.py +456 -321
- vendor_llama_cpp_pydist/llama.cpp/convert_hf_to_gguf_update.py +1 -0
- vendor_llama_cpp_pydist/llama.cpp/docs/android/imported-into-android-studio.jpg +0 -0
- vendor_llama_cpp_pydist/llama.cpp/docs/android.md +22 -2
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/SYCL.md +2 -0
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/hexagon/CMakeUserPresets.json +2 -0
- vendor_llama_cpp_pydist/llama.cpp/docs/development/HOWTO-add-model.md +3 -2
- vendor_llama_cpp_pydist/llama.cpp/docs/docker.md +15 -11
- vendor_llama_cpp_pydist/llama.cpp/docs/ops/SYCL.csv +797 -361
- vendor_llama_cpp_pydist/llama.cpp/docs/ops.md +9 -9
- vendor_llama_cpp_pydist/llama.cpp/ggml/CMakeLists.txt +5 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/include/ggml-alloc.h +9 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/include/ggml-backend.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/include/ggml.h +2 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/CMakeLists.txt +3 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-alloc.c +56 -12
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-backend.cpp +21 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +4 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +285 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +28 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +51 -6
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/repack.cpp +286 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/repack.h +8 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/vec.cpp +41 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/vec.h +125 -22
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/argmax.cu +2 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mma.cuh +168 -111
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mmf.cuh +14 -10
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +36 -29
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/ssm-conv.cu +14 -20
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/CMakeLists.txt +3 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +19 -7
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/CMakeLists.txt +2 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/act-ops.c +90 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/htp-msg.h +6 -5
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.c +154 -47
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +57 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/main.c +2 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +176 -28
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.m +7 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/add-id.cpp +77 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/add-id.hpp +8 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/common.hpp +17 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/convert.cpp +15 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +18 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +56 -3
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +97 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +4 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +17 -6
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +22 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/pad.cpp +5 -5
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/ssm_conv.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +58 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +211 -52
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +29 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +3 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +3 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +2 -2
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +40 -24
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +8 -5
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp +62 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp +79 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp +65 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl +53 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +9 -3
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +14 -3
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml.c +5 -0
- vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/constants.py +89 -0
- vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/tensor_mapping.py +96 -3
- vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/utility.py +1 -1
- vendor_llama_cpp_pydist/llama.cpp/grammars/README.md +3 -3
- vendor_llama_cpp_pydist/llama.cpp/include/llama.h +18 -1
- vendor_llama_cpp_pydist/llama.cpp/pyrightconfig.json +1 -1
- vendor_llama_cpp_pydist/llama.cpp/scripts/compare-logprobs.py +281 -0
- vendor_llama_cpp_pydist/llama.cpp/scripts/snapdragon/adb/run-mtmd.sh +65 -0
- vendor_llama_cpp_pydist/llama.cpp/scripts/sync-ggml.last +1 -1
- vendor_llama_cpp_pydist/llama.cpp/src/llama-arch.cpp +1890 -2248
- vendor_llama_cpp_pydist/llama.cpp/src/llama-arch.h +9 -2
- vendor_llama_cpp_pydist/llama.cpp/src/llama-context.cpp +93 -15
- vendor_llama_cpp_pydist/llama.cpp/src/llama-context.h +8 -2
- vendor_llama_cpp_pydist/llama.cpp/src/llama-graph.cpp +75 -7
- vendor_llama_cpp_pydist/llama.cpp/src/llama-graph.h +17 -4
- vendor_llama_cpp_pydist/llama.cpp/src/llama-hparams.cpp +3 -9
- vendor_llama_cpp_pydist/llama.cpp/src/llama-hparams.h +3 -6
- vendor_llama_cpp_pydist/llama.cpp/src/llama-impl.cpp +4 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-kv-cache.cpp +85 -31
- vendor_llama_cpp_pydist/llama.cpp/src/llama-kv-cache.h +19 -2
- vendor_llama_cpp_pydist/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/src/llama-mmap.cpp +123 -28
- vendor_llama_cpp_pydist/llama.cpp/src/llama-mmap.h +5 -1
- vendor_llama_cpp_pydist/llama.cpp/src/llama-model-loader.cpp +58 -13
- vendor_llama_cpp_pydist/llama.cpp/src/llama-model-loader.h +2 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-model.cpp +98 -57
- vendor_llama_cpp_pydist/llama.cpp/src/llama-model.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-quant.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.cpp +16 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-vocab.cpp +2 -1
- vendor_llama_cpp_pydist/llama.cpp/src/llama.cpp +665 -1
- vendor_llama_cpp_pydist/llama.cpp/src/models/glm4-moe.cpp +28 -11
- vendor_llama_cpp_pydist/llama.cpp/src/models/glm4.cpp +27 -4
- vendor_llama_cpp_pydist/llama.cpp/src/models/models.h +5 -5
- vendor_llama_cpp_pydist/llama.cpp/src/models/nemotron-h.cpp +35 -6
- vendor_llama_cpp_pydist/llama.cpp/src/models/qwen2.cpp +12 -3
- vendor_llama_cpp_pydist/llama.cpp/src/models/qwen3next.cpp +81 -266
- vendor_llama_cpp_pydist/llama.cpp/tests/CMakeLists.txt +8 -0
- vendor_llama_cpp_pydist/llama.cpp/tests/test-arg-parser.cpp +29 -0
- vendor_llama_cpp_pydist/llama.cpp/tests/test-backend-ops.cpp +11 -4
- vendor_llama_cpp_pydist/llama.cpp/tests/test-chat.cpp +157 -0
- vendor_llama_cpp_pydist/llama.cpp/tests/test-json-schema-to-grammar.cpp +75 -0
- vendor_llama_cpp_pydist/llama.cpp/tests/test-state-restore-fragmented.cpp +122 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/CMakeLists.txt +1 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/cli/README.md +1 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/completion/README.md +16 -16
- vendor_llama_cpp_pydist/llama.cpp/tools/completion/completion.cpp +7 -16
- vendor_llama_cpp_pydist/llama.cpp/tools/cvector-generator/cvector-generator.cpp +3 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/fit-params/CMakeLists.txt +8 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/fit-params/README.md +55 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/fit-params/fit-params.cpp +66 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/imatrix/imatrix.cpp +3 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/llama-bench/README.md +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/CMakeLists.txt +2 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip-graph.h +7 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip-impl.h +29 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip-model.h +53 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip.cpp +265 -37
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip.h +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/conformer.cpp +217 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/glm4v.cpp +120 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/models.h +10 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/whisper-enc.cpp +9 -10
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd-audio.cpp +371 -550
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd-audio.h +19 -28
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd-cli.cpp +22 -5
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd.cpp +29 -13
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/tests.sh +1 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/perplexity/perplexity.cpp +3 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/server/README.md +45 -27
- vendor_llama_cpp_pydist/llama.cpp/tools/server/public/index.html.gz +0 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-context.cpp +51 -32
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.cpp +137 -266
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.h +24 -26
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server.cpp +19 -9
- vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/unit/test_compat_anthropic.py +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/README.md +6 -5
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/docs/architecture/high-level-architecture-simplified.md +6 -2
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/docs/architecture/high-level-architecture.md +13 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/package-lock.json +10 -10
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/package.json +3 -4
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/app.d.ts +7 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentPreview.svelte +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentThumbnailFile.svelte +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte +28 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionFileAttachments.svelte +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte +1 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte +53 -8
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte +4 -2
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageEditForm.svelte +391 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte +108 -6
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageUser.svelte +26 -48
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessages.svelte +9 -4
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte +21 -4
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreenHeader.svelte +6 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte +16 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsImportExportTab.svelte +68 -2
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebar.svelte +5 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/misc/BadgeChatStatistic.svelte +26 -7
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/misc/MarkdownContent.svelte +263 -167
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/misc/SyntaxHighlightedCode.svelte +3 -2
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/models/ModelsSelector.svelte +199 -185
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/server/ServerErrorSplash.svelte +2 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/ui/switch/index.ts +7 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/ui/switch/switch.svelte +29 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/constants/settings-config.ts +9 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/enums/chat.ts +4 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/enums/index.ts +2 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/markdown/enhance-code-blocks.ts +162 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/markdown/enhance-links.ts +33 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/services/parameter-sync.spec.ts +14 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/services/parameter-sync.ts +75 -13
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/chat.svelte.ts +226 -169
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/conversations.svelte.ts +24 -2
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/server.svelte.ts +4 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/settings.svelte.ts +2 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/api.d.ts +2 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/utils/api-key-validation.ts +2 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/utils/clipboard.ts +262 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/utils/file-preview.ts +0 -9
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/utils/index.ts +11 -2
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/utils/text.ts +7 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/routes/+layout.svelte +25 -9
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/tests/unit/clipboard.test.ts +423 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/{src/lib/utils → tests/unit}/latex-protection.test.ts +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/{src/lib/utils → tests/unit}/model-names.test.ts +1 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/vite.config.ts +2 -2
- vendor_llama_cpp_pydist/llama.cpp/tools/tts/tts.cpp +6 -6
- vendor_llama_cpp_pydist/llama.cpp/vendor/cpp-httplib/CMakeLists.txt +2 -1
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/utils/copy.ts +0 -71
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/tests/server/demo.spec.ts +0 -7
- {llama_cpp_pydist-0.18.0.dist-info → llama_cpp_pydist-0.19.0.dist-info}/WHEEL +0 -0
- {llama_cpp_pydist-0.18.0.dist-info → llama_cpp_pydist-0.19.0.dist-info}/licenses/LICENSE +0 -0
- {llama_cpp_pydist-0.18.0.dist-info → llama_cpp_pydist-0.19.0.dist-info}/top_level.txt +0 -0
|
@@ -175,7 +175,15 @@ llama_kv_cache::llama_kv_cache(
|
|
|
175
175
|
|
|
176
176
|
// allocate tensors and initialize the buffers to avoid NaNs in the padding
|
|
177
177
|
for (auto & [buft, ctx] : ctx_map) {
|
|
178
|
-
ggml_backend_buffer_t buf
|
|
178
|
+
ggml_backend_buffer_t buf;
|
|
179
|
+
if (model.hparams.no_alloc) {
|
|
180
|
+
buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer
|
|
181
|
+
for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != nullptr; t = ggml_get_next_tensor(ctx.get(), t)) {
|
|
182
|
+
t->buffer = buf; // set dummy buffer for KV cache so that the backend scheduler won't try to allocate it
|
|
183
|
+
}
|
|
184
|
+
} else {
|
|
185
|
+
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft); // real buffer
|
|
186
|
+
}
|
|
179
187
|
if (!buf) {
|
|
180
188
|
throw std::runtime_error("failed to allocate buffer for kv cache");
|
|
181
189
|
}
|
|
@@ -482,9 +490,18 @@ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const {
|
|
|
482
490
|
|
|
483
491
|
std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache::memory_breakdown() const {
|
|
484
492
|
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
|
485
|
-
for (const auto & [
|
|
486
|
-
|
|
493
|
+
for (const auto & [ctx, buf] : ctxs_bufs) {
|
|
494
|
+
ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(buf.get());
|
|
495
|
+
|
|
496
|
+
if (hparams.no_alloc) {
|
|
497
|
+
GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) == nullptr);
|
|
498
|
+
ret[buft] += ggml_backend_alloc_ctx_tensors_from_buft_size(ctx.get(), buft);
|
|
499
|
+
} else {
|
|
500
|
+
// GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) != nullptr); // multi_buffer does not have a defined base
|
|
501
|
+
ret[buft] += ggml_backend_buffer_get_size(buf.get());
|
|
502
|
+
}
|
|
487
503
|
}
|
|
504
|
+
|
|
488
505
|
return ret;
|
|
489
506
|
}
|
|
490
507
|
|
|
@@ -1372,7 +1389,7 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
|
|
|
1372
1389
|
const auto & yarn_ext_factor = cparams.yarn_ext_factor;
|
|
1373
1390
|
const auto & yarn_beta_fast = cparams.yarn_beta_fast;
|
|
1374
1391
|
const auto & yarn_beta_slow = cparams.yarn_beta_slow;
|
|
1375
|
-
const auto & yarn_attn_factor =
|
|
1392
|
+
const auto & yarn_attn_factor = cparams.yarn_attn_factor;
|
|
1376
1393
|
|
|
1377
1394
|
const auto & n_rot = hparams.n_rot;
|
|
1378
1395
|
const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE || hparams.rope_type == LLAMA_ROPE_TYPE_IMROPE
|
|
@@ -1544,9 +1561,11 @@ void llama_kv_cache::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama
|
|
|
1544
1561
|
|
|
1545
1562
|
const uint32_t strm = seq_id == -1 ? s : seq_to_stream[seq_id];
|
|
1546
1563
|
|
|
1564
|
+
slot_info sinfo;
|
|
1565
|
+
|
|
1547
1566
|
bool res = true;
|
|
1548
|
-
res = res && state_read_meta(io, strm, cell_count, seq_id);
|
|
1549
|
-
res = res && state_read_data(io, strm, cell_count);
|
|
1567
|
+
res = res && state_read_meta(io, strm, cell_count, sinfo, seq_id);
|
|
1568
|
+
res = res && state_read_data(io, strm, cell_count, sinfo);
|
|
1550
1569
|
|
|
1551
1570
|
if (!res) {
|
|
1552
1571
|
if (seq_id == -1) {
|
|
@@ -1685,7 +1704,7 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t
|
|
|
1685
1704
|
}
|
|
1686
1705
|
}
|
|
1687
1706
|
|
|
1688
|
-
bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id) {
|
|
1707
|
+
bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, slot_info & sinfo, llama_seq_id dest_seq_id) {
|
|
1689
1708
|
auto & cells = v_cells[strm];
|
|
1690
1709
|
auto & head = v_heads[strm];
|
|
1691
1710
|
|
|
@@ -1722,7 +1741,7 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
|
|
|
1722
1741
|
ubatch.seq_id[i] = &dest_seq_id;
|
|
1723
1742
|
}
|
|
1724
1743
|
|
|
1725
|
-
|
|
1744
|
+
sinfo = find_slot(ubatch, false);
|
|
1726
1745
|
if (sinfo.empty()) {
|
|
1727
1746
|
LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
|
|
1728
1747
|
return false;
|
|
@@ -1732,20 +1751,16 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
|
|
|
1732
1751
|
// see: https://github.com/ggml-org/llama.cpp/pull/16825#issuecomment-3460868350
|
|
1733
1752
|
apply_ubatch(sinfo, ubatch);
|
|
1734
1753
|
|
|
1735
|
-
|
|
1754
|
+
LLAMA_LOG_DEBUG("%s: cell_count = %d, dest_seq_id = %d\n", __func__, cell_count, dest_seq_id);
|
|
1736
1755
|
|
|
1737
|
-
//
|
|
1738
|
-
|
|
1739
|
-
|
|
1740
|
-
|
|
1741
|
-
|
|
1742
|
-
|
|
1743
|
-
|
|
1744
|
-
|
|
1745
|
-
GGML_ASSERT(cells.pos_get(head_cur) == ubatch.pos[0]);
|
|
1746
|
-
GGML_ASSERT(cells.pos_get(head_cur + cell_count - 1) == ubatch.pos[cell_count - 1]);
|
|
1747
|
-
GGML_ASSERT(cells.seq_has(head_cur, dest_seq_id));
|
|
1748
|
-
GGML_ASSERT(cells.seq_has(head_cur + cell_count - 1, dest_seq_id));
|
|
1756
|
+
// DEBUG CHECK: verify that all cells were allocated and have correct seq_id and pos values
|
|
1757
|
+
GGML_ASSERT(sinfo.n_stream() == 1);
|
|
1758
|
+
GGML_ASSERT(sinfo.idxs[0].size() == cell_count);
|
|
1759
|
+
for (uint32_t i = 0; i < cell_count; ++i) {
|
|
1760
|
+
const uint32_t idx = sinfo.idxs[0][i];
|
|
1761
|
+
GGML_ASSERT(cells.pos_get(idx) == ubatch.pos[i]);
|
|
1762
|
+
GGML_ASSERT(cells.seq_has(idx, dest_seq_id));
|
|
1763
|
+
}
|
|
1749
1764
|
} else {
|
|
1750
1765
|
// whole KV cache restore
|
|
1751
1766
|
|
|
@@ -1778,15 +1793,24 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
|
|
|
1778
1793
|
}
|
|
1779
1794
|
}
|
|
1780
1795
|
|
|
1796
|
+
// Create contiguous slot_info for whole cache restore
|
|
1797
|
+
sinfo.s0 = strm;
|
|
1798
|
+
sinfo.s1 = strm;
|
|
1799
|
+
sinfo.resize(1);
|
|
1800
|
+
sinfo.strm[0] = strm;
|
|
1801
|
+
sinfo.idxs[0].resize(cell_count);
|
|
1802
|
+
for (uint32_t i = 0; i < cell_count; ++i) {
|
|
1803
|
+
sinfo.idxs[0][i] = i;
|
|
1804
|
+
}
|
|
1805
|
+
|
|
1781
1806
|
head = 0;
|
|
1782
1807
|
}
|
|
1783
1808
|
|
|
1784
1809
|
return true;
|
|
1785
1810
|
}
|
|
1786
1811
|
|
|
1787
|
-
bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count) {
|
|
1812
|
+
bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, const slot_info & sinfo) {
|
|
1788
1813
|
auto & cells = v_cells[strm];
|
|
1789
|
-
auto & head = v_heads[strm];
|
|
1790
1814
|
|
|
1791
1815
|
uint32_t v_trans;
|
|
1792
1816
|
uint32_t n_layer;
|
|
@@ -1836,8 +1860,17 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
|
|
|
1836
1860
|
}
|
|
1837
1861
|
|
|
1838
1862
|
if (cell_count) {
|
|
1839
|
-
|
|
1840
|
-
|
|
1863
|
+
if (sinfo.is_contiguous()) {
|
|
1864
|
+
// Fast path: contiguous cells, single memcpy
|
|
1865
|
+
ggml_backend_tensor_set(k, io.read(cell_count * k_size_row), sinfo.head() * k_size_row, cell_count * k_size_row);
|
|
1866
|
+
} else {
|
|
1867
|
+
// Slow path: scatter to non-contiguous positions
|
|
1868
|
+
const void * src = io.read(cell_count * k_size_row);
|
|
1869
|
+
for (uint32_t i = 0; i < cell_count; ++i) {
|
|
1870
|
+
const size_t dst_offset = sinfo.idxs[0][i] * k_size_row;
|
|
1871
|
+
ggml_backend_tensor_set(k, (const char*)src + i * k_size_row, dst_offset, k_size_row);
|
|
1872
|
+
}
|
|
1873
|
+
}
|
|
1841
1874
|
}
|
|
1842
1875
|
}
|
|
1843
1876
|
|
|
@@ -1868,8 +1901,17 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
|
|
|
1868
1901
|
}
|
|
1869
1902
|
|
|
1870
1903
|
if (cell_count) {
|
|
1871
|
-
|
|
1872
|
-
|
|
1904
|
+
if (sinfo.is_contiguous()) {
|
|
1905
|
+
// Fast path: contiguous cells, single memcpy
|
|
1906
|
+
ggml_backend_tensor_set(v, io.read(cell_count * v_size_row), sinfo.head() * v_size_row, cell_count * v_size_row);
|
|
1907
|
+
} else {
|
|
1908
|
+
// Slow path: scatter to non-contiguous positions
|
|
1909
|
+
const void * src = io.read(cell_count * v_size_row);
|
|
1910
|
+
for (uint32_t i = 0; i < cell_count; ++i) {
|
|
1911
|
+
const size_t dst_offset = sinfo.idxs[0][i] * v_size_row;
|
|
1912
|
+
ggml_backend_tensor_set(v, (const char*)src + i * v_size_row, dst_offset, v_size_row);
|
|
1913
|
+
}
|
|
1914
|
+
}
|
|
1873
1915
|
}
|
|
1874
1916
|
}
|
|
1875
1917
|
} else {
|
|
@@ -1908,10 +1950,22 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
|
|
|
1908
1950
|
}
|
|
1909
1951
|
|
|
1910
1952
|
if (cell_count) {
|
|
1911
|
-
|
|
1912
|
-
|
|
1913
|
-
const
|
|
1914
|
-
|
|
1953
|
+
if (sinfo.is_contiguous()) {
|
|
1954
|
+
// Fast path: contiguous cells
|
|
1955
|
+
const uint32_t h = sinfo.head();
|
|
1956
|
+
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
|
1957
|
+
const size_t dst_offset = (h + j * cells.size()) * v_size_el;
|
|
1958
|
+
ggml_backend_tensor_set(v, io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
|
|
1959
|
+
}
|
|
1960
|
+
} else {
|
|
1961
|
+
// Slow path: scatter to non-contiguous positions
|
|
1962
|
+
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
|
1963
|
+
const void * src = io.read(cell_count * v_size_el);
|
|
1964
|
+
for (uint32_t i = 0; i < cell_count; ++i) {
|
|
1965
|
+
const size_t dst_offset = (sinfo.idxs[0][i] + j * cells.size()) * v_size_el;
|
|
1966
|
+
ggml_backend_tensor_set(v, (const char*)src + i * v_size_el, dst_offset, v_size_el);
|
|
1967
|
+
}
|
|
1968
|
+
}
|
|
1915
1969
|
}
|
|
1916
1970
|
}
|
|
1917
1971
|
}
|
|
@@ -72,6 +72,23 @@ public:
|
|
|
72
72
|
void clear() {
|
|
73
73
|
idxs.clear();
|
|
74
74
|
}
|
|
75
|
+
|
|
76
|
+
// check if indices are contiguous starting from head()
|
|
77
|
+
bool is_contiguous() const {
|
|
78
|
+
if (idxs.empty() || idxs[0].empty()) {
|
|
79
|
+
return true;
|
|
80
|
+
}
|
|
81
|
+
if (idxs.size() > 1) {
|
|
82
|
+
return false;
|
|
83
|
+
}
|
|
84
|
+
const uint32_t h = idxs[0][0];
|
|
85
|
+
for (size_t i = 0; i < idxs[0].size(); ++i) {
|
|
86
|
+
if (idxs[0][i] != h + i) {
|
|
87
|
+
return false;
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
return true;
|
|
91
|
+
}
|
|
75
92
|
};
|
|
76
93
|
|
|
77
94
|
using slot_info_vec_t = std::vector<slot_info>;
|
|
@@ -264,8 +281,8 @@ private:
|
|
|
264
281
|
void state_write_meta(llama_io_write_i & io, const cell_ranges_t & cr, llama_seq_id seq_id = -1) const;
|
|
265
282
|
void state_write_data(llama_io_write_i & io, const cell_ranges_t & cr) const;
|
|
266
283
|
|
|
267
|
-
bool state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
|
|
268
|
-
bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count);
|
|
284
|
+
bool state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, slot_info & sinfo, llama_seq_id dest_seq_id = -1);
|
|
285
|
+
bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, const slot_info & sinfo);
|
|
269
286
|
};
|
|
270
287
|
|
|
271
288
|
class llama_kv_cache_context : public llama_memory_context_i {
|
|
@@ -222,7 +222,7 @@ llama_memory_hybrid_context::llama_memory_hybrid_context(
|
|
|
222
222
|
ubatches(std::move(ubatches)),
|
|
223
223
|
// note: here we copy the ubatches. not sure if this is ideal
|
|
224
224
|
ctx_attn(new llama_kv_cache_context(mem->get_mem_attn(), std::move(sinfos_attn), this->ubatches)),
|
|
225
|
-
ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(),
|
|
225
|
+
ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(), this->ubatches)),
|
|
226
226
|
status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
|
|
227
227
|
}
|
|
228
228
|
|
|
@@ -13,9 +13,10 @@
|
|
|
13
13
|
#ifdef __has_include
|
|
14
14
|
#if __has_include(<unistd.h>)
|
|
15
15
|
#include <unistd.h>
|
|
16
|
+
#include <fcntl.h>
|
|
17
|
+
#include <sys/stat.h>
|
|
16
18
|
#if defined(_POSIX_MAPPED_FILES)
|
|
17
19
|
#include <sys/mman.h>
|
|
18
|
-
#include <fcntl.h>
|
|
19
20
|
#endif
|
|
20
21
|
#if defined(_POSIX_MEMLOCK_RANGE)
|
|
21
22
|
#include <sys/resource.h>
|
|
@@ -74,7 +75,7 @@ struct llama_file::impl {
|
|
|
74
75
|
return ret;
|
|
75
76
|
}
|
|
76
77
|
|
|
77
|
-
impl(const char * fname, const char * mode) {
|
|
78
|
+
impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
|
|
78
79
|
fp = ggml_fopen(fname, mode);
|
|
79
80
|
if (fp == NULL) {
|
|
80
81
|
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
|
@@ -153,13 +154,40 @@ struct llama_file::impl {
|
|
|
153
154
|
write_raw(&val, sizeof(val));
|
|
154
155
|
}
|
|
155
156
|
|
|
157
|
+
void read_aligned_chunk(size_t offset, void * dest, size_t size) const {
|
|
158
|
+
throw std::runtime_error("DirectIO is not implemented on Windows.");
|
|
159
|
+
}
|
|
160
|
+
|
|
156
161
|
~impl() {
|
|
157
162
|
if (fp) {
|
|
158
163
|
std::fclose(fp);
|
|
159
164
|
}
|
|
160
165
|
}
|
|
161
166
|
#else
|
|
162
|
-
impl(const char * fname, const char * mode) {
|
|
167
|
+
impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
|
|
168
|
+
#ifdef __linux__
|
|
169
|
+
// Try unbuffered I/O for read only
|
|
170
|
+
if (use_direct_io && std::strcmp(mode, "rb") == 0) {
|
|
171
|
+
fd = open(fname, O_RDONLY | O_DIRECT);
|
|
172
|
+
|
|
173
|
+
if (fd != -1) {
|
|
174
|
+
struct stat file_stats{};
|
|
175
|
+
fstat(fd, &file_stats);
|
|
176
|
+
|
|
177
|
+
size = file_stats.st_size;
|
|
178
|
+
alignment = file_stats.st_blksize;
|
|
179
|
+
|
|
180
|
+
off_t ret = lseek(fd, 0, SEEK_SET);
|
|
181
|
+
if (ret == -1) {
|
|
182
|
+
throw std::runtime_error(format("seek error: %s", strerror(errno)));
|
|
183
|
+
}
|
|
184
|
+
return;
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
LLAMA_LOG_WARN("Failed to open model %s with error: %s. Falling back to buffered I/O",
|
|
188
|
+
fname, strerror(errno));
|
|
189
|
+
}
|
|
190
|
+
#endif
|
|
163
191
|
fp = ggml_fopen(fname, mode);
|
|
164
192
|
if (fp == NULL) {
|
|
165
193
|
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
|
@@ -170,27 +198,30 @@ struct llama_file::impl {
|
|
|
170
198
|
}
|
|
171
199
|
|
|
172
200
|
size_t tell() const {
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
throw std::runtime_error(format("ftell error: %s", strerror(errno)));
|
|
201
|
+
if (fd == -1) {
|
|
202
|
+
long ret = std::ftell(fp);
|
|
203
|
+
if (ret == -1) {
|
|
204
|
+
throw std::runtime_error(format("ftell error: %s", strerror(errno)));
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
return (size_t) ret;
|
|
181
208
|
}
|
|
182
209
|
|
|
183
|
-
|
|
210
|
+
off_t pos = lseek(fd, 0, SEEK_CUR);
|
|
211
|
+
if (pos == -1) {
|
|
212
|
+
throw std::runtime_error(format("lseek error: %s", strerror(errno)));
|
|
213
|
+
}
|
|
214
|
+
return (size_t) pos;
|
|
184
215
|
}
|
|
185
216
|
|
|
186
217
|
void seek(size_t offset, int whence) const {
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
if (ret
|
|
218
|
+
off_t ret = 0;
|
|
219
|
+
if (fd == -1) {
|
|
220
|
+
ret = std::fseek(fp, (long) offset, whence);
|
|
221
|
+
} else {
|
|
222
|
+
ret = lseek(fd, offset, whence);
|
|
223
|
+
}
|
|
224
|
+
if (ret == -1) {
|
|
194
225
|
throw std::runtime_error(format("seek error: %s", strerror(errno)));
|
|
195
226
|
}
|
|
196
227
|
}
|
|
@@ -200,13 +231,55 @@ struct llama_file::impl {
|
|
|
200
231
|
return;
|
|
201
232
|
}
|
|
202
233
|
errno = 0;
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
234
|
+
if (fd == -1) {
|
|
235
|
+
std::size_t ret = std::fread(ptr, len, 1, fp);
|
|
236
|
+
if (ferror(fp)) {
|
|
237
|
+
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
|
238
|
+
}
|
|
239
|
+
if (ret != 1) {
|
|
240
|
+
throw std::runtime_error("unexpectedly reached end of file");
|
|
241
|
+
}
|
|
242
|
+
} else {
|
|
243
|
+
bool successful = false;
|
|
244
|
+
while (!successful) {
|
|
245
|
+
off_t ret = read(fd, ptr, len);
|
|
246
|
+
|
|
247
|
+
if (ret == -1) {
|
|
248
|
+
if (errno == EINTR) {
|
|
249
|
+
continue; // Interrupted by signal, retry
|
|
250
|
+
}
|
|
251
|
+
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
|
252
|
+
}
|
|
253
|
+
if (ret == 0) {
|
|
254
|
+
throw std::runtime_error("unexpectedly reached end of file");
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
successful = true;
|
|
258
|
+
}
|
|
206
259
|
}
|
|
207
|
-
|
|
208
|
-
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
void read_aligned_chunk(size_t offset, void * dest, size_t size) const {
|
|
263
|
+
off_t aligned_offset = offset & ~(alignment - 1);
|
|
264
|
+
off_t offset_from_alignment = offset - aligned_offset;
|
|
265
|
+
size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1);
|
|
266
|
+
|
|
267
|
+
void * raw_buffer = nullptr;
|
|
268
|
+
int ret = posix_memalign(&raw_buffer, alignment, bytes_to_read);
|
|
269
|
+
if (ret != 0) {
|
|
270
|
+
throw std::runtime_error(format("posix_memalign failed with error %d", ret));
|
|
209
271
|
}
|
|
272
|
+
|
|
273
|
+
struct aligned_buffer_deleter {
|
|
274
|
+
void operator()(void * p) const { free(p); }
|
|
275
|
+
};
|
|
276
|
+
std::unique_ptr<void, aligned_buffer_deleter> buffer(raw_buffer);
|
|
277
|
+
|
|
278
|
+
seek(aligned_offset, SEEK_SET);
|
|
279
|
+
read_raw(buffer.get(), bytes_to_read);
|
|
280
|
+
|
|
281
|
+
uintptr_t actual_data = reinterpret_cast<uintptr_t>(buffer.get()) + offset_from_alignment;
|
|
282
|
+
memcpy(dest, reinterpret_cast<void *>(actual_data), size);
|
|
210
283
|
}
|
|
211
284
|
|
|
212
285
|
uint32_t read_u32() const {
|
|
@@ -231,22 +304,43 @@ struct llama_file::impl {
|
|
|
231
304
|
}
|
|
232
305
|
|
|
233
306
|
~impl() {
|
|
234
|
-
if (
|
|
307
|
+
if (fd != -1) {
|
|
308
|
+
close(fd);
|
|
309
|
+
} else {
|
|
235
310
|
std::fclose(fp);
|
|
236
311
|
}
|
|
237
312
|
}
|
|
313
|
+
int fd = -1;
|
|
238
314
|
#endif
|
|
239
315
|
|
|
240
|
-
|
|
241
|
-
|
|
316
|
+
void read_raw_at(void * ptr, size_t len, size_t offset) const {
|
|
317
|
+
if (alignment != 1) {
|
|
318
|
+
read_aligned_chunk(offset, ptr, len);
|
|
319
|
+
} else {
|
|
320
|
+
seek(offset, SEEK_SET);
|
|
321
|
+
read_raw(ptr, len);
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
size_t read_alignment() const {
|
|
326
|
+
return alignment;
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
size_t alignment = 1;
|
|
330
|
+
|
|
331
|
+
FILE * fp{};
|
|
332
|
+
size_t size{};
|
|
242
333
|
};
|
|
243
334
|
|
|
244
|
-
llama_file::llama_file(const char * fname, const char * mode
|
|
335
|
+
llama_file::llama_file(const char * fname, const char * mode, const bool use_direct_io) :
|
|
336
|
+
pimpl(std::make_unique<impl>(fname, mode, use_direct_io)) {}
|
|
245
337
|
llama_file::~llama_file() = default;
|
|
246
338
|
|
|
247
339
|
size_t llama_file::tell() const { return pimpl->tell(); }
|
|
248
340
|
size_t llama_file::size() const { return pimpl->size; }
|
|
249
341
|
|
|
342
|
+
size_t llama_file::read_alignment() const { return pimpl->read_alignment(); }
|
|
343
|
+
|
|
250
344
|
int llama_file::file_id() const {
|
|
251
345
|
#ifdef _WIN32
|
|
252
346
|
return _fileno(pimpl->fp);
|
|
@@ -261,6 +355,7 @@ int llama_file::file_id() const {
|
|
|
261
355
|
|
|
262
356
|
void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
|
|
263
357
|
void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); }
|
|
358
|
+
void llama_file::read_raw_at(void * ptr, size_t len, size_t offset) const { pimpl->read_raw_at(ptr, len, offset); }
|
|
264
359
|
|
|
265
360
|
uint32_t llama_file::read_u32() const { return pimpl->read_u32(); }
|
|
266
361
|
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
#include <cstdint>
|
|
4
4
|
#include <memory>
|
|
5
5
|
#include <vector>
|
|
6
|
+
#include <cstdio>
|
|
6
7
|
|
|
7
8
|
struct llama_file;
|
|
8
9
|
struct llama_mmap;
|
|
@@ -13,7 +14,7 @@ using llama_mmaps = std::vector<std::unique_ptr<llama_mmap>>;
|
|
|
13
14
|
using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
|
|
14
15
|
|
|
15
16
|
struct llama_file {
|
|
16
|
-
llama_file(const char * fname, const char * mode);
|
|
17
|
+
llama_file(const char * fname, const char * mode, bool use_direct_io = false);
|
|
17
18
|
~llama_file();
|
|
18
19
|
|
|
19
20
|
size_t tell() const;
|
|
@@ -24,11 +25,14 @@ struct llama_file {
|
|
|
24
25
|
void seek(size_t offset, int whence) const;
|
|
25
26
|
|
|
26
27
|
void read_raw(void * ptr, size_t len) const;
|
|
28
|
+
void read_raw_at(void * ptr, size_t len, size_t offset) const;
|
|
29
|
+
void read_aligned_chunk(size_t offset, void * dest, size_t size) const;
|
|
27
30
|
uint32_t read_u32() const;
|
|
28
31
|
|
|
29
32
|
void write_raw(const void * ptr, size_t len) const;
|
|
30
33
|
void write_u32(uint32_t val) const;
|
|
31
34
|
|
|
35
|
+
size_t read_alignment() const;
|
|
32
36
|
private:
|
|
33
37
|
struct impl;
|
|
34
38
|
std::unique_ptr<impl> pimpl;
|
|
@@ -473,6 +473,7 @@ llama_model_loader::llama_model_loader(
|
|
|
473
473
|
std::vector<std::string> & splits,
|
|
474
474
|
bool use_mmap,
|
|
475
475
|
bool check_tensors,
|
|
476
|
+
bool no_alloc,
|
|
476
477
|
const llama_model_kv_override * param_overrides_p,
|
|
477
478
|
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) {
|
|
478
479
|
int trace = 0;
|
|
@@ -503,7 +504,7 @@ llama_model_loader::llama_model_loader(
|
|
|
503
504
|
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
|
504
505
|
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
|
505
506
|
|
|
506
|
-
files.emplace_back(new llama_file(fname.c_str(), "rb"));
|
|
507
|
+
files.emplace_back(new llama_file(fname.c_str(), "rb", !use_mmap));
|
|
507
508
|
contexts.emplace_back(ctx);
|
|
508
509
|
|
|
509
510
|
// Save tensors data offset of the main file.
|
|
@@ -571,7 +572,7 @@ llama_model_loader::llama_model_loader(
|
|
|
571
572
|
}
|
|
572
573
|
}
|
|
573
574
|
|
|
574
|
-
files.emplace_back(new llama_file(fname_split, "rb"));
|
|
575
|
+
files.emplace_back(new llama_file(fname_split, "rb", !use_mmap));
|
|
575
576
|
contexts.emplace_back(ctx);
|
|
576
577
|
|
|
577
578
|
// Save tensors data offset info of the shard.
|
|
@@ -716,6 +717,7 @@ llama_model_loader::llama_model_loader(
|
|
|
716
717
|
|
|
717
718
|
this->use_mmap = use_mmap;
|
|
718
719
|
this->check_tensors = check_tensors;
|
|
720
|
+
this->no_alloc = no_alloc;
|
|
719
721
|
}
|
|
720
722
|
|
|
721
723
|
std::string llama_model_loader::get_arch_name() const {
|
|
@@ -933,7 +935,15 @@ bool llama_model_loader::load_all_data(
|
|
|
933
935
|
// 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
|
|
934
936
|
// NVMe raid configurations might require more / larger buffers.
|
|
935
937
|
constexpr size_t n_buffers = 4;
|
|
936
|
-
|
|
938
|
+
|
|
939
|
+
size_t alignment = 1;
|
|
940
|
+
for (const auto & file : files) {
|
|
941
|
+
alignment = std::max(file->read_alignment(), alignment);
|
|
942
|
+
}
|
|
943
|
+
|
|
944
|
+
// Buffer size: balance between memory usage and I/O efficiency
|
|
945
|
+
// 64MB works well for NVMe drives
|
|
946
|
+
const size_t buffer_size = alignment != 1 ? 64 * 1024 * 1024 + 2 * alignment : 1 * 1024 * 1024;
|
|
937
947
|
|
|
938
948
|
std::vector<ggml_backend_buffer_t> host_buffers;
|
|
939
949
|
std::vector<ggml_backend_event_t> events;
|
|
@@ -983,6 +993,7 @@ bool llama_model_loader::load_all_data(
|
|
|
983
993
|
// If the backend is supported, create pinned memory buffers and events for synchronisation.
|
|
984
994
|
for (size_t idx = 0; idx < n_buffers; ++idx) {
|
|
985
995
|
auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
|
|
996
|
+
|
|
986
997
|
if (!buf) {
|
|
987
998
|
LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func,
|
|
988
999
|
ggml_backend_dev_name(dev));
|
|
@@ -1064,9 +1075,9 @@ bool llama_model_loader::load_all_data(
|
|
|
1064
1075
|
}
|
|
1065
1076
|
} else {
|
|
1066
1077
|
const auto & file = files.at(weight->idx);
|
|
1078
|
+
|
|
1067
1079
|
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
|
1068
|
-
file->
|
|
1069
|
-
file->read_raw(cur->data, n_size);
|
|
1080
|
+
file->read_raw_at(cur->data, n_size, weight->offs);
|
|
1070
1081
|
if (check_tensors) {
|
|
1071
1082
|
validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
|
|
1072
1083
|
return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
|
|
@@ -1075,26 +1086,60 @@ bool llama_model_loader::load_all_data(
|
|
|
1075
1086
|
} else {
|
|
1076
1087
|
// If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
|
|
1077
1088
|
if (upload_backend) {
|
|
1078
|
-
|
|
1089
|
+
size_t offset = weight->offs;
|
|
1090
|
+
alignment = file->read_alignment();
|
|
1091
|
+
size_t aligned_offset = offset & ~(alignment - 1);
|
|
1092
|
+
size_t offset_from_alignment = offset - aligned_offset;
|
|
1093
|
+
file->seek(aligned_offset, SEEK_SET);
|
|
1094
|
+
|
|
1095
|
+
// Calculate aligned read boundaries
|
|
1096
|
+
size_t read_start = aligned_offset;
|
|
1097
|
+
size_t read_end = (offset + n_size + alignment - 1) & ~(alignment - 1);
|
|
1079
1098
|
|
|
1080
1099
|
size_t bytes_read = 0;
|
|
1100
|
+
size_t data_read = 0; // Actual tensor data copied (excluding padding)
|
|
1101
|
+
|
|
1102
|
+
while (bytes_read < read_end - read_start) {
|
|
1103
|
+
size_t read_size = std::min<size_t>(buffer_size, read_end - read_start - bytes_read);
|
|
1081
1104
|
|
|
1082
|
-
|
|
1083
|
-
|
|
1105
|
+
// Align the destination pointer within the pinned buffer
|
|
1106
|
+
uintptr_t ptr_dest_aligned = (reinterpret_cast<uintptr_t>(host_ptrs[buffer_idx]) + alignment - 1) & ~(alignment - 1);
|
|
1084
1107
|
|
|
1108
|
+
// Wait for previous upload to complete before reusing buffer
|
|
1085
1109
|
ggml_backend_event_synchronize(events[buffer_idx]);
|
|
1086
|
-
|
|
1087
|
-
|
|
1110
|
+
|
|
1111
|
+
// Read aligned chunk from file
|
|
1112
|
+
file->read_raw(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
|
|
1113
|
+
|
|
1114
|
+
// Calculate actual data portion (excluding alignment padding)
|
|
1115
|
+
uintptr_t ptr_data = ptr_dest_aligned;
|
|
1116
|
+
size_t data_to_copy = read_size;
|
|
1117
|
+
|
|
1118
|
+
// Skip alignment padding at start of first chunk
|
|
1119
|
+
if (bytes_read == 0) {
|
|
1120
|
+
ptr_data += offset_from_alignment;
|
|
1121
|
+
data_to_copy -= offset_from_alignment;
|
|
1122
|
+
}
|
|
1123
|
+
|
|
1124
|
+
// Trim alignment padding at end of last chunk
|
|
1125
|
+
if (aligned_offset + bytes_read + read_size > offset + n_size) {
|
|
1126
|
+
data_to_copy -= (read_end - (offset + n_size));
|
|
1127
|
+
}
|
|
1128
|
+
|
|
1129
|
+
// Async upload actual data to GPU
|
|
1130
|
+
ggml_backend_tensor_set_async(upload_backend, cur,
|
|
1131
|
+
reinterpret_cast<void *>(ptr_data), data_read, data_to_copy);
|
|
1088
1132
|
ggml_backend_event_record(events[buffer_idx], upload_backend);
|
|
1089
1133
|
|
|
1090
|
-
|
|
1134
|
+
data_read += data_to_copy;
|
|
1135
|
+
bytes_read += read_size;
|
|
1136
|
+
|
|
1091
1137
|
++buffer_idx;
|
|
1092
1138
|
buffer_idx %= n_buffers;
|
|
1093
1139
|
}
|
|
1094
1140
|
} else {
|
|
1095
1141
|
read_buf.resize(n_size);
|
|
1096
|
-
file->
|
|
1097
|
-
file->read_raw(read_buf.data(), n_size);
|
|
1142
|
+
file->read_raw_at(read_buf.data(), n_size, weight->offs);
|
|
1098
1143
|
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
|
|
1099
1144
|
if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
|
|
1100
1145
|
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
|
|
@@ -71,6 +71,7 @@ struct llama_model_loader {
|
|
|
71
71
|
|
|
72
72
|
bool use_mmap = false;
|
|
73
73
|
bool check_tensors;
|
|
74
|
+
bool no_alloc;
|
|
74
75
|
|
|
75
76
|
llama_files files;
|
|
76
77
|
llama_ftype ftype;
|
|
@@ -97,6 +98,7 @@ struct llama_model_loader {
|
|
|
97
98
|
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
|
|
98
99
|
bool use_mmap,
|
|
99
100
|
bool check_tensors,
|
|
101
|
+
bool no_alloc,
|
|
100
102
|
const llama_model_kv_override * param_overrides_p,
|
|
101
103
|
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p);
|
|
102
104
|
|