llama-cpp-pydist 0.18.0__py3-none-any.whl → 0.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. llama_cpp/binaries/{llama-b7376-bin-win-cpu-x64.zip → llama-b7488-bin-win-cpu-x64.zip} +0 -0
  2. {llama_cpp_pydist-0.18.0.dist-info → llama_cpp_pydist-0.19.0.dist-info}/METADATA +1216 -1158
  3. {llama_cpp_pydist-0.18.0.dist-info → llama_cpp_pydist-0.19.0.dist-info}/RECORD +231 -206
  4. scripts/generate_changelog.py +10 -0
  5. vendor_llama_cpp_pydist/llama.cpp/.devops/cann.Dockerfile +1 -1
  6. vendor_llama_cpp_pydist/llama.cpp/.devops/llama-cli-cann.Dockerfile +3 -2
  7. vendor_llama_cpp_pydist/llama.cpp/.devops/llama-cpp-cuda.srpm.spec +2 -0
  8. vendor_llama_cpp_pydist/llama.cpp/.devops/llama-cpp.srpm.spec +2 -0
  9. vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/011-bug-results.yml +6 -3
  10. vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/019-bug-misc.yml +1 -0
  11. vendor_llama_cpp_pydist/llama.cpp/.github/workflows/build.yml +33 -2
  12. vendor_llama_cpp_pydist/llama.cpp/.github/workflows/release.yml +8 -46
  13. vendor_llama_cpp_pydist/llama.cpp/.github/workflows/server-webui.yml +225 -0
  14. vendor_llama_cpp_pydist/llama.cpp/.github/workflows/server.yml +0 -264
  15. vendor_llama_cpp_pydist/llama.cpp/.gitignore +1 -0
  16. vendor_llama_cpp_pydist/llama.cpp/CODEOWNERS +3 -2
  17. vendor_llama_cpp_pydist/llama.cpp/README.md +4 -2
  18. vendor_llama_cpp_pydist/llama.cpp/SECURITY.md +3 -0
  19. vendor_llama_cpp_pydist/llama.cpp/ci/run.sh +6 -0
  20. vendor_llama_cpp_pydist/llama.cpp/common/arg.cpp +226 -58
  21. vendor_llama_cpp_pydist/llama.cpp/common/arg.h +11 -2
  22. vendor_llama_cpp_pydist/llama.cpp/common/chat-peg-parser.cpp +12 -2
  23. vendor_llama_cpp_pydist/llama.cpp/common/chat.cpp +140 -0
  24. vendor_llama_cpp_pydist/llama.cpp/common/common.cpp +130 -67
  25. vendor_llama_cpp_pydist/llama.cpp/common/common.h +36 -12
  26. vendor_llama_cpp_pydist/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
  27. vendor_llama_cpp_pydist/llama.cpp/common/json-schema-to-grammar.h +20 -0
  28. vendor_llama_cpp_pydist/llama.cpp/common/peg-parser.cpp +1 -1
  29. vendor_llama_cpp_pydist/llama.cpp/common/preset.cpp +218 -6
  30. vendor_llama_cpp_pydist/llama.cpp/common/preset.h +45 -3
  31. vendor_llama_cpp_pydist/llama.cpp/common/sampling.cpp +67 -54
  32. vendor_llama_cpp_pydist/llama.cpp/common/sampling.h +8 -0
  33. vendor_llama_cpp_pydist/llama.cpp/convert_hf_to_gguf.py +456 -321
  34. vendor_llama_cpp_pydist/llama.cpp/convert_hf_to_gguf_update.py +1 -0
  35. vendor_llama_cpp_pydist/llama.cpp/docs/android/imported-into-android-studio.jpg +0 -0
  36. vendor_llama_cpp_pydist/llama.cpp/docs/android.md +22 -2
  37. vendor_llama_cpp_pydist/llama.cpp/docs/backend/SYCL.md +2 -0
  38. vendor_llama_cpp_pydist/llama.cpp/docs/backend/hexagon/CMakeUserPresets.json +2 -0
  39. vendor_llama_cpp_pydist/llama.cpp/docs/development/HOWTO-add-model.md +3 -2
  40. vendor_llama_cpp_pydist/llama.cpp/docs/docker.md +15 -11
  41. vendor_llama_cpp_pydist/llama.cpp/docs/ops/SYCL.csv +797 -361
  42. vendor_llama_cpp_pydist/llama.cpp/docs/ops.md +9 -9
  43. vendor_llama_cpp_pydist/llama.cpp/ggml/CMakeLists.txt +5 -0
  44. vendor_llama_cpp_pydist/llama.cpp/ggml/include/ggml-alloc.h +9 -0
  45. vendor_llama_cpp_pydist/llama.cpp/ggml/include/ggml-backend.h +1 -0
  46. vendor_llama_cpp_pydist/llama.cpp/ggml/include/ggml.h +2 -1
  47. vendor_llama_cpp_pydist/llama.cpp/ggml/src/CMakeLists.txt +3 -0
  48. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-alloc.c +56 -12
  49. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-backend.cpp +21 -2
  50. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +4 -0
  51. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +285 -0
  52. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +28 -0
  53. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +51 -6
  54. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/repack.cpp +286 -0
  55. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/repack.h +8 -0
  56. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/vec.cpp +41 -1
  57. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/vec.h +125 -22
  58. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/argmax.cu +2 -2
  59. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mma.cuh +168 -111
  60. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mmf.cuh +14 -10
  61. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +36 -29
  62. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/ssm-conv.cu +14 -20
  63. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/CMakeLists.txt +3 -1
  64. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +19 -7
  65. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/CMakeLists.txt +2 -1
  66. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/act-ops.c +90 -2
  67. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/htp-msg.h +6 -5
  68. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.c +154 -47
  69. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +57 -0
  70. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/main.c +2 -0
  71. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +176 -28
  72. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.m +7 -0
  73. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/add-id.cpp +77 -0
  74. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/add-id.hpp +8 -0
  75. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/common.hpp +17 -0
  76. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/convert.cpp +15 -0
  77. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +18 -0
  78. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +56 -3
  79. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +97 -0
  80. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +4 -0
  81. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +17 -6
  82. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +22 -0
  83. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/pad.cpp +5 -5
  84. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/ssm_conv.cpp +1 -1
  85. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +58 -0
  86. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +211 -52
  87. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +29 -0
  88. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +3 -0
  89. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +3 -0
  90. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +2 -2
  91. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +40 -24
  92. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +8 -5
  93. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp +62 -0
  94. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp +79 -0
  95. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp +65 -0
  96. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl +53 -0
  97. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +9 -3
  98. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +14 -3
  99. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml.c +5 -0
  100. vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/constants.py +89 -0
  101. vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/tensor_mapping.py +96 -3
  102. vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/utility.py +1 -1
  103. vendor_llama_cpp_pydist/llama.cpp/grammars/README.md +3 -3
  104. vendor_llama_cpp_pydist/llama.cpp/include/llama.h +18 -1
  105. vendor_llama_cpp_pydist/llama.cpp/pyrightconfig.json +1 -1
  106. vendor_llama_cpp_pydist/llama.cpp/scripts/compare-logprobs.py +281 -0
  107. vendor_llama_cpp_pydist/llama.cpp/scripts/snapdragon/adb/run-mtmd.sh +65 -0
  108. vendor_llama_cpp_pydist/llama.cpp/scripts/sync-ggml.last +1 -1
  109. vendor_llama_cpp_pydist/llama.cpp/src/llama-arch.cpp +1890 -2248
  110. vendor_llama_cpp_pydist/llama.cpp/src/llama-arch.h +9 -2
  111. vendor_llama_cpp_pydist/llama.cpp/src/llama-context.cpp +93 -15
  112. vendor_llama_cpp_pydist/llama.cpp/src/llama-context.h +8 -2
  113. vendor_llama_cpp_pydist/llama.cpp/src/llama-graph.cpp +75 -7
  114. vendor_llama_cpp_pydist/llama.cpp/src/llama-graph.h +17 -4
  115. vendor_llama_cpp_pydist/llama.cpp/src/llama-hparams.cpp +3 -9
  116. vendor_llama_cpp_pydist/llama.cpp/src/llama-hparams.h +3 -6
  117. vendor_llama_cpp_pydist/llama.cpp/src/llama-impl.cpp +4 -0
  118. vendor_llama_cpp_pydist/llama.cpp/src/llama-kv-cache.cpp +85 -31
  119. vendor_llama_cpp_pydist/llama.cpp/src/llama-kv-cache.h +19 -2
  120. vendor_llama_cpp_pydist/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
  121. vendor_llama_cpp_pydist/llama.cpp/src/llama-mmap.cpp +123 -28
  122. vendor_llama_cpp_pydist/llama.cpp/src/llama-mmap.h +5 -1
  123. vendor_llama_cpp_pydist/llama.cpp/src/llama-model-loader.cpp +58 -13
  124. vendor_llama_cpp_pydist/llama.cpp/src/llama-model-loader.h +2 -0
  125. vendor_llama_cpp_pydist/llama.cpp/src/llama-model.cpp +98 -57
  126. vendor_llama_cpp_pydist/llama.cpp/src/llama-model.h +1 -0
  127. vendor_llama_cpp_pydist/llama.cpp/src/llama-quant.cpp +1 -1
  128. vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.cpp +16 -0
  129. vendor_llama_cpp_pydist/llama.cpp/src/llama-vocab.cpp +2 -1
  130. vendor_llama_cpp_pydist/llama.cpp/src/llama.cpp +665 -1
  131. vendor_llama_cpp_pydist/llama.cpp/src/models/glm4-moe.cpp +28 -11
  132. vendor_llama_cpp_pydist/llama.cpp/src/models/glm4.cpp +27 -4
  133. vendor_llama_cpp_pydist/llama.cpp/src/models/models.h +5 -5
  134. vendor_llama_cpp_pydist/llama.cpp/src/models/nemotron-h.cpp +35 -6
  135. vendor_llama_cpp_pydist/llama.cpp/src/models/qwen2.cpp +12 -3
  136. vendor_llama_cpp_pydist/llama.cpp/src/models/qwen3next.cpp +81 -266
  137. vendor_llama_cpp_pydist/llama.cpp/tests/CMakeLists.txt +8 -0
  138. vendor_llama_cpp_pydist/llama.cpp/tests/test-arg-parser.cpp +29 -0
  139. vendor_llama_cpp_pydist/llama.cpp/tests/test-backend-ops.cpp +11 -4
  140. vendor_llama_cpp_pydist/llama.cpp/tests/test-chat.cpp +157 -0
  141. vendor_llama_cpp_pydist/llama.cpp/tests/test-json-schema-to-grammar.cpp +75 -0
  142. vendor_llama_cpp_pydist/llama.cpp/tests/test-state-restore-fragmented.cpp +122 -0
  143. vendor_llama_cpp_pydist/llama.cpp/tools/CMakeLists.txt +1 -0
  144. vendor_llama_cpp_pydist/llama.cpp/tools/cli/README.md +1 -0
  145. vendor_llama_cpp_pydist/llama.cpp/tools/completion/README.md +16 -16
  146. vendor_llama_cpp_pydist/llama.cpp/tools/completion/completion.cpp +7 -16
  147. vendor_llama_cpp_pydist/llama.cpp/tools/cvector-generator/cvector-generator.cpp +3 -3
  148. vendor_llama_cpp_pydist/llama.cpp/tools/fit-params/CMakeLists.txt +8 -0
  149. vendor_llama_cpp_pydist/llama.cpp/tools/fit-params/README.md +55 -0
  150. vendor_llama_cpp_pydist/llama.cpp/tools/fit-params/fit-params.cpp +66 -0
  151. vendor_llama_cpp_pydist/llama.cpp/tools/imatrix/imatrix.cpp +3 -3
  152. vendor_llama_cpp_pydist/llama.cpp/tools/llama-bench/README.md +1 -1
  153. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/CMakeLists.txt +2 -0
  154. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip-graph.h +7 -1
  155. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip-impl.h +29 -1
  156. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip-model.h +53 -1
  157. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip.cpp +265 -37
  158. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip.h +1 -1
  159. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/conformer.cpp +217 -0
  160. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/glm4v.cpp +120 -0
  161. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/models.h +10 -0
  162. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/whisper-enc.cpp +9 -10
  163. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd-audio.cpp +371 -550
  164. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd-audio.h +19 -28
  165. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd-cli.cpp +22 -5
  166. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd.cpp +29 -13
  167. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/tests.sh +1 -0
  168. vendor_llama_cpp_pydist/llama.cpp/tools/perplexity/perplexity.cpp +3 -3
  169. vendor_llama_cpp_pydist/llama.cpp/tools/server/README.md +45 -27
  170. vendor_llama_cpp_pydist/llama.cpp/tools/server/public/index.html.gz +0 -0
  171. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-context.cpp +51 -32
  172. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.cpp +137 -266
  173. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.h +24 -26
  174. vendor_llama_cpp_pydist/llama.cpp/tools/server/server.cpp +19 -9
  175. vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/unit/test_compat_anthropic.py +1 -1
  176. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/README.md +6 -5
  177. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/docs/architecture/high-level-architecture-simplified.md +6 -2
  178. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/docs/architecture/high-level-architecture.md +13 -3
  179. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/package-lock.json +10 -10
  180. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/package.json +3 -4
  181. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/app.d.ts +7 -0
  182. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentPreview.svelte +1 -1
  183. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentThumbnailFile.svelte +1 -1
  184. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte +28 -3
  185. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionFileAttachments.svelte +1 -1
  186. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte +1 -0
  187. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte +53 -8
  188. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte +4 -2
  189. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageEditForm.svelte +391 -0
  190. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte +108 -6
  191. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageUser.svelte +26 -48
  192. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessages.svelte +9 -4
  193. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte +21 -4
  194. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreenHeader.svelte +6 -1
  195. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte +16 -1
  196. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsImportExportTab.svelte +68 -2
  197. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebar.svelte +5 -1
  198. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/misc/BadgeChatStatistic.svelte +26 -7
  199. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/misc/MarkdownContent.svelte +263 -167
  200. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/misc/SyntaxHighlightedCode.svelte +3 -2
  201. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/models/ModelsSelector.svelte +199 -185
  202. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/server/ServerErrorSplash.svelte +2 -1
  203. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/ui/switch/index.ts +7 -0
  204. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/ui/switch/switch.svelte +29 -0
  205. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/constants/settings-config.ts +9 -0
  206. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/enums/chat.ts +4 -0
  207. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/enums/index.ts +2 -0
  208. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/markdown/enhance-code-blocks.ts +162 -0
  209. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/markdown/enhance-links.ts +33 -0
  210. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/services/parameter-sync.spec.ts +14 -0
  211. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/services/parameter-sync.ts +75 -13
  212. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/chat.svelte.ts +226 -169
  213. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/conversations.svelte.ts +24 -2
  214. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/server.svelte.ts +4 -0
  215. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/settings.svelte.ts +2 -1
  216. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/api.d.ts +2 -0
  217. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/utils/api-key-validation.ts +2 -1
  218. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/utils/clipboard.ts +262 -0
  219. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/utils/file-preview.ts +0 -9
  220. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/utils/index.ts +11 -2
  221. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/utils/text.ts +7 -0
  222. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/routes/+layout.svelte +25 -9
  223. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/tests/unit/clipboard.test.ts +423 -0
  224. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/{src/lib/utils → tests/unit}/latex-protection.test.ts +1 -1
  225. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/{src/lib/utils → tests/unit}/model-names.test.ts +1 -1
  226. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/vite.config.ts +2 -2
  227. vendor_llama_cpp_pydist/llama.cpp/tools/tts/tts.cpp +6 -6
  228. vendor_llama_cpp_pydist/llama.cpp/vendor/cpp-httplib/CMakeLists.txt +2 -1
  229. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/utils/copy.ts +0 -71
  230. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/tests/server/demo.spec.ts +0 -7
  231. {llama_cpp_pydist-0.18.0.dist-info → llama_cpp_pydist-0.19.0.dist-info}/WHEEL +0 -0
  232. {llama_cpp_pydist-0.18.0.dist-info → llama_cpp_pydist-0.19.0.dist-info}/licenses/LICENSE +0 -0
  233. {llama_cpp_pydist-0.18.0.dist-info → llama_cpp_pydist-0.19.0.dist-info}/top_level.txt +0 -0
@@ -175,7 +175,15 @@ llama_kv_cache::llama_kv_cache(
175
175
 
176
176
  // allocate tensors and initialize the buffers to avoid NaNs in the padding
177
177
  for (auto & [buft, ctx] : ctx_map) {
178
- ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft);
178
+ ggml_backend_buffer_t buf;
179
+ if (model.hparams.no_alloc) {
180
+ buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer
181
+ for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != nullptr; t = ggml_get_next_tensor(ctx.get(), t)) {
182
+ t->buffer = buf; // set dummy buffer for KV cache so that the backend scheduler won't try to allocate it
183
+ }
184
+ } else {
185
+ buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft); // real buffer
186
+ }
179
187
  if (!buf) {
180
188
  throw std::runtime_error("failed to allocate buffer for kv cache");
181
189
  }
@@ -482,9 +490,18 @@ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const {
482
490
 
483
491
  std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache::memory_breakdown() const {
484
492
  std::map<ggml_backend_buffer_type_t, size_t> ret;
485
- for (const auto & [_, buf] : ctxs_bufs) {
486
- ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
493
+ for (const auto & [ctx, buf] : ctxs_bufs) {
494
+ ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(buf.get());
495
+
496
+ if (hparams.no_alloc) {
497
+ GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) == nullptr);
498
+ ret[buft] += ggml_backend_alloc_ctx_tensors_from_buft_size(ctx.get(), buft);
499
+ } else {
500
+ // GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) != nullptr); // multi_buffer does not have a defined base
501
+ ret[buft] += ggml_backend_buffer_get_size(buf.get());
502
+ }
487
503
  }
504
+
488
505
  return ret;
489
506
  }
490
507
 
@@ -1372,7 +1389,7 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
1372
1389
  const auto & yarn_ext_factor = cparams.yarn_ext_factor;
1373
1390
  const auto & yarn_beta_fast = cparams.yarn_beta_fast;
1374
1391
  const auto & yarn_beta_slow = cparams.yarn_beta_slow;
1375
- const auto & yarn_attn_factor = llama_hparams::yarn_attn_factor_adjust(cparams.yarn_attn_factor, cparams.rope_freq_scale, cparams.yarn_ext_factor);
1392
+ const auto & yarn_attn_factor = cparams.yarn_attn_factor;
1376
1393
 
1377
1394
  const auto & n_rot = hparams.n_rot;
1378
1395
  const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE || hparams.rope_type == LLAMA_ROPE_TYPE_IMROPE
@@ -1544,9 +1561,11 @@ void llama_kv_cache::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama
1544
1561
 
1545
1562
  const uint32_t strm = seq_id == -1 ? s : seq_to_stream[seq_id];
1546
1563
 
1564
+ slot_info sinfo;
1565
+
1547
1566
  bool res = true;
1548
- res = res && state_read_meta(io, strm, cell_count, seq_id);
1549
- res = res && state_read_data(io, strm, cell_count);
1567
+ res = res && state_read_meta(io, strm, cell_count, sinfo, seq_id);
1568
+ res = res && state_read_data(io, strm, cell_count, sinfo);
1550
1569
 
1551
1570
  if (!res) {
1552
1571
  if (seq_id == -1) {
@@ -1685,7 +1704,7 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t
1685
1704
  }
1686
1705
  }
1687
1706
 
1688
- bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id) {
1707
+ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, slot_info & sinfo, llama_seq_id dest_seq_id) {
1689
1708
  auto & cells = v_cells[strm];
1690
1709
  auto & head = v_heads[strm];
1691
1710
 
@@ -1722,7 +1741,7 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
1722
1741
  ubatch.seq_id[i] = &dest_seq_id;
1723
1742
  }
1724
1743
 
1725
- const auto sinfo = find_slot(ubatch, true);
1744
+ sinfo = find_slot(ubatch, false);
1726
1745
  if (sinfo.empty()) {
1727
1746
  LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
1728
1747
  return false;
@@ -1732,20 +1751,16 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
1732
1751
  // see: https://github.com/ggml-org/llama.cpp/pull/16825#issuecomment-3460868350
1733
1752
  apply_ubatch(sinfo, ubatch);
1734
1753
 
1735
- const auto head_cur = sinfo.head();
1754
+ LLAMA_LOG_DEBUG("%s: cell_count = %d, dest_seq_id = %d\n", __func__, cell_count, dest_seq_id);
1736
1755
 
1737
- // keep the head at the old position because we will read the KV data into it in state_read_data()
1738
- head = head_cur;
1739
-
1740
- LLAMA_LOG_DEBUG("%s: head_cur = %d, head = %d, cell_count = %d, dest_seq_id = %d\n", __func__, head_cur, head, cell_count, dest_seq_id);
1741
-
1742
- // DEBUG CHECK: head_cur should be our first cell, head_cur + cell_count - 1 should be our last cell (verify seq_id and pos values)
1743
- // Assume that this is one contiguous block of cells
1744
- GGML_ASSERT(head_cur + cell_count <= cells.size());
1745
- GGML_ASSERT(cells.pos_get(head_cur) == ubatch.pos[0]);
1746
- GGML_ASSERT(cells.pos_get(head_cur + cell_count - 1) == ubatch.pos[cell_count - 1]);
1747
- GGML_ASSERT(cells.seq_has(head_cur, dest_seq_id));
1748
- GGML_ASSERT(cells.seq_has(head_cur + cell_count - 1, dest_seq_id));
1756
+ // DEBUG CHECK: verify that all cells were allocated and have correct seq_id and pos values
1757
+ GGML_ASSERT(sinfo.n_stream() == 1);
1758
+ GGML_ASSERT(sinfo.idxs[0].size() == cell_count);
1759
+ for (uint32_t i = 0; i < cell_count; ++i) {
1760
+ const uint32_t idx = sinfo.idxs[0][i];
1761
+ GGML_ASSERT(cells.pos_get(idx) == ubatch.pos[i]);
1762
+ GGML_ASSERT(cells.seq_has(idx, dest_seq_id));
1763
+ }
1749
1764
  } else {
1750
1765
  // whole KV cache restore
1751
1766
 
@@ -1778,15 +1793,24 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
1778
1793
  }
1779
1794
  }
1780
1795
 
1796
+ // Create contiguous slot_info for whole cache restore
1797
+ sinfo.s0 = strm;
1798
+ sinfo.s1 = strm;
1799
+ sinfo.resize(1);
1800
+ sinfo.strm[0] = strm;
1801
+ sinfo.idxs[0].resize(cell_count);
1802
+ for (uint32_t i = 0; i < cell_count; ++i) {
1803
+ sinfo.idxs[0][i] = i;
1804
+ }
1805
+
1781
1806
  head = 0;
1782
1807
  }
1783
1808
 
1784
1809
  return true;
1785
1810
  }
1786
1811
 
1787
- bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count) {
1812
+ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, const slot_info & sinfo) {
1788
1813
  auto & cells = v_cells[strm];
1789
- auto & head = v_heads[strm];
1790
1814
 
1791
1815
  uint32_t v_trans;
1792
1816
  uint32_t n_layer;
@@ -1836,8 +1860,17 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
1836
1860
  }
1837
1861
 
1838
1862
  if (cell_count) {
1839
- // Read and set the keys for the whole cell range
1840
- ggml_backend_tensor_set(k, io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row);
1863
+ if (sinfo.is_contiguous()) {
1864
+ // Fast path: contiguous cells, single memcpy
1865
+ ggml_backend_tensor_set(k, io.read(cell_count * k_size_row), sinfo.head() * k_size_row, cell_count * k_size_row);
1866
+ } else {
1867
+ // Slow path: scatter to non-contiguous positions
1868
+ const void * src = io.read(cell_count * k_size_row);
1869
+ for (uint32_t i = 0; i < cell_count; ++i) {
1870
+ const size_t dst_offset = sinfo.idxs[0][i] * k_size_row;
1871
+ ggml_backend_tensor_set(k, (const char*)src + i * k_size_row, dst_offset, k_size_row);
1872
+ }
1873
+ }
1841
1874
  }
1842
1875
  }
1843
1876
 
@@ -1868,8 +1901,17 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
1868
1901
  }
1869
1902
 
1870
1903
  if (cell_count) {
1871
- // Read and set the values for the whole cell range
1872
- ggml_backend_tensor_set(v, io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row);
1904
+ if (sinfo.is_contiguous()) {
1905
+ // Fast path: contiguous cells, single memcpy
1906
+ ggml_backend_tensor_set(v, io.read(cell_count * v_size_row), sinfo.head() * v_size_row, cell_count * v_size_row);
1907
+ } else {
1908
+ // Slow path: scatter to non-contiguous positions
1909
+ const void * src = io.read(cell_count * v_size_row);
1910
+ for (uint32_t i = 0; i < cell_count; ++i) {
1911
+ const size_t dst_offset = sinfo.idxs[0][i] * v_size_row;
1912
+ ggml_backend_tensor_set(v, (const char*)src + i * v_size_row, dst_offset, v_size_row);
1913
+ }
1914
+ }
1873
1915
  }
1874
1916
  }
1875
1917
  } else {
@@ -1908,10 +1950,22 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
1908
1950
  }
1909
1951
 
1910
1952
  if (cell_count) {
1911
- // For each row in the transposed matrix, read the values for the whole cell range
1912
- for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
1913
- const size_t dst_offset = (head + j * cells.size()) * v_size_el;
1914
- ggml_backend_tensor_set(v, io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
1953
+ if (sinfo.is_contiguous()) {
1954
+ // Fast path: contiguous cells
1955
+ const uint32_t h = sinfo.head();
1956
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
1957
+ const size_t dst_offset = (h + j * cells.size()) * v_size_el;
1958
+ ggml_backend_tensor_set(v, io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
1959
+ }
1960
+ } else {
1961
+ // Slow path: scatter to non-contiguous positions
1962
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
1963
+ const void * src = io.read(cell_count * v_size_el);
1964
+ for (uint32_t i = 0; i < cell_count; ++i) {
1965
+ const size_t dst_offset = (sinfo.idxs[0][i] + j * cells.size()) * v_size_el;
1966
+ ggml_backend_tensor_set(v, (const char*)src + i * v_size_el, dst_offset, v_size_el);
1967
+ }
1968
+ }
1915
1969
  }
1916
1970
  }
1917
1971
  }
@@ -72,6 +72,23 @@ public:
72
72
  void clear() {
73
73
  idxs.clear();
74
74
  }
75
+
76
+ // check if indices are contiguous starting from head()
77
+ bool is_contiguous() const {
78
+ if (idxs.empty() || idxs[0].empty()) {
79
+ return true;
80
+ }
81
+ if (idxs.size() > 1) {
82
+ return false;
83
+ }
84
+ const uint32_t h = idxs[0][0];
85
+ for (size_t i = 0; i < idxs[0].size(); ++i) {
86
+ if (idxs[0][i] != h + i) {
87
+ return false;
88
+ }
89
+ }
90
+ return true;
91
+ }
75
92
  };
76
93
 
77
94
  using slot_info_vec_t = std::vector<slot_info>;
@@ -264,8 +281,8 @@ private:
264
281
  void state_write_meta(llama_io_write_i & io, const cell_ranges_t & cr, llama_seq_id seq_id = -1) const;
265
282
  void state_write_data(llama_io_write_i & io, const cell_ranges_t & cr) const;
266
283
 
267
- bool state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
268
- bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count);
284
+ bool state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, slot_info & sinfo, llama_seq_id dest_seq_id = -1);
285
+ bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, const slot_info & sinfo);
269
286
  };
270
287
 
271
288
  class llama_kv_cache_context : public llama_memory_context_i {
@@ -222,7 +222,7 @@ llama_memory_hybrid_context::llama_memory_hybrid_context(
222
222
  ubatches(std::move(ubatches)),
223
223
  // note: here we copy the ubatches. not sure if this is ideal
224
224
  ctx_attn(new llama_kv_cache_context(mem->get_mem_attn(), std::move(sinfos_attn), this->ubatches)),
225
- ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(), this->ubatches)),
225
+ ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(), this->ubatches)),
226
226
  status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
227
227
  }
228
228
 
@@ -13,9 +13,10 @@
13
13
  #ifdef __has_include
14
14
  #if __has_include(<unistd.h>)
15
15
  #include <unistd.h>
16
+ #include <fcntl.h>
17
+ #include <sys/stat.h>
16
18
  #if defined(_POSIX_MAPPED_FILES)
17
19
  #include <sys/mman.h>
18
- #include <fcntl.h>
19
20
  #endif
20
21
  #if defined(_POSIX_MEMLOCK_RANGE)
21
22
  #include <sys/resource.h>
@@ -74,7 +75,7 @@ struct llama_file::impl {
74
75
  return ret;
75
76
  }
76
77
 
77
- impl(const char * fname, const char * mode) {
78
+ impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
78
79
  fp = ggml_fopen(fname, mode);
79
80
  if (fp == NULL) {
80
81
  throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
@@ -153,13 +154,40 @@ struct llama_file::impl {
153
154
  write_raw(&val, sizeof(val));
154
155
  }
155
156
 
157
+ void read_aligned_chunk(size_t offset, void * dest, size_t size) const {
158
+ throw std::runtime_error("DirectIO is not implemented on Windows.");
159
+ }
160
+
156
161
  ~impl() {
157
162
  if (fp) {
158
163
  std::fclose(fp);
159
164
  }
160
165
  }
161
166
  #else
162
- impl(const char * fname, const char * mode) {
167
+ impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
168
+ #ifdef __linux__
169
+ // Try unbuffered I/O for read only
170
+ if (use_direct_io && std::strcmp(mode, "rb") == 0) {
171
+ fd = open(fname, O_RDONLY | O_DIRECT);
172
+
173
+ if (fd != -1) {
174
+ struct stat file_stats{};
175
+ fstat(fd, &file_stats);
176
+
177
+ size = file_stats.st_size;
178
+ alignment = file_stats.st_blksize;
179
+
180
+ off_t ret = lseek(fd, 0, SEEK_SET);
181
+ if (ret == -1) {
182
+ throw std::runtime_error(format("seek error: %s", strerror(errno)));
183
+ }
184
+ return;
185
+ }
186
+
187
+ LLAMA_LOG_WARN("Failed to open model %s with error: %s. Falling back to buffered I/O",
188
+ fname, strerror(errno));
189
+ }
190
+ #endif
163
191
  fp = ggml_fopen(fname, mode);
164
192
  if (fp == NULL) {
165
193
  throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
@@ -170,27 +198,30 @@ struct llama_file::impl {
170
198
  }
171
199
 
172
200
  size_t tell() const {
173
- // TODO: this ifdef is never true?
174
- #ifdef _WIN32
175
- __int64 ret = _ftelli64(fp);
176
- #else
177
- long ret = std::ftell(fp);
178
- #endif
179
- if (ret == -1) {
180
- throw std::runtime_error(format("ftell error: %s", strerror(errno)));
201
+ if (fd == -1) {
202
+ long ret = std::ftell(fp);
203
+ if (ret == -1) {
204
+ throw std::runtime_error(format("ftell error: %s", strerror(errno)));
205
+ }
206
+
207
+ return (size_t) ret;
181
208
  }
182
209
 
183
- return (size_t) ret;
210
+ off_t pos = lseek(fd, 0, SEEK_CUR);
211
+ if (pos == -1) {
212
+ throw std::runtime_error(format("lseek error: %s", strerror(errno)));
213
+ }
214
+ return (size_t) pos;
184
215
  }
185
216
 
186
217
  void seek(size_t offset, int whence) const {
187
- // TODO: this ifdef is never true?
188
- #ifdef _WIN32
189
- int ret = _fseeki64(fp, (__int64) offset, whence);
190
- #else
191
- int ret = std::fseek(fp, (long) offset, whence);
192
- #endif
193
- if (ret != 0) {
218
+ off_t ret = 0;
219
+ if (fd == -1) {
220
+ ret = std::fseek(fp, (long) offset, whence);
221
+ } else {
222
+ ret = lseek(fd, offset, whence);
223
+ }
224
+ if (ret == -1) {
194
225
  throw std::runtime_error(format("seek error: %s", strerror(errno)));
195
226
  }
196
227
  }
@@ -200,13 +231,55 @@ struct llama_file::impl {
200
231
  return;
201
232
  }
202
233
  errno = 0;
203
- std::size_t ret = std::fread(ptr, len, 1, fp);
204
- if (ferror(fp)) {
205
- throw std::runtime_error(format("read error: %s", strerror(errno)));
234
+ if (fd == -1) {
235
+ std::size_t ret = std::fread(ptr, len, 1, fp);
236
+ if (ferror(fp)) {
237
+ throw std::runtime_error(format("read error: %s", strerror(errno)));
238
+ }
239
+ if (ret != 1) {
240
+ throw std::runtime_error("unexpectedly reached end of file");
241
+ }
242
+ } else {
243
+ bool successful = false;
244
+ while (!successful) {
245
+ off_t ret = read(fd, ptr, len);
246
+
247
+ if (ret == -1) {
248
+ if (errno == EINTR) {
249
+ continue; // Interrupted by signal, retry
250
+ }
251
+ throw std::runtime_error(format("read error: %s", strerror(errno)));
252
+ }
253
+ if (ret == 0) {
254
+ throw std::runtime_error("unexpectedly reached end of file");
255
+ }
256
+
257
+ successful = true;
258
+ }
206
259
  }
207
- if (ret != 1) {
208
- throw std::runtime_error("unexpectedly reached end of file");
260
+ }
261
+
262
+ void read_aligned_chunk(size_t offset, void * dest, size_t size) const {
263
+ off_t aligned_offset = offset & ~(alignment - 1);
264
+ off_t offset_from_alignment = offset - aligned_offset;
265
+ size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1);
266
+
267
+ void * raw_buffer = nullptr;
268
+ int ret = posix_memalign(&raw_buffer, alignment, bytes_to_read);
269
+ if (ret != 0) {
270
+ throw std::runtime_error(format("posix_memalign failed with error %d", ret));
209
271
  }
272
+
273
+ struct aligned_buffer_deleter {
274
+ void operator()(void * p) const { free(p); }
275
+ };
276
+ std::unique_ptr<void, aligned_buffer_deleter> buffer(raw_buffer);
277
+
278
+ seek(aligned_offset, SEEK_SET);
279
+ read_raw(buffer.get(), bytes_to_read);
280
+
281
+ uintptr_t actual_data = reinterpret_cast<uintptr_t>(buffer.get()) + offset_from_alignment;
282
+ memcpy(dest, reinterpret_cast<void *>(actual_data), size);
210
283
  }
211
284
 
212
285
  uint32_t read_u32() const {
@@ -231,22 +304,43 @@ struct llama_file::impl {
231
304
  }
232
305
 
233
306
  ~impl() {
234
- if (fp) {
307
+ if (fd != -1) {
308
+ close(fd);
309
+ } else {
235
310
  std::fclose(fp);
236
311
  }
237
312
  }
313
+ int fd = -1;
238
314
  #endif
239
315
 
240
- FILE * fp;
241
- size_t size;
316
+ void read_raw_at(void * ptr, size_t len, size_t offset) const {
317
+ if (alignment != 1) {
318
+ read_aligned_chunk(offset, ptr, len);
319
+ } else {
320
+ seek(offset, SEEK_SET);
321
+ read_raw(ptr, len);
322
+ }
323
+ }
324
+
325
+ size_t read_alignment() const {
326
+ return alignment;
327
+ }
328
+
329
+ size_t alignment = 1;
330
+
331
+ FILE * fp{};
332
+ size_t size{};
242
333
  };
243
334
 
244
- llama_file::llama_file(const char * fname, const char * mode) : pimpl(std::make_unique<impl>(fname, mode)) {}
335
+ llama_file::llama_file(const char * fname, const char * mode, const bool use_direct_io) :
336
+ pimpl(std::make_unique<impl>(fname, mode, use_direct_io)) {}
245
337
  llama_file::~llama_file() = default;
246
338
 
247
339
  size_t llama_file::tell() const { return pimpl->tell(); }
248
340
  size_t llama_file::size() const { return pimpl->size; }
249
341
 
342
+ size_t llama_file::read_alignment() const { return pimpl->read_alignment(); }
343
+
250
344
  int llama_file::file_id() const {
251
345
  #ifdef _WIN32
252
346
  return _fileno(pimpl->fp);
@@ -261,6 +355,7 @@ int llama_file::file_id() const {
261
355
 
262
356
  void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
263
357
  void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); }
358
+ void llama_file::read_raw_at(void * ptr, size_t len, size_t offset) const { pimpl->read_raw_at(ptr, len, offset); }
264
359
 
265
360
  uint32_t llama_file::read_u32() const { return pimpl->read_u32(); }
266
361
 
@@ -3,6 +3,7 @@
3
3
  #include <cstdint>
4
4
  #include <memory>
5
5
  #include <vector>
6
+ #include <cstdio>
6
7
 
7
8
  struct llama_file;
8
9
  struct llama_mmap;
@@ -13,7 +14,7 @@ using llama_mmaps = std::vector<std::unique_ptr<llama_mmap>>;
13
14
  using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
14
15
 
15
16
  struct llama_file {
16
- llama_file(const char * fname, const char * mode);
17
+ llama_file(const char * fname, const char * mode, bool use_direct_io = false);
17
18
  ~llama_file();
18
19
 
19
20
  size_t tell() const;
@@ -24,11 +25,14 @@ struct llama_file {
24
25
  void seek(size_t offset, int whence) const;
25
26
 
26
27
  void read_raw(void * ptr, size_t len) const;
28
+ void read_raw_at(void * ptr, size_t len, size_t offset) const;
29
+ void read_aligned_chunk(size_t offset, void * dest, size_t size) const;
27
30
  uint32_t read_u32() const;
28
31
 
29
32
  void write_raw(const void * ptr, size_t len) const;
30
33
  void write_u32(uint32_t val) const;
31
34
 
35
+ size_t read_alignment() const;
32
36
  private:
33
37
  struct impl;
34
38
  std::unique_ptr<impl> pimpl;
@@ -473,6 +473,7 @@ llama_model_loader::llama_model_loader(
473
473
  std::vector<std::string> & splits,
474
474
  bool use_mmap,
475
475
  bool check_tensors,
476
+ bool no_alloc,
476
477
  const llama_model_kv_override * param_overrides_p,
477
478
  const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) {
478
479
  int trace = 0;
@@ -503,7 +504,7 @@ llama_model_loader::llama_model_loader(
503
504
  get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
504
505
  llm_kv = LLM_KV(llm_arch_from_string(arch_name));
505
506
 
506
- files.emplace_back(new llama_file(fname.c_str(), "rb"));
507
+ files.emplace_back(new llama_file(fname.c_str(), "rb", !use_mmap));
507
508
  contexts.emplace_back(ctx);
508
509
 
509
510
  // Save tensors data offset of the main file.
@@ -571,7 +572,7 @@ llama_model_loader::llama_model_loader(
571
572
  }
572
573
  }
573
574
 
574
- files.emplace_back(new llama_file(fname_split, "rb"));
575
+ files.emplace_back(new llama_file(fname_split, "rb", !use_mmap));
575
576
  contexts.emplace_back(ctx);
576
577
 
577
578
  // Save tensors data offset info of the shard.
@@ -716,6 +717,7 @@ llama_model_loader::llama_model_loader(
716
717
 
717
718
  this->use_mmap = use_mmap;
718
719
  this->check_tensors = check_tensors;
720
+ this->no_alloc = no_alloc;
719
721
  }
720
722
 
721
723
  std::string llama_model_loader::get_arch_name() const {
@@ -933,7 +935,15 @@ bool llama_model_loader::load_all_data(
933
935
  // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
934
936
  // NVMe raid configurations might require more / larger buffers.
935
937
  constexpr size_t n_buffers = 4;
936
- constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
938
+
939
+ size_t alignment = 1;
940
+ for (const auto & file : files) {
941
+ alignment = std::max(file->read_alignment(), alignment);
942
+ }
943
+
944
+ // Buffer size: balance between memory usage and I/O efficiency
945
+ // 64MB works well for NVMe drives
946
+ const size_t buffer_size = alignment != 1 ? 64 * 1024 * 1024 + 2 * alignment : 1 * 1024 * 1024;
937
947
 
938
948
  std::vector<ggml_backend_buffer_t> host_buffers;
939
949
  std::vector<ggml_backend_event_t> events;
@@ -983,6 +993,7 @@ bool llama_model_loader::load_all_data(
983
993
  // If the backend is supported, create pinned memory buffers and events for synchronisation.
984
994
  for (size_t idx = 0; idx < n_buffers; ++idx) {
985
995
  auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
996
+
986
997
  if (!buf) {
987
998
  LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func,
988
999
  ggml_backend_dev_name(dev));
@@ -1064,9 +1075,9 @@ bool llama_model_loader::load_all_data(
1064
1075
  }
1065
1076
  } else {
1066
1077
  const auto & file = files.at(weight->idx);
1078
+
1067
1079
  if (ggml_backend_buffer_is_host(cur->buffer)) {
1068
- file->seek(weight->offs, SEEK_SET);
1069
- file->read_raw(cur->data, n_size);
1080
+ file->read_raw_at(cur->data, n_size, weight->offs);
1070
1081
  if (check_tensors) {
1071
1082
  validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
1072
1083
  return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
@@ -1075,26 +1086,60 @@ bool llama_model_loader::load_all_data(
1075
1086
  } else {
1076
1087
  // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
1077
1088
  if (upload_backend) {
1078
- file->seek(weight->offs, SEEK_SET);
1089
+ size_t offset = weight->offs;
1090
+ alignment = file->read_alignment();
1091
+ size_t aligned_offset = offset & ~(alignment - 1);
1092
+ size_t offset_from_alignment = offset - aligned_offset;
1093
+ file->seek(aligned_offset, SEEK_SET);
1094
+
1095
+ // Calculate aligned read boundaries
1096
+ size_t read_start = aligned_offset;
1097
+ size_t read_end = (offset + n_size + alignment - 1) & ~(alignment - 1);
1079
1098
 
1080
1099
  size_t bytes_read = 0;
1100
+ size_t data_read = 0; // Actual tensor data copied (excluding padding)
1101
+
1102
+ while (bytes_read < read_end - read_start) {
1103
+ size_t read_size = std::min<size_t>(buffer_size, read_end - read_start - bytes_read);
1081
1104
 
1082
- while (bytes_read < n_size) {
1083
- size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
1105
+ // Align the destination pointer within the pinned buffer
1106
+ uintptr_t ptr_dest_aligned = (reinterpret_cast<uintptr_t>(host_ptrs[buffer_idx]) + alignment - 1) & ~(alignment - 1);
1084
1107
 
1108
+ // Wait for previous upload to complete before reusing buffer
1085
1109
  ggml_backend_event_synchronize(events[buffer_idx]);
1086
- file->read_raw(host_ptrs[buffer_idx], read_iteration);
1087
- ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
1110
+
1111
+ // Read aligned chunk from file
1112
+ file->read_raw(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
1113
+
1114
+ // Calculate actual data portion (excluding alignment padding)
1115
+ uintptr_t ptr_data = ptr_dest_aligned;
1116
+ size_t data_to_copy = read_size;
1117
+
1118
+ // Skip alignment padding at start of first chunk
1119
+ if (bytes_read == 0) {
1120
+ ptr_data += offset_from_alignment;
1121
+ data_to_copy -= offset_from_alignment;
1122
+ }
1123
+
1124
+ // Trim alignment padding at end of last chunk
1125
+ if (aligned_offset + bytes_read + read_size > offset + n_size) {
1126
+ data_to_copy -= (read_end - (offset + n_size));
1127
+ }
1128
+
1129
+ // Async upload actual data to GPU
1130
+ ggml_backend_tensor_set_async(upload_backend, cur,
1131
+ reinterpret_cast<void *>(ptr_data), data_read, data_to_copy);
1088
1132
  ggml_backend_event_record(events[buffer_idx], upload_backend);
1089
1133
 
1090
- bytes_read += read_iteration;
1134
+ data_read += data_to_copy;
1135
+ bytes_read += read_size;
1136
+
1091
1137
  ++buffer_idx;
1092
1138
  buffer_idx %= n_buffers;
1093
1139
  }
1094
1140
  } else {
1095
1141
  read_buf.resize(n_size);
1096
- file->seek(weight->offs, SEEK_SET);
1097
- file->read_raw(read_buf.data(), n_size);
1142
+ file->read_raw_at(read_buf.data(), n_size, weight->offs);
1098
1143
  ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
1099
1144
  if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
1100
1145
  throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
@@ -71,6 +71,7 @@ struct llama_model_loader {
71
71
 
72
72
  bool use_mmap = false;
73
73
  bool check_tensors;
74
+ bool no_alloc;
74
75
 
75
76
  llama_files files;
76
77
  llama_ftype ftype;
@@ -97,6 +98,7 @@ struct llama_model_loader {
97
98
  std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
98
99
  bool use_mmap,
99
100
  bool check_tensors,
101
+ bool no_alloc,
100
102
  const llama_model_kv_override * param_overrides_p,
101
103
  const llama_model_tensor_buft_override * param_tensor_buft_overrides_p);
102
104