@fugood/llama.node 0.3.3 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (225) hide show
  1. package/CMakeLists.txt +5 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +18 -1
  17. package/package.json +1 -1
  18. package/src/EmbeddingWorker.cpp +15 -5
  19. package/src/EmbeddingWorker.h +2 -1
  20. package/src/LlamaCompletionWorker.cpp +1 -1
  21. package/src/LlamaContext.cpp +81 -18
  22. package/src/LlamaContext.h +2 -0
  23. package/src/llama.cpp/.github/workflows/build.yml +197 -159
  24. package/src/llama.cpp/.github/workflows/docker.yml +5 -8
  25. package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
  26. package/src/llama.cpp/.github/workflows/server.yml +21 -14
  27. package/src/llama.cpp/CMakeLists.txt +11 -6
  28. package/src/llama.cpp/Sources/llama/llama.h +4 -0
  29. package/src/llama.cpp/cmake/common.cmake +33 -0
  30. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
  31. package/src/llama.cpp/common/CMakeLists.txt +6 -2
  32. package/src/llama.cpp/common/arg.cpp +426 -245
  33. package/src/llama.cpp/common/common.cpp +143 -80
  34. package/src/llama.cpp/common/common.h +81 -24
  35. package/src/llama.cpp/common/sampling.cpp +53 -19
  36. package/src/llama.cpp/common/sampling.h +22 -1
  37. package/src/llama.cpp/common/speculative.cpp +274 -0
  38. package/src/llama.cpp/common/speculative.h +28 -0
  39. package/src/llama.cpp/docs/build.md +101 -148
  40. package/src/llama.cpp/examples/CMakeLists.txt +32 -13
  41. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  42. package/src/llama.cpp/examples/batched/batched.cpp +5 -4
  43. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  44. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  45. package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
  46. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
  47. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  48. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
  49. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  50. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
  51. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
  52. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
  53. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  54. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
  55. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  56. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
  57. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  58. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  59. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  60. package/src/llama.cpp/examples/imatrix/imatrix.cpp +11 -2
  61. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/infill/infill.cpp +1 -1
  63. package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
  64. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +405 -316
  65. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  66. package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
  67. package/src/llama.cpp/examples/llava/clip.cpp +262 -66
  68. package/src/llama.cpp/examples/llava/clip.h +8 -2
  69. package/src/llama.cpp/examples/llava/llava-cli.cpp +1 -1
  70. package/src/llama.cpp/examples/llava/llava.cpp +46 -19
  71. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +1 -1
  72. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
  73. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  74. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
  75. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  76. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -1
  77. package/src/llama.cpp/examples/lookup/lookup.cpp +2 -2
  78. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/main/main.cpp +9 -5
  80. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
  81. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  82. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
  83. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  84. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  85. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  86. package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
  87. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  88. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  89. package/src/llama.cpp/examples/retrieval/retrieval.cpp +4 -4
  90. package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
  91. package/src/llama.cpp/examples/run/run.cpp +911 -0
  92. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  93. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -4
  94. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -7
  95. package/src/llama.cpp/examples/server/server.cpp +1758 -886
  96. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  97. package/src/llama.cpp/examples/server/utils.hpp +94 -304
  98. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  99. package/src/llama.cpp/examples/simple/simple.cpp +4 -0
  100. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +1 -1
  101. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +3 -0
  102. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  103. package/src/llama.cpp/examples/speculative/speculative.cpp +16 -15
  104. package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
  105. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
  106. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  107. package/src/llama.cpp/examples/tokenize/tokenize.cpp +1 -1
  108. package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
  109. package/src/llama.cpp/examples/tts/tts.cpp +932 -0
  110. package/src/llama.cpp/ggml/CMakeLists.txt +46 -34
  111. package/src/llama.cpp/ggml/include/ggml-backend.h +16 -0
  112. package/src/llama.cpp/ggml/include/ggml-cpu.h +7 -49
  113. package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
  114. package/src/llama.cpp/ggml/include/ggml.h +106 -24
  115. package/src/llama.cpp/ggml/src/CMakeLists.txt +73 -24
  116. package/src/llama.cpp/ggml/src/ggml-alloc.c +0 -1
  117. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +51 -11
  118. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +379 -22
  119. package/src/llama.cpp/ggml/src/ggml-backend.cpp +4 -4
  120. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +3 -7
  121. package/src/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +5 -2
  122. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +33 -3
  123. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
  124. package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
  125. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +95 -35
  126. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
  127. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
  128. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
  129. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
  130. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
  131. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
  132. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
  133. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
  134. package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
  135. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +288 -213
  136. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  137. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  138. package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/common.h +19 -22
  139. package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.cpp +93 -92
  140. package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.h +2 -9
  141. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  142. package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.c → ggml-cpu-aarch64.cpp} +892 -190
  143. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +2 -24
  144. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  145. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  146. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +15 -0
  147. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +38 -25
  148. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  149. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  150. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +552 -399
  151. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +101 -136
  152. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +2 -2
  153. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +7 -10
  154. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
  155. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -6
  156. package/src/llama.cpp/ggml/src/ggml-impl.h +32 -11
  157. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +13 -9
  158. package/src/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +131 -64
  159. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +3 -6
  160. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +39 -0
  161. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +14 -7
  162. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
  163. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
  164. package/src/llama.cpp/ggml/src/ggml-opt.cpp +67 -80
  165. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -9
  166. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +3 -5
  167. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +5 -2
  168. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +13 -10
  169. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +2 -11
  170. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -0
  171. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +2 -2
  172. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
  173. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
  174. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +32 -13
  175. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +80 -61
  176. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
  177. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +159 -114
  178. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
  179. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
  180. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +6 -20
  181. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +4 -3
  182. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +8 -8
  183. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
  184. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
  185. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +4 -1
  187. package/src/llama.cpp/ggml/src/ggml-threading.h +4 -2
  188. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +21 -7
  189. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1718 -399
  190. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +3 -1
  191. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +105 -31
  192. package/src/llama.cpp/ggml/src/ggml.c +367 -207
  193. package/src/llama.cpp/include/llama-cpp.h +25 -0
  194. package/src/llama.cpp/include/llama.h +26 -19
  195. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
  196. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
  197. package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
  198. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  199. package/src/llama.cpp/src/CMakeLists.txt +2 -7
  200. package/src/llama.cpp/src/llama-grammar.cpp +15 -15
  201. package/src/llama.cpp/src/llama-grammar.h +2 -5
  202. package/src/llama.cpp/src/llama-sampling.cpp +35 -90
  203. package/src/llama.cpp/src/llama-vocab.cpp +6 -1
  204. package/src/llama.cpp/src/llama.cpp +1748 -640
  205. package/src/llama.cpp/src/unicode.cpp +62 -51
  206. package/src/llama.cpp/src/unicode.h +9 -10
  207. package/src/llama.cpp/tests/CMakeLists.txt +48 -37
  208. package/src/llama.cpp/tests/test-arg-parser.cpp +2 -2
  209. package/src/llama.cpp/tests/test-backend-ops.cpp +140 -21
  210. package/src/llama.cpp/tests/test-chat-template.cpp +50 -4
  211. package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
  212. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
  213. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
  214. package/src/llama.cpp/tests/test-quantize-fns.cpp +3 -3
  215. package/src/llama.cpp/tests/test-rope.cpp +61 -20
  216. package/src/llama.cpp/tests/test-sampling.cpp +2 -2
  217. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
  218. package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
  219. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
  220. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
  221. package/src/llama.cpp/ggml/include/ggml-amx.h +0 -25
  222. package/src/llama.cpp/ggml/src/ggml-aarch64.c +0 -129
  223. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -19
  224. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +0 -107
  225. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
@@ -8,7 +8,10 @@
8
8
 
9
9
  // FIXME: required here for quantization functions
10
10
  #include "ggml-quants.h"
11
- #include "ggml-aarch64.h"
11
+
12
+ #ifdef GGML_USE_CPU_HBM
13
+ #include <hbwmalloc.h>
14
+ #endif
12
15
 
13
16
  #if defined(_MSC_VER) || defined(__MINGW32__)
14
17
  #include <malloc.h> // using malloc.h with MSC/MINGW
@@ -788,32 +791,23 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
788
791
  .to_float = (ggml_to_float_t) ggml_bf16_to_fp32_row,
789
792
  .from_float_ref = (ggml_from_float_t) ggml_fp32_to_bf16_row_ref,
790
793
  },
791
- [GGML_TYPE_Q4_0_4_4] = {
792
- .type_name = "q4_0_4x4",
793
- .blck_size = QK4_0,
794
- .blck_size_interleave = 4,
795
- .type_size = sizeof(block_q4_0),
796
- .is_quantized = true,
797
- .to_float = NULL,
798
- .from_float_ref = NULL,
794
+ [31] = { // GGML_TYPE_Q4_0_4_4
795
+ .type_name = "TYPE_Q4_0_4_4 REMOVED, use Q4_0 with runtime repacking",
796
+ .blck_size = 0,
797
+ .type_size = 0,
798
+ .is_quantized = false,
799
799
  },
800
- [GGML_TYPE_Q4_0_4_8] = {
801
- .type_name = "q4_0_4x8",
802
- .blck_size = QK4_0,
803
- .blck_size_interleave = 8,
804
- .type_size = sizeof(block_q4_0),
805
- .is_quantized = true,
806
- .to_float = NULL,
807
- .from_float_ref = NULL,
800
+ [32] = { // GGML_TYPE_Q4_0_4_8
801
+ .type_name = "TYPE_Q4_0_4_8 REMOVED, use Q4_0 with runtime repacking",
802
+ .blck_size = 0,
803
+ .type_size = 0,
804
+ .is_quantized = false,
808
805
  },
809
- [GGML_TYPE_Q4_0_8_8] = {
810
- .type_name = "q4_0_8x8",
811
- .blck_size = QK4_0,
812
- .blck_size_interleave = 8,
813
- .type_size = sizeof(block_q4_0),
814
- .is_quantized = true,
815
- .to_float = NULL,
816
- .from_float_ref = NULL,
806
+ [33] = { // GGML_TYPE_Q4_0_8_8
807
+ .type_name = "TYPE_Q4_0_8_8 REMOVED, use Q4_0 with runtime repacking",
808
+ .blck_size = 0,
809
+ .type_size = 0,
810
+ .is_quantized = false,
817
811
  },
818
812
  [GGML_TYPE_TQ1_0] = {
819
813
  .type_name = "tq1_0",
@@ -831,6 +825,24 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
831
825
  .to_float = (ggml_to_float_t) dequantize_row_tq2_0,
832
826
  .from_float_ref = (ggml_from_float_t) quantize_row_tq2_0_ref,
833
827
  },
828
+ [36] = { // GGML_TYPE_IQ4_NL_4_4
829
+ .type_name = "TYPE_IQ4_NL_4_4 REMOVED, use IQ4_NL with runtime repacking",
830
+ .blck_size = 0,
831
+ .type_size = 0,
832
+ .is_quantized = false,
833
+ },
834
+ [37] = { // GGML_TYPE_IQ4_NL_4_8
835
+ .type_name = "TYPE_IQ4_NL_4_8 REMOVED, use IQ4_NL with runtime repacking",
836
+ .blck_size = 0,
837
+ .type_size = 0,
838
+ .is_quantized = false,
839
+ },
840
+ [38] = { // GGML_TYPE_IQ4_NL_8_8
841
+ .type_name = "TYPE_IQ4_NL_8_8 REMOVED, use IQ4_NL with runtime repacking",
842
+ .blck_size = 0,
843
+ .type_size = 0,
844
+ .is_quantized = false,
845
+ },
834
846
  };
835
847
 
836
848
  const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) {
@@ -941,6 +953,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
941
953
  "POOL_2D_BACK",
942
954
  "UPSCALE",
943
955
  "PAD",
956
+ "PAD_REFLECT_1D",
944
957
  "ARANGE",
945
958
  "TIMESTEP_EMBEDDING",
946
959
  "ARGSORT",
@@ -974,7 +987,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
974
987
  "OPT_STEP_ADAMW",
975
988
  };
976
989
 
977
- static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
990
+ static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
978
991
 
979
992
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
980
993
  "none",
@@ -1036,6 +1049,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1036
1049
  "pool_2d_back(x)",
1037
1050
  "upscale(x)",
1038
1051
  "pad(x)",
1052
+ "pad_reflect_1d(x)",
1039
1053
  "arange(start, stop, step)",
1040
1054
  "timestep_embedding(timesteps, dim, max_period)",
1041
1055
  "argsort(x)",
@@ -1069,7 +1083,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1069
1083
  "adamw(x)",
1070
1084
  };
1071
1085
 
1072
- static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
1086
+ static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
1073
1087
 
1074
1088
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
1075
1089
 
@@ -1259,9 +1273,6 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
1259
1273
  case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break;
1260
1274
  case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break;
1261
1275
  case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break;
1262
- case GGML_FTYPE_MOSTLY_Q4_0_4_4: wtype = GGML_TYPE_Q4_0_4_4; break;
1263
- case GGML_FTYPE_MOSTLY_Q4_0_4_8: wtype = GGML_TYPE_Q4_0_4_8; break;
1264
- case GGML_FTYPE_MOSTLY_Q4_0_8_8: wtype = GGML_TYPE_Q4_0_8_8; break;
1265
1276
  case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
1266
1277
  case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
1267
1278
  }
@@ -2255,6 +2266,7 @@ struct ggml_tensor * ggml_argmax(
2255
2266
  struct ggml_context * ctx,
2256
2267
  struct ggml_tensor * a) {
2257
2268
  GGML_ASSERT(ggml_is_matrix(a));
2269
+ GGML_ASSERT(a->ne[0] <= INT32_MAX);
2258
2270
 
2259
2271
  struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, a->ne[1]);
2260
2272
 
@@ -3505,15 +3517,18 @@ static struct ggml_tensor * ggml_rope_impl(
3505
3517
  GGML_ASSERT(c->ne[0] >= n_dims / 2);
3506
3518
  }
3507
3519
 
3520
+ int sections[4] = {0, 0, 0, 0};
3521
+
3508
3522
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
3509
3523
 
3510
- int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
3524
+ int32_t params[15] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
3511
3525
  memcpy(params + 5, &freq_base, sizeof(float));
3512
3526
  memcpy(params + 6, &freq_scale, sizeof(float));
3513
3527
  memcpy(params + 7, &ext_factor, sizeof(float));
3514
3528
  memcpy(params + 8, &attn_factor, sizeof(float));
3515
3529
  memcpy(params + 9, &beta_fast, sizeof(float));
3516
3530
  memcpy(params + 10, &beta_slow, sizeof(float));
3531
+ memcpy(params + 11, &sections, sizeof(int)*4);
3517
3532
  ggml_set_op_params(result, params, sizeof(params));
3518
3533
 
3519
3534
  result->op = GGML_OP_ROPE;
@@ -3535,6 +3550,53 @@ struct ggml_tensor * ggml_rope(
3535
3550
  );
3536
3551
  }
3537
3552
 
3553
+ struct ggml_tensor * ggml_rope_multi(
3554
+ struct ggml_context * ctx,
3555
+ struct ggml_tensor * a,
3556
+ struct ggml_tensor * b,
3557
+ struct ggml_tensor * c,
3558
+ int n_dims,
3559
+ int sections[4],
3560
+ int mode,
3561
+ int n_ctx_orig,
3562
+ float freq_base,
3563
+ float freq_scale,
3564
+ float ext_factor,
3565
+ float attn_factor,
3566
+ float beta_fast,
3567
+ float beta_slow) {
3568
+ // Multimodal Rotary Position Embedding
3569
+ GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");
3570
+
3571
+ GGML_ASSERT(ggml_is_vector(b));
3572
+ GGML_ASSERT(b->type == GGML_TYPE_I32);
3573
+ GGML_ASSERT(a->ne[2] * 4 == b->ne[0]); // mrope expecting 4 position ids per token
3574
+
3575
+ if (c) {
3576
+ GGML_ASSERT(c->type == GGML_TYPE_F32);
3577
+ GGML_ASSERT(c->ne[0] >= n_dims / 2);
3578
+ }
3579
+
3580
+ struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
3581
+
3582
+ int32_t params[11 + 4] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
3583
+ memcpy(params + 5, &freq_base, sizeof(float));
3584
+ memcpy(params + 6, &freq_scale, sizeof(float));
3585
+ memcpy(params + 7, &ext_factor, sizeof(float));
3586
+ memcpy(params + 8, &attn_factor, sizeof(float));
3587
+ memcpy(params + 9, &beta_fast, sizeof(float));
3588
+ memcpy(params + 10, &beta_slow, sizeof(float));
3589
+ memcpy(&params[11], sections, sizeof(int)*4);
3590
+ ggml_set_op_params(result, params, sizeof(params));
3591
+
3592
+ result->op = GGML_OP_ROPE;
3593
+ result->src[0] = a;
3594
+ result->src[1] = b;
3595
+ result->src[2] = c;
3596
+
3597
+ return result;
3598
+ }
3599
+
3538
3600
  struct ggml_tensor * ggml_rope_inplace(
3539
3601
  struct ggml_context * ctx,
3540
3602
  struct ggml_tensor * a,
@@ -3698,13 +3760,84 @@ struct ggml_tensor * ggml_clamp(
3698
3760
  return result;
3699
3761
  }
3700
3762
 
3701
- // ggml_conv_1d
3702
-
3703
3763
  static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
3704
3764
  return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
3705
3765
  }
3706
3766
 
3707
- GGML_API struct ggml_tensor * ggml_conv_1d(
3767
+ // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
3768
+ // a: [OC,IC, KH, KW]
3769
+ // b: [N, IC, IH, IW]
3770
+ // result: [N, OH, OW, IC*KH*KW]
3771
+ struct ggml_tensor * ggml_im2col(
3772
+ struct ggml_context * ctx,
3773
+ struct ggml_tensor * a,
3774
+ struct ggml_tensor * b,
3775
+ int s0,
3776
+ int s1,
3777
+ int p0,
3778
+ int p1,
3779
+ int d0,
3780
+ int d1,
3781
+ bool is_2D,
3782
+ enum ggml_type dst_type) {
3783
+ if (is_2D) {
3784
+ GGML_ASSERT(a->ne[2] == b->ne[2]);
3785
+ } else {
3786
+ //GGML_ASSERT(b->ne[1] % a->ne[1] == 0);
3787
+ GGML_ASSERT(b->ne[1] == a->ne[1]);
3788
+ GGML_ASSERT(b->ne[3] == 1);
3789
+ }
3790
+
3791
+ const int64_t OH = is_2D ? ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0;
3792
+ const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
3793
+
3794
+ GGML_ASSERT((!is_2D || OH > 0) && "b too small compared to a");
3795
+ GGML_ASSERT((OW > 0) && "b too small compared to a");
3796
+
3797
+ const int64_t ne[4] = {
3798
+ is_2D ? (a->ne[2] * a->ne[1] * a->ne[0]) : a->ne[1] * a->ne[0],
3799
+ OW,
3800
+ is_2D ? OH : b->ne[2],
3801
+ is_2D ? b->ne[3] : 1,
3802
+ };
3803
+
3804
+ struct ggml_tensor * result = ggml_new_tensor(ctx, dst_type, 4, ne);
3805
+ int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
3806
+ ggml_set_op_params(result, params, sizeof(params));
3807
+
3808
+ result->op = GGML_OP_IM2COL;
3809
+ result->src[0] = a;
3810
+ result->src[1] = b;
3811
+
3812
+ return result;
3813
+ }
3814
+
3815
+ struct ggml_tensor * ggml_im2col_back(
3816
+ struct ggml_context * ctx,
3817
+ struct ggml_tensor * a,
3818
+ struct ggml_tensor * b,
3819
+ int64_t * ne,
3820
+ int s0,
3821
+ int s1,
3822
+ int p0,
3823
+ int p1,
3824
+ int d0,
3825
+ int d1,
3826
+ bool is_2D) {
3827
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
3828
+ int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
3829
+ ggml_set_op_params(result, params, sizeof(params));
3830
+
3831
+ result->op = GGML_OP_IM2COL_BACK;
3832
+ result->src[0] = a;
3833
+ result->src[1] = b;
3834
+
3835
+ return result;
3836
+ }
3837
+
3838
+ // ggml_conv_1d
3839
+
3840
+ struct ggml_tensor * ggml_conv_1d(
3708
3841
  struct ggml_context * ctx,
3709
3842
  struct ggml_tensor * a,
3710
3843
  struct ggml_tensor * b,
@@ -3734,137 +3867,75 @@ struct ggml_tensor* ggml_conv_1d_ph(
3734
3867
  return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
3735
3868
  }
3736
3869
 
3737
- // ggml_conv_transpose_1d
3738
-
3739
- static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
3740
- return (ins - 1) * s - 2 * p + d * (ks - 1) + 1;
3741
- }
3870
+ // ggml_conv_1d_dw
3742
3871
 
3743
- GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
3872
+ struct ggml_tensor * ggml_conv_1d_dw(
3744
3873
  struct ggml_context * ctx,
3745
3874
  struct ggml_tensor * a,
3746
3875
  struct ggml_tensor * b,
3747
3876
  int s0,
3748
3877
  int p0,
3749
3878
  int d0) {
3750
- GGML_ASSERT(ggml_is_matrix(b));
3751
- GGML_ASSERT(a->ne[2] == b->ne[1]);
3752
- GGML_ASSERT(a->ne[3] == 1);
3879
+ struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], 1, a->ne[1], a->ne[2]);
3880
+ struct ggml_tensor * new_b = ggml_reshape_4d(ctx, b, b->ne[0], 1, b->ne[1], b->ne[2]);
3753
3881
 
3754
- GGML_ASSERT(p0 == 0);
3755
- GGML_ASSERT(d0 == 1);
3882
+ struct ggml_tensor * im2col = ggml_im2col(ctx, new_a, new_b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16);
3756
3883
 
3757
- const int64_t ne[4] = {
3758
- ggml_calc_conv_transpose_1d_output_size(b->ne[0], a->ne[0], s0, 0 /*p0*/, 1 /*d0*/),
3759
- a->ne[1], b->ne[2], 1,
3760
- };
3761
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
3884
+ struct ggml_tensor * result = ggml_mul_mat(ctx, im2col, a);
3762
3885
 
3763
- int32_t params[] = { s0, p0, d0 };
3764
- ggml_set_op_params(result, params, sizeof(params));
3765
-
3766
- result->op = GGML_OP_CONV_TRANSPOSE_1D;
3767
- result->src[0] = a;
3768
- result->src[1] = b;
3886
+ result = ggml_reshape_3d(ctx, result, b->ne[0], b->ne[1], 1);
3769
3887
 
3770
3888
  return result;
3771
3889
  }
3772
3890
 
3773
- // ggml_conv_depthwise
3891
+ // ggml_conv_1d_dw_ph
3774
3892
 
3775
- struct ggml_tensor * ggml_conv_depthwise_2d(
3893
+ struct ggml_tensor * ggml_conv_1d_dw_ph(
3776
3894
  struct ggml_context * ctx,
3777
3895
  struct ggml_tensor * a,
3778
3896
  struct ggml_tensor * b,
3779
3897
  int s0,
3780
- int s1,
3781
- int p0,
3782
- int p1,
3783
- int d0,
3784
- int d1) {
3785
- struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
3786
- struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
3787
- ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
3788
- s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW]
3789
- struct ggml_tensor * new_b = ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
3898
+ int d0) {
3899
+ return ggml_conv_1d_dw(ctx, a, b, s0, a->ne[0] / 2, d0);
3900
+ }
3790
3901
 
3791
- new_a = ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1); // [OC,1, KH, KW] => [1, OC, 1, KH * KW]
3792
- struct ggml_tensor * result = ggml_mul_mat(ctx, new_a, new_b);
3793
- result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
3902
+ // ggml_conv_transpose_1d
3794
3903
 
3795
- return result;
3904
+ static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
3905
+ return (ins - 1) * s - 2 * p + d * (ks - 1) + 1;
3796
3906
  }
3797
- // ggml_conv_2d
3798
3907
 
3799
- // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
3800
- // a: [OC,IC, KH, KW]
3801
- // b: [N, IC, IH, IW]
3802
- // result: [N, OH, OW, IC*KH*KW]
3803
- struct ggml_tensor * ggml_im2col(
3908
+ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
3804
3909
  struct ggml_context * ctx,
3805
3910
  struct ggml_tensor * a,
3806
3911
  struct ggml_tensor * b,
3807
3912
  int s0,
3808
- int s1,
3809
3913
  int p0,
3810
- int p1,
3811
- int d0,
3812
- int d1,
3813
- bool is_2D,
3814
- enum ggml_type dst_type) {
3815
- if(is_2D) {
3816
- GGML_ASSERT(a->ne[2] == b->ne[2]);
3817
- } else {
3818
- GGML_ASSERT(a->ne[1] == b->ne[1]);
3819
- GGML_ASSERT(b->ne[3] == 1);
3820
- }
3821
-
3822
- const int64_t OH = is_2D ? ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0;
3823
- const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
3914
+ int d0) {
3915
+ GGML_ASSERT(ggml_is_matrix(b));
3916
+ GGML_ASSERT(a->ne[2] == b->ne[1]);
3917
+ GGML_ASSERT(a->ne[3] == 1);
3824
3918
 
3825
- GGML_ASSERT((!is_2D || OH > 0) && "b too small compared to a");
3826
- GGML_ASSERT((OW > 0) && "b too small compared to a");
3919
+ GGML_ASSERT(p0 == 0);
3920
+ GGML_ASSERT(d0 == 1);
3827
3921
 
3828
3922
  const int64_t ne[4] = {
3829
- is_2D ? (a->ne[2] * a->ne[1] * a->ne[0]) : a->ne[1] * a->ne[0],
3830
- OW,
3831
- is_2D ? OH : b->ne[2],
3832
- is_2D ? b->ne[3] : 1,
3923
+ ggml_calc_conv_transpose_1d_output_size(b->ne[0], a->ne[0], s0, 0 /*p0*/, 1 /*d0*/),
3924
+ a->ne[1], b->ne[2], 1,
3833
3925
  };
3926
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
3834
3927
 
3835
- struct ggml_tensor * result = ggml_new_tensor(ctx, dst_type, 4, ne);
3836
- int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
3928
+ int32_t params[] = { s0, p0, d0 };
3837
3929
  ggml_set_op_params(result, params, sizeof(params));
3838
3930
 
3839
- result->op = GGML_OP_IM2COL;
3931
+ result->op = GGML_OP_CONV_TRANSPOSE_1D;
3840
3932
  result->src[0] = a;
3841
3933
  result->src[1] = b;
3842
3934
 
3843
3935
  return result;
3844
3936
  }
3845
3937
 
3846
- struct ggml_tensor * ggml_im2col_back(
3847
- struct ggml_context * ctx,
3848
- struct ggml_tensor * a,
3849
- struct ggml_tensor * b,
3850
- int64_t * ne,
3851
- int s0,
3852
- int s1,
3853
- int p0,
3854
- int p1,
3855
- int d0,
3856
- int d1,
3857
- bool is_2D) {
3858
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
3859
- int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
3860
- ggml_set_op_params(result, params, sizeof(params));
3861
-
3862
- result->op = GGML_OP_IM2COL_BACK;
3863
- result->src[0] = a;
3864
- result->src[1] = b;
3865
-
3866
- return result;
3867
- }
3938
+ // ggml_conv_2d
3868
3939
 
3869
3940
  // a: [OC,IC, KH, KW]
3870
3941
  // b: [N, IC, IH, IW]
@@ -3911,6 +3982,31 @@ struct ggml_tensor * ggml_conv_2d_s1_ph(
3911
3982
  return ggml_conv_2d(ctx, a, b, 1, 1, a->ne[0] / 2, a->ne[1] / 2, 1, 1);
3912
3983
  }
3913
3984
 
3985
+ // ggml_conv_2d_dw
3986
+
3987
+ struct ggml_tensor * ggml_conv_2d_dw(
3988
+ struct ggml_context * ctx,
3989
+ struct ggml_tensor * a,
3990
+ struct ggml_tensor * b,
3991
+ int s0,
3992
+ int s1,
3993
+ int p0,
3994
+ int p1,
3995
+ int d0,
3996
+ int d1) {
3997
+ struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
3998
+ struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
3999
+ ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
4000
+ s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW]
4001
+ struct ggml_tensor * new_b = ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
4002
+
4003
+ new_a = ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1); // [OC,1, KH, KW] => [1, OC, 1, KH * KW]
4004
+ struct ggml_tensor * result = ggml_mul_mat(ctx, new_a, new_b);
4005
+ result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
4006
+
4007
+ return result;
4008
+ }
4009
+
3914
4010
  // ggml_conv_transpose_2d_p0
3915
4011
 
3916
4012
  static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {
@@ -4087,6 +4183,37 @@ struct ggml_tensor * ggml_pad(
4087
4183
  return result;
4088
4184
  }
4089
4185
 
4186
+ // ggml_pad_reflect_1d
4187
+
4188
+ struct ggml_tensor * ggml_pad_reflect_1d(
4189
+ struct ggml_context * ctx,
4190
+ struct ggml_tensor * a,
4191
+ int p0,
4192
+ int p1) {
4193
+ GGML_ASSERT(p0 >= 0);
4194
+ GGML_ASSERT(p1 >= 0);
4195
+
4196
+ GGML_ASSERT(p0 < a->ne[0]); // padding length on each size must be less than the
4197
+ GGML_ASSERT(p1 < a->ne[0]); // existing length of the dimension being padded
4198
+
4199
+ GGML_ASSERT(ggml_is_contiguous(a));
4200
+ GGML_ASSERT(a->type == GGML_TYPE_F32);
4201
+
4202
+ struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
4203
+ a->ne[0] + p0 + p1,
4204
+ a->ne[1],
4205
+ a->ne[2],
4206
+ a->ne[3]);
4207
+
4208
+ int32_t params[] = { p0, p1 };
4209
+ ggml_set_op_params(result, params, sizeof(params));
4210
+
4211
+ result->op = GGML_OP_PAD_REFLECT_1D;
4212
+ result->src[0] = a;
4213
+
4214
+ return result;
4215
+ }
4216
+
4090
4217
  // ggml_arange
4091
4218
 
4092
4219
  struct ggml_tensor * ggml_arange(
@@ -4138,6 +4265,7 @@ struct ggml_tensor * ggml_argsort(
4138
4265
  struct ggml_context * ctx,
4139
4266
  struct ggml_tensor * a,
4140
4267
  enum ggml_sort_order order) {
4268
+ GGML_ASSERT(a->ne[0] <= INT32_MAX);
4141
4269
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne);
4142
4270
 
4143
4271
  ggml_set_op_params_i32(result, 0, (int32_t) order);
@@ -5019,8 +5147,10 @@ static void ggml_hash_map_free(struct hash_map * map) {
5019
5147
  }
5020
5148
 
5021
5149
  // utility functions to change gradients
5022
- // if a is in acc_table, modify gradients in-place and mark result as gradient accumulator
5023
- // else if a is in zero_table, replace a
5150
+ // isrc is the index of tensor in cgraph->visited_has_set.keys
5151
+ // the corresponding gradient (accumulators) are also at position isrc
5152
+ // if tensor has a gradient accumulator, modify that accumulator in-place
5153
+ // else if there is no gradient for tensor, set the corresponding value
5024
5154
  // else, just add/subtract/etc. the gradients
5025
5155
 
5026
5156
  static void ggml_add_or_set(
@@ -5028,11 +5158,14 @@ static void ggml_add_or_set(
5028
5158
  struct ggml_cgraph * cgraph,
5029
5159
  size_t isrc,
5030
5160
  struct ggml_tensor * tensor) {
5161
+ struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc];
5162
+ GGML_ASSERT(src);
5031
5163
  if (cgraph->grads[isrc]) {
5032
- cgraph->grads[isrc] = ggml_add_impl(ctx, cgraph->grads[isrc], tensor, cgraph->grad_accs[isrc]);
5164
+ cgraph->grads[isrc] = ggml_add_impl(ctx, cgraph->grads[isrc], tensor, /*inplace =*/ cgraph->grad_accs[isrc]);
5033
5165
  } else {
5034
5166
  cgraph->grads[isrc] = tensor;
5035
5167
  }
5168
+ ggml_format_name(cgraph->grads[isrc], "grad for %s", src->name);
5036
5169
  ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
5037
5170
  }
5038
5171
 
@@ -5040,18 +5173,20 @@ static void ggml_acc_or_set(
5040
5173
  struct ggml_context * ctx,
5041
5174
  struct ggml_cgraph * cgraph,
5042
5175
  size_t isrc,
5043
- struct ggml_tensor * src,
5044
5176
  struct ggml_tensor * tensor,
5045
5177
  const size_t nb1,
5046
5178
  const size_t nb2,
5047
5179
  const size_t nb3,
5048
5180
  const size_t offset) {
5181
+ struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc];
5182
+ GGML_ASSERT(src);
5049
5183
  if (cgraph->grads[isrc]) {
5050
5184
  cgraph->grads[isrc] = ggml_acc_impl(ctx, cgraph->grads[isrc], tensor, nb1, nb2, nb3, offset, cgraph->grad_accs[isrc]);
5051
5185
  } else {
5052
5186
  struct ggml_tensor * a_zero = ggml_scale(ctx, src, 0.0f); // FIXME this is going to produce NaN if a contains inf/NaN
5053
5187
  cgraph->grads[isrc] = ggml_acc_impl(ctx, a_zero, tensor, nb1, nb2, nb3, offset, false);
5054
5188
  }
5189
+ ggml_format_name(cgraph->grads[isrc], "grad for %s", cgraph->visited_hash_set.keys[isrc]->name);
5055
5190
  ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
5056
5191
  }
5057
5192
 
@@ -5059,13 +5194,15 @@ static void ggml_add1_or_set(
5059
5194
  struct ggml_context * ctx,
5060
5195
  struct ggml_cgraph * cgraph,
5061
5196
  size_t isrc,
5062
- struct ggml_tensor * src,
5063
5197
  struct ggml_tensor * tensor) {
5198
+ struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc];
5199
+ GGML_ASSERT(src);
5064
5200
  if (cgraph->grads[isrc]) {
5065
5201
  cgraph->grads[isrc] = ggml_add1_impl(ctx, cgraph->grads[isrc], tensor, cgraph->grad_accs[isrc]);
5066
5202
  } else {
5067
5203
  cgraph->grads[isrc] = ggml_repeat(ctx, tensor, src);
5068
5204
  }
5205
+ ggml_format_name(cgraph->grads[isrc], "grad for %s", src->name);
5069
5206
  ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
5070
5207
  }
5071
5208
 
@@ -5074,11 +5211,14 @@ static void ggml_sub_or_set(
5074
5211
  struct ggml_cgraph * cgraph,
5075
5212
  size_t isrc,
5076
5213
  struct ggml_tensor * tensor) {
5214
+ struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc];
5215
+ GGML_ASSERT(src);
5077
5216
  if (cgraph->grads[isrc]) {
5078
5217
  cgraph->grads[isrc] = ggml_sub_impl(ctx, cgraph->grads[isrc], tensor, cgraph->grad_accs[isrc]);
5079
5218
  } else {
5080
5219
  cgraph->grads[isrc] = ggml_neg(ctx, tensor);
5081
5220
  }
5221
+ ggml_format_name(cgraph->grads[isrc], "grad for %s", src->name);
5082
5222
  ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
5083
5223
  }
5084
5224
 
@@ -5095,12 +5235,12 @@ static void ggml_compute_backward(
5095
5235
  struct ggml_tensor * src1 = tensor->src[1];
5096
5236
  struct ggml_tensor * src2 = tensor->src[2];
5097
5237
  struct ggml_hash_set * hash_set = &cgraph->visited_hash_set;
5098
- const size_t isrc0 = ggml_hash_find(hash_set, src0);
5099
- const size_t isrc1 = ggml_hash_find(hash_set, src1);
5100
- const size_t isrc2 = ggml_hash_find(hash_set, src2);
5101
- const bool src0_needs_grads = isrc0 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc0) && grads_needed[isrc0];
5102
- const bool src1_needs_grads = isrc1 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc1) && grads_needed[isrc1];
5103
- const bool src2_needs_grads = isrc2 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc2) && grads_needed[isrc2];
5238
+ const size_t isrc0 = src0 ? ggml_hash_find(hash_set, src0) : (size_t) -1;
5239
+ const size_t isrc1 = src1 ? ggml_hash_find(hash_set, src1) : (size_t) -1;
5240
+ const size_t isrc2 = src2 ? ggml_hash_find(hash_set, src2) : (size_t) -1;
5241
+ const bool src0_needs_grads = src0 && isrc0 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc0) && grads_needed[isrc0];
5242
+ const bool src1_needs_grads = src1 && isrc1 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc1) && grads_needed[isrc1];
5243
+ const bool src2_needs_grads = src2 && isrc2 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc2) && grads_needed[isrc2];
5104
5244
 
5105
5245
  switch (tensor->op) {
5106
5246
  case GGML_OP_DUP: {
@@ -5200,7 +5340,7 @@ static void ggml_compute_backward(
5200
5340
  } break;
5201
5341
  case GGML_OP_SUM: {
5202
5342
  if (src0_needs_grads) {
5203
- ggml_add1_or_set(ctx, cgraph, isrc0, src0, grad);
5343
+ ggml_add1_or_set(ctx, cgraph, isrc0, grad);
5204
5344
  }
5205
5345
  } break;
5206
5346
  case GGML_OP_SUM_ROWS: {
@@ -5210,7 +5350,7 @@ static void ggml_compute_backward(
5210
5350
  } break;
5211
5351
  case GGML_OP_MEAN: {
5212
5352
  if (src0_needs_grads) {
5213
- ggml_add1_or_set(ctx, cgraph, isrc0, src0, ggml_scale_impl(ctx, grad, 1.0f/src0->ne[0], false));
5353
+ ggml_add1_or_set(ctx, cgraph, isrc0, ggml_scale_impl(ctx, grad, 1.0f/src0->ne[0], false));
5214
5354
  }
5215
5355
  } break;
5216
5356
  case GGML_OP_REPEAT: {
@@ -5363,7 +5503,7 @@ static void ggml_compute_backward(
5363
5503
  nb3 = (nb3 / n0) * ng;
5364
5504
  }
5365
5505
 
5366
- ggml_acc_or_set(ctx, cgraph, isrc0, src0, grad, nb1, nb2, nb3, offset);
5506
+ ggml_acc_or_set(ctx, cgraph, isrc0, grad, nb1, nb2, nb3, offset);
5367
5507
  }
5368
5508
  } break;
5369
5509
  case GGML_OP_PERMUTE: {
@@ -5597,10 +5737,9 @@ void ggml_build_backward_expand(
5597
5737
 
5598
5738
  const int n_nodes_f = cgraph->n_nodes;
5599
5739
 
5600
- const size_t hash_size = ggml_hash_size(2*cgraph->size);
5601
- memset(cgraph->grads, 0, hash_size*sizeof(struct ggml_tensor *));
5602
- memset(cgraph->grad_accs, 0, hash_size*sizeof(struct ggml_tensor *));
5603
- bool * grads_needed = calloc(hash_size, sizeof(bool));
5740
+ memset(cgraph->grads, 0, cgraph->visited_hash_set.size*sizeof(struct ggml_tensor *));
5741
+ memset(cgraph->grad_accs, 0, cgraph->visited_hash_set.size*sizeof(struct ggml_tensor *));
5742
+ bool * grads_needed = calloc(cgraph->visited_hash_set.size, sizeof(bool));
5604
5743
 
5605
5744
  {
5606
5745
  bool any_params = false;
@@ -5621,7 +5760,7 @@ void ggml_build_backward_expand(
5621
5760
  continue;
5622
5761
  }
5623
5762
 
5624
- bool node_needs_grad = node->flags & GGML_TENSOR_FLAG_PARAM;
5763
+ bool node_needs_grad = (node->flags & GGML_TENSOR_FLAG_PARAM) || (node->flags & GGML_TENSOR_FLAG_LOSS);
5625
5764
  bool ignore_src[GGML_MAX_SRC] = {false};
5626
5765
  switch (node->op) {
5627
5766
  // gradients in node->src[0] for one reason or another have no effect on output gradients
@@ -5638,7 +5777,7 @@ void ggml_build_backward_expand(
5638
5777
  } break;
5639
5778
 
5640
5779
  // gradients in node->src[1] for one reason or another have no effect on output gradients
5641
- case GGML_OP_CPY: // gradients in CPY target are irrelevant
5780
+ case GGML_OP_CPY: // gradients in CPY target are irrelevant
5642
5781
  case GGML_OP_GET_ROWS: // row indices not differentiable
5643
5782
  case GGML_OP_GET_ROWS_BACK: // same as for GET_ROWS
5644
5783
  case GGML_OP_ROPE: // positions not differentiable
@@ -5665,9 +5804,12 @@ void ggml_build_backward_expand(
5665
5804
  node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE);
5666
5805
 
5667
5806
  const size_t igrad = ggml_hash_find(&cgraph->visited_hash_set, node);
5807
+ GGML_ASSERT(igrad != GGML_HASHSET_FULL);
5808
+ GGML_ASSERT(ggml_bitset_get(cgraph->visited_hash_set.used, igrad));
5668
5809
  if ((accumulate && (node->flags & GGML_TENSOR_FLAG_PARAM)) || (node->flags & GGML_TENSOR_FLAG_LOSS)) {
5669
- cgraph->grads[igrad] = ggml_dup_tensor(ctx_static, node);
5670
- cgraph->grad_accs[igrad] = cgraph->grads[igrad];
5810
+ cgraph->grad_accs[igrad] = ggml_dup_tensor(ctx_static, node);
5811
+ cgraph->grads[igrad] = cgraph->grad_accs[igrad];
5812
+ ggml_format_name(cgraph->grad_accs[igrad], "grad acc for %s", node->name);
5671
5813
  }
5672
5814
  grads_needed[igrad] = true;
5673
5815
  }
@@ -5761,15 +5903,15 @@ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
5761
5903
 
5762
5904
  struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) {
5763
5905
  struct ggml_cgraph cgraph = {
5764
- /*.size =*/ 0,
5765
- /*.n_nodes =*/ i1 - i0,
5766
- /*.n_leafs =*/ 0,
5767
- /*.nodes =*/ cgraph0->nodes + i0,
5768
- /*.grads =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL,
5769
- /*.grad_accs =*/ cgraph0->grad_accs ? cgraph0->grad_accs + i0 : NULL,
5770
- /*.leafs =*/ NULL,
5771
- /*.hash_table =*/ { 0, NULL, NULL },
5772
- /*.order =*/ cgraph0->order,
5906
+ /*.size =*/ 0,
5907
+ /*.n_nodes =*/ i1 - i0,
5908
+ /*.n_leafs =*/ 0,
5909
+ /*.nodes =*/ cgraph0->nodes + i0,
5910
+ /*.grads =*/ NULL, // gradients would need visited_hash_set
5911
+ /*.grad_accs =*/ NULL,
5912
+ /*.leafs =*/ NULL,
5913
+ /*.visited_hash_set =*/ { 0, NULL, NULL },
5914
+ /*.order =*/ cgraph0->order,
5773
5915
  };
5774
5916
 
5775
5917
  return cgraph;
@@ -5799,12 +5941,22 @@ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
5799
5941
  }
5800
5942
  }
5801
5943
 
5944
+ if (dst->grads) {
5945
+ memset(dst->grads, 0, dst->visited_hash_set.size*sizeof(struct ggml_tensor *));
5946
+ memset(dst->grad_accs, 0, dst->visited_hash_set.size*sizeof(struct ggml_tensor *));
5947
+ }
5802
5948
  if (src->grads) {
5803
5949
  GGML_ASSERT(dst->grads != NULL);
5804
5950
  GGML_ASSERT(dst->grad_accs != NULL);
5805
5951
  for (int i = 0; i < src->n_nodes; ++i) {
5806
5952
  const size_t igrad_src = ggml_hash_find(&src->visited_hash_set, src->nodes[i]);
5807
5953
  const size_t igrad_dst = ggml_hash_find(&dst->visited_hash_set, dst->nodes[i]);
5954
+
5955
+ GGML_ASSERT(igrad_src != GGML_HASHSET_FULL);
5956
+ GGML_ASSERT(ggml_bitset_get(src->visited_hash_set.used, igrad_src));
5957
+ GGML_ASSERT(igrad_dst != GGML_HASHSET_FULL);
5958
+ GGML_ASSERT(ggml_bitset_get(dst->visited_hash_set.used, igrad_dst));
5959
+
5808
5960
  dst->grads[igrad_dst] = src->grads[igrad_src];
5809
5961
  dst->grad_accs[igrad_dst] = src->grad_accs[igrad_src];
5810
5962
  }
@@ -5839,12 +5991,8 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
5839
5991
 
5840
5992
  if (node->op == GGML_OP_OPT_STEP_ADAMW) {
5841
5993
  // clear momenta
5842
- if (node->src[2]->data) {
5843
- ggml_set_zero(node->src[2]);
5844
- }
5845
- if (node->src[3]->data) {
5846
- ggml_set_zero(node->src[3]);
5847
- }
5994
+ ggml_set_zero(node->src[2]);
5995
+ ggml_set_zero(node->src[3]);
5848
5996
  }
5849
5997
 
5850
5998
  // initial gradients of loss should be 1, 0 otherwise
@@ -5923,12 +6071,12 @@ struct ggml_tensor * ggml_graph_get_tensor(const struct ggml_cgraph * cgraph, co
5923
6071
 
5924
6072
  struct ggml_tensor * ggml_graph_get_grad(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
5925
6073
  const size_t igrad = ggml_hash_find(&cgraph->visited_hash_set, node);
5926
- return igrad != GGML_HASHSET_FULL && ggml_bitset_get(cgraph->visited_hash_set.used, igrad) ? cgraph->grads[igrad] : NULL;
6074
+ return igrad != GGML_HASHSET_FULL && ggml_bitset_get(cgraph->visited_hash_set.used, igrad) && cgraph->grads ? cgraph->grads[igrad] : NULL;
5927
6075
  }
5928
6076
 
5929
6077
  struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
5930
6078
  const size_t igrad = ggml_hash_find(&cgraph->visited_hash_set, node);
5931
- return igrad != GGML_HASHSET_FULL && ggml_bitset_get(cgraph->visited_hash_set.used, igrad) ? cgraph->grad_accs[igrad] : NULL;
6079
+ return igrad != GGML_HASHSET_FULL && ggml_bitset_get(cgraph->visited_hash_set.used, igrad) && cgraph->grad_accs ? cgraph->grad_accs[igrad] : NULL;
5932
6080
  }
5933
6081
 
5934
6082
  void ggml_graph_print(const struct ggml_cgraph * cgraph) {
@@ -6240,9 +6388,6 @@ size_t ggml_quantize_chunk(
6240
6388
  case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
6241
6389
  case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
6242
6390
  case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
6243
- case GGML_TYPE_Q4_0_4_4: result = quantize_q4_0_4x4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
6244
- case GGML_TYPE_Q4_0_4_8: result = quantize_q4_0_4x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
6245
- case GGML_TYPE_Q4_0_8_8: result = quantize_q4_0_8x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
6246
6391
  case GGML_TYPE_F16:
6247
6392
  {
6248
6393
  size_t elemsize = sizeof(ggml_fp16_t);
@@ -6378,7 +6523,7 @@ struct gguf_context {
6378
6523
  void * data;
6379
6524
  };
6380
6525
 
6381
- static size_t gguf_type_size(enum gguf_type type) {
6526
+ size_t gguf_type_size(enum gguf_type type) {
6382
6527
  GGML_ASSERT(0 <= type && type < GGUF_TYPE_COUNT);
6383
6528
  return GGUF_TYPE_SIZE[type];
6384
6529
  }
@@ -6506,13 +6651,7 @@ struct gguf_context * gguf_init_empty(void) {
6506
6651
  return ctx;
6507
6652
  }
6508
6653
 
6509
- struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
6510
- FILE * file = ggml_fopen(fname, "rb");
6511
- if (!file) {
6512
- fprintf(stderr, "%s: failed to open '%s': '%s'\n", __func__, fname, strerror(errno));
6513
- return NULL;
6514
- }
6515
-
6654
+ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params) {
6516
6655
  // offset from start of file
6517
6656
  size_t offset = 0;
6518
6657
 
@@ -6525,7 +6664,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
6525
6664
  for (uint32_t i = 0; i < sizeof(magic); i++) {
6526
6665
  if (magic[i] != GGUF_MAGIC[i]) {
6527
6666
  fprintf(stderr, "%s: invalid magic characters '%c%c%c%c'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
6528
- fclose(file);
6529
6667
  return NULL;
6530
6668
  }
6531
6669
  }
@@ -6536,7 +6674,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
6536
6674
  struct gguf_context * ctx = calloc(1, sizeof(struct gguf_context));
6537
6675
  if (!ctx) {
6538
6676
  fprintf(stderr, "%s: failed to allocate memory for context\n", __func__);
6539
- fclose(file);
6540
6677
  return NULL;
6541
6678
  }
6542
6679
 
@@ -6554,7 +6691,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
6554
6691
 
6555
6692
  if (ctx->header.version == 1) {
6556
6693
  fprintf(stderr, "%s: GGUFv1 is no longer supported. please use a more up-to-date version\n", __func__);
6557
- fclose(file);
6558
6694
  gguf_free(ctx);
6559
6695
  return NULL;
6560
6696
  }
@@ -6567,7 +6703,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
6567
6703
 
6568
6704
  if (!ok) {
6569
6705
  fprintf(stderr, "%s: failed to read header\n", __func__);
6570
- fclose(file);
6571
6706
  gguf_free(ctx);
6572
6707
  return NULL;
6573
6708
  }
@@ -6577,12 +6712,13 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
6577
6712
  {
6578
6713
  const uint64_t n_kv = ctx->header.n_kv;
6579
6714
 
6580
- ctx->kv = calloc(n_kv, sizeof(struct gguf_kv));
6581
- if (!ctx->kv) {
6582
- fprintf(stderr, "%s: failed to allocate memory for kv pairs\n", __func__);
6583
- fclose(file);
6584
- gguf_free(ctx);
6585
- return NULL;
6715
+ if (n_kv > 0) {
6716
+ ctx->kv = calloc(n_kv, sizeof(struct gguf_kv));
6717
+ if (!ctx->kv) {
6718
+ fprintf(stderr, "%s: failed to allocate memory for kv pairs\n", __func__);
6719
+ gguf_free(ctx);
6720
+ return NULL;
6721
+ }
6586
6722
  }
6587
6723
 
6588
6724
  for (uint64_t i = 0; i < n_kv; ++i) {
@@ -6629,7 +6765,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
6629
6765
  // prevent from integer overflow in the malloc below
6630
6766
  if (kv->value.arr.n >= SIZE_MAX/gguf_type_size(kv->value.arr.type)) {
6631
6767
  fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
6632
- fclose(file);
6633
6768
  gguf_free(ctx);
6634
6769
  return NULL;
6635
6770
  }
@@ -6637,7 +6772,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
6637
6772
  kv->value.arr.data = calloc(kv->value.arr.n, gguf_type_size(kv->value.arr.type));
6638
6773
  if (!kv->value.arr.data) {
6639
6774
  fprintf(stderr, "%s: failed to allocate memory for array\n", __func__);
6640
- fclose(file);
6641
6775
  gguf_free(ctx);
6642
6776
  return NULL;
6643
6777
  }
@@ -6649,7 +6783,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
6649
6783
  // prevent from integer overflow in the malloc below
6650
6784
  if (kv->value.arr.n >= SIZE_MAX/sizeof(struct gguf_str)) {
6651
6785
  fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
6652
- fclose(file);
6653
6786
  gguf_free(ctx);
6654
6787
  return NULL;
6655
6788
  }
@@ -6657,7 +6790,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
6657
6790
  kv->value.arr.data = calloc(kv->value.arr.n, sizeof(struct gguf_str));
6658
6791
  if (!kv->value.arr.data) {
6659
6792
  fprintf(stderr, "%s: failed to allocate memory for array\n", __func__);
6660
- fclose(file);
6661
6793
  gguf_free(ctx);
6662
6794
  return NULL;
6663
6795
  }
@@ -6688,7 +6820,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
6688
6820
 
6689
6821
  if (!ok) {
6690
6822
  fprintf(stderr, "%s: failed to read key-value pairs\n", __func__);
6691
- fclose(file);
6692
6823
  gguf_free(ctx);
6693
6824
  return NULL;
6694
6825
  }
@@ -6699,7 +6830,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
6699
6830
  ctx->infos = calloc(ctx->header.n_tensors, sizeof(struct gguf_tensor_info));
6700
6831
  if (!ctx->infos) {
6701
6832
  fprintf(stderr, "%s: failed to allocate memory for tensor infos\n", __func__);
6702
- fclose(file);
6703
6833
  gguf_free(ctx);
6704
6834
  return NULL;
6705
6835
  }
@@ -6735,7 +6865,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
6735
6865
 
6736
6866
  if (!ok) {
6737
6867
  fprintf(stderr, "%s: failed to read tensor info\n", __func__);
6738
- fclose(file);
6739
6868
  gguf_free(ctx);
6740
6869
  return NULL;
6741
6870
  }
@@ -6774,10 +6903,17 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
6774
6903
  (int64_t) info->ne[2] *
6775
6904
  (int64_t) info->ne[3];
6776
6905
 
6777
- if (ggml_blck_size(info->type) == 0 || ne % ggml_blck_size(info->type) != 0) {
6906
+ if (ggml_blck_size(info->type) == 0 ) {
6907
+ // this tensor type support have been removed:
6908
+ fprintf(stderr, "%s: tensor '%s' of type %d: %s\n",
6909
+ __func__, info->name.data, (int) info->type, ggml_type_name(info->type));
6910
+ gguf_free(ctx);
6911
+ return NULL;
6912
+ }
6913
+
6914
+ if (ne % ggml_blck_size(info->type) != 0) {
6778
6915
  fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n",
6779
6916
  __func__, info->name.data, (int) info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
6780
- fclose(file);
6781
6917
  gguf_free(ctx);
6782
6918
  return NULL;
6783
6919
  }
@@ -6809,7 +6945,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
6809
6945
  *params.ctx = ggml_init(pdata);
6810
6946
  if (*params.ctx == NULL) {
6811
6947
  fprintf(stderr, "%s: failed to initialize context\n", __func__);
6812
- fclose(file);
6813
6948
  gguf_free(ctx);
6814
6949
  return NULL;
6815
6950
  }
@@ -6828,7 +6963,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
6828
6963
 
6829
6964
  if (!ok) {
6830
6965
  fprintf(stderr, "%s: failed to read tensor data\n", __func__);
6831
- fclose(file);
6832
6966
  ggml_free(ctx_data);
6833
6967
  gguf_free(ctx);
6834
6968
  return NULL;
@@ -6867,7 +7001,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
6867
7001
 
6868
7002
  if (!ok) {
6869
7003
  fprintf(stderr, "%s: failed to read the tensor data\n", __func__);
6870
- fclose(file);
6871
7004
  ggml_free(ctx_data);
6872
7005
  gguf_free(ctx);
6873
7006
  return NULL;
@@ -6876,11 +7009,21 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
6876
7009
  ggml_set_no_alloc(ctx_data, params.no_alloc);
6877
7010
  }
6878
7011
 
6879
- fclose(file);
6880
-
6881
7012
  return ctx;
6882
7013
  }
6883
7014
 
7015
+ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
7016
+ FILE * file = ggml_fopen(fname, "rb");
7017
+ if (!file) {
7018
+ fprintf(stderr, "%s: failed to open '%s': '%s'\n", __func__, fname, strerror(errno));
7019
+ return NULL;
7020
+ }
7021
+
7022
+ struct gguf_context * result = gguf_init_from_file_impl(file, params);
7023
+ fclose(file);
7024
+ return result;
7025
+ }
7026
+
6884
7027
  void gguf_free(struct gguf_context * ctx) {
6885
7028
  if (ctx == NULL) {
6886
7029
  return;
@@ -7340,13 +7483,7 @@ void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const vo
7340
7483
  // fwrite(val, sizeof(char), size, file);
7341
7484
  //}
7342
7485
 
7343
- struct gguf_buf {
7344
- void * data;
7345
- size_t size;
7346
- size_t offset;
7347
- };
7348
-
7349
- static struct gguf_buf gguf_buf_init(size_t size) {
7486
+ struct gguf_buf gguf_buf_init(size_t size) {
7350
7487
  struct gguf_buf buf = {
7351
7488
  /*buf.data =*/ size == 0 ? NULL : GGML_CALLOC(1, size),
7352
7489
  /*buf.size =*/ size,
@@ -7356,7 +7493,7 @@ static struct gguf_buf gguf_buf_init(size_t size) {
7356
7493
  return buf;
7357
7494
  }
7358
7495
 
7359
- static void gguf_buf_free(struct gguf_buf buf) {
7496
+ void gguf_buf_free(struct gguf_buf buf) {
7360
7497
  if (buf.data) {
7361
7498
  GGML_FREE(buf.data);
7362
7499
  }
@@ -7394,7 +7531,7 @@ static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_si
7394
7531
  buf->offset += el_size;
7395
7532
  }
7396
7533
 
7397
- static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
7534
+ void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
7398
7535
  // write header
7399
7536
  gguf_bwrite_el(buf, &ctx->header.magic, sizeof(ctx->header.magic));
7400
7537
  gguf_bwrite_el(buf, &ctx->header.version, sizeof(ctx->header.version));
@@ -7549,3 +7686,26 @@ void ggml_log_set(ggml_log_callback log_callback, void * user_data) {
7549
7686
  g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default;
7550
7687
  g_logger_state.log_callback_user_data = user_data;
7551
7688
  }
7689
+
7690
+ void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) {
7691
+ p->n_threads = n_threads;
7692
+ p->prio = 0; // default priority (usually means normal or inherited)
7693
+ p->poll = 50; // hybrid-polling enabled
7694
+ p->strict_cpu = false; // no strict placement (all threads share same cpumask)
7695
+ p->paused = false; // threads are ready to go
7696
+ memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited)
7697
+ }
7698
+
7699
+ struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) {
7700
+ struct ggml_threadpool_params p;
7701
+ ggml_threadpool_params_init(&p, n_threads);
7702
+ return p;
7703
+ }
7704
+
7705
+ bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) {
7706
+ if (p0->n_threads != p1->n_threads ) return false;
7707
+ if (p0->prio != p1->prio ) return false;
7708
+ if (p0->poll != p1->poll ) return false;
7709
+ if (p0->strict_cpu != p1->strict_cpu ) return false;
7710
+ return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
7711
+ }