@novastera-oss/llamarn 0.2.7 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. package/android/src/main/cpp/include/llama.h +8 -3
  2. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  3. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  6. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  7. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  8. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  10. package/cpp/LlamaCppModel.cpp +56 -22
  11. package/cpp/build-info.cpp +2 -2
  12. package/cpp/llama.cpp/CMakeLists.txt +1 -1
  13. package/cpp/llama.cpp/common/arg.cpp +7 -0
  14. package/cpp/llama.cpp/common/common.cpp +3 -0
  15. package/cpp/llama.cpp/common/common.h +1 -0
  16. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
  17. package/cpp/llama.cpp/convert_hf_to_gguf.py +118 -20
  18. package/cpp/llama.cpp/ggml/CMakeLists.txt +1 -0
  19. package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
  20. package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
  21. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +17 -0
  22. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +1 -1
  23. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +31 -2
  24. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
  25. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
  26. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +1027 -1038
  27. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
  28. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  29. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
  30. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
  31. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
  32. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
  33. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
  34. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
  35. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
  36. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
  37. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
  38. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +83 -102
  39. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
  40. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
  41. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +192 -67
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
  43. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +56 -40
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
  48. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +54 -29
  49. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  50. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  51. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  52. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  53. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
  54. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +84 -31
  55. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
  56. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
  57. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
  58. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
  59. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
  60. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  61. package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
  62. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
  63. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +227 -41
  64. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +362 -182
  65. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
  66. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +240 -535
  67. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
  68. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -24
  69. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
  70. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
  71. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +99 -166
  72. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +94 -72
  73. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
  74. package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
  75. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
  76. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +6 -9
  77. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +45 -54
  78. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
  79. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  80. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
  81. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +132 -201
  82. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
  83. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
  84. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
  85. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  86. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  87. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
  88. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
  89. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +12 -1
  90. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +57 -1
  91. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
  92. package/cpp/llama.cpp/ggml/src/ggml.c +69 -13
  93. package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
  94. package/cpp/llama.cpp/gguf-py/gguf/constants.py +76 -0
  95. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +21 -0
  96. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +64 -0
  97. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
  98. package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
  99. package/cpp/llama.cpp/include/llama.h +8 -3
  100. package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
  101. package/cpp/llama.cpp/src/llama-arch.cpp +55 -0
  102. package/cpp/llama.cpp/src/llama-arch.h +18 -0
  103. package/cpp/llama.cpp/src/llama-batch.cpp +570 -359
  104. package/cpp/llama.cpp/src/llama-batch.h +98 -70
  105. package/cpp/llama.cpp/src/llama-chat.cpp +11 -6
  106. package/cpp/llama.cpp/src/llama-context.cpp +101 -107
  107. package/cpp/llama.cpp/src/llama-context.h +13 -13
  108. package/cpp/llama.cpp/src/llama-graph.cpp +199 -252
  109. package/cpp/llama.cpp/src/llama-graph.h +44 -32
  110. package/cpp/llama.cpp/src/llama-hparams.cpp +4 -0
  111. package/cpp/llama.cpp/src/llama-hparams.h +8 -0
  112. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +51 -53
  113. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +19 -24
  114. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +110 -104
  115. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +17 -22
  116. package/cpp/llama.cpp/src/llama-kv-cells.h +35 -11
  117. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +66 -67
  118. package/cpp/llama.cpp/src/llama-memory-hybrid.h +16 -21
  119. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +69 -68
  120. package/cpp/llama.cpp/src/llama-memory-recurrent.h +15 -20
  121. package/cpp/llama.cpp/src/llama-memory.h +18 -22
  122. package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
  123. package/cpp/llama.cpp/src/llama-model.cpp +1006 -472
  124. package/cpp/llama.cpp/src/llama-model.h +22 -0
  125. package/cpp/llama.cpp/src/llama-quant.cpp +87 -5
  126. package/cpp/llama.cpp/src/llama-vocab.cpp +26 -3
  127. package/cpp/llama.cpp/src/llama-vocab.h +1 -0
  128. package/cpp/rn-utils.h +3 -0
  129. package/ios/include/common.h +1 -0
  130. package/ios/include/llama.h +8 -3
  131. package/ios/libs/llama.xcframework/Info.plist +19 -19
  132. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  133. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4863
  134. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  135. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
  136. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +8 -3
  137. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  138. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  139. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
  140. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3742
  141. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  142. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  143. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
  144. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  145. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  146. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
  147. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3744
  148. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
  149. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
  150. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +8 -3
  151. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
  152. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
  153. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +8 -3
  154. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  155. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
  156. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
  157. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +8 -3
  158. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  159. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  160. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  161. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4863
  162. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  163. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
  164. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +8 -3
  165. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  166. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  167. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
  168. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3742
  169. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  170. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  171. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
  172. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  173. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  174. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4900
  175. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  176. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
  177. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +8 -3
  178. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  179. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  180. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4871
  181. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3773
  182. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  183. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  184. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
  185. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  186. package/package.json +1 -1
@@ -101,6 +101,7 @@ extern "C" {
101
101
  GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
102
102
  GGML_BACKEND_API int ggml_cpu_has_vsx (void);
103
103
  GGML_BACKEND_API int ggml_cpu_has_vxe (void);
104
+ GGML_BACKEND_API int ggml_cpu_has_nnpa (void);
104
105
  GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
105
106
  GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
106
107
 
@@ -133,6 +134,7 @@ extern "C" {
133
134
 
134
135
  GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
135
136
 
137
+ GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *, float *, int64_t);
136
138
  GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
137
139
  GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
138
140
  GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);
@@ -470,6 +470,7 @@ extern "C" {
470
470
  GGML_OP_TRANSPOSE,
471
471
  GGML_OP_GET_ROWS,
472
472
  GGML_OP_GET_ROWS_BACK,
473
+ GGML_OP_SET_ROWS,
473
474
  GGML_OP_DIAG,
474
475
  GGML_OP_DIAG_MASK_INF,
475
476
  GGML_OP_DIAG_MASK_ZERO,
@@ -489,6 +490,7 @@ extern "C" {
489
490
  GGML_OP_UPSCALE, // nearest interpolate
490
491
  GGML_OP_PAD,
491
492
  GGML_OP_PAD_REFLECT_1D,
493
+ GGML_OP_ROLL,
492
494
  GGML_OP_ARANGE,
493
495
  GGML_OP_TIMESTEP_EMBEDDING,
494
496
  GGML_OP_ARGSORT,
@@ -686,6 +688,9 @@ extern "C" {
686
688
  // true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
687
689
  GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
688
690
 
691
+ // true if the elements in dimension 0 are contiguous, or there is just 1 block of elements
692
+ GGML_API bool ggml_is_contiguous_rows(const struct ggml_tensor * tensor);
693
+
689
694
  GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
690
695
  GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
691
696
 
@@ -1374,6 +1379,23 @@ extern "C" {
1374
1379
  struct ggml_tensor * b, // row indices
1375
1380
  struct ggml_tensor * c); // data for ggml_get_rows, only used for its shape
1376
1381
 
1382
+ // a TD [n_embd, ne1, ne2, ne3]
1383
+ // b TS [n_embd, n_rows, ne02, ne03] | ne02 == ne2, ne03 == ne3
1384
+ // c I64 [n_rows, ne11, ne12, 1] | c[i] in [0, ne1)
1385
+ //
1386
+ // undefined behavior if destination rows overlap
1387
+ //
1388
+ // broadcast:
1389
+ // ne2 % ne11 == 0
1390
+ // ne3 % ne12 == 0
1391
+ //
1392
+ // return view(a)
1393
+ GGML_API struct ggml_tensor * ggml_set_rows(
1394
+ struct ggml_context * ctx,
1395
+ struct ggml_tensor * a, // destination
1396
+ struct ggml_tensor * b, // source
1397
+ struct ggml_tensor * c); // row indices
1398
+
1377
1399
  GGML_API struct ggml_tensor * ggml_diag(
1378
1400
  struct ggml_context * ctx,
1379
1401
  struct ggml_tensor * a);
@@ -1801,6 +1823,17 @@ extern "C" {
1801
1823
  int p0,
1802
1824
  int p1);
1803
1825
 
1826
+ // Move tensor elements by an offset given for each dimension. Elements that
1827
+ // are shifted beyond the last position are wrapped around to the beginning.
1828
+ GGML_API struct ggml_tensor * ggml_roll(
1829
+ struct ggml_context * ctx,
1830
+ struct ggml_tensor * a,
1831
+ int shift0,
1832
+ int shift1,
1833
+ int shift2,
1834
+ int shift3);
1835
+
1836
+
1804
1837
  // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
1805
1838
  // timesteps: [N,]
1806
1839
  // return: [N, dim]
@@ -390,6 +390,7 @@ extern "C" {
390
390
  void * imatrix; // pointer to importance matrix data
391
391
  void * kv_overrides; // pointer to vector containing overrides
392
392
  void * tensor_types; // pointer to vector containing tensor types
393
+ void * prune_layers; // pointer to vector containing layer indices to prune
393
394
  } llama_model_quantize_params;
394
395
 
395
396
  typedef struct llama_logit_bias {
@@ -943,12 +944,14 @@ extern "C" {
943
944
  // Requires the context to have a memory.
944
945
  // For encode-decoder contexts, processes the batch using the decoder.
945
946
  // Positive return values does not mean a fatal error, but rather a warning.
946
- // Upon non-zero return values, the memory state is restored to the state before this call
947
+ // Upon fatal-error or abort, the ubatches that managed to be been processed will remain in the memory state of the context
948
+ // To handle this correctly, query the memory state using llama_memory_seq_pos_min() and llama_memory_seq_pos_max()
949
+ // Upon other return values, the memory state is restored to the state before this call
947
950
  // 0 - success
948
951
  // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
949
- // 2 - aborted
952
+ // 2 - aborted (processed ubatches will remain in the context's memory)
950
953
  // -1 - invalid input batch
951
- // < -1 - error
954
+ // < -1 - fatal error (processed ubatches will remain in the context's memory)
952
955
  LLAMA_API int32_t llama_decode(
953
956
  struct llama_context * ctx,
954
957
  struct llama_batch batch);
@@ -1044,6 +1047,7 @@ extern "C" {
1044
1047
 
1045
1048
  LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
1046
1049
  LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
1050
+ LLAMA_API bool llama_vocab_get_add_sep(const struct llama_vocab * vocab);
1047
1051
 
1048
1052
  LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab);
1049
1053
  LLAMA_API llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab);
@@ -1087,6 +1091,7 @@ extern "C" {
1087
1091
  /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
1088
1092
  /// @return Returns the number of tokens on success, no more than n_tokens_max
1089
1093
  /// @return Returns a negative number on failure - the number of tokens that would have been returned
1094
+ /// @return Returns INT32_MIN on overflow (e.g., tokenization result size exceeds int32_t limit)
1090
1095
  /// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
1091
1096
  /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
1092
1097
  /// as plaintext. Does not insert a leading space.
@@ -101,6 +101,7 @@ extern "C" {
101
101
  GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
102
102
  GGML_BACKEND_API int ggml_cpu_has_vsx (void);
103
103
  GGML_BACKEND_API int ggml_cpu_has_vxe (void);
104
+ GGML_BACKEND_API int ggml_cpu_has_nnpa (void);
104
105
  GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
105
106
  GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
106
107
 
@@ -133,6 +134,7 @@ extern "C" {
133
134
 
134
135
  GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
135
136
 
137
+ GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *, float *, int64_t);
136
138
  GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
137
139
  GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
138
140
  GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);
@@ -470,6 +470,7 @@ extern "C" {
470
470
  GGML_OP_TRANSPOSE,
471
471
  GGML_OP_GET_ROWS,
472
472
  GGML_OP_GET_ROWS_BACK,
473
+ GGML_OP_SET_ROWS,
473
474
  GGML_OP_DIAG,
474
475
  GGML_OP_DIAG_MASK_INF,
475
476
  GGML_OP_DIAG_MASK_ZERO,
@@ -489,6 +490,7 @@ extern "C" {
489
490
  GGML_OP_UPSCALE, // nearest interpolate
490
491
  GGML_OP_PAD,
491
492
  GGML_OP_PAD_REFLECT_1D,
493
+ GGML_OP_ROLL,
492
494
  GGML_OP_ARANGE,
493
495
  GGML_OP_TIMESTEP_EMBEDDING,
494
496
  GGML_OP_ARGSORT,
@@ -686,6 +688,9 @@ extern "C" {
686
688
  // true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
687
689
  GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
688
690
 
691
+ // true if the elements in dimension 0 are contiguous, or there is just 1 block of elements
692
+ GGML_API bool ggml_is_contiguous_rows(const struct ggml_tensor * tensor);
693
+
689
694
  GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
690
695
  GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
691
696
 
@@ -1374,6 +1379,23 @@ extern "C" {
1374
1379
  struct ggml_tensor * b, // row indices
1375
1380
  struct ggml_tensor * c); // data for ggml_get_rows, only used for its shape
1376
1381
 
1382
+ // a TD [n_embd, ne1, ne2, ne3]
1383
+ // b TS [n_embd, n_rows, ne02, ne03] | ne02 == ne2, ne03 == ne3
1384
+ // c I64 [n_rows, ne11, ne12, 1] | c[i] in [0, ne1)
1385
+ //
1386
+ // undefined behavior if destination rows overlap
1387
+ //
1388
+ // broadcast:
1389
+ // ne2 % ne11 == 0
1390
+ // ne3 % ne12 == 0
1391
+ //
1392
+ // return view(a)
1393
+ GGML_API struct ggml_tensor * ggml_set_rows(
1394
+ struct ggml_context * ctx,
1395
+ struct ggml_tensor * a, // destination
1396
+ struct ggml_tensor * b, // source
1397
+ struct ggml_tensor * c); // row indices
1398
+
1377
1399
  GGML_API struct ggml_tensor * ggml_diag(
1378
1400
  struct ggml_context * ctx,
1379
1401
  struct ggml_tensor * a);
@@ -1801,6 +1823,17 @@ extern "C" {
1801
1823
  int p0,
1802
1824
  int p1);
1803
1825
 
1826
+ // Move tensor elements by an offset given for each dimension. Elements that
1827
+ // are shifted beyond the last position are wrapped around to the beginning.
1828
+ GGML_API struct ggml_tensor * ggml_roll(
1829
+ struct ggml_context * ctx,
1830
+ struct ggml_tensor * a,
1831
+ int shift0,
1832
+ int shift1,
1833
+ int shift2,
1834
+ int shift3);
1835
+
1836
+
1804
1837
  // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
1805
1838
  // timesteps: [N,]
1806
1839
  // return: [N, dim]
@@ -390,6 +390,7 @@ extern "C" {
390
390
  void * imatrix; // pointer to importance matrix data
391
391
  void * kv_overrides; // pointer to vector containing overrides
392
392
  void * tensor_types; // pointer to vector containing tensor types
393
+ void * prune_layers; // pointer to vector containing layer indices to prune
393
394
  } llama_model_quantize_params;
394
395
 
395
396
  typedef struct llama_logit_bias {
@@ -943,12 +944,14 @@ extern "C" {
943
944
  // Requires the context to have a memory.
944
945
  // For encode-decoder contexts, processes the batch using the decoder.
945
946
  // Positive return values does not mean a fatal error, but rather a warning.
946
- // Upon non-zero return values, the memory state is restored to the state before this call
947
+ // Upon fatal-error or abort, the ubatches that managed to be been processed will remain in the memory state of the context
948
+ // To handle this correctly, query the memory state using llama_memory_seq_pos_min() and llama_memory_seq_pos_max()
949
+ // Upon other return values, the memory state is restored to the state before this call
947
950
  // 0 - success
948
951
  // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
949
- // 2 - aborted
952
+ // 2 - aborted (processed ubatches will remain in the context's memory)
950
953
  // -1 - invalid input batch
951
- // < -1 - error
954
+ // < -1 - fatal error (processed ubatches will remain in the context's memory)
952
955
  LLAMA_API int32_t llama_decode(
953
956
  struct llama_context * ctx,
954
957
  struct llama_batch batch);
@@ -1044,6 +1047,7 @@ extern "C" {
1044
1047
 
1045
1048
  LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
1046
1049
  LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
1050
+ LLAMA_API bool llama_vocab_get_add_sep(const struct llama_vocab * vocab);
1047
1051
 
1048
1052
  LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab);
1049
1053
  LLAMA_API llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab);
@@ -1087,6 +1091,7 @@ extern "C" {
1087
1091
  /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
1088
1092
  /// @return Returns the number of tokens on success, no more than n_tokens_max
1089
1093
  /// @return Returns a negative number on failure - the number of tokens that would have been returned
1094
+ /// @return Returns INT32_MIN on overflow (e.g., tokenization result size exceeds int32_t limit)
1090
1095
  /// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
1091
1096
  /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
1092
1097
  /// as plaintext. Does not insert a leading space.
@@ -101,6 +101,7 @@ extern "C" {
101
101
  GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
102
102
  GGML_BACKEND_API int ggml_cpu_has_vsx (void);
103
103
  GGML_BACKEND_API int ggml_cpu_has_vxe (void);
104
+ GGML_BACKEND_API int ggml_cpu_has_nnpa (void);
104
105
  GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
105
106
  GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
106
107
 
@@ -133,6 +134,7 @@ extern "C" {
133
134
 
134
135
  GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
135
136
 
137
+ GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *, float *, int64_t);
136
138
  GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
137
139
  GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
138
140
  GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);
@@ -470,6 +470,7 @@ extern "C" {
470
470
  GGML_OP_TRANSPOSE,
471
471
  GGML_OP_GET_ROWS,
472
472
  GGML_OP_GET_ROWS_BACK,
473
+ GGML_OP_SET_ROWS,
473
474
  GGML_OP_DIAG,
474
475
  GGML_OP_DIAG_MASK_INF,
475
476
  GGML_OP_DIAG_MASK_ZERO,
@@ -489,6 +490,7 @@ extern "C" {
489
490
  GGML_OP_UPSCALE, // nearest interpolate
490
491
  GGML_OP_PAD,
491
492
  GGML_OP_PAD_REFLECT_1D,
493
+ GGML_OP_ROLL,
492
494
  GGML_OP_ARANGE,
493
495
  GGML_OP_TIMESTEP_EMBEDDING,
494
496
  GGML_OP_ARGSORT,
@@ -686,6 +688,9 @@ extern "C" {
686
688
  // true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
687
689
  GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
688
690
 
691
+ // true if the elements in dimension 0 are contiguous, or there is just 1 block of elements
692
+ GGML_API bool ggml_is_contiguous_rows(const struct ggml_tensor * tensor);
693
+
689
694
  GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
690
695
  GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
691
696
 
@@ -1374,6 +1379,23 @@ extern "C" {
1374
1379
  struct ggml_tensor * b, // row indices
1375
1380
  struct ggml_tensor * c); // data for ggml_get_rows, only used for its shape
1376
1381
 
1382
+ // a TD [n_embd, ne1, ne2, ne3]
1383
+ // b TS [n_embd, n_rows, ne02, ne03] | ne02 == ne2, ne03 == ne3
1384
+ // c I64 [n_rows, ne11, ne12, 1] | c[i] in [0, ne1)
1385
+ //
1386
+ // undefined behavior if destination rows overlap
1387
+ //
1388
+ // broadcast:
1389
+ // ne2 % ne11 == 0
1390
+ // ne3 % ne12 == 0
1391
+ //
1392
+ // return view(a)
1393
+ GGML_API struct ggml_tensor * ggml_set_rows(
1394
+ struct ggml_context * ctx,
1395
+ struct ggml_tensor * a, // destination
1396
+ struct ggml_tensor * b, // source
1397
+ struct ggml_tensor * c); // row indices
1398
+
1377
1399
  GGML_API struct ggml_tensor * ggml_diag(
1378
1400
  struct ggml_context * ctx,
1379
1401
  struct ggml_tensor * a);
@@ -1801,6 +1823,17 @@ extern "C" {
1801
1823
  int p0,
1802
1824
  int p1);
1803
1825
 
1826
+ // Move tensor elements by an offset given for each dimension. Elements that
1827
+ // are shifted beyond the last position are wrapped around to the beginning.
1828
+ GGML_API struct ggml_tensor * ggml_roll(
1829
+ struct ggml_context * ctx,
1830
+ struct ggml_tensor * a,
1831
+ int shift0,
1832
+ int shift1,
1833
+ int shift2,
1834
+ int shift3);
1835
+
1836
+
1804
1837
  // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
1805
1838
  // timesteps: [N,]
1806
1839
  // return: [N, dim]
@@ -390,6 +390,7 @@ extern "C" {
390
390
  void * imatrix; // pointer to importance matrix data
391
391
  void * kv_overrides; // pointer to vector containing overrides
392
392
  void * tensor_types; // pointer to vector containing tensor types
393
+ void * prune_layers; // pointer to vector containing layer indices to prune
393
394
  } llama_model_quantize_params;
394
395
 
395
396
  typedef struct llama_logit_bias {
@@ -943,12 +944,14 @@ extern "C" {
943
944
  // Requires the context to have a memory.
944
945
  // For encode-decoder contexts, processes the batch using the decoder.
945
946
  // Positive return values does not mean a fatal error, but rather a warning.
946
- // Upon non-zero return values, the memory state is restored to the state before this call
947
+ // Upon fatal-error or abort, the ubatches that managed to be been processed will remain in the memory state of the context
948
+ // To handle this correctly, query the memory state using llama_memory_seq_pos_min() and llama_memory_seq_pos_max()
949
+ // Upon other return values, the memory state is restored to the state before this call
947
950
  // 0 - success
948
951
  // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
949
- // 2 - aborted
952
+ // 2 - aborted (processed ubatches will remain in the context's memory)
950
953
  // -1 - invalid input batch
951
- // < -1 - error
954
+ // < -1 - fatal error (processed ubatches will remain in the context's memory)
952
955
  LLAMA_API int32_t llama_decode(
953
956
  struct llama_context * ctx,
954
957
  struct llama_batch batch);
@@ -1044,6 +1047,7 @@ extern "C" {
1044
1047
 
1045
1048
  LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
1046
1049
  LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
1050
+ LLAMA_API bool llama_vocab_get_add_sep(const struct llama_vocab * vocab);
1047
1051
 
1048
1052
  LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab);
1049
1053
  LLAMA_API llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab);
@@ -1087,6 +1091,7 @@ extern "C" {
1087
1091
  /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
1088
1092
  /// @return Returns the number of tokens on success, no more than n_tokens_max
1089
1093
  /// @return Returns a negative number on failure - the number of tokens that would have been returned
1094
+ /// @return Returns INT32_MIN on overflow (e.g., tokenization result size exceeds int32_t limit)
1090
1095
  /// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
1091
1096
  /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
1092
1097
  /// as plaintext. Does not insert a leading space.