@novastera-oss/llamarn 0.2.7 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. package/android/src/main/cpp/include/llama.h +8 -3
  2. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  3. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  6. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  7. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  8. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  10. package/cpp/LlamaCppModel.cpp +56 -22
  11. package/cpp/build-info.cpp +2 -2
  12. package/cpp/llama.cpp/CMakeLists.txt +1 -1
  13. package/cpp/llama.cpp/common/arg.cpp +7 -0
  14. package/cpp/llama.cpp/common/common.cpp +3 -0
  15. package/cpp/llama.cpp/common/common.h +1 -0
  16. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
  17. package/cpp/llama.cpp/convert_hf_to_gguf.py +118 -20
  18. package/cpp/llama.cpp/ggml/CMakeLists.txt +1 -0
  19. package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
  20. package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
  21. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +17 -0
  22. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +1 -1
  23. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +31 -2
  24. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
  25. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
  26. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +1027 -1038
  27. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
  28. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  29. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
  30. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
  31. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
  32. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
  33. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
  34. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
  35. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
  36. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
  37. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
  38. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +83 -102
  39. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
  40. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
  41. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +192 -67
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
  43. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +56 -40
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
  48. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +54 -29
  49. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  50. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  51. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  52. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  53. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
  54. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +84 -31
  55. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
  56. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
  57. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
  58. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
  59. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
  60. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  61. package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
  62. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
  63. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +227 -41
  64. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +362 -182
  65. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
  66. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +240 -535
  67. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
  68. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -24
  69. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
  70. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
  71. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +99 -166
  72. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +94 -72
  73. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
  74. package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
  75. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
  76. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +6 -9
  77. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +45 -54
  78. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
  79. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  80. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
  81. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +132 -201
  82. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
  83. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
  84. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
  85. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  86. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  87. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
  88. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
  89. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +12 -1
  90. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +57 -1
  91. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
  92. package/cpp/llama.cpp/ggml/src/ggml.c +69 -13
  93. package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
  94. package/cpp/llama.cpp/gguf-py/gguf/constants.py +76 -0
  95. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +21 -0
  96. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +64 -0
  97. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
  98. package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
  99. package/cpp/llama.cpp/include/llama.h +8 -3
  100. package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
  101. package/cpp/llama.cpp/src/llama-arch.cpp +55 -0
  102. package/cpp/llama.cpp/src/llama-arch.h +18 -0
  103. package/cpp/llama.cpp/src/llama-batch.cpp +570 -359
  104. package/cpp/llama.cpp/src/llama-batch.h +98 -70
  105. package/cpp/llama.cpp/src/llama-chat.cpp +11 -6
  106. package/cpp/llama.cpp/src/llama-context.cpp +101 -107
  107. package/cpp/llama.cpp/src/llama-context.h +13 -13
  108. package/cpp/llama.cpp/src/llama-graph.cpp +199 -252
  109. package/cpp/llama.cpp/src/llama-graph.h +44 -32
  110. package/cpp/llama.cpp/src/llama-hparams.cpp +4 -0
  111. package/cpp/llama.cpp/src/llama-hparams.h +8 -0
  112. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +51 -53
  113. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +19 -24
  114. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +110 -104
  115. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +17 -22
  116. package/cpp/llama.cpp/src/llama-kv-cells.h +35 -11
  117. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +66 -67
  118. package/cpp/llama.cpp/src/llama-memory-hybrid.h +16 -21
  119. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +69 -68
  120. package/cpp/llama.cpp/src/llama-memory-recurrent.h +15 -20
  121. package/cpp/llama.cpp/src/llama-memory.h +18 -22
  122. package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
  123. package/cpp/llama.cpp/src/llama-model.cpp +1006 -472
  124. package/cpp/llama.cpp/src/llama-model.h +22 -0
  125. package/cpp/llama.cpp/src/llama-quant.cpp +87 -5
  126. package/cpp/llama.cpp/src/llama-vocab.cpp +26 -3
  127. package/cpp/llama.cpp/src/llama-vocab.h +1 -0
  128. package/cpp/rn-utils.h +3 -0
  129. package/ios/include/common.h +1 -0
  130. package/ios/include/llama.h +8 -3
  131. package/ios/libs/llama.xcframework/Info.plist +19 -19
  132. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  133. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4863
  134. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  135. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
  136. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +8 -3
  137. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  138. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  139. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
  140. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3742
  141. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  142. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  143. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
  144. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  145. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  146. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
  147. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3744
  148. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
  149. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
  150. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +8 -3
  151. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
  152. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
  153. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +8 -3
  154. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  155. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
  156. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
  157. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +8 -3
  158. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  159. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  160. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  161. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4863
  162. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  163. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
  164. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +8 -3
  165. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  166. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  167. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
  168. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3742
  169. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  170. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  171. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
  172. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  173. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  174. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4900
  175. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  176. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
  177. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +8 -3
  178. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  179. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  180. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4871
  181. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3773
  182. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  183. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  184. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
  185. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  186. package/package.json +1 -1
@@ -61,9 +61,6 @@
61
61
  #define m512i(p) (__m512i)(p)
62
62
  #endif
63
63
 
64
- // precomputed f32 table for f16 (256 KB) (ggml-impl.h)
65
- float ggml_table_f32_f16[1 << 16];
66
-
67
64
  #if defined(__linux__) || \
68
65
  defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
69
66
  (defined(__APPLE__) && !TARGET_OS_TV && !TARGET_OS_WATCH)
@@ -936,6 +933,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
936
933
  "TRANSPOSE",
937
934
  "GET_ROWS",
938
935
  "GET_ROWS_BACK",
936
+ "SET_ROWS",
939
937
  "DIAG",
940
938
  "DIAG_MASK_INF",
941
939
  "DIAG_MASK_ZERO",
@@ -955,6 +953,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
955
953
  "UPSCALE",
956
954
  "PAD",
957
955
  "PAD_REFLECT_1D",
956
+ "ROLL",
958
957
  "ARANGE",
959
958
  "TIMESTEP_EMBEDDING",
960
959
  "ARGSORT",
@@ -985,7 +984,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
985
984
  "OPT_STEP_ADAMW",
986
985
  };
987
986
 
988
- static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
987
+ static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84");
989
988
 
990
989
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
991
990
  "none",
@@ -1031,6 +1030,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1031
1030
  "transpose(x)",
1032
1031
  "get_rows(x)",
1033
1032
  "get_rows_back(x)",
1033
+ "set_rows(x)",
1034
1034
  "diag(x)",
1035
1035
  "diag_mask_inf(x)",
1036
1036
  "diag_mask_zero(x)",
@@ -1050,6 +1050,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1050
1050
  "upscale(x)",
1051
1051
  "pad(x)",
1052
1052
  "pad_reflect_1d(x)",
1053
+ "roll(x)",
1053
1054
  "arange(start, stop, step)",
1054
1055
  "timestep_embedding(timesteps, dim, max_period)",
1055
1056
  "argsort(x)",
@@ -1080,7 +1081,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1080
1081
  "adamw(x)",
1081
1082
  };
1082
1083
 
1083
- static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
1084
+ static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84");
1084
1085
 
1085
1086
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
1086
1087
 
@@ -1349,6 +1350,12 @@ bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor) {
1349
1350
  tensor->nb[2] == ggml_type_size(tensor->type);
1350
1351
  }
1351
1352
 
1353
+ bool ggml_is_contiguous_rows(const struct ggml_tensor * tensor) {
1354
+ return
1355
+ tensor->ne[0] == ggml_blck_size(tensor->type) ||
1356
+ tensor->nb[0] == ggml_type_size(tensor->type);
1357
+ }
1358
+
1352
1359
  static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
1353
1360
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1354
1361
 
@@ -1420,14 +1427,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
1420
1427
  // initialize time system (required on Windows)
1421
1428
  ggml_time_init();
1422
1429
 
1423
- for (int i = 0; i < (1 << 16); ++i) {
1424
- union {
1425
- uint16_t u16;
1426
- ggml_fp16_t fp16;
1427
- } u = {i};
1428
- ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
1429
- }
1430
-
1431
1430
  is_first_call = false;
1432
1431
  }
1433
1432
 
@@ -3393,6 +3392,35 @@ struct ggml_tensor * ggml_get_rows_back(
3393
3392
  return result;
3394
3393
  }
3395
3394
 
3395
+ // ggml_set_rows
3396
+
3397
+ struct ggml_tensor * ggml_set_rows(
3398
+ struct ggml_context * ctx,
3399
+ struct ggml_tensor * a,
3400
+ struct ggml_tensor * b,
3401
+ struct ggml_tensor * c) {
3402
+ GGML_ASSERT(a->ne[0] == b->ne[0]);
3403
+ GGML_ASSERT(a->ne[2] == b->ne[2]);
3404
+ GGML_ASSERT(a->ne[3] == b->ne[3]);
3405
+ GGML_ASSERT(b->ne[1] == c->ne[0]);
3406
+ GGML_ASSERT(b->ne[2] % c->ne[1] == 0);
3407
+ GGML_ASSERT(b->ne[3] % c->ne[2] == 0);
3408
+ GGML_ASSERT(c->ne[3] == 1);
3409
+ GGML_ASSERT(b->type == GGML_TYPE_F32);
3410
+ GGML_ASSERT(c->type == GGML_TYPE_I64);
3411
+
3412
+ GGML_ASSERT(ggml_is_contiguous_rows(a));
3413
+ GGML_ASSERT(ggml_is_contiguous_rows(b));
3414
+
3415
+ struct ggml_tensor * result = ggml_view_tensor(ctx, a);
3416
+
3417
+ result->op = GGML_OP_SET_ROWS;
3418
+ result->src[0] = b;
3419
+ result->src[1] = c;
3420
+
3421
+ return result;
3422
+ }
3423
+
3396
3424
  // ggml_diag
3397
3425
 
3398
3426
  struct ggml_tensor * ggml_diag(
@@ -4341,6 +4369,34 @@ struct ggml_tensor * ggml_pad_reflect_1d(
4341
4369
  return result;
4342
4370
  }
4343
4371
 
4372
+ // ggml_roll
4373
+
4374
+ struct ggml_tensor * ggml_roll(
4375
+ struct ggml_context * ctx,
4376
+ struct ggml_tensor * a,
4377
+ int shift0,
4378
+ int shift1,
4379
+ int shift2,
4380
+ int shift3) {
4381
+ GGML_ASSERT(a->nb[0] == ggml_type_size(a->type));
4382
+ GGML_ASSERT(abs(shift0) < a->ne[0]);
4383
+ GGML_ASSERT(abs(shift1) < a->ne[1]);
4384
+ GGML_ASSERT(abs(shift2) < a->ne[2]);
4385
+ GGML_ASSERT(abs(shift3) < a->ne[3]);
4386
+
4387
+ struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
4388
+
4389
+ ggml_set_op_params_i32(result, 0, shift0);
4390
+ ggml_set_op_params_i32(result, 1, shift1);
4391
+ ggml_set_op_params_i32(result, 2, shift2);
4392
+ ggml_set_op_params_i32(result, 3, shift3);
4393
+
4394
+ result->op = GGML_OP_ROLL;
4395
+ result->src[0] = a;
4396
+
4397
+ return result;
4398
+ }
4399
+
4344
4400
  // ggml_arange
4345
4401
 
4346
4402
  struct ggml_tensor * ggml_arange(
@@ -335,7 +335,11 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
335
335
 
336
336
  for (uint32_t i = 0; i < magic.size(); i++) {
337
337
  if (magic[i] != GGUF_MAGIC[i]) {
338
- GGML_LOG_ERROR("%s: invalid magic characters: '%c%c%c%c', expected 'GGUF'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
338
+ char c0 = isprint(magic[0]) ? magic[0] : '?';
339
+ char c1 = isprint(magic[1]) ? magic[1] : '?';
340
+ char c2 = isprint(magic[2]) ? magic[2] : '?';
341
+ char c3 = isprint(magic[3]) ? magic[3] : '?';
342
+ GGML_LOG_ERROR("%s: invalid magic characters: '%c%c%c%c', expected 'GGUF'\n", __func__, c0, c1, c2, c3);
339
343
  gguf_free(ctx);
340
344
  return nullptr;
341
345
  }
@@ -118,6 +118,10 @@ class Keys:
118
118
  EMBEDDING_SCALE = "{arch}.embedding_scale"
119
119
  TOKEN_SHIFT_COUNT = "{arch}.token_shift_count"
120
120
  INTERLEAVE_MOE_LAYER_STEP = "{arch}.interleave_moe_layer_step"
121
+ ACTIVATION_SPARSITY_SCALE = "{arch}.activation_sparsity_scale"
122
+ ALTUP_ACTIVE_IDX = "{arch}.altup.active_idx"
123
+ ALTUP_NUM_INPUTS = "{arch}.altup.num_inputs"
124
+ EMBD_LENGTH_PER_LAYER_INP = "{arch}.embedding_length_per_layer_input"
121
125
 
122
126
  class Attention:
123
127
  HEAD_COUNT = "{arch}.attention.head_count"
@@ -142,6 +146,8 @@ class Keys:
142
146
  SCALE = "{arch}.attention.scale"
143
147
  KEY_LENGTH_MLA = "{arch}.attention.key_length_mla"
144
148
  VALUE_LENGTH_MLA = "{arch}.attention.value_length_mla"
149
+ SHARED_KV_LAYERS = "{arch}.attention.shared_kv_layers"
150
+ SLIDING_WINDOW_PATTERN = "{arch}.attention.sliding_window_pattern"
145
151
 
146
152
  class Rope:
147
153
  DIMENSION_COUNT = "{arch}.rope.dimension_count"
@@ -198,6 +204,7 @@ class Keys:
198
204
  MASK_ID = "tokenizer.ggml.mask_token_id"
199
205
  ADD_BOS = "tokenizer.ggml.add_bos_token"
200
206
  ADD_EOS = "tokenizer.ggml.add_eos_token"
207
+ ADD_SEP = "tokenizer.ggml.add_sep_token"
201
208
  ADD_PREFIX = "tokenizer.ggml.add_space_prefix"
202
209
  REMOVE_EXTRA_WS = "tokenizer.ggml.remove_extra_whitespaces"
203
210
  PRECOMPILED_CHARSMAP = "tokenizer.ggml.precompiled_charsmap"
@@ -313,6 +320,7 @@ class MODEL_ARCH(IntEnum):
313
320
  GEMMA = auto()
314
321
  GEMMA2 = auto()
315
322
  GEMMA3 = auto()
323
+ GEMMA3N = auto()
316
324
  STARCODER2 = auto()
317
325
  RWKV6 = auto()
318
326
  RWKV6QWEN2 = auto()
@@ -398,6 +406,22 @@ class MODEL_TENSOR(IntEnum):
398
406
  ATTN_Q_NORM = auto()
399
407
  ATTN_K_NORM = auto()
400
408
  LAYER_OUT_NORM = auto()
409
+ PER_LAYER_TOKEN_EMBD = auto() # gemma3n
410
+ PER_LAYER_MODEL_PROJ = auto() # gemma3n
411
+ PER_LAYER_INP_GATE = auto() # gemma3n
412
+ PER_LAYER_PROJ = auto() # gemma3n
413
+ PER_LAYER_PROJ_NORM = auto() # gemma3n
414
+ PER_LAYER_POST_NORM = auto() # gemma3n
415
+ ALTUP_PROJ = auto() # gemma3n
416
+ ALTUP_UNEMBD_PROJ = auto() # gemma3n
417
+ ALTUP_CORRECT_COEF = auto() # gemma3n
418
+ ALTUP_CORRECT_SCALE = auto() # gemma3n
419
+ ALTUP_PREDICT_COEF = auto() # gemma3n
420
+ ALTUP_ROUTER = auto() # gemma3n
421
+ ALTUP_ROUTER_NORM = auto() # gemma3n
422
+ LAUREL_L = auto() # gemma3n
423
+ LAUREL_R = auto() # gemma3n
424
+ LAUREL_POST_NORM = auto() # gemma3n
401
425
  SSM_IN = auto()
402
426
  SSM_CONV1D = auto()
403
427
  SSM_X = auto()
@@ -596,6 +620,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
596
620
  MODEL_ARCH.GEMMA: "gemma",
597
621
  MODEL_ARCH.GEMMA2: "gemma2",
598
622
  MODEL_ARCH.GEMMA3: "gemma3",
623
+ MODEL_ARCH.GEMMA3N: "gemma3n",
599
624
  MODEL_ARCH.STARCODER2: "starcoder2",
600
625
  MODEL_ARCH.RWKV6: "rwkv6",
601
626
  MODEL_ARCH.RWKV6QWEN2: "rwkv6qwen2",
@@ -681,6 +706,22 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
681
706
  MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
682
707
  MODEL_TENSOR.FFN_EXP_PROBS_B: "blk.{bid}.exp_probs_b",
683
708
  MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
709
+ MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: "per_layer_token_embd", # gemma3n
710
+ MODEL_TENSOR.PER_LAYER_MODEL_PROJ: "per_layer_model_proj", # gemma3n
711
+ MODEL_TENSOR.PER_LAYER_PROJ_NORM: "per_layer_proj_norm", # gemma3n
712
+ MODEL_TENSOR.ALTUP_UNEMBD_PROJ: "altup_unembd_proj", # gemma3n
713
+ MODEL_TENSOR.ALTUP_PROJ: "altup_proj", # gemma3n
714
+ MODEL_TENSOR.PER_LAYER_INP_GATE: "blk.{bid}.inp_gate", # gemma3n
715
+ MODEL_TENSOR.PER_LAYER_PROJ: "blk.{bid}.proj", # gemma3n
716
+ MODEL_TENSOR.PER_LAYER_POST_NORM: "blk.{bid}.post_norm", # gemma3n
717
+ MODEL_TENSOR.ALTUP_CORRECT_COEF: "blk.{bid}.altup_correct_coef", # gemma3n
718
+ MODEL_TENSOR.ALTUP_CORRECT_SCALE: "blk.{bid}.altup_correct_scale", # gemma3n
719
+ MODEL_TENSOR.ALTUP_PREDICT_COEF: "blk.{bid}.altup_predict_coef", # gemma3n
720
+ MODEL_TENSOR.ALTUP_ROUTER: "blk.{bid}.altup_router", # gemma3n
721
+ MODEL_TENSOR.ALTUP_ROUTER_NORM: "blk.{bid}.altup_router_norm", # gemma3n
722
+ MODEL_TENSOR.LAUREL_L: "blk.{bid}.laurel_l", # gemma3n
723
+ MODEL_TENSOR.LAUREL_R: "blk.{bid}.laurel_r", # gemma3n
724
+ MODEL_TENSOR.LAUREL_POST_NORM: "blk.{bid}.laurel_post_norm", # gemma3n
684
725
  MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in",
685
726
  MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d",
686
727
  MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x",
@@ -1485,6 +1526,41 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
1485
1526
  MODEL_TENSOR.FFN_PRE_NORM,
1486
1527
  MODEL_TENSOR.FFN_POST_NORM,
1487
1528
  ],
1529
+ MODEL_ARCH.GEMMA3N: [
1530
+ MODEL_TENSOR.TOKEN_EMBD,
1531
+ MODEL_TENSOR.OUTPUT,
1532
+ MODEL_TENSOR.OUTPUT_NORM,
1533
+ MODEL_TENSOR.ATTN_Q,
1534
+ MODEL_TENSOR.ATTN_Q_NORM,
1535
+ MODEL_TENSOR.ATTN_K,
1536
+ MODEL_TENSOR.ATTN_K_NORM,
1537
+ MODEL_TENSOR.ATTN_V,
1538
+ MODEL_TENSOR.ATTN_OUT,
1539
+ MODEL_TENSOR.FFN_GATE,
1540
+ MODEL_TENSOR.FFN_DOWN,
1541
+ MODEL_TENSOR.FFN_UP,
1542
+ MODEL_TENSOR.ATTN_NORM,
1543
+ MODEL_TENSOR.ATTN_POST_NORM,
1544
+ MODEL_TENSOR.FFN_PRE_NORM,
1545
+ MODEL_TENSOR.FFN_POST_NORM,
1546
+ # altup / laurel
1547
+ MODEL_TENSOR.PER_LAYER_TOKEN_EMBD,
1548
+ MODEL_TENSOR.PER_LAYER_MODEL_PROJ,
1549
+ MODEL_TENSOR.PER_LAYER_INP_GATE,
1550
+ MODEL_TENSOR.PER_LAYER_PROJ,
1551
+ MODEL_TENSOR.PER_LAYER_PROJ_NORM,
1552
+ MODEL_TENSOR.PER_LAYER_POST_NORM,
1553
+ MODEL_TENSOR.ALTUP_PROJ,
1554
+ MODEL_TENSOR.ALTUP_UNEMBD_PROJ,
1555
+ MODEL_TENSOR.ALTUP_CORRECT_COEF,
1556
+ MODEL_TENSOR.ALTUP_CORRECT_SCALE,
1557
+ MODEL_TENSOR.ALTUP_PREDICT_COEF,
1558
+ MODEL_TENSOR.ALTUP_ROUTER,
1559
+ MODEL_TENSOR.ALTUP_ROUTER_NORM,
1560
+ MODEL_TENSOR.LAUREL_L,
1561
+ MODEL_TENSOR.LAUREL_R,
1562
+ MODEL_TENSOR.LAUREL_POST_NORM,
1563
+ ],
1488
1564
  MODEL_ARCH.STARCODER2: [
1489
1565
  MODEL_TENSOR.TOKEN_EMBD,
1490
1566
  MODEL_TENSOR.OUTPUT_NORM,
@@ -672,6 +672,18 @@ class GGUFWriter:
672
672
  def add_decoder_start_token_id(self, id: int) -> None:
673
673
  self.add_uint32(Keys.LLM.DECODER_START_TOKEN_ID.format(arch=self.arch), id)
674
674
 
675
+ def add_embedding_length_per_layer_input(self, value: int) -> None:
676
+ self.add_uint32(Keys.LLM.EMBD_LENGTH_PER_LAYER_INP.format(arch=self.arch), value)
677
+
678
+ def add_altup_active_idx(self, val: int) -> None:
679
+ self.add_uint32(Keys.LLM.ALTUP_ACTIVE_IDX.format(arch=self.arch), val)
680
+
681
+ def add_altup_num_inputs(self, val: int) -> None:
682
+ self.add_uint32(Keys.LLM.ALTUP_NUM_INPUTS.format(arch=self.arch), val)
683
+
684
+ def add_activation_sparsity_scale(self, values: Sequence[float]) -> None:
685
+ self.add_array(Keys.LLM.ACTIVATION_SPARSITY_SCALE.format(arch=self.arch), values)
686
+
675
687
  def add_head_count(self, count: int | Sequence[int]) -> None:
676
688
  if isinstance(count, int):
677
689
  self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count)
@@ -702,6 +714,12 @@ class GGUFWriter:
702
714
  def add_clamp_kqv(self, value: float) -> None:
703
715
  self.add_float32(Keys.Attention.CLAMP_KQV.format(arch=self.arch), value)
704
716
 
717
+ def add_shared_kv_layers(self, value: float) -> None:
718
+ self.add_float32(Keys.Attention.SHARED_KV_LAYERS.format(arch=self.arch), value)
719
+
720
+ def add_sliding_window_pattern(self, value: Sequence[bool]) -> None:
721
+ self.add_array(Keys.Attention.SLIDING_WINDOW_PATTERN.format(arch=self.arch), value)
722
+
705
723
  def add_logit_scale(self, value: float) -> None:
706
724
  self.add_float32(Keys.LLM.LOGIT_SCALE.format(arch=self.arch), value)
707
725
 
@@ -891,6 +909,9 @@ class GGUFWriter:
891
909
  def add_add_eos_token(self, value: bool) -> None:
892
910
  self.add_bool(Keys.Tokenizer.ADD_EOS, value)
893
911
 
912
+ def add_add_sep_token(self, value: bool) -> None:
913
+ self.add_bool(Keys.Tokenizer.ADD_SEP, value)
914
+
894
915
  def add_add_space_prefix(self, value: bool) -> None:
895
916
  self.add_bool(Keys.Tokenizer.ADD_PREFIX, value)
896
917
 
@@ -480,6 +480,70 @@ class TensorNameMap:
480
480
  "encoder.layer.{bid}.layer_norm_2" # jina-v2-code
481
481
  ),
482
482
 
483
+ MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: (
484
+ "model.embed_tokens_per_layer", # gemma3n
485
+ ),
486
+
487
+ MODEL_TENSOR.PER_LAYER_MODEL_PROJ: (
488
+ "model.per_layer_model_projection", # gemma3n
489
+ ),
490
+
491
+ MODEL_TENSOR.PER_LAYER_PROJ_NORM: (
492
+ "model.per_layer_projection_norm", # gemma3n
493
+ ),
494
+
495
+ MODEL_TENSOR.ALTUP_PROJ: (
496
+ "model.altup_projections", # gemma3n
497
+ ),
498
+
499
+ MODEL_TENSOR.ALTUP_UNEMBD_PROJ: (
500
+ "model.altup_unembed_projections", # gemma3n
501
+ ),
502
+
503
+ MODEL_TENSOR.PER_LAYER_INP_GATE: (
504
+ "model.layers.{bid}.per_layer_input_gate", # gemma3n
505
+ ),
506
+
507
+ MODEL_TENSOR.PER_LAYER_PROJ: (
508
+ "model.layers.{bid}.per_layer_projection", # gemma3n
509
+ ),
510
+
511
+ MODEL_TENSOR.PER_LAYER_POST_NORM: (
512
+ "model.layers.{bid}.post_per_layer_input_norm", # gemma3n
513
+ ),
514
+
515
+ MODEL_TENSOR.ALTUP_CORRECT_COEF: (
516
+ "model.layers.{bid}.altup.correction_coefs", # gemma3n
517
+ ),
518
+
519
+ MODEL_TENSOR.ALTUP_CORRECT_SCALE: (
520
+ "model.layers.{bid}.altup.correct_output_scale", # gemma3n
521
+ ),
522
+
523
+ MODEL_TENSOR.ALTUP_PREDICT_COEF: (
524
+ "model.layers.{bid}.altup.prediction_coefs", # gemma3n
525
+ ),
526
+
527
+ MODEL_TENSOR.ALTUP_ROUTER: (
528
+ "model.layers.{bid}.altup.modality_router", # gemma3n
529
+ ),
530
+
531
+ MODEL_TENSOR.ALTUP_ROUTER_NORM: (
532
+ "model.layers.{bid}.altup.router_norm", # gemma3n
533
+ ),
534
+
535
+ MODEL_TENSOR.LAUREL_L: (
536
+ "model.layers.{bid}.laurel.linear_left", # gemma3n
537
+ ),
538
+
539
+ MODEL_TENSOR.LAUREL_R: (
540
+ "model.layers.{bid}.laurel.linear_right", # gemma3n
541
+ ),
542
+
543
+ MODEL_TENSOR.LAUREL_POST_NORM: (
544
+ "model.layers.{bid}.laurel.post_laurel_norm", # gemma3n
545
+ ),
546
+
483
547
  MODEL_TENSOR.SSM_IN: (
484
548
  "model.layers.{bid}.in_proj",
485
549
  "backbone.layers.{bid}.mixer.in_proj",
@@ -7,7 +7,10 @@ import os
7
7
  from pathlib import Path
8
8
  from typing import Any, Callable, Sequence, Mapping, Iterable, Protocol, ClassVar, runtime_checkable
9
9
 
10
- from sentencepiece import SentencePieceProcessor
10
+ try:
11
+ from sentencepiece import SentencePieceProcessor
12
+ except ImportError:
13
+ SentencePieceProcessor = None
11
14
 
12
15
  import gguf
13
16
 
@@ -116,6 +119,7 @@ class SpecialVocab:
116
119
  logger.warning(f'Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping')
117
120
 
118
121
  def _try_load_from_tokenizer_json(self, path: Path) -> bool:
122
+ tokenizer = None
119
123
  tokenizer_file = path / 'tokenizer.json'
120
124
  if tokenizer_file.is_file():
121
125
  with open(tokenizer_file, encoding = 'utf-8') as f:
@@ -149,11 +153,97 @@ class SpecialVocab:
149
153
  added_tokens = tokenizer.get('added_tokens', {})
150
154
  else:
151
155
  added_tokens = {}
156
+ tokenizer_config = None
152
157
  tokenizer_config_file = path / 'tokenizer_config.json'
153
- if not tokenizer_config_file.is_file():
158
+ if tokenizer_config_file.is_file():
159
+ with open(tokenizer_config_file, encoding = 'utf-8') as f:
160
+ tokenizer_config = json.load(f)
161
+ if tokenizer:
162
+ special_bos = (tokenizer_config or {}).get('bos_token')
163
+ special_cls = (tokenizer_config or {}).get('cls_token')
164
+ special_eos = (tokenizer_config or {}).get('eos_token')
165
+ special_sep = (tokenizer_config or {}).get('sep_token')
166
+ if not special_bos and special_cls and tokenizer_config:
167
+ tokenizer_config['bos_token'] = special_bos = special_cls
168
+ if not special_eos and special_sep and tokenizer_config:
169
+ tokenizer_config['eos_token'] = special_eos = special_sep
170
+ if post_processor := tokenizer.get('post_processor'):
171
+ for processor in post_processor.get('processors', [post_processor]):
172
+ if processor.get('type') == 'RobertaProcessing':
173
+ self.add_special_token['bos'] = True
174
+ self.add_special_token['eos'] = True
175
+ self.add_special_token['sep'] = True
176
+ if not special_cls and tokenizer_config:
177
+ special_cls = processor.get('cls', [special_bos])[0]
178
+ tokenizer_config['cls_token'] = special_cls
179
+ if not special_sep and tokenizer_config:
180
+ special_sep = processor.get('sep', [special_eos])[0]
181
+ tokenizer_config['sep_token'] = special_sep
182
+ continue
183
+ # Crude parsing of TemplateProcessing to determine if BOS/SEP/EOS should be added
184
+ # Only works with simple templates, **will** get it wrong on unusual sequences
185
+ if processor.get('type') == 'TemplateProcessing':
186
+ tmpl_single = processor.get('single', [])
187
+ tmpl_pair = processor.get('pair', [])
188
+ special_first = None
189
+ special_last = None
190
+ if len(tmpl_single) > 1:
191
+ if special_first := tmpl_single[0].get('SpecialToken', {}).get('id'):
192
+ if not tokenizer_config:
193
+ special_bos = special_first
194
+ self.add_special_token['bos'] = True if special_first in (special_bos, special_cls) else False
195
+ if special_first not in (special_bos, special_cls):
196
+ logger.warning(f'Unknown leading special token {special_first!r} in TemplateProcessing<single>')
197
+ if special_last := tmpl_single[-1].get('SpecialToken', {}).get('id'):
198
+ if not tokenizer_config:
199
+ special_eos = special_last
200
+ elif special_last != special_eos:
201
+ if 'eot' not in self.special_token_types:
202
+ self.special_token_types = tuple(self.special_token_types) + ('eot', )
203
+ tokenizer_config['eot_token'] = special_eos
204
+ elif 'eom' not in self.special_token_types:
205
+ self.special_token_types = tuple(self.special_token_types) + ('eom', )
206
+ tokenizer_config['eom_token'] = special_eos
207
+ else:
208
+ logger.warning(f'Overriding EOS token {special_eos!r} with {special_last!r} without EOT/EOM fallback!')
209
+ tokenizer_config['eos_token'] = special_eos = special_last
210
+ self.add_special_token['eos'] = True if special_last == special_eos else False
211
+ if special_last != special_eos:
212
+ logger.warning(f'Unknown trailing special token {special_last!r} in TemplateProcessing<single>')
213
+ if tmpl_pair:
214
+ seq_start = 1 if special_first and tmpl_pair[0].get('SpecialToken', {}).get('id') == special_first else 0
215
+ seq_stop = -1 if special_last and tmpl_pair[-1].get('SpecialToken', {}).get('id') == special_last else None
216
+ if (special_first and seq_start == 0) or (special_last and seq_stop is None):
217
+ logger.warning('TemplateProcessing<single> leading/trailing special tokens do not match TemplateProcessing<pair>')
218
+ if tmpl_pair := tmpl_pair[slice(seq_start, seq_stop)]:
219
+ tmpl_a = tmpl_pair[0].get('Sequence', {}).get('id')
220
+ tmpl_b = tmpl_pair[-1].get('Sequence', {}).get('id')
221
+ if tmpl_a != 'A' or tmpl_b != 'B':
222
+ logger.warning(f'Unknown sequence {tmpl_a}...{tmpl_b} in TemplateProcessing<pair>')
223
+ # A [sep] [eos] B
224
+ if tmpl_a == 'A' and tmpl_b == 'B' and (tmpl_pair := tmpl_pair[1:-1]):
225
+ add_sep = False
226
+ if special_entry := tmpl_pair[0].get('SpecialToken', {}).get('id'):
227
+ if special_entry in (special_sep, special_eos) and not special_last:
228
+ add_sep = True
229
+ if special_entry not in (special_sep, special_eos):
230
+ logger.warning(f'Unknown separator token {special_entry!r} in TemplateProcessing<pair>')
231
+ else:
232
+ logger.warning(f'Unknown middle sequence {tmpl_pair[0]!r} in TemplateProcessing<pair>')
233
+ if len(tmpl_pair) == 2:
234
+ if special_entry := tmpl_pair[1].get('SpecialToken', {}).get('id'):
235
+ if special_entry in (special_sep, special_eos):
236
+ add_sep = True
237
+ if special_entry not in (special_sep, special_eos):
238
+ logger.warning(f'Unknown second separator token {special_entry!r} in TemplateProcessing<pair>')
239
+ else:
240
+ logger.warning(f'Unknown second middle sequence {tmpl_pair[1]!r} in TemplateProcessing<pair>')
241
+ self.add_special_token['sep'] = add_sep
242
+ if add_sep and not special_sep and tokenizer_config:
243
+ tokenizer_config['sep_token'] = special_eos
244
+ continue
245
+ if not tokenizer_config:
154
246
  return True
155
- with open(tokenizer_config_file, encoding = 'utf-8') as f:
156
- tokenizer_config = json.load(f)
157
247
  chat_template_alt = None
158
248
  chat_template_file = path / 'chat_template.json'
159
249
  if chat_template_file.is_file():
@@ -302,6 +392,9 @@ class SentencePieceVocab(Vocab):
302
392
  name = "spm"
303
393
 
304
394
  def __init__(self, base_path: Path):
395
+ if SentencePieceProcessor is None:
396
+ raise RuntimeError("sentencepiece is not installed")
397
+
305
398
  added_tokens: dict[str, int] = {}
306
399
  if (fname_tokenizer := base_path / 'tokenizer.model').exists():
307
400
  # normal location
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "gguf"
3
- version = "0.17.0"
3
+ version = "0.17.1"
4
4
  description = "Read and write ML models in GGUF for GGML"
5
5
  authors = ["GGML <ggml@ggml.ai>"]
6
6
  packages = [
@@ -22,7 +22,7 @@ python = ">=3.8"
22
22
  numpy = ">=1.17"
23
23
  tqdm = ">=4.27"
24
24
  pyyaml = ">=5.1"
25
- sentencepiece = ">=0.1.98,<=0.2.0"
25
+ sentencepiece = { version = ">=0.1.98,<=0.2.0", optional = true }
26
26
  PySide6 = { version = "^6.9", python = ">=3.9,<3.14", optional = true }
27
27
 
28
28
  [tool.poetry.dev-dependencies]
@@ -390,6 +390,7 @@ extern "C" {
390
390
  void * imatrix; // pointer to importance matrix data
391
391
  void * kv_overrides; // pointer to vector containing overrides
392
392
  void * tensor_types; // pointer to vector containing tensor types
393
+ void * prune_layers; // pointer to vector containing layer indices to prune
393
394
  } llama_model_quantize_params;
394
395
 
395
396
  typedef struct llama_logit_bias {
@@ -943,12 +944,14 @@ extern "C" {
943
944
  // Requires the context to have a memory.
944
945
  // For encode-decoder contexts, processes the batch using the decoder.
945
946
  // Positive return values does not mean a fatal error, but rather a warning.
946
- // Upon non-zero return values, the memory state is restored to the state before this call
947
+ // Upon fatal-error or abort, the ubatches that managed to be been processed will remain in the memory state of the context
948
+ // To handle this correctly, query the memory state using llama_memory_seq_pos_min() and llama_memory_seq_pos_max()
949
+ // Upon other return values, the memory state is restored to the state before this call
947
950
  // 0 - success
948
951
  // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
949
- // 2 - aborted
952
+ // 2 - aborted (processed ubatches will remain in the context's memory)
950
953
  // -1 - invalid input batch
951
- // < -1 - error
954
+ // < -1 - fatal error (processed ubatches will remain in the context's memory)
952
955
  LLAMA_API int32_t llama_decode(
953
956
  struct llama_context * ctx,
954
957
  struct llama_batch batch);
@@ -1044,6 +1047,7 @@ extern "C" {
1044
1047
 
1045
1048
  LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
1046
1049
  LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
1050
+ LLAMA_API bool llama_vocab_get_add_sep(const struct llama_vocab * vocab);
1047
1051
 
1048
1052
  LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab);
1049
1053
  LLAMA_API llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab);
@@ -1087,6 +1091,7 @@ extern "C" {
1087
1091
  /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
1088
1092
  /// @return Returns the number of tokens on success, no more than n_tokens_max
1089
1093
  /// @return Returns a negative number on failure - the number of tokens that would have been returned
1094
+ /// @return Returns INT32_MIN on overflow (e.g., tokenization result size exceeds int32_t limit)
1090
1095
  /// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
1091
1096
  /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
1092
1097
  /// as plaintext. Does not insert a leading space.