@novastera-oss/llamarn 0.2.7 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. package/android/src/main/cpp/include/llama.h +8 -3
  2. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  3. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  6. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  7. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  8. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  10. package/cpp/LlamaCppModel.cpp +56 -22
  11. package/cpp/build-info.cpp +2 -2
  12. package/cpp/llama.cpp/CMakeLists.txt +1 -1
  13. package/cpp/llama.cpp/common/arg.cpp +7 -0
  14. package/cpp/llama.cpp/common/common.cpp +3 -0
  15. package/cpp/llama.cpp/common/common.h +1 -0
  16. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
  17. package/cpp/llama.cpp/convert_hf_to_gguf.py +118 -20
  18. package/cpp/llama.cpp/ggml/CMakeLists.txt +1 -0
  19. package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
  20. package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
  21. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +17 -0
  22. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +1 -1
  23. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +31 -2
  24. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
  25. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
  26. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +1027 -1038
  27. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
  28. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  29. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
  30. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
  31. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
  32. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
  33. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
  34. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
  35. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
  36. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
  37. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
  38. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +83 -102
  39. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
  40. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
  41. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +192 -67
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
  43. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +56 -40
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
  48. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +54 -29
  49. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  50. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  51. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  52. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  53. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
  54. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +84 -31
  55. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
  56. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
  57. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
  58. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
  59. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
  60. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  61. package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
  62. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
  63. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +227 -41
  64. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +362 -182
  65. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
  66. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +240 -535
  67. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
  68. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -24
  69. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
  70. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
  71. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +99 -166
  72. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +94 -72
  73. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
  74. package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
  75. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
  76. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +6 -9
  77. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +45 -54
  78. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
  79. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  80. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
  81. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +132 -201
  82. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
  83. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
  84. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
  85. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  86. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  87. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
  88. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
  89. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +12 -1
  90. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +57 -1
  91. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
  92. package/cpp/llama.cpp/ggml/src/ggml.c +69 -13
  93. package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
  94. package/cpp/llama.cpp/gguf-py/gguf/constants.py +76 -0
  95. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +21 -0
  96. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +64 -0
  97. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
  98. package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
  99. package/cpp/llama.cpp/include/llama.h +8 -3
  100. package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
  101. package/cpp/llama.cpp/src/llama-arch.cpp +55 -0
  102. package/cpp/llama.cpp/src/llama-arch.h +18 -0
  103. package/cpp/llama.cpp/src/llama-batch.cpp +570 -359
  104. package/cpp/llama.cpp/src/llama-batch.h +98 -70
  105. package/cpp/llama.cpp/src/llama-chat.cpp +11 -6
  106. package/cpp/llama.cpp/src/llama-context.cpp +101 -107
  107. package/cpp/llama.cpp/src/llama-context.h +13 -13
  108. package/cpp/llama.cpp/src/llama-graph.cpp +199 -252
  109. package/cpp/llama.cpp/src/llama-graph.h +44 -32
  110. package/cpp/llama.cpp/src/llama-hparams.cpp +4 -0
  111. package/cpp/llama.cpp/src/llama-hparams.h +8 -0
  112. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +51 -53
  113. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +19 -24
  114. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +110 -104
  115. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +17 -22
  116. package/cpp/llama.cpp/src/llama-kv-cells.h +35 -11
  117. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +66 -67
  118. package/cpp/llama.cpp/src/llama-memory-hybrid.h +16 -21
  119. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +69 -68
  120. package/cpp/llama.cpp/src/llama-memory-recurrent.h +15 -20
  121. package/cpp/llama.cpp/src/llama-memory.h +18 -22
  122. package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
  123. package/cpp/llama.cpp/src/llama-model.cpp +1006 -472
  124. package/cpp/llama.cpp/src/llama-model.h +22 -0
  125. package/cpp/llama.cpp/src/llama-quant.cpp +87 -5
  126. package/cpp/llama.cpp/src/llama-vocab.cpp +26 -3
  127. package/cpp/llama.cpp/src/llama-vocab.h +1 -0
  128. package/cpp/rn-utils.h +3 -0
  129. package/ios/include/common.h +1 -0
  130. package/ios/include/llama.h +8 -3
  131. package/ios/libs/llama.xcframework/Info.plist +19 -19
  132. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  133. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4863
  134. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  135. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
  136. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +8 -3
  137. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  138. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  139. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
  140. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3742
  141. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  142. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  143. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
  144. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  145. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  146. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
  147. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3744
  148. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
  149. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
  150. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +8 -3
  151. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
  152. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
  153. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +8 -3
  154. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  155. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
  156. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
  157. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +8 -3
  158. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  159. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  160. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  161. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4863
  162. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  163. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
  164. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +8 -3
  165. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  166. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  167. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
  168. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3742
  169. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  170. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  171. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
  172. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  173. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  174. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4900
  175. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  176. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
  177. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +8 -3
  178. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  179. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  180. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4871
  181. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3773
  182. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  183. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  184. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
  185. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  186. package/package.json +1 -1
@@ -95,6 +95,8 @@ enum llm_type {
95
95
  LLM_TYPE_17B_128E, // llama4 Maverick
96
96
  LLM_TYPE_30B_A3B,
97
97
  LLM_TYPE_235B_A22B,
98
+ LLM_TYPE_E2B,
99
+ LLM_TYPE_E4B,
98
100
  };
99
101
 
100
102
  std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type);
@@ -316,6 +318,19 @@ struct llama_layer {
316
318
  struct ggml_tensor * ffn_up_scale = nullptr;
317
319
  struct ggml_tensor * ffn_down_scale = nullptr;
318
320
 
321
+ // altup & laurel
322
+ struct ggml_tensor * per_layer_inp_gate = nullptr;
323
+ struct ggml_tensor * per_layer_proj = nullptr;
324
+ struct ggml_tensor * per_layer_post_norm = nullptr;
325
+ struct ggml_tensor * altup_correct_coef = nullptr;
326
+ struct ggml_tensor * altup_correct_scale = nullptr;
327
+ struct ggml_tensor * altup_predict_coef = nullptr;
328
+ struct ggml_tensor * altup_router = nullptr;
329
+ struct ggml_tensor * altup_router_norm = nullptr;
330
+ struct ggml_tensor * laurel_l = nullptr;
331
+ struct ggml_tensor * laurel_r = nullptr;
332
+ struct ggml_tensor * laurel_post_norm = nullptr;
333
+
319
334
  struct llama_layer_posnet posnet;
320
335
 
321
336
  struct llama_layer_convnext convnext;
@@ -354,6 +369,13 @@ struct llama_model {
354
369
  struct ggml_tensor * conv1d = nullptr;
355
370
  struct ggml_tensor * conv1d_b = nullptr;
356
371
 
372
+ // gemma3n altup
373
+ struct ggml_tensor * tok_embd_per_layer = nullptr;
374
+ struct ggml_tensor * altup_proj = nullptr;
375
+ struct ggml_tensor * altup_unembd_proj = nullptr;
376
+ struct ggml_tensor * per_layer_model_proj = nullptr;
377
+ struct ggml_tensor * per_layer_proj_norm = nullptr;
378
+
357
379
  std::vector<llama_layer> layers;
358
380
 
359
381
  llama_model_params params;
@@ -1,5 +1,4 @@
1
1
  #include "llama-quant.h"
2
-
3
2
  #include "llama-impl.h"
4
3
  #include "llama-model.h"
5
4
  #include "llama-model-loader.h"
@@ -27,6 +26,56 @@ static void zeros(std::ofstream & file, size_t n) {
27
26
  }
28
27
  }
29
28
 
29
+ static std::string remap_layer(const std::string & orig_name, const std::vector<int> & prune, std::map<int, std::string> & mapped, int & next_id) {
30
+ if (prune.empty()) {
31
+ return orig_name;
32
+ }
33
+
34
+ static const std::regex pattern(R"(blk\.(\d+)\.)");
35
+ if (std::smatch match; std::regex_search(orig_name, match, pattern)) {
36
+ const int blk = std::stoi(match[1]);
37
+ std::string new_name = orig_name;
38
+
39
+ if (mapped.count(blk)) {
40
+ // Already mapped, do nothing
41
+ } else if (std::find(prune.begin(), prune.end(), blk) != prune.end()) {
42
+ mapped[blk] = "";
43
+ } else if (blk < prune.front()) {
44
+ mapped[blk] = std::to_string(blk);
45
+ next_id = blk + 1;
46
+ } else {
47
+ mapped[blk] = std::to_string(next_id);
48
+ ++next_id;
49
+ }
50
+
51
+ return mapped[blk].empty() ? mapped[blk] : new_name.replace(match.position(1), match.length(1), mapped[blk]);
52
+ }
53
+
54
+ return orig_name;
55
+ }
56
+
57
+ static std::string remap_imatrix (const std::string & orig_name, const std::map<int, std::string> & mapped) {
58
+ if (mapped.empty()) {
59
+ return orig_name;
60
+ }
61
+
62
+ static const std::regex pattern(R"(blk\.(\d+)\.)");
63
+ if (std::smatch match; std::regex_search(orig_name, match, pattern)) {
64
+ const std::string blk(match[1]);
65
+ std::string new_name = orig_name;
66
+
67
+ for (const auto & p : mapped) {
68
+ if (p.second == blk) {
69
+ LLAMA_LOG_DEBUG("(blk.%d imatrix) ", p.first);
70
+ return new_name.replace(match.position(1), match.length(1), std::to_string(p.first));
71
+ }
72
+ }
73
+ GGML_ABORT("\n%s: imatrix mapping error for %s\n", __func__, orig_name.c_str());
74
+ }
75
+
76
+ return orig_name;
77
+ }
78
+
30
79
  struct quantize_state_impl {
31
80
  const llama_model & model;
32
81
  const llama_model_quantize_params * params;
@@ -174,7 +223,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
174
223
  new_type = GGML_TYPE_Q6_K;
175
224
  }
176
225
  }
177
- } else if (name == "token_embd.weight") {
226
+ } else if (name == "token_embd.weight" || name == "per_layer_token_embd.weight") {
178
227
  if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
179
228
  new_type = qs.params->token_embedding_type;
180
229
  } else {
@@ -568,6 +617,11 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
568
617
  const size_t align = GGUF_DEFAULT_ALIGNMENT;
569
618
  gguf_context_ptr ctx_out { gguf_init_empty() };
570
619
 
620
+ std::vector<int> prune_list = {};
621
+ if (params->prune_layers) {
622
+ prune_list = *static_cast<const std::vector<int> *>(params->prune_layers);
623
+ }
624
+
571
625
  // copy the KV pairs from the input file
572
626
  gguf_set_kv (ctx_out.get(), ml.meta.get());
573
627
  gguf_set_val_u32(ctx_out.get(), "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
@@ -597,12 +651,32 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
597
651
  }
598
652
  }
599
653
 
654
+ std::map<int, std::string> mapped;
655
+ int blk_id = 0;
656
+ int pruned_attention_w = 0;
657
+
600
658
  // make a list of weights
601
659
  std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
602
660
  tensors.reserve(ml.weights_map.size());
603
661
  for (const auto & it : ml.weights_map) {
662
+ const std::string remapped_name(remap_layer(it.first, prune_list, mapped, blk_id));
663
+ if (remapped_name.empty()) {
664
+ if (it.first.find("attn_v.weight") != std::string::npos ||
665
+ it.first.find("attn_qkv.weight") != std::string::npos ||
666
+ it.first.find("attn_kv_b.weight") != std::string::npos) {
667
+ pruned_attention_w++;
668
+ }
669
+ LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str());
670
+ continue;
671
+ } else if (remapped_name != it.first) {
672
+ ggml_set_name(it.second.tensor, remapped_name.c_str());
673
+ LLAMA_LOG_DEBUG("%s: tensor %s remapped to %s\n", __func__, it.first.c_str(), ggml_get_name(it.second.tensor));
674
+ }
604
675
  tensors.push_back(&it.second);
605
676
  }
677
+ if (!prune_list.empty()) {
678
+ gguf_set_val_u32(ctx_out.get(), ml.llm_kv(LLM_KV_BLOCK_COUNT).c_str(), blk_id);
679
+ }
606
680
 
607
681
  // keep_split requires that the weights are sorted by split index
608
682
  if (params->keep_split) {
@@ -640,7 +714,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
640
714
  if (llama_model_has_encoder(&model)) {
641
715
  n_attn_layer *= 3;
642
716
  }
643
- GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
717
+ GGML_ASSERT((qs.n_attention_wv == n_attn_layer - pruned_attention_w) && "n_attention_wv is unexpected");
644
718
  }
645
719
 
646
720
  size_t total_size_org = 0;
@@ -681,7 +755,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
681
755
  for (size_t i = 0; i < ctx_outs.size(); ++i) {
682
756
  gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
683
757
  gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split);
684
- gguf_set_val_i32(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors);
758
+ gguf_set_val_i32(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), (int32_t)tensors.size());
685
759
  }
686
760
  }
687
761
 
@@ -756,6 +830,13 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
756
830
  // NOTE: can't use LLM_TN here because the layer number is not known
757
831
  quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
758
832
 
833
+ // these are very small (e.g. 4x4)
834
+ quantize &= name.find("altup") == std::string::npos;
835
+ quantize &= name.find("laurel") == std::string::npos;
836
+
837
+ // these are not too big so keep them as it is
838
+ quantize &= name.find("per_layer_model_proj") == std::string::npos;
839
+
759
840
  // do not quantize positional embeddings and token types (BERT)
760
841
  quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
761
842
  quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
@@ -832,7 +913,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
832
913
 
833
914
  const float * imatrix = nullptr;
834
915
  if (imatrix_data) {
835
- auto it = imatrix_data->find(tensor->name);
916
+ auto it = imatrix_data->find(remap_imatrix(tensor->name, mapped));
836
917
  if (it == imatrix_data->end()) {
837
918
  LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
838
919
  } else {
@@ -947,6 +1028,7 @@ llama_model_quantize_params llama_model_quantize_default_params() {
947
1028
  /*.imatrix =*/ nullptr,
948
1029
  /*.kv_overrides =*/ nullptr,
949
1030
  /*.tensor_type =*/ nullptr,
1031
+ /*.prune_layers =*/ nullptr
950
1032
  };
951
1033
 
952
1034
  return result;
@@ -1269,6 +1269,7 @@ struct llama_vocab::impl {
1269
1269
  bool add_space_prefix = false;
1270
1270
  bool add_bos = false;
1271
1271
  bool add_eos = false;
1272
+ bool add_sep = false;
1272
1273
  bool ignore_merges = false;
1273
1274
  bool clean_spaces = false; // clean_up_tokenization_spaces
1274
1275
  bool remove_extra_whitespaces = false;
@@ -1421,6 +1422,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1421
1422
  special_sep_id = 102;
1422
1423
  special_pad_id = 0;
1423
1424
  special_mask_id = 103;
1425
+
1426
+ add_sep = true;
1424
1427
  } else if (tokenizer_model == "gpt2") {
1425
1428
  type = LLAMA_VOCAB_TYPE_BPE;
1426
1429
 
@@ -1550,12 +1553,15 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1550
1553
  tokenizer_pre == "jina-es" ||
1551
1554
  tokenizer_pre == "jina-de" ||
1552
1555
  tokenizer_pre == "gigachat" ||
1553
- tokenizer_pre == "jina-v1-en" ||
1554
1556
  tokenizer_pre == "jina-v2-es" ||
1555
- tokenizer_pre == "jina-v2-de" ||
1557
+ tokenizer_pre == "jina-v2-de") {
1558
+ pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1559
+ } else if (
1560
+ tokenizer_pre == "jina-v1-en" ||
1556
1561
  tokenizer_pre == "jina-v2-code" ||
1557
1562
  tokenizer_pre == "roberta-bpe") {
1558
1563
  pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1564
+ add_sep = true;
1559
1565
  } else if (
1560
1566
  tokenizer_pre == "refact") {
1561
1567
  pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT;
@@ -1665,6 +1671,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1665
1671
  clean_spaces = true;
1666
1672
  add_bos = true;
1667
1673
  add_eos = false;
1674
+ add_sep = true;
1668
1675
  } else if (type == LLAMA_VOCAB_TYPE_UGM) {
1669
1676
  pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
1670
1677
  add_bos = false;
@@ -1801,7 +1808,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1801
1808
  }
1802
1809
  }
1803
1810
 
1804
- // Handle add_bos and add_eos
1811
+ // Handle add_bos, add_eos and add_sep
1805
1812
  {
1806
1813
  bool temp = true;
1807
1814
 
@@ -1811,6 +1818,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1811
1818
  if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
1812
1819
  add_eos = temp;
1813
1820
  }
1821
+ if (ml.get_key(LLM_KV_TOKENIZER_ADD_SEP, temp, false)) {
1822
+ add_sep = temp;
1823
+ }
1814
1824
  }
1815
1825
 
1816
1826
  // auto-detect special tokens by text
@@ -3000,6 +3010,10 @@ bool llama_vocab::get_add_eos() const {
3000
3010
  return pimpl->add_eos;
3001
3011
  }
3002
3012
 
3013
+ bool llama_vocab::get_add_sep() const {
3014
+ return pimpl->add_sep;
3015
+ }
3016
+
3003
3017
  bool llama_vocab::get_ignore_merges() const {
3004
3018
  return pimpl->ignore_merges;
3005
3019
  }
@@ -3060,6 +3074,11 @@ int32_t llama_vocab::tokenize(
3060
3074
  bool add_special,
3061
3075
  bool parse_special) const {
3062
3076
  auto res = tokenize(std::string(text, text_len), add_special, parse_special);
3077
+ if (res.size() >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
3078
+ LLAMA_LOG_ERROR("%s: tokenization result size %zu exceeds int32_t limit\n", __func__, res.size());
3079
+ return std::numeric_limits<int32_t>::min();
3080
+ }
3081
+
3063
3082
  if (n_tokens_max < (int) res.size()) {
3064
3083
  // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
3065
3084
  return -((int) res.size());
@@ -3191,6 +3210,10 @@ bool llama_vocab_get_add_eos(const struct llama_vocab * vocab) {
3191
3210
  return vocab->get_add_eos();
3192
3211
  }
3193
3212
 
3213
+ bool llama_vocab_get_add_sep(const struct llama_vocab * vocab) {
3214
+ return vocab->get_add_sep();
3215
+ }
3216
+
3194
3217
  llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab) {
3195
3218
  return vocab->token_fim_pre();
3196
3219
  }
@@ -74,6 +74,7 @@ struct llama_vocab {
74
74
  bool get_add_space_prefix () const;
75
75
  bool get_add_bos () const;
76
76
  bool get_add_eos () const;
77
+ bool get_add_sep () const;
77
78
  bool get_ignore_merges () const;
78
79
  bool get_clean_spaces () const;
79
80
  bool get_remove_extra_whitespaces () const;
package/cpp/rn-utils.h CHANGED
@@ -54,6 +54,7 @@ struct CompletionOptions {
54
54
  float top_p = 0.9f;
55
55
  float top_k = 40.0f;
56
56
  float min_p = 0.05f;
57
+ float presence_penalty = 0.0f; // for reducing repetitions (0-2 range)
57
58
  int n_keep = 0;
58
59
  int n_probs = 0; // for log probabilities
59
60
  bool post_sampling_probs = false;
@@ -77,6 +78,7 @@ struct CompletionOptions {
77
78
  {"top_p", top_p},
78
79
  {"top_k", top_k},
79
80
  {"min_p", min_p},
81
+ {"presence_penalty", presence_penalty},
80
82
  {"n_predict", n_predict},
81
83
  {"n_keep", n_keep},
82
84
  {"n_probs", n_probs},
@@ -147,6 +149,7 @@ struct CompletionOptions {
147
149
  data["top_p"] = top_p;
148
150
  data["max_tokens"] = n_predict;
149
151
  data["stream"] = stream;
152
+ data["presence_penalty"] = presence_penalty;
150
153
 
151
154
  if (seed >= 0) {
152
155
  data["seed"] = seed;
@@ -358,6 +358,7 @@ struct common_params {
358
358
  int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
359
359
  std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
360
360
  std::string embd_sep = "\n"; // separator of embeddings
361
+ std::string cls_sep = "\t"; // separator of classification sequences
361
362
 
362
363
  // server params
363
364
  int32_t port = 8080; // server listens on this network port
@@ -390,6 +390,7 @@ extern "C" {
390
390
  void * imatrix; // pointer to importance matrix data
391
391
  void * kv_overrides; // pointer to vector containing overrides
392
392
  void * tensor_types; // pointer to vector containing tensor types
393
+ void * prune_layers; // pointer to vector containing layer indices to prune
393
394
  } llama_model_quantize_params;
394
395
 
395
396
  typedef struct llama_logit_bias {
@@ -943,12 +944,14 @@ extern "C" {
943
944
  // Requires the context to have a memory.
944
945
  // For encode-decoder contexts, processes the batch using the decoder.
945
946
  // Positive return values does not mean a fatal error, but rather a warning.
946
- // Upon non-zero return values, the memory state is restored to the state before this call
947
+ // Upon fatal-error or abort, the ubatches that managed to be been processed will remain in the memory state of the context
948
+ // To handle this correctly, query the memory state using llama_memory_seq_pos_min() and llama_memory_seq_pos_max()
949
+ // Upon other return values, the memory state is restored to the state before this call
947
950
  // 0 - success
948
951
  // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
949
- // 2 - aborted
952
+ // 2 - aborted (processed ubatches will remain in the context's memory)
950
953
  // -1 - invalid input batch
951
- // < -1 - error
954
+ // < -1 - fatal error (processed ubatches will remain in the context's memory)
952
955
  LLAMA_API int32_t llama_decode(
953
956
  struct llama_context * ctx,
954
957
  struct llama_batch batch);
@@ -1044,6 +1047,7 @@ extern "C" {
1044
1047
 
1045
1048
  LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
1046
1049
  LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
1050
+ LLAMA_API bool llama_vocab_get_add_sep(const struct llama_vocab * vocab);
1047
1051
 
1048
1052
  LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab);
1049
1053
  LLAMA_API llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab);
@@ -1087,6 +1091,7 @@ extern "C" {
1087
1091
  /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
1088
1092
  /// @return Returns the number of tokens on success, no more than n_tokens_max
1089
1093
  /// @return Returns a negative number on failure - the number of tokens that would have been returned
1094
+ /// @return Returns INT32_MIN on overflow (e.g., tokenization result size exceeds int32_t limit)
1090
1095
  /// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
1091
1096
  /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
1092
1097
  /// as plaintext. Does not insert a leading space.
@@ -6,11 +6,11 @@
6
6
  <array>
7
7
  <dict>
8
8
  <key>BinaryPath</key>
9
- <string>llama.framework/Versions/A/llama</string>
9
+ <string>llama.framework/llama</string>
10
10
  <key>DebugSymbolsPath</key>
11
11
  <string>dSYMs</string>
12
12
  <key>LibraryIdentifier</key>
13
- <string>macos-arm64_x86_64</string>
13
+ <string>ios-arm64_x86_64-simulator</string>
14
14
  <key>LibraryPath</key>
15
15
  <string>llama.framework</string>
16
16
  <key>SupportedArchitectures</key>
@@ -19,7 +19,9 @@
19
19
  <string>x86_64</string>
20
20
  </array>
21
21
  <key>SupportedPlatform</key>
22
- <string>macos</string>
22
+ <string>ios</string>
23
+ <key>SupportedPlatformVariant</key>
24
+ <string>simulator</string>
23
25
  </dict>
24
26
  <dict>
25
27
  <key>BinaryPath</key>
@@ -27,7 +29,7 @@
27
29
  <key>DebugSymbolsPath</key>
28
30
  <string>dSYMs</string>
29
31
  <key>LibraryIdentifier</key>
30
- <string>tvos-arm64_x86_64-simulator</string>
32
+ <string>xros-arm64_x86_64-simulator</string>
31
33
  <key>LibraryPath</key>
32
34
  <string>llama.framework</string>
33
35
  <key>SupportedArchitectures</key>
@@ -36,7 +38,7 @@
36
38
  <string>x86_64</string>
37
39
  </array>
38
40
  <key>SupportedPlatform</key>
39
- <string>tvos</string>
41
+ <string>xros</string>
40
42
  <key>SupportedPlatformVariant</key>
41
43
  <string>simulator</string>
42
44
  </dict>
@@ -46,7 +48,7 @@
46
48
  <key>DebugSymbolsPath</key>
47
49
  <string>dSYMs</string>
48
50
  <key>LibraryIdentifier</key>
49
- <string>xros-arm64</string>
51
+ <string>ios-arm64</string>
50
52
  <key>LibraryPath</key>
51
53
  <string>llama.framework</string>
52
54
  <key>SupportedArchitectures</key>
@@ -54,7 +56,7 @@
54
56
  <string>arm64</string>
55
57
  </array>
56
58
  <key>SupportedPlatform</key>
57
- <string>xros</string>
59
+ <string>ios</string>
58
60
  </dict>
59
61
  <dict>
60
62
  <key>BinaryPath</key>
@@ -62,7 +64,7 @@
62
64
  <key>DebugSymbolsPath</key>
63
65
  <string>dSYMs</string>
64
66
  <key>LibraryIdentifier</key>
65
- <string>xros-arm64_x86_64-simulator</string>
67
+ <string>tvos-arm64_x86_64-simulator</string>
66
68
  <key>LibraryPath</key>
67
69
  <string>llama.framework</string>
68
70
  <key>SupportedArchitectures</key>
@@ -71,7 +73,7 @@
71
73
  <string>x86_64</string>
72
74
  </array>
73
75
  <key>SupportedPlatform</key>
74
- <string>xros</string>
76
+ <string>tvos</string>
75
77
  <key>SupportedPlatformVariant</key>
76
78
  <string>simulator</string>
77
79
  </dict>
@@ -81,7 +83,7 @@
81
83
  <key>DebugSymbolsPath</key>
82
84
  <string>dSYMs</string>
83
85
  <key>LibraryIdentifier</key>
84
- <string>tvos-arm64</string>
86
+ <string>xros-arm64</string>
85
87
  <key>LibraryPath</key>
86
88
  <string>llama.framework</string>
87
89
  <key>SupportedArchitectures</key>
@@ -89,23 +91,24 @@
89
91
  <string>arm64</string>
90
92
  </array>
91
93
  <key>SupportedPlatform</key>
92
- <string>tvos</string>
94
+ <string>xros</string>
93
95
  </dict>
94
96
  <dict>
95
97
  <key>BinaryPath</key>
96
- <string>llama.framework/llama</string>
98
+ <string>llama.framework/Versions/A/llama</string>
97
99
  <key>DebugSymbolsPath</key>
98
100
  <string>dSYMs</string>
99
101
  <key>LibraryIdentifier</key>
100
- <string>ios-arm64</string>
102
+ <string>macos-arm64_x86_64</string>
101
103
  <key>LibraryPath</key>
102
104
  <string>llama.framework</string>
103
105
  <key>SupportedArchitectures</key>
104
106
  <array>
105
107
  <string>arm64</string>
108
+ <string>x86_64</string>
106
109
  </array>
107
110
  <key>SupportedPlatform</key>
108
- <string>ios</string>
111
+ <string>macos</string>
109
112
  </dict>
110
113
  <dict>
111
114
  <key>BinaryPath</key>
@@ -113,18 +116,15 @@
113
116
  <key>DebugSymbolsPath</key>
114
117
  <string>dSYMs</string>
115
118
  <key>LibraryIdentifier</key>
116
- <string>ios-arm64_x86_64-simulator</string>
119
+ <string>tvos-arm64</string>
117
120
  <key>LibraryPath</key>
118
121
  <string>llama.framework</string>
119
122
  <key>SupportedArchitectures</key>
120
123
  <array>
121
124
  <string>arm64</string>
122
- <string>x86_64</string>
123
125
  </array>
124
126
  <key>SupportedPlatform</key>
125
- <string>ios</string>
126
- <key>SupportedPlatformVariant</key>
127
- <string>simulator</string>
127
+ <string>tvos</string>
128
128
  </dict>
129
129
  </array>
130
130
  <key>CFBundlePackageType</key>