llama_cpp 0.15.3 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +16 -0
  3. data/ext/llama_cpp/extconf.rb +1 -2
  4. data/ext/llama_cpp/llama_cpp.cpp +27 -3
  5. data/lib/llama_cpp/version.rb +2 -2
  6. data/sig/llama_cpp.rbs +15 -1
  7. data/vendor/tmp/llama.cpp/Makefile +66 -36
  8. data/vendor/tmp/llama.cpp/ggml-alloc.c +4 -4
  9. data/vendor/tmp/llama.cpp/ggml-backend.c +5 -5
  10. data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
  11. data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
  12. data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
  13. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +103 -0
  14. data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
  15. data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
  16. data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
  17. data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
  18. data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
  19. data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
  20. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +662 -0
  21. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
  22. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
  23. data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
  24. data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
  25. data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
  26. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +1564 -0
  27. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +404 -0
  28. data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
  29. data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
  30. data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
  31. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +45 -0
  32. data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
  33. data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
  34. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +205 -0
  35. data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
  36. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
  37. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
  38. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
  39. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
  40. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
  41. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
  42. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
  43. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
  44. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
  45. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
  123. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
  124. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
  125. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
  126. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
  127. data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
  128. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +266 -0
  129. data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
  130. data/vendor/tmp/llama.cpp/ggml-cuda.cu +35 -16
  131. data/vendor/tmp/llama.cpp/ggml-impl.h +4 -0
  132. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +21 -7
  133. data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
  134. data/vendor/tmp/llama.cpp/ggml-metal.m +99 -35
  135. data/vendor/tmp/llama.cpp/ggml-metal.metal +146 -80
  136. data/vendor/tmp/llama.cpp/ggml-quants.c +101 -11
  137. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +75 -58
  138. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +345 -227
  139. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +99301 -39793
  140. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +458 -329
  141. data/vendor/tmp/llama.cpp/ggml.c +301 -409
  142. data/vendor/tmp/llama.cpp/ggml.h +19 -23
  143. data/vendor/tmp/llama.cpp/llama.cpp +855 -651
  144. data/vendor/tmp/llama.cpp/llama.h +28 -48
  145. metadata +121 -6
  146. data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
  147. data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
  148. data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
  149. data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
@@ -13,8 +13,6 @@
13
13
 
14
14
  #ifdef GGML_USE_CUDA
15
15
  # include "ggml-cuda.h"
16
- #elif defined(GGML_USE_CLBLAST)
17
- # include "ggml-opencl.h"
18
16
  #elif defined(GGML_USE_VULKAN)
19
17
  # include "ggml-vulkan.h"
20
18
  #elif defined(GGML_USE_SYCL)
@@ -103,14 +101,14 @@
103
101
  #endif
104
102
 
105
103
  #define LLAMA_MAX_NODES 8192
106
- #define LLAMA_MAX_EXPERTS 128
104
+ #define LLAMA_MAX_EXPERTS 160
107
105
 
108
106
  //
109
107
  // logging
110
108
  //
111
109
 
112
110
  LLAMA_ATTRIBUTE_FORMAT(2, 3)
113
- static void llama_log_internal (ggml_log_level level, const char* format, ...);
111
+ static void llama_log_internal (ggml_log_level level, const char * format, ...);
114
112
  static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
115
113
 
116
114
  #define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
@@ -222,6 +220,7 @@ enum llm_arch {
222
220
  LLM_ARCH_DBRX,
223
221
  LLM_ARCH_OLMO,
224
222
  LLM_ARCH_ARCTIC,
223
+ LLM_ARCH_DEEPSEEK2,
225
224
  LLM_ARCH_UNKNOWN,
226
225
  };
227
226
 
@@ -259,6 +258,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
259
258
  { LLM_ARCH_DBRX, "dbrx" },
260
259
  { LLM_ARCH_OLMO, "olmo" },
261
260
  { LLM_ARCH_ARCTIC, "arctic" },
261
+ { LLM_ARCH_DEEPSEEK2, "deepseek2" },
262
262
  { LLM_ARCH_UNKNOWN, "(unknown)" },
263
263
  };
264
264
 
@@ -279,11 +279,15 @@ enum llm_kv {
279
279
  LLM_KV_CONTEXT_LENGTH,
280
280
  LLM_KV_EMBEDDING_LENGTH,
281
281
  LLM_KV_BLOCK_COUNT,
282
+ LLM_KV_LEADING_DENSE_BLOCK_COUNT,
282
283
  LLM_KV_FEED_FORWARD_LENGTH,
284
+ LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
283
285
  LLM_KV_USE_PARALLEL_RESIDUAL,
284
286
  LLM_KV_TENSOR_DATA_LAYOUT,
285
287
  LLM_KV_EXPERT_COUNT,
286
288
  LLM_KV_EXPERT_USED_COUNT,
289
+ LLM_KV_EXPERT_SHARED_COUNT,
290
+ LLM_KV_EXPERT_WEIGHTS_SCALE,
287
291
  LLM_KV_POOLING_TYPE,
288
292
  LLM_KV_LOGIT_SCALE,
289
293
 
@@ -296,6 +300,8 @@ enum llm_kv {
296
300
  LLM_KV_ATTENTION_LAYERNORM_EPS,
297
301
  LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
298
302
  LLM_KV_ATTENTION_CAUSAL,
303
+ LLM_KV_ATTENTION_Q_LORA_RANK,
304
+ LLM_KV_ATTENTION_KV_LORA_RANK,
299
305
 
300
306
  LLM_KV_ROPE_DIMENSION_COUNT,
301
307
  LLM_KV_ROPE_FREQ_BASE,
@@ -305,6 +311,7 @@ enum llm_kv {
305
311
  LLM_KV_ROPE_SCALING_ATTN_FACTOR,
306
312
  LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
307
313
  LLM_KV_ROPE_SCALING_FINETUNED,
314
+ LLM_KV_ROPE_SCALING_YARN_LOG_MUL,
308
315
 
309
316
  LLM_KV_SPLIT_NO,
310
317
  LLM_KV_SPLIT_COUNT,
@@ -353,17 +360,21 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
353
360
  { LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
354
361
  { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
355
362
 
356
- { LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
357
- { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
358
- { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
359
- { LLM_KV_BLOCK_COUNT, "%s.block_count" },
360
- { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
361
- { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
362
- { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
363
- { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
364
- { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
365
- { LLM_KV_POOLING_TYPE , "%s.pooling_type" },
366
- { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
363
+ { LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
364
+ { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
365
+ { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
366
+ { LLM_KV_BLOCK_COUNT, "%s.block_count" },
367
+ { LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
368
+ { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
369
+ { LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
370
+ { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
371
+ { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
372
+ { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
373
+ { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
374
+ { LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
375
+ { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
376
+ { LLM_KV_POOLING_TYPE , "%s.pooling_type" },
377
+ { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
367
378
 
368
379
  { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
369
380
  { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -374,6 +385,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
374
385
  { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
375
386
  { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
376
387
  { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
388
+ { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
389
+ { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
377
390
 
378
391
  { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
379
392
  { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
@@ -383,6 +396,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
383
396
  { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
384
397
  { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
385
398
  { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
399
+ { LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" },
386
400
 
387
401
  { LLM_KV_SPLIT_NO, "split.no" },
388
402
  { LLM_KV_SPLIT_COUNT, "split.count" },
@@ -474,6 +488,12 @@ enum llm_tensor {
474
488
  LLM_TENSOR_SSM_A,
475
489
  LLM_TENSOR_SSM_D,
476
490
  LLM_TENSOR_SSM_OUT,
491
+ LLM_TENSOR_ATTN_Q_A,
492
+ LLM_TENSOR_ATTN_Q_B,
493
+ LLM_TENSOR_ATTN_KV_A_MQA,
494
+ LLM_TENSOR_ATTN_KV_B,
495
+ LLM_TENSOR_ATTN_Q_A_NORM,
496
+ LLM_TENSOR_ATTN_KV_A_NORM,
477
497
  };
478
498
 
479
499
  static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
@@ -1057,6 +1077,35 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
1057
1077
  { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1058
1078
  },
1059
1079
  },
1080
+ {
1081
+ LLM_ARCH_DEEPSEEK2,
1082
+ {
1083
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1084
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1085
+ { LLM_TENSOR_OUTPUT, "output" },
1086
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1087
+ { LLM_TENSOR_ATTN_Q_A_NORM, "blk.%d.attn_q_a_norm" },
1088
+ { LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" },
1089
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1090
+ { LLM_TENSOR_ATTN_Q_A, "blk.%d.attn_q_a" },
1091
+ { LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" },
1092
+ { LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
1093
+ { LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
1094
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1095
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1096
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1097
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1098
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1099
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1100
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1101
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1102
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1103
+ { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
1104
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
1105
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
1106
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1107
+ },
1108
+ },
1060
1109
  {
1061
1110
  LLM_ARCH_UNKNOWN,
1062
1111
  {
@@ -1651,12 +1700,13 @@ struct llama_mlock {
1651
1700
  };
1652
1701
  using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
1653
1702
 
1654
- static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
1703
+ // NOTE: avoid ever using this except for building the token_to_piece caches
1704
+ static std::string llama_token_to_piece(const struct llama_model * model, llama_token token, bool special) {
1655
1705
  std::vector<char> result(8, 0);
1656
- const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
1706
+ const int n_tokens = llama_token_to_piece(model, token, result.data(), result.size(), special);
1657
1707
  if (n_tokens < 0) {
1658
1708
  result.resize(-n_tokens);
1659
- int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
1709
+ int check = llama_token_to_piece(model, token, result.data(), result.size(), special);
1660
1710
  GGML_ASSERT(check == -n_tokens);
1661
1711
  }
1662
1712
  else {
@@ -1741,6 +1791,7 @@ enum e_model {
1741
1791
  MODEL_13B,
1742
1792
  MODEL_14B,
1743
1793
  MODEL_15B,
1794
+ MODEL_16B,
1744
1795
  MODEL_20B,
1745
1796
  MODEL_30B,
1746
1797
  MODEL_34B,
@@ -1748,6 +1799,7 @@ enum e_model {
1748
1799
  MODEL_40B,
1749
1800
  MODEL_65B,
1750
1801
  MODEL_70B,
1802
+ MODEL_236B,
1751
1803
  MODEL_314B,
1752
1804
  MODEL_SMALL,
1753
1805
  MODEL_MEDIUM,
@@ -1783,13 +1835,21 @@ struct llama_hparams {
1783
1835
  uint32_t n_expert_used = 0;
1784
1836
  uint32_t n_vocab_type = 0; // for BERT-style token types
1785
1837
 
1838
+ uint32_t n_layer_dense_lead = 0;
1839
+ uint32_t n_lora_q = 0;
1840
+ uint32_t n_lora_kv = 0;
1841
+ uint32_t n_ff_exp = 0;
1842
+ uint32_t n_expert_shared = 0;
1843
+ float expert_weights_scale = 0.0;
1844
+
1786
1845
  float f_norm_eps;
1787
1846
  float f_norm_rms_eps;
1788
1847
 
1789
1848
  float rope_attn_factor = 1.0f;
1790
1849
  float rope_freq_base_train;
1791
1850
  float rope_freq_scale_train;
1792
- uint32_t n_yarn_orig_ctx;
1851
+ uint32_t n_ctx_orig_yarn;
1852
+ float rope_yarn_log_mul;
1793
1853
 
1794
1854
  // for State Space Models
1795
1855
  uint32_t ssm_d_conv = 0;
@@ -1823,8 +1883,14 @@ struct llama_hparams {
1823
1883
  if (this->n_expert != other.n_expert) return true;
1824
1884
  if (this->n_expert_used != other.n_expert_used) return true;
1825
1885
 
1886
+ if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
1887
+ if (this->n_lora_q != other.n_lora_q) return true;
1888
+ if (this->n_lora_kv != other.n_lora_kv) return true;
1889
+ if (this->n_ff_exp != other.n_ff_exp) return true;
1890
+ if (this->n_expert_shared != other.n_expert_shared) return true;
1891
+
1826
1892
  if (this->rope_finetuned != other.rope_finetuned) return true;
1827
- if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
1893
+ if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true;
1828
1894
 
1829
1895
  if (this->ssm_d_conv != other.ssm_d_conv) return true;
1830
1896
  if (this->ssm_d_inner != other.ssm_d_inner) return true;
@@ -1838,6 +1904,8 @@ struct llama_hparams {
1838
1904
  if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
1839
1905
  if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
1840
1906
  if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
1907
+ if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
1908
+ if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
1841
1909
 
1842
1910
  return false;
1843
1911
  }
@@ -1881,7 +1949,7 @@ struct llama_cparams {
1881
1949
  float rope_freq_base;
1882
1950
  float rope_freq_scale;
1883
1951
 
1884
- uint32_t n_yarn_orig_ctx;
1952
+ uint32_t n_ctx_orig_yarn;
1885
1953
  // These hyperparameters are not exposed in GGUF, because all
1886
1954
  // existing YaRN models use the same values for them.
1887
1955
  float yarn_ext_factor;
@@ -1913,6 +1981,8 @@ struct llama_layer {
1913
1981
  struct ggml_tensor * attn_k_norm_b;
1914
1982
  struct ggml_tensor * attn_out_norm;
1915
1983
  struct ggml_tensor * attn_out_norm_b;
1984
+ struct ggml_tensor * attn_q_a_norm;
1985
+ struct ggml_tensor * attn_kv_a_norm;
1916
1986
 
1917
1987
  // attention
1918
1988
  struct ggml_tensor * wq;
@@ -1920,6 +1990,10 @@ struct llama_layer {
1920
1990
  struct ggml_tensor * wv;
1921
1991
  struct ggml_tensor * wo;
1922
1992
  struct ggml_tensor * wqkv;
1993
+ struct ggml_tensor * wq_a;
1994
+ struct ggml_tensor * wq_b;
1995
+ struct ggml_tensor * wkv_a_mqa;
1996
+ struct ggml_tensor * wkv_b;
1923
1997
 
1924
1998
  // attention bias
1925
1999
  struct ggml_tensor * bq;
@@ -1953,8 +2027,9 @@ struct llama_layer {
1953
2027
  struct ggml_tensor * ffn_up_shexp;
1954
2028
 
1955
2029
  // ff bias
1956
- struct ggml_tensor * ffn_down_b; // b2
1957
- struct ggml_tensor * ffn_up_b; // b3
2030
+ struct ggml_tensor * ffn_gate_b = nullptr;
2031
+ struct ggml_tensor * ffn_down_b = nullptr; // b2
2032
+ struct ggml_tensor * ffn_up_b = nullptr; // b3
1958
2033
  struct ggml_tensor * ffn_act;
1959
2034
 
1960
2035
  // mamba proj
@@ -2072,12 +2147,12 @@ struct llama_control_vector {
2072
2147
  struct llama_vocab {
2073
2148
  using id = int32_t;
2074
2149
  using token = std::string;
2075
- using ttype = llama_token_type;
2150
+ using tattr = llama_token_attr;
2076
2151
 
2077
2152
  struct token_data {
2078
2153
  token text;
2079
2154
  float score;
2080
- ttype type;
2155
+ tattr attr;
2081
2156
  };
2082
2157
 
2083
2158
  enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
@@ -2086,7 +2161,8 @@ struct llama_vocab {
2086
2161
  std::unordered_map<token, id> token_to_id;
2087
2162
  std::vector<token_data> id_to_token;
2088
2163
 
2089
- std::unordered_map<token, id> special_tokens_cache;
2164
+ std::vector<id> cache_special_tokens;
2165
+ std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = true);
2090
2166
 
2091
2167
  std::map<std::pair<std::string, std::string>, int> bpe_ranks;
2092
2168
 
@@ -2293,13 +2369,34 @@ struct llama_context {
2293
2369
  struct llama_control_vector cvec;
2294
2370
  };
2295
2371
 
2372
+ static size_t llama_get_device_count(const llama_model & model) {
2373
+ size_t count = 1;
2374
+ #if defined(GGML_USE_CUDA)
2375
+ count = ggml_backend_cuda_get_device_count();
2376
+ #elif defined(GGML_USE_SYCL)
2377
+ count = ggml_backend_sycl_get_device_count();
2378
+ #elif defined(GGML_USE_VULKAN)
2379
+ count = ggml_backend_vk_get_device_count();
2380
+ #endif
2381
+ #if defined(GGML_USE_RPC)
2382
+ count += model.rpc_servers.size();
2383
+ #endif
2384
+ return count;
2385
+ GGML_UNUSED(model);
2386
+ }
2387
+
2296
2388
  static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
2297
2389
  ggml_backend_buffer_type_t buft = nullptr;
2298
2390
 
2299
- #ifdef GGML_USE_RPC
2300
- std::string endpoint = model.rpc_servers[gpu];
2301
- buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
2302
- #elif defined(GGML_USE_METAL)
2391
+ #if defined(GGML_USE_RPC)
2392
+ int dev_count = (int)llama_get_device_count(model);
2393
+ int rpc_count = (int)model.rpc_servers.size();
2394
+ if (gpu >= dev_count - rpc_count) {
2395
+ const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str();
2396
+ return ggml_backend_rpc_buffer_type(endpoint);
2397
+ }
2398
+ #endif
2399
+ #if defined(GGML_USE_METAL)
2303
2400
  buft = ggml_backend_metal_buffer_type();
2304
2401
  #elif defined(GGML_USE_CUDA)
2305
2402
  buft = ggml_backend_cuda_buffer_type(gpu);
@@ -2307,8 +2404,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
2307
2404
  buft = ggml_backend_vk_buffer_type(gpu);
2308
2405
  #elif defined(GGML_USE_SYCL)
2309
2406
  buft = ggml_backend_sycl_buffer_type(gpu);
2310
- #elif defined(GGML_USE_CLBLAST)
2311
- buft = ggml_backend_opencl_buffer_type();
2312
2407
  #elif defined(GGML_USE_KOMPUTE)
2313
2408
  buft = ggml_backend_kompute_buffer_type(gpu);
2314
2409
  if (buft == nullptr) {
@@ -2347,29 +2442,19 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
2347
2442
  GGML_UNUSED(tensor_split);
2348
2443
  }
2349
2444
 
2350
- static size_t llama_get_device_count(const llama_model & model) {
2351
- #if defined(GGML_USE_RPC)
2352
- return model.rpc_servers.size();
2353
- #elif defined(GGML_USE_CUDA)
2354
- return ggml_backend_cuda_get_device_count();
2355
- #elif defined(GGML_USE_SYCL)
2356
- return ggml_backend_sycl_get_device_count();
2357
- #elif defined(GGML_USE_VULKAN)
2358
- return ggml_backend_vk_get_device_count();
2359
- #else
2360
- return 1;
2361
- #endif
2362
- GGML_UNUSED(model);
2363
- }
2364
-
2365
2445
  static size_t llama_get_device_memory(const llama_model & model, int device) {
2366
2446
  #if defined(GGML_USE_RPC)
2367
- size_t total;
2368
- size_t free;
2369
- std::string endpoint = model.rpc_servers[device];
2370
- ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
2371
- return free;
2372
- #elif defined(GGML_USE_CUDA)
2447
+ int dev_count = (int)llama_get_device_count(model);
2448
+ int rpc_count = (int)model.rpc_servers.size();
2449
+ if (device >= dev_count - rpc_count) {
2450
+ size_t total;
2451
+ size_t free;
2452
+ const char * endpoint = model.rpc_servers[device - dev_count + rpc_count].c_str();
2453
+ ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
2454
+ return free;
2455
+ }
2456
+ #endif
2457
+ #if defined(GGML_USE_CUDA)
2373
2458
  size_t total;
2374
2459
  size_t free;
2375
2460
  ggml_backend_cuda_get_device_memory(device, &free, &total);
@@ -2441,10 +2526,6 @@ static bool llama_kv_cache_init(
2441
2526
  }
2442
2527
  }
2443
2528
 
2444
- #ifdef GGML_USE_CLBLAST
2445
- offload = false;
2446
- #endif
2447
-
2448
2529
  // count used buffer types
2449
2530
  std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
2450
2531
  if (offload) {
@@ -3832,6 +3913,7 @@ static const char * llama_model_type_name(e_model type) {
3832
3913
  case MODEL_13B: return "13B";
3833
3914
  case MODEL_14B: return "14B";
3834
3915
  case MODEL_15B: return "15B";
3916
+ case MODEL_16B: return "16B";
3835
3917
  case MODEL_20B: return "20B";
3836
3918
  case MODEL_30B: return "30B";
3837
3919
  case MODEL_34B: return "34B";
@@ -3839,6 +3921,7 @@ static const char * llama_model_type_name(e_model type) {
3839
3921
  case MODEL_40B: return "40B";
3840
3922
  case MODEL_65B: return "65B";
3841
3923
  case MODEL_70B: return "70B";
3924
+ case MODEL_236B: return "236B";
3842
3925
  case MODEL_314B: return "314B";
3843
3926
  case MODEL_SMALL: return "0.1B";
3844
3927
  case MODEL_MEDIUM: return "0.4B";
@@ -3922,8 +4005,8 @@ static void llm_load_hparams(
3922
4005
  ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
3923
4006
  hparams.rope_finetuned = rope_finetuned;
3924
4007
 
3925
- hparams.n_yarn_orig_ctx = hparams.n_ctx_train;
3926
- ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_yarn_orig_ctx, false);
4008
+ hparams.n_ctx_orig_yarn = hparams.n_ctx_train;
4009
+ ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn, false);
3927
4010
 
3928
4011
  // rope_freq_base (optional)
3929
4012
  hparams.rope_freq_base_train = 10000.0f;
@@ -3981,7 +4064,9 @@ static void llm_load_hparams(
3981
4064
  switch (hparams.n_layer) {
3982
4065
  case 22: model.type = e_model::MODEL_1B; break;
3983
4066
  case 26: model.type = e_model::MODEL_3B; break;
3984
- case 32: model.type = hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B; break;
4067
+ // granite uses a vocab with len 49152
4068
+ case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break;
4069
+ case 36: model.type = e_model::MODEL_8B; break; // granite
3985
4070
  case 40: model.type = e_model::MODEL_13B; break;
3986
4071
  case 48: model.type = e_model::MODEL_34B; break;
3987
4072
  case 60: model.type = e_model::MODEL_30B; break;
@@ -4251,6 +4336,8 @@ static void llm_load_hparams(
4251
4336
  case 30: model.type = e_model::MODEL_3B; break;
4252
4337
  case 32: model.type = e_model::MODEL_7B; break;
4253
4338
  case 40: model.type = e_model::MODEL_15B; break;
4339
+ case 52: model.type = e_model::MODEL_20B; break; // granite
4340
+ case 88: model.type = e_model::MODEL_34B; break; // granite
4254
4341
  default: model.type = e_model::MODEL_UNKNOWN;
4255
4342
  }
4256
4343
  } break;
@@ -4384,6 +4471,26 @@ static void llm_load_hparams(
4384
4471
  model.type = e_model::MODEL_UNKNOWN;
4385
4472
  }
4386
4473
  } break;
4474
+ case LLM_ARCH_DEEPSEEK2:
4475
+ {
4476
+ bool is_lite = (hparams.n_layer == 27);
4477
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
4478
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
4479
+ if (!is_lite) {
4480
+ ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
4481
+ }
4482
+ ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
4483
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
4484
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
4485
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
4486
+ ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
4487
+
4488
+ switch (hparams.n_layer) {
4489
+ case 27: model.type = e_model::MODEL_16B; break;
4490
+ case 60: model.type = e_model::MODEL_236B; break;
4491
+ default: model.type = e_model::MODEL_UNKNOWN;
4492
+ }
4493
+ } break;
4387
4494
  default: (void)0;
4388
4495
  }
4389
4496
 
@@ -4490,15 +4597,14 @@ static void llm_load_vocab(
4490
4597
  vocab.special_cls_id = 101;
4491
4598
  vocab.special_mask_id = 103;
4492
4599
  vocab.add_space_prefix = false;
4493
- } else {
4494
- if (tokenizer_model == "gpt2") {
4495
- vocab.type = LLAMA_VOCAB_TYPE_BPE;
4496
- } else {
4497
- LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str());
4498
- LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
4499
- vocab.type = LLAMA_VOCAB_TYPE_SPM;
4500
- return;
4600
+ } else if (tokenizer_model == "gpt2") {
4601
+ vocab.type = LLAMA_VOCAB_TYPE_BPE;
4602
+
4603
+ const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
4604
+ if (add_space_prefix_keyidx != -1) {
4605
+ vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
4501
4606
  }
4607
+
4502
4608
  // read bpe merges and populate bpe ranks
4503
4609
  const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
4504
4610
  if (merges_keyidx == -1) {
@@ -4532,6 +4638,8 @@ static void llm_load_vocab(
4532
4638
  vocab.special_pad_id = -1;
4533
4639
  vocab.special_cls_id = -1;
4534
4640
  vocab.special_mask_id = -1;
4641
+ } else {
4642
+ throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
4535
4643
  }
4536
4644
 
4537
4645
  // for now, only BPE models have pre-tokenizers
@@ -4593,6 +4701,9 @@ static void llm_load_vocab(
4593
4701
  } else if (
4594
4702
  tokenizer_pre == "dbrx") {
4595
4703
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
4704
+ } else if (
4705
+ tokenizer_pre == "smaug-bpe") {
4706
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
4596
4707
  } else {
4597
4708
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
4598
4709
  }
@@ -4631,7 +4742,20 @@ static void llm_load_vocab(
4631
4742
  auto & token_data = vocab.id_to_token[i];
4632
4743
  token_data.text = std::move(word);
4633
4744
  token_data.score = scores ? scores[i] : 0.0f;
4634
- token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL;
4745
+ token_data.attr = LLAMA_TOKEN_ATTR_NORMAL;
4746
+
4747
+ if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file
4748
+ switch(toktypes[i]) {
4749
+ case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break;
4750
+ case LLAMA_TOKEN_TYPE_UNUSED: token_data.attr = LLAMA_TOKEN_ATTR_UNUSED; break;
4751
+ case LLAMA_TOKEN_TYPE_NORMAL: token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; break;
4752
+ case LLAMA_TOKEN_TYPE_CONTROL: token_data.attr = LLAMA_TOKEN_ATTR_CONTROL; break;
4753
+ case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break;
4754
+ case LLAMA_TOKEN_TYPE_BYTE: token_data.attr = LLAMA_TOKEN_ATTR_BYTE; break;
4755
+ case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
4756
+ default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
4757
+ }
4758
+ }
4635
4759
  }
4636
4760
  GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
4637
4761
 
@@ -4721,96 +4845,88 @@ static void llm_load_vocab(
4721
4845
 
4722
4846
  // build special tokens cache
4723
4847
  {
4724
- // TODO: It is unclear (to me) at this point, whether special tokes are guaranteed to be of a deterministic type,
4725
- // and will always be correctly labeled in 'added_tokens.json' etc.
4726
- // The assumption is, since special tokens aren't meant to be exposed to end user, they are designed
4727
- // to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer
4728
- // are special tokens.
4729
- // From testing, this appears to correlate 1:1 with special tokens.
4730
- //
4848
+ for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
4849
+ if (!(vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL)) {
4850
+ vocab.cache_special_tokens.push_back(id);
4851
+ }
4852
+ }
4731
4853
 
4732
- // Counting special tokens and verifying in only one direction
4733
- // is sufficient to detect difference in those two sets.
4734
- //
4735
- uint32_t special_tokens_count_by_type = 0;
4736
- uint32_t special_tokens_count_from_verification = 0;
4854
+ std::sort( vocab.cache_special_tokens.begin(), vocab.cache_special_tokens.end(),
4855
+ [&] (const llama_vocab::id a, const llama_vocab::id b) {
4856
+ return vocab.id_to_token[a].text.size() > vocab.id_to_token[b].text.size();
4857
+ }
4858
+ );
4737
4859
 
4738
- bool special_tokens_definition_mismatch = false;
4860
+ LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size());
4861
+ }
4739
4862
 
4740
- for (const auto & t : vocab.token_to_id) {
4741
- const auto & token = t.first;
4742
- const auto & id = t.second;
4863
+ // build token to piece cache
4864
+ {
4865
+ size_t size_cache = 0;
4743
4866
 
4744
- // Count all non-normal tokens in the vocab while iterating
4745
- if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) {
4746
- special_tokens_count_by_type++;
4747
- }
4867
+ std::vector<llama_vocab::token> cache_token_to_piece(n_vocab);
4748
4868
 
4749
- // Skip single character tokens
4750
- if (token.length() > 1) {
4751
- bool is_tokenizable = false;
4869
+ for (uint32_t id = 0; id < n_vocab; ++id) {
4870
+ cache_token_to_piece[id] = llama_token_to_piece(&model, id, true);
4752
4871
 
4753
- // Split token string representation in two, in all possible ways
4754
- // and check if both halves can be matched to a valid token
4755
- for (unsigned i = 1; i < token.length();) {
4756
- const auto left = token.substr(0, i);
4757
- const auto right = token.substr(i);
4872
+ size_cache += cache_token_to_piece[id].size();
4873
+ }
4758
4874
 
4759
- // check if we didnt partition in the middle of a utf sequence
4760
- auto utf = utf8_len(left.at(left.length() - 1));
4875
+ std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
4761
4876
 
4762
- if (utf == 1) {
4763
- if (vocab.token_to_id.find(left) != vocab.token_to_id.end() &&
4764
- vocab.token_to_id.find(right) != vocab.token_to_id.end() ) {
4765
- is_tokenizable = true;
4766
- break;
4767
- }
4768
- i++;
4769
- } else {
4770
- // skip over the rest of multibyte utf sequence
4771
- i += utf - 1;
4772
- }
4877
+ LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
4878
+ }
4879
+
4880
+ // Handle per token attributes
4881
+ //NOTE: Each model customizes per token attributes.
4882
+ //NOTE: Per token attributes are missing from the GGUF file.
4883
+ //TODO: Extract attributes from GGUF file.
4884
+ {
4885
+ auto _contains_any = [] (const std::string &str, const std::vector<std::string> &substrs) -> bool {
4886
+ for (auto substr : substrs) {
4887
+ if (str.find(substr) < std::string::npos) {
4888
+ return true;
4773
4889
  }
4890
+ }
4891
+ return false;
4892
+ };
4774
4893
 
4775
- if (!is_tokenizable) {
4776
- // Some tokens are multibyte, but they are utf sequences with equivalent text length of 1
4777
- // it's faster to re-filter them here, since there are way less candidates now
4894
+ auto _set_tokenid_attr = [&] (const llama_vocab::id id, llama_token_attr attr, bool value) {
4895
+ uint32_t current = vocab.id_to_token.at(id).attr;
4896
+ current = value ? (current | attr) : (current & ~attr);
4897
+ vocab.id_to_token[id].attr = (llama_token_attr) current;
4898
+ };
4778
4899
 
4779
- // Calculate a total "utf" length of a token string representation
4780
- size_t utf8_str_len = 0;
4781
- for (unsigned i = 0; i < token.length();) {
4782
- utf8_str_len++;
4783
- i += utf8_len(token.at(i));
4784
- }
4900
+ auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) {
4901
+ _set_tokenid_attr(vocab.token_to_id.at(token), attr, value);
4902
+ };
4785
4903
 
4786
- // And skip the ones which are one character
4787
- if (utf8_str_len > 1) {
4788
- // At this point what we have left are special tokens only
4789
- vocab.special_tokens_cache[token] = id;
4904
+ std::string model_name;
4905
+ std::string tokenizer_pre;
4790
4906
 
4791
- // Count manually found special tokens
4792
- special_tokens_count_from_verification++;
4907
+ ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
4908
+ ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
4793
4909
 
4794
- // If this manually found special token is not marked as such, flag a mismatch
4795
- if (vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL) {
4796
- special_tokens_definition_mismatch = true;
4797
- }
4798
- }
4799
- }
4910
+ // model name to lowercase
4911
+ std::transform(model_name.begin(), model_name.end(), model_name.begin(),
4912
+ [] (const std::string::value_type x) {
4913
+ return std::tolower(x);
4800
4914
  }
4801
- }
4915
+ );
4802
4916
 
4803
- if (special_tokens_definition_mismatch || special_tokens_count_from_verification != special_tokens_count_by_type) {
4804
- LLAMA_LOG_WARN("%s: mismatch in special tokens definition ( %u/%zu vs %u/%zu ).\n",
4805
- __func__,
4806
- special_tokens_count_from_verification, vocab.id_to_token.size(),
4807
- special_tokens_count_by_type, vocab.id_to_token.size()
4808
- );
4809
- } else {
4810
- LLAMA_LOG_INFO("%s: special tokens definition check successful ( %u/%zu ).\n",
4811
- __func__,
4812
- special_tokens_count_from_verification, vocab.id_to_token.size()
4813
- );
4917
+ // set attributes by model/tokenizer name
4918
+ if (_contains_any(tokenizer_pre, {"jina-v2-es", "jina-v2-de"})) {
4919
+ _set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
4920
+ } else if (_contains_any(model_name, {"phi-3", "phi3"})) {
4921
+ for (auto id : vocab.cache_special_tokens) {
4922
+ _set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
4923
+ }
4924
+ for (auto token : {"</s>"}) {
4925
+ _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true);
4926
+ }
4927
+ for (auto token : {"<unk>", "<s>", "<|endoftext|>"}) {
4928
+ _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
4929
+ }
4814
4930
  }
4815
4931
  }
4816
4932
  }
@@ -4852,7 +4968,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
4852
4968
  LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
4853
4969
  LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
4854
4970
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
4855
- LLAMA_LOG_INFO("%s: n_yarn_orig_ctx = %u\n", __func__, hparams.n_yarn_orig_ctx);
4971
+ LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
4856
4972
  LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
4857
4973
  LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
4858
4974
  LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
@@ -4892,6 +5008,16 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
4892
5008
  if (vocab.special_suffix_id != -1) { LLAMA_LOG_INFO( "%s: SUF token = %d '%s'\n", __func__, vocab.special_suffix_id, vocab.id_to_token[vocab.special_suffix_id].text.c_str() ); }
4893
5009
  if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
4894
5010
  if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
5011
+
5012
+ if (model.arch == LLM_ARCH_DEEPSEEK2) {
5013
+ LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
5014
+ LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
5015
+ LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
5016
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
5017
+ LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
5018
+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
5019
+ LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
5020
+ }
4895
5021
  }
4896
5022
 
4897
5023
  // Returns false if cancelled by progress_callback
@@ -5048,8 +5174,6 @@ static bool llm_load_tensors(
5048
5174
  throw std::runtime_error("model has expert layers but no expert layers are used");
5049
5175
  }
5050
5176
 
5051
- GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
5052
-
5053
5177
  ggml_context * ctx_input = ctx_map.at(model.buft_input.buft);
5054
5178
  ggml_context * ctx_output = ctx_map.at(model.buft_output.buft);
5055
5179
  ggml_context * ctx_output_split = ctx_map.at(model.buft_output.buft_matrix);
@@ -5069,12 +5193,10 @@ static bool llm_load_tensors(
5069
5193
  // output
5070
5194
  {
5071
5195
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5072
- if (model.arch != LLM_ARCH_MINICPM){
5073
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5074
- // if output is NULL, init from the input tok embed
5075
- if (model.output == NULL) {
5076
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5077
- }
5196
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5197
+ // if output is NULL, init from the input tok embed
5198
+ if (model.output == NULL) {
5199
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5078
5200
  }
5079
5201
  }
5080
5202
 
@@ -5103,6 +5225,11 @@ static bool llm_load_tensors(
5103
5225
  layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5104
5226
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
5105
5227
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5228
+
5229
+ // optional MLP bias
5230
+ layer.ffn_gate_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
5231
+ layer.ffn_down_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5232
+ layer.ffn_up_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
5106
5233
  } else {
5107
5234
  layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
5108
5235
 
@@ -6210,6 +6337,70 @@ static bool llm_load_tensors(
6210
6337
  layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
6211
6338
  }
6212
6339
  } break;
6340
+ case LLM_ARCH_DEEPSEEK2:
6341
+ {
6342
+ bool is_lite = (hparams.n_layer == 27);
6343
+
6344
+ const uint32_t n_embd_head_qk_rope = hparams.n_rot;
6345
+ const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
6346
+ const uint32_t q_lora_rank = hparams.n_lora_q;
6347
+ const uint32_t kv_lora_rank = hparams.n_lora_kv;
6348
+ const uint32_t n_ff_exp = hparams.n_ff_exp;
6349
+
6350
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6351
+
6352
+ // output
6353
+ {
6354
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
6355
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
6356
+ }
6357
+
6358
+ for (int i = 0; i < n_layer; ++i) {
6359
+ ggml_context * ctx_layer = ctx_for_layer(i);
6360
+ ggml_context * ctx_split = ctx_for_layer_split(i);
6361
+
6362
+ auto & layer = model.layers[i];
6363
+
6364
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
6365
+ if (!is_lite) {
6366
+ layer.attn_q_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank});
6367
+ }
6368
+ layer.attn_kv_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank});
6369
+
6370
+ if (!is_lite) {
6371
+ layer.wq_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank});
6372
+ layer.wq_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.n_head * hparams.n_embd_head_k});
6373
+ } else {
6374
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa});
6375
+ }
6376
+ layer.wkv_a_mqa = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope});
6377
+ layer.wkv_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, hparams.n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)});
6378
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {hparams.n_head * hparams.n_embd_head_v, n_embd});
6379
+
6380
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
6381
+
6382
+ if ((uint32_t) i < hparams.n_layer_dense_lead) {
6383
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
6384
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
6385
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
6386
+ } else {
6387
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
6388
+
6389
+ GGML_ASSERT(hparams.n_expert > 0);
6390
+ GGML_ASSERT(hparams.n_expert_used > 0);
6391
+
6392
+ // MoE branch
6393
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
6394
+ layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
6395
+ layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
6396
+
6397
+ // Shared expert branch
6398
+ layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * hparams.n_expert_shared});
6399
+ layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * hparams.n_expert_shared, n_embd});
6400
+ layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * hparams.n_expert_shared});
6401
+ }
6402
+ }
6403
+ } break;
6213
6404
  default:
6214
6405
  throw std::runtime_error("unknown architecture");
6215
6406
  }
@@ -6664,6 +6855,8 @@ static struct ggml_tensor * llm_build_moe_ffn(
6664
6855
  int64_t n_expert_used,
6665
6856
  llm_ffn_op_type type_op,
6666
6857
  bool norm_w,
6858
+ bool scale_w,
6859
+ float w_scale,
6667
6860
  const llm_build_cb & cb,
6668
6861
  int il) {
6669
6862
  int64_t n_embd = cur->ne[0];
@@ -6695,6 +6888,10 @@ static struct ggml_tensor * llm_build_moe_ffn(
6695
6888
 
6696
6889
  weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens);
6697
6890
  }
6891
+ if (scale_w) {
6892
+ weights = ggml_scale(ctx, weights, w_scale);
6893
+ cb(weights, "ffn_moe_weights_scaled", il);
6894
+ }
6698
6895
 
6699
6896
  cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
6700
6897
  ggml_tensor * up = ggml_mul_mat_id(ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
@@ -6937,7 +7134,7 @@ struct llm_build_context {
6937
7134
  const int32_t n_kv; // size of KV cache to consider (n_kv <= kv_self.size)
6938
7135
  const int32_t n_outputs;
6939
7136
  const int32_t kv_head; // index of where we store new KV data in the cache
6940
- const int32_t n_orig_ctx;
7137
+ const int32_t n_ctx_orig;
6941
7138
 
6942
7139
  const bool flash_attn;
6943
7140
 
@@ -6986,7 +7183,7 @@ struct llm_build_context {
6986
7183
  n_kv (worst_case ? kv_self.size : kv_self.n),
6987
7184
  n_outputs (worst_case ? n_tokens : lctx.n_outputs),
6988
7185
  kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
6989
- n_orig_ctx (cparams.n_yarn_orig_ctx),
7186
+ n_ctx_orig (cparams.n_ctx_orig_yarn),
6990
7187
  flash_attn (cparams.flash_attn),
6991
7188
  pooling_type (cparams.pooling_type),
6992
7189
  rope_type (hparams.rope_type),
@@ -7044,7 +7241,7 @@ struct llm_build_context {
7044
7241
  ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
7045
7242
  ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
7046
7243
  0),
7047
- lctx.inp_K_shift, rope_factors, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7244
+ lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7048
7245
  ext_factor, attn_factor, beta_fast, beta_slow);
7049
7246
 
7050
7247
  cb(tmp, "K_shifted", il);
@@ -7153,7 +7350,7 @@ struct llm_build_context {
7153
7350
  // choose long/short freq factors based on the context size
7154
7351
  const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
7155
7352
 
7156
- if (n_ctx_pre_seq > hparams.n_yarn_orig_ctx) {
7353
+ if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) {
7157
7354
  return model.layers[il].rope_long;
7158
7355
  }
7159
7356
 
@@ -7269,14 +7466,14 @@ struct llm_build_context {
7269
7466
 
7270
7467
  Qcur = ggml_rope_ext(
7271
7468
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7272
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7469
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7273
7470
  ext_factor, attn_factor, beta_fast, beta_slow
7274
7471
  );
7275
7472
  cb(Qcur, "Qcur", il);
7276
7473
 
7277
7474
  Kcur = ggml_rope_ext(
7278
7475
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7279
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7476
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7280
7477
  ext_factor, attn_factor, beta_fast, beta_slow
7281
7478
  );
7282
7479
  cb(Kcur, "Kcur", il);
@@ -7305,9 +7502,9 @@ struct llm_build_context {
7305
7502
  cb(cur, "ffn_norm", il);
7306
7503
 
7307
7504
  cur = llm_build_ffn(ctx0, cur,
7308
- model.layers[il].ffn_up, NULL,
7309
- model.layers[il].ffn_gate, NULL,
7310
- model.layers[il].ffn_down, NULL,
7505
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
7506
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b,
7507
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
7311
7508
  NULL,
7312
7509
  LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
7313
7510
  cb(cur, "ffn_out", il);
@@ -7325,6 +7522,7 @@ struct llm_build_context {
7325
7522
  model.layers[il].ffn_down_exps,
7326
7523
  n_expert, n_expert_used,
7327
7524
  LLM_FFN_SILU, true,
7525
+ false, 0.0,
7328
7526
  cb, il);
7329
7527
  cb(cur, "ffn_moe_out", il);
7330
7528
  }
@@ -7399,12 +7597,12 @@ struct llm_build_context {
7399
7597
  case MODEL_7B:
7400
7598
  Qcur = ggml_rope_ext(
7401
7599
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7402
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7600
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7403
7601
  ext_factor, attn_factor, beta_fast, beta_slow
7404
7602
  );
7405
7603
  Kcur = ggml_rope_ext(
7406
7604
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7407
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7605
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7408
7606
  ext_factor, attn_factor, beta_fast, beta_slow
7409
7607
  );
7410
7608
  break;
@@ -7511,14 +7709,14 @@ struct llm_build_context {
7511
7709
 
7512
7710
  Qcur = ggml_rope_ext(
7513
7711
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7514
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7712
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7515
7713
  ext_factor, attn_factor, beta_fast, beta_slow
7516
7714
  );
7517
7715
  cb(Qcur, "Qcur", il);
7518
7716
 
7519
7717
  Kcur = ggml_rope_ext(
7520
7718
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7521
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7719
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7522
7720
  ext_factor, attn_factor, beta_fast, beta_slow
7523
7721
  );
7524
7722
  cb(Kcur, "Kcur", il);
@@ -7631,13 +7829,13 @@ struct llm_build_context {
7631
7829
 
7632
7830
  // using mode = 2 for neox mode
7633
7831
  Qcur = ggml_rope_ext(
7634
- ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
7832
+ ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
7635
7833
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
7636
7834
  );
7637
7835
  cb(Qcur, "Qcur", il);
7638
7836
 
7639
7837
  Kcur = ggml_rope_ext(
7640
- ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
7838
+ ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
7641
7839
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
7642
7840
  );
7643
7841
  cb(Kcur, "Kcur", il);
@@ -7755,14 +7953,14 @@ struct llm_build_context {
7755
7953
 
7756
7954
  Qcur = ggml_rope_ext(
7757
7955
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7758
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7956
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7759
7957
  ext_factor, attn_factor, beta_fast, beta_slow
7760
7958
  );
7761
7959
  cb(Qcur, "Qcur", il);
7762
7960
 
7763
7961
  Kcur = ggml_rope_ext(
7764
7962
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7765
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7963
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7766
7964
  ext_factor, attn_factor, beta_fast, beta_slow
7767
7965
  );
7768
7966
  cb(Kcur, "Kcur", il);
@@ -7806,6 +8004,7 @@ struct llm_build_context {
7806
8004
  model.layers[il].ffn_down_exps,
7807
8005
  n_expert, n_expert_used,
7808
8006
  LLM_FFN_GELU, true,
8007
+ false, 0.0,
7809
8008
  cb, il);
7810
8009
  cb(cur, "ffn_moe_out", il);
7811
8010
 
@@ -7907,14 +8106,14 @@ struct llm_build_context {
7907
8106
 
7908
8107
  Qcur = ggml_rope_ext(
7909
8108
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7910
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8109
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7911
8110
  ext_factor, attn_factor, beta_fast, beta_slow
7912
8111
  );
7913
8112
  cb(Qcur, "Qcur", il);
7914
8113
 
7915
8114
  Kcur = ggml_rope_ext(
7916
8115
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7917
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8116
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7918
8117
  ext_factor, attn_factor, beta_fast, beta_slow
7919
8118
  );
7920
8119
  cb(Kcur, "Kcur", il);
@@ -7949,6 +8148,7 @@ struct llm_build_context {
7949
8148
  model.layers[il].ffn_down_exps,
7950
8149
  n_expert, n_expert_used,
7951
8150
  LLM_FFN_SILU, true,
8151
+ false, 0.0,
7952
8152
  cb, il);
7953
8153
  cb(cur, "ffn_moe_out", il);
7954
8154
 
@@ -8260,14 +8460,14 @@ struct llm_build_context {
8260
8460
 
8261
8461
  Qcur = ggml_rope_ext(
8262
8462
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
8263
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8463
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8264
8464
  ext_factor, attn_factor, beta_fast, beta_slow
8265
8465
  );
8266
8466
  cb(Qcur, "Qcur", il);
8267
8467
 
8268
8468
  Kcur = ggml_rope_ext(
8269
8469
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
8270
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8470
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8271
8471
  ext_factor, attn_factor, beta_fast, beta_slow
8272
8472
  );
8273
8473
  cb(Kcur, "Kcur", il);
@@ -8700,14 +8900,14 @@ struct llm_build_context {
8700
8900
 
8701
8901
  Qcur = ggml_rope_ext(
8702
8902
  ctx0, Qcur, inp_pos, nullptr,
8703
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8903
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8704
8904
  ext_factor, attn_factor, beta_fast, beta_slow
8705
8905
  );
8706
8906
  cb(Qcur, "Qcur", il);
8707
8907
 
8708
8908
  Kcur = ggml_rope_ext(
8709
8909
  ctx0, Kcur, inp_pos, nullptr,
8710
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8910
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8711
8911
  ext_factor, attn_factor, beta_fast, beta_slow
8712
8912
  );
8713
8913
  cb(Kcur, "Kcur", il);
@@ -8819,13 +9019,13 @@ struct llm_build_context {
8819
9019
 
8820
9020
  // using mode = 2 for neox mode
8821
9021
  Qcur = ggml_rope_ext(
8822
- ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
9022
+ ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
8823
9023
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
8824
9024
  );
8825
9025
  cb(Qcur, "Qcur", il);
8826
9026
 
8827
9027
  Kcur = ggml_rope_ext(
8828
- ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
9028
+ ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
8829
9029
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
8830
9030
  );
8831
9031
  cb(Kcur, "Kcur", il);
@@ -8931,14 +9131,14 @@ struct llm_build_context {
8931
9131
 
8932
9132
  Qcur = ggml_rope_ext(
8933
9133
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
8934
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9134
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8935
9135
  ext_factor, attn_factor, beta_fast, beta_slow
8936
9136
  );
8937
9137
  cb(Qcur, "Qcur", il);
8938
9138
 
8939
9139
  Kcur = ggml_rope_ext(
8940
9140
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
8941
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9141
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8942
9142
  ext_factor, attn_factor, beta_fast, beta_slow
8943
9143
  );
8944
9144
  cb(Kcur, "Kcur", il);
@@ -9045,14 +9245,14 @@ struct llm_build_context {
9045
9245
 
9046
9246
  Qcur = ggml_rope_ext(
9047
9247
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9048
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9248
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9049
9249
  ext_factor, attn_factor, beta_fast, beta_slow
9050
9250
  );
9051
9251
  cb(Qcur, "Qcur", il);
9052
9252
 
9053
9253
  Kcur = ggml_rope_ext(
9054
9254
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9055
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9255
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9056
9256
  ext_factor, attn_factor, beta_fast, beta_slow
9057
9257
  );
9058
9258
  cb(Kcur, "Kcur", il);
@@ -9087,6 +9287,7 @@ struct llm_build_context {
9087
9287
  model.layers[il].ffn_down_exps,
9088
9288
  n_expert, n_expert_used,
9089
9289
  LLM_FFN_SILU, false,
9290
+ false, 0.0,
9090
9291
  cb, il);
9091
9292
  cb(cur, "ffn_moe_out", il);
9092
9293
 
@@ -9196,7 +9397,7 @@ struct llm_build_context {
9196
9397
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9197
9398
 
9198
9399
  Qcur = ggml_rope_ext(
9199
- ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
9400
+ ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
9200
9401
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9201
9402
  );
9202
9403
  cb(Qcur, "Qcur", il);
@@ -9207,7 +9408,7 @@ struct llm_build_context {
9207
9408
  cb(Qcur, "Qcur", il);
9208
9409
 
9209
9410
  Kcur = ggml_rope_ext(
9210
- ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
9411
+ ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
9211
9412
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9212
9413
  );
9213
9414
  cb(Kcur, "Kcur", il);
@@ -9318,7 +9519,7 @@ struct llm_build_context {
9318
9519
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9319
9520
 
9320
9521
  Qcur = ggml_rope_ext(
9321
- ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
9522
+ ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig,
9322
9523
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9323
9524
  );
9324
9525
  cb(Qcur, "Qcur", il);
@@ -9327,7 +9528,7 @@ struct llm_build_context {
9327
9528
  cb(Qcur, "Qcur", il);
9328
9529
 
9329
9530
  Kcur = ggml_rope_ext(
9330
- ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
9531
+ ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig,
9331
9532
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9332
9533
  );
9333
9534
  cb(Kcur, "Kcur", il);
@@ -9435,13 +9636,13 @@ struct llm_build_context {
9435
9636
 
9436
9637
  Qcur = ggml_rope_ext(
9437
9638
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos, nullptr,
9438
- n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9639
+ n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
9439
9640
  ext_factor, attn_factor, beta_fast, beta_slow);
9440
9641
  cb(Qcur, "Qcur", il);
9441
9642
 
9442
9643
  Kcur = ggml_rope_ext(
9443
9644
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, nullptr,
9444
- n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9645
+ n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
9445
9646
  ext_factor, attn_factor, beta_fast, beta_slow);
9446
9647
  cb(Kcur, "Kcur", il);
9447
9648
 
@@ -9643,14 +9844,14 @@ struct llm_build_context {
9643
9844
 
9644
9845
  struct ggml_tensor * Qcur = ggml_rope_ext(
9645
9846
  ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9646
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9847
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9647
9848
  ext_factor, attn_factor, beta_fast, beta_slow
9648
9849
  );
9649
9850
  cb(Qcur, "Qcur", il);
9650
9851
 
9651
9852
  struct ggml_tensor * Kcur = ggml_rope_ext(
9652
9853
  ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9653
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9854
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9654
9855
  ext_factor, attn_factor, beta_fast, beta_slow
9655
9856
  );
9656
9857
  cb(Kcur, "Kcur", il);
@@ -9759,14 +9960,14 @@ struct llm_build_context {
9759
9960
 
9760
9961
  Qcur = ggml_rope_ext(
9761
9962
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9762
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9963
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9763
9964
  ext_factor, attn_factor, beta_fast, beta_slow
9764
9965
  );
9765
9966
  cb(Qcur, "Qcur", il);
9766
9967
 
9767
9968
  Kcur = ggml_rope_ext(
9768
9969
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9769
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9970
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9770
9971
  ext_factor, attn_factor, beta_fast, beta_slow
9771
9972
  );
9772
9973
  cb(Kcur, "Kcur", il);
@@ -9876,14 +10077,14 @@ struct llm_build_context {
9876
10077
 
9877
10078
  Qcur = ggml_rope_ext(
9878
10079
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9879
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10080
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9880
10081
  ext_factor, attn_factor, beta_fast, beta_slow
9881
10082
  );
9882
10083
  cb(Qcur, "Qcur", il);
9883
10084
 
9884
10085
  Kcur = ggml_rope_ext(
9885
10086
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9886
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10087
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9887
10088
  ext_factor, attn_factor, beta_fast, beta_slow
9888
10089
  );
9889
10090
  cb(Kcur, "Kcur", il);
@@ -10006,14 +10207,14 @@ struct llm_build_context {
10006
10207
 
10007
10208
  Qcur = ggml_rope_ext(
10008
10209
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10009
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10210
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10010
10211
  ext_factor, attn_factor, beta_fast, beta_slow
10011
10212
  );
10012
10213
  cb(Qcur, "Qcur", il);
10013
10214
 
10014
10215
  Kcur = ggml_rope_ext(
10015
10216
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10016
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10217
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10017
10218
  ext_factor, attn_factor, beta_fast, beta_slow
10018
10219
  );
10019
10220
  cb(Kcur, "Kcur", il);
@@ -10078,7 +10279,7 @@ struct llm_build_context {
10078
10279
  cb(cur, "lmhead_scaling", -1);
10079
10280
 
10080
10281
  // lm_head
10081
- cur = ggml_mul_mat(ctx0, model.tok_embd, cur);
10282
+ cur = ggml_mul_mat(ctx0, model.output, cur);
10082
10283
  cb(cur, "result_output", -1);
10083
10284
 
10084
10285
  ggml_build_forward_expand(gf, cur);
@@ -10126,7 +10327,7 @@ struct llm_build_context {
10126
10327
 
10127
10328
  Qcur = ggml_rope_ext(
10128
10329
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
10129
- n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10330
+ n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
10130
10331
  ext_factor, attn_factor, beta_fast, beta_slow);
10131
10332
  cb(Qcur, "Qcur", il);
10132
10333
 
@@ -10135,7 +10336,7 @@ struct llm_build_context {
10135
10336
 
10136
10337
  Kcur = ggml_rope_ext(
10137
10338
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
10138
- n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10339
+ n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
10139
10340
  ext_factor, attn_factor, beta_fast, beta_slow);
10140
10341
  cb(Kcur, "Kcur", il);
10141
10342
 
@@ -10246,14 +10447,14 @@ struct llm_build_context {
10246
10447
 
10247
10448
  Qcur = ggml_rope_ext(
10248
10449
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10249
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10450
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10250
10451
  ext_factor, attn_factor, beta_fast, beta_slow
10251
10452
  );
10252
10453
  cb(Qcur, "Qcur", il);
10253
10454
 
10254
10455
  Kcur = ggml_rope_ext(
10255
10456
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10256
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10457
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10257
10458
  ext_factor, attn_factor, beta_fast, beta_slow
10258
10459
  );
10259
10460
  cb(Kcur, "Kcur", il);
@@ -10536,14 +10737,14 @@ struct llm_build_context {
10536
10737
 
10537
10738
  Qcur = ggml_rope_ext(
10538
10739
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10539
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10740
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10540
10741
  ext_factor, attn_factor, beta_fast, beta_slow
10541
10742
  );
10542
10743
  cb(Qcur, "Qcur", il);
10543
10744
 
10544
10745
  Kcur = ggml_rope_ext(
10545
10746
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10546
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10747
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10547
10748
  ext_factor, attn_factor, beta_fast, beta_slow
10548
10749
  );
10549
10750
  cb(Kcur, "Kcur", il);
@@ -10667,14 +10868,14 @@ struct llm_build_context {
10667
10868
 
10668
10869
  Qcur = ggml_rope_ext(
10669
10870
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10670
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10871
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10671
10872
  ext_factor, attn_factor, beta_fast, beta_slow
10672
10873
  );
10673
10874
  cb(Qcur, "Qcur", il);
10674
10875
 
10675
10876
  Kcur = ggml_rope_ext(
10676
10877
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10677
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10878
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10678
10879
  ext_factor, attn_factor, beta_fast, beta_slow
10679
10880
  );
10680
10881
  cb(Kcur, "Kcur", il);
@@ -10781,14 +10982,14 @@ struct llm_build_context {
10781
10982
 
10782
10983
  Qcur = ggml_rope_ext(
10783
10984
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10784
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10985
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10785
10986
  ext_factor, attn_factor, beta_fast, beta_slow
10786
10987
  );
10787
10988
  cb(Qcur, "Qcur", il);
10788
10989
 
10789
10990
  Kcur = ggml_rope_ext(
10790
10991
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10791
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10992
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10792
10993
  ext_factor, attn_factor, beta_fast, beta_slow
10793
10994
  );
10794
10995
  cb(Kcur, "Kcur", il);
@@ -10916,14 +11117,14 @@ struct llm_build_context {
10916
11117
 
10917
11118
  Qcur = ggml_rope_ext(
10918
11119
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10919
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
11120
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10920
11121
  ext_factor, attn_factor, beta_fast, beta_slow
10921
11122
  );
10922
11123
  cb(Qcur, "Qcur", il);
10923
11124
 
10924
11125
  Kcur = ggml_rope_ext(
10925
11126
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10926
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
11127
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10927
11128
  ext_factor, attn_factor, beta_fast, beta_slow
10928
11129
  );
10929
11130
  cb(Kcur, "Kcur", il);
@@ -10974,6 +11175,7 @@ struct llm_build_context {
10974
11175
  model.layers[il].ffn_down_exps,
10975
11176
  n_expert, n_expert_used,
10976
11177
  LLM_FFN_SILU, true,
11178
+ false, 0.0,
10977
11179
  cb, il);
10978
11180
  cb(cur, "ffn_moe_out", il);
10979
11181
 
@@ -11005,6 +11207,239 @@ struct llm_build_context {
11005
11207
 
11006
11208
  return gf;
11007
11209
  }
11210
+
11211
+ struct ggml_cgraph * build_deepseek2() {
11212
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
11213
+
11214
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
11215
+ int32_t n_tokens = this->n_tokens;
11216
+
11217
+ bool is_lite = (hparams.n_layer == 27);
11218
+
11219
+ // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
11220
+ // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
11221
+ const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
11222
+ const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(hparams.n_embd_head_k));
11223
+ const float attn_factor_scaled = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
11224
+
11225
+ const uint32_t n_embd_head_qk_rope = hparams.n_rot;
11226
+ const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
11227
+ const uint32_t kv_lora_rank = hparams.n_lora_kv;
11228
+
11229
+ struct ggml_tensor * cur;
11230
+ struct ggml_tensor * inpL;
11231
+
11232
+ // {n_embd, n_tokens}
11233
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
11234
+
11235
+ // inp_pos - contains the positions
11236
+ struct ggml_tensor * inp_pos = build_inp_pos();
11237
+
11238
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
11239
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
11240
+
11241
+ for (int il = 0; il < n_layer; ++il) {
11242
+ struct ggml_tensor * inpSA = inpL;
11243
+
11244
+ // norm
11245
+ cur = llm_build_norm(ctx0, inpL, hparams,
11246
+ model.layers[il].attn_norm, NULL,
11247
+ LLM_NORM_RMS, cb, il);
11248
+ cb(cur, "attn_norm", il);
11249
+
11250
+ // self_attention
11251
+ {
11252
+ struct ggml_tensor * q = NULL;
11253
+ if (!is_lite) {
11254
+ // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
11255
+ q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
11256
+ cb(q, "q", il);
11257
+
11258
+ q = llm_build_norm(ctx0, q, hparams,
11259
+ model.layers[il].attn_q_a_norm, NULL,
11260
+ LLM_NORM_RMS, cb, il);
11261
+ cb(q, "q", il);
11262
+
11263
+ // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
11264
+ q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
11265
+ cb(q, "q", il);
11266
+ } else {
11267
+ q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
11268
+ cb(q, "q", il);
11269
+ }
11270
+
11271
+ // split into {n_head * n_embd_head_qk_nope, n_tokens}
11272
+ struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
11273
+ ggml_row_size(q->type, hparams.n_embd_head_k),
11274
+ ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
11275
+ 0);
11276
+ cb(q_nope, "q_nope", il);
11277
+
11278
+ // and {n_head * n_embd_head_qk_rope, n_tokens}
11279
+ struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
11280
+ ggml_row_size(q->type, hparams.n_embd_head_k),
11281
+ ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
11282
+ ggml_row_size(q->type, n_embd_head_qk_nope));
11283
+ cb(q_pe, "q_pe", il);
11284
+
11285
+ // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
11286
+ struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
11287
+ cb(kv_pe_compresseed, "kv_pe_compresseed", il);
11288
+
11289
+ // split into {kv_lora_rank, n_tokens}
11290
+ struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
11291
+ kv_pe_compresseed->nb[1],
11292
+ 0);
11293
+ cb(kv_compressed, "kv_compressed", il);
11294
+
11295
+ // and {n_embd_head_qk_rope, n_tokens}
11296
+ struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
11297
+ kv_pe_compresseed->nb[1],
11298
+ kv_pe_compresseed->nb[1],
11299
+ ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
11300
+ cb(k_pe, "k_pe", il);
11301
+
11302
+ kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
11303
+ kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
11304
+ model.layers[il].attn_kv_a_norm, NULL,
11305
+ LLM_NORM_RMS, cb, il);
11306
+ cb(kv_compressed, "kv_compressed", il);
11307
+
11308
+ // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
11309
+ struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
11310
+ cb(kv, "kv", il);
11311
+
11312
+ // split into {n_head * n_embd_head_qk_nope, n_tokens}
11313
+ struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
11314
+ ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
11315
+ ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
11316
+ 0);
11317
+ cb(k_nope, "k_nope", il);
11318
+
11319
+ // and {n_head * n_embd_head_v, n_tokens}
11320
+ struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
11321
+ ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
11322
+ ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
11323
+ ggml_row_size(kv->type, (n_embd_head_qk_nope)));
11324
+ cb(v_states, "v_states", il);
11325
+
11326
+ v_states = ggml_cont(ctx0, v_states);
11327
+ cb(v_states, "v_states", il);
11328
+
11329
+ v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
11330
+ ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
11331
+ 0);
11332
+ cb(v_states, "v_states", il);
11333
+
11334
+ q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
11335
+ q_pe = ggml_rope_ext(
11336
+ ctx0, q_pe, inp_pos, nullptr,
11337
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
11338
+ ext_factor, attn_factor_scaled, beta_fast, beta_slow
11339
+ );
11340
+ cb(q_pe, "q_pe", il);
11341
+
11342
+ // shared RoPE key
11343
+ k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
11344
+ k_pe = ggml_rope_ext(
11345
+ ctx0, k_pe, inp_pos, nullptr,
11346
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
11347
+ ext_factor, attn_factor_scaled, beta_fast, beta_slow
11348
+ );
11349
+ cb(k_pe, "k_pe", il);
11350
+
11351
+ struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
11352
+ cb(q_states, "q_states", il);
11353
+
11354
+ struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
11355
+ cb(k_states, "k_states", il);
11356
+
11357
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
11358
+ model.layers[il].wo, NULL,
11359
+ k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
11360
+ }
11361
+
11362
+ if (il == n_layer - 1) {
11363
+ // skip computing output for unused tokens
11364
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
11365
+ n_tokens = n_outputs;
11366
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
11367
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
11368
+ }
11369
+
11370
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
11371
+ cb(ffn_inp, "ffn_inp", il);
11372
+
11373
+ if ((uint32_t) il < hparams.n_layer_dense_lead) {
11374
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
11375
+ model.layers[il].ffn_norm, NULL,
11376
+ LLM_NORM_RMS, cb, il);
11377
+ cb(cur, "ffn_norm", il);
11378
+
11379
+ cur = llm_build_ffn(ctx0, cur,
11380
+ model.layers[il].ffn_up, NULL,
11381
+ model.layers[il].ffn_gate, NULL,
11382
+ model.layers[il].ffn_down, NULL,
11383
+ NULL,
11384
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
11385
+ cb(cur, "ffn_out", il);
11386
+ } else {
11387
+ // MoE branch
11388
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
11389
+ model.layers[il].ffn_norm, NULL,
11390
+ LLM_NORM_RMS, cb, il);
11391
+ cb(cur, "ffn_norm", il);
11392
+
11393
+ ggml_tensor * moe_out =
11394
+ llm_build_moe_ffn(ctx0, cur,
11395
+ model.layers[il].ffn_gate_inp,
11396
+ model.layers[il].ffn_up_exps,
11397
+ model.layers[il].ffn_gate_exps,
11398
+ model.layers[il].ffn_down_exps,
11399
+ n_expert, n_expert_used,
11400
+ LLM_FFN_SILU, false,
11401
+ true, hparams.expert_weights_scale,
11402
+ cb, il);
11403
+ cb(moe_out, "ffn_moe_out", il);
11404
+
11405
+ // FFN shared expert
11406
+ {
11407
+ ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, cur,
11408
+ model.layers[il].ffn_up_shexp, NULL,
11409
+ model.layers[il].ffn_gate_shexp, NULL,
11410
+ model.layers[il].ffn_down_shexp, NULL,
11411
+ NULL,
11412
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
11413
+ cb(ffn_shexp, "ffn_shexp", il);
11414
+
11415
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
11416
+ cb(cur, "ffn_out", il);
11417
+ }
11418
+ }
11419
+
11420
+ cur = ggml_add(ctx0, cur, ffn_inp);
11421
+ cb(cur, "l_out", il);
11422
+
11423
+ // input for next layer
11424
+ inpL = cur;
11425
+ }
11426
+
11427
+ cur = inpL;
11428
+
11429
+ cur = llm_build_norm(ctx0, cur, hparams,
11430
+ model.output_norm, NULL,
11431
+ LLM_NORM_RMS, cb, -1);
11432
+ cb(cur, "result_norm", -1);
11433
+
11434
+ // lm_head
11435
+ cur = ggml_mul_mat(ctx0, model.output, cur);
11436
+ cb(cur, "result_output", -1);
11437
+
11438
+ ggml_build_forward_expand(gf, cur);
11439
+
11440
+ return gf;
11441
+ }
11442
+
11008
11443
  };
11009
11444
 
11010
11445
  static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -11223,6 +11658,10 @@ static struct ggml_cgraph * llama_build_graph(
11223
11658
  {
11224
11659
  result = llm.build_arctic();
11225
11660
  } break;
11661
+ case LLM_ARCH_DEEPSEEK2:
11662
+ {
11663
+ result = llm.build_deepseek2();
11664
+ } break;
11226
11665
  default:
11227
11666
  GGML_ASSERT(false);
11228
11667
  }
@@ -12239,27 +12678,27 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
12239
12678
 
12240
12679
  static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
12241
12680
  GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
12242
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL;
12681
+ return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL;
12243
12682
  }
12244
12683
 
12245
12684
  static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
12246
12685
  GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
12247
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNKNOWN;
12686
+ return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN;
12248
12687
  }
12249
12688
 
12250
12689
  static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
12251
12690
  GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
12252
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
12691
+ return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL;
12253
12692
  }
12254
12693
 
12255
12694
  static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
12256
12695
  GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
12257
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
12696
+ return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE;
12258
12697
  }
12259
12698
 
12260
12699
  static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
12261
12700
  GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
12262
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
12701
+ return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
12263
12702
  }
12264
12703
 
12265
12704
  static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
@@ -12512,6 +12951,7 @@ struct llm_tokenizer_bpe {
12512
12951
  });
12513
12952
  break;
12514
12953
  case LLAMA_VOCAB_PRE_TYPE_DBRX:
12954
+ case LLAMA_VOCAB_PRE_TYPE_SMAUG:
12515
12955
  word_collection = unicode_regex_split(text, {
12516
12956
  // same as llama3
12517
12957
  "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
@@ -12734,7 +13174,7 @@ struct llm_tokenizer_wpm {
12734
13174
  llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
12735
13175
 
12736
13176
  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
12737
- auto * token_map = &vocab.token_to_id;
13177
+ const auto & token_map = vocab.token_to_id;
12738
13178
 
12739
13179
  // normalize and split by whitespace
12740
13180
  std::vector<std::string> words = preprocess(text);
@@ -12749,108 +13189,89 @@ struct llm_tokenizer_wpm {
12749
13189
  }
12750
13190
 
12751
13191
  // prepend phantom space
12752
- std::string word1 = "\xe2\x96\x81" + word;
12753
- int n = word1.size();
13192
+ const std::string word1 = "\xe2\x96\x81" + word;
13193
+ const int n = word1.size();
12754
13194
 
12755
- // we're at the start of a new word
12756
- int i = 0;
12757
- bool match_any = false;
13195
+ const size_t current_tokens = output.size();
12758
13196
 
13197
+ // we're at the start of a new word
12759
13198
  // move through character position in word
12760
- while (i < n) {
13199
+ for (int i = 0; i < n; ++i) {
12761
13200
  // loop through possible match length
12762
13201
  bool match = false;
12763
13202
  for (int j = n; j > i; j--) {
12764
- auto it = token_map->find(word1.substr(i, j - i));
12765
- if (it != token_map->end()) {
13203
+ auto it = token_map.find(word1.substr(i, j - i));
13204
+ if (it != token_map.end()) {
12766
13205
  output.push_back(it->second);
12767
13206
  match = true;
12768
- match_any = true;
12769
- i = j;
13207
+ i = j - 1;
12770
13208
  break;
12771
13209
  }
12772
13210
  }
12773
13211
 
12774
- // must be an unknown character
12775
- if (!match) {
12776
- i++;
13212
+ if (!match) { // discard all
13213
+ output.resize(current_tokens);
13214
+ break; // and discard next tokens
12777
13215
  }
12778
13216
  }
12779
13217
 
12780
13218
  // we didn't find any matches for this word
12781
- if (!match_any) {
13219
+ if (current_tokens == output.size()) {
12782
13220
  output.push_back(vocab.special_unk_id);
12783
13221
  }
12784
13222
  }
12785
13223
  }
12786
13224
 
12787
13225
  std::vector<std::string> preprocess(const std::string & text) {
12788
- std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
12789
-
12790
- // strip accents, strip control, uniformize whitespace,
12791
- // to lowercase, pad chinese characters, pad punctuation
12792
- std::string new_str = "";
12793
- for (uint32_t code : cpts_nfd) {
12794
- const codepoint_flags flags = unicode_cpt_flags(code);
12795
- if (flags.is_accent_mark || flags.is_control) {
13226
+ const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
13227
+ std::vector<std::string> words(1, "");
13228
+
13229
+ for (const char32_t cpt : cpts_nfd) {
13230
+ const auto flags = unicode_cpt_flags(cpt);
13231
+
13232
+ if (flags.is_whitespace) {
13233
+ if (words.back().size()) { // finish previous word if any
13234
+ words.emplace_back();
13235
+ }
12796
13236
  continue;
12797
13237
  }
12798
- code = unicode_tolower(code);
12799
- if (flags.is_separator || flags.is_whitespace) { //####FIXME: is_separator ?
12800
- code = ' ';
12801
- }
12802
- std::string s = unicode_cpt_to_utf8(code);
12803
- if (flags.is_punctuation || is_ascii_punct(code) || is_chinese_char(code)) {
12804
- new_str += " ";
12805
- new_str += s;
12806
- new_str += " ";
12807
- } else {
12808
- new_str += s;
13238
+
13239
+ assert (!flags.is_separator);
13240
+ if (cpt == 0 || cpt == 0xFFFD || flags.is_control) {
13241
+ continue;
12809
13242
  }
12810
- }
12811
13243
 
12812
- // split by whitespace
12813
- uint64_t l = 0;
12814
- uint64_t r = 0;
12815
- std::vector<std::string> words;
12816
- while (r < new_str.size()) {
12817
- // if is whitespace
12818
- if (isspace(new_str[r], std::locale::classic())) {
12819
- if (r > l) words.push_back(new_str.substr(l, (r - l)));
12820
- l = r + 1;
12821
- r = l;
13244
+ const std::string s = unicode_cpt_to_utf8(unicode_tolower(cpt));
13245
+ if (flags.is_punctuation || ( cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt)) {
13246
+ if (words.back().size()) { // finish previous word if any
13247
+ words.emplace_back();
13248
+ }
13249
+ words.back() = s; // single char word
13250
+ words.emplace_back(); // start a new word
12822
13251
  } else {
12823
- r += 1;
13252
+ words.back() += s; // append char to word
12824
13253
  }
12825
13254
  }
12826
- if (r > l) {
12827
- words.push_back(new_str.substr(l, (r - l)));
12828
- }
12829
- return words;
12830
- }
12831
13255
 
12832
- bool is_ascii_punct(uint32_t code) {
12833
- if (code > 0xFF) {
12834
- return false;
13256
+ if (!words.back().size()) {
13257
+ words.pop_back();
12835
13258
  }
12836
- auto c = char(static_cast<unsigned char>(code));
12837
- return ispunct(c, std::locale::classic());
13259
+
13260
+ return words;
12838
13261
  }
12839
13262
 
12840
- bool is_chinese_char(uint32_t cpt) {
12841
- if ((cpt >= 0x4E00 && cpt <= 0x9FFF) ||
12842
- (cpt >= 0x3400 && cpt <= 0x4DBF) ||
13263
+ static bool is_chinese_char(uint32_t cpt) {
13264
+ return
13265
+ (cpt >= 0x04E00 && cpt <= 0x09FFF) ||
13266
+ (cpt >= 0x03400 && cpt <= 0x04DBF) ||
12843
13267
  (cpt >= 0x20000 && cpt <= 0x2A6DF) ||
12844
13268
  (cpt >= 0x2A700 && cpt <= 0x2B73F) ||
12845
13269
  (cpt >= 0x2B740 && cpt <= 0x2B81F) ||
12846
13270
  (cpt >= 0x2B920 && cpt <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
12847
- (cpt >= 0xF900 && cpt <= 0xFAFF) ||
12848
- (cpt >= 0x2F800 && cpt <= 0x2FA1F) ||
12849
- (cpt >= 0x3000 && cpt <= 0x303F) ||
12850
- (cpt >= 0xFF00 && cpt <= 0xFFEF)) {
12851
- return true; // NOLINT
12852
- }
12853
- return false;
13271
+ (cpt >= 0x0F900 && cpt <= 0x0FAFF) ||
13272
+ (cpt >= 0x2F800 && cpt <= 0x2FA1F);
13273
+ //(cpt >= 0x3000 && cpt <= 0x303F) ||
13274
+ //(cpt >= 0xFF00 && cpt <= 0xFFEF);
12854
13275
  }
12855
13276
 
12856
13277
  const llama_vocab & vocab;
@@ -12894,9 +13315,9 @@ struct fragment_buffer_variant {
12894
13315
 
12895
13316
  static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
12896
13317
  // for each special token
12897
- for (const auto & st: vocab.special_tokens_cache) {
12898
- const auto & special_token = st.first;
12899
- const auto & special_id = st.second;
13318
+ for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
13319
+ const auto & data = vocab.id_to_token[special_id];
13320
+ const auto & special_token = data.text;
12900
13321
 
12901
13322
  // for each text fragment
12902
13323
  std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
@@ -12905,7 +13326,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
12905
13326
 
12906
13327
  // if a fragment is text ( not yet processed )
12907
13328
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
12908
- auto * raw_text = &(fragment.raw_text);
13329
+ auto & raw_text = fragment.raw_text;
12909
13330
 
12910
13331
  auto raw_text_base_offset = fragment.offset;
12911
13332
  auto raw_text_base_length = fragment.length;
@@ -12915,7 +13336,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
12915
13336
  // find the first occurrence of a given special token in this fragment
12916
13337
  // passing offset argument only limit the "search area" but match coordinates
12917
13338
  // are still relative to the source full raw_text
12918
- auto match = raw_text->find(special_token, raw_text_base_offset);
13339
+ auto match = raw_text.find(special_token, raw_text_base_offset);
12919
13340
 
12920
13341
  // no occurrences found, stop processing this fragment for a given special token
12921
13342
  if (match == std::string::npos) break;
@@ -12933,13 +13354,22 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
12933
13354
  if (match > raw_text_base_offset) {
12934
13355
  // left
12935
13356
  const int64_t left_reminder_offset = raw_text_base_offset + 0;
12936
- const int64_t left_reminder_length = match - raw_text_base_offset;
12937
- buffer.emplace_after(it, (*raw_text), left_reminder_offset, left_reminder_length);
13357
+ int64_t left_reminder_length = match - raw_text_base_offset;
13358
+
13359
+ if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) {
13360
+ while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) {
13361
+ left_reminder_length--;
13362
+ }
13363
+ }
13364
+
13365
+ if (left_reminder_length > 0) {
13366
+ buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
13367
+ it++;
13368
+ }
12938
13369
 
12939
13370
  #ifdef PRETOKENIZERDEBUG
12940
13371
  LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
12941
13372
  #endif
12942
- it++;
12943
13373
  }
12944
13374
 
12945
13375
  // special token
@@ -12948,16 +13378,25 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
12948
13378
 
12949
13379
  // right
12950
13380
  if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
12951
- const int64_t right_reminder_offset = match + special_token.length();
12952
- const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
12953
- buffer.emplace_after(it, (*raw_text), right_reminder_offset, right_reminder_length);
13381
+ int64_t right_reminder_offset = match + special_token.length();
13382
+ int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
13383
+
13384
+ if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) {
13385
+ while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) {
13386
+ right_reminder_offset++;
13387
+ right_reminder_length--;
13388
+ }
13389
+ }
13390
+
13391
+ if (right_reminder_length > 0) {
13392
+ buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
13393
+ it++;
13394
+ }
12954
13395
 
12955
13396
  #ifdef PRETOKENIZERDEBUG
12956
13397
  LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
12957
13398
  #endif
12958
13399
 
12959
- it++;
12960
-
12961
13400
  if (source == 0) {
12962
13401
  buffer.erase_after(buffer.before_begin());
12963
13402
  } else {
@@ -13003,9 +13442,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13003
13442
  // tokenizer.encode('', add_special_tokens=True) returns [1]
13004
13443
  // tokenizer.encode('', add_special_tokens=False) returns []
13005
13444
 
13006
- static const bool rtrim = true; //TODO: as param
13007
13445
  bool is_prev_special = false;
13008
- bool special_token_rtrim = false;
13009
13446
 
13010
13447
  if (add_special && vocab.special_add_bos != 0) {
13011
13448
  GGML_ASSERT(vocab.special_bos_id != -1);
@@ -13015,25 +13452,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13015
13452
 
13016
13453
  for (const auto & fragment : fragment_buffer) {
13017
13454
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
13018
- // without adding this leading whitespace, we do not get the same results as the original tokenizer
13019
-
13020
- // TODO: It's likely possible to get rid of this string copy entirely
13021
- // by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
13022
- // and passing 'add space prefix' as bool argument
13023
- //
13024
13455
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
13025
13456
 
13026
- if (special_token_rtrim) {
13027
- size_t num_whitespaces = 0;
13028
- while (isspace(raw_text[num_whitespaces])) {
13029
- num_whitespaces++;
13030
- }
13031
- if (num_whitespaces == raw_text.size()) {
13032
- continue; // skip if all whitespaces
13033
- }
13034
- raw_text = raw_text.substr(num_whitespaces);
13035
- }
13036
-
13037
13457
  if (vocab.add_space_prefix) {
13038
13458
  if (!output.size() || is_prev_special) { // prefix with space if first token
13039
13459
  raw_text = " " + raw_text;
@@ -13049,11 +13469,6 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13049
13469
  } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
13050
13470
  output.push_back(fragment.token);
13051
13471
  is_prev_special = true;
13052
- // phi-3 special tokens without rtrim, works fine for llama-spm too
13053
- special_token_rtrim = rtrim
13054
- && fragment.token != vocab.special_bos_id
13055
- && fragment.token != vocab.special_unk_id
13056
- && fragment.token != vocab.special_eos_id;
13057
13472
  }
13058
13473
  }
13059
13474
 
@@ -14054,7 +14469,7 @@ void llama_sample_repetition_penalties(
14054
14469
 
14055
14470
  void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
14056
14471
  GGML_ASSERT(ctx);
14057
- const int64_t t_start_sample_us = ggml_time_us();
14472
+ int64_t t_start_sample_us = ggml_time_us();
14058
14473
 
14059
14474
  bool allow_eog = false;
14060
14475
  for (const auto & stack : grammar->stacks) {
@@ -14066,12 +14481,13 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
14066
14481
 
14067
14482
  std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
14068
14483
  candidates_decoded.reserve(candidates->size);
14069
- std::vector<llama_grammar_candidate> candidates_grammar;
14484
+
14485
+ std::vector<llama_grammar_candidate> candidates_grammar;
14070
14486
  candidates_grammar.reserve(candidates->size);
14071
14487
 
14072
14488
  for (size_t i = 0; i < candidates->size; ++i) {
14073
- const llama_token id = candidates->data[i].id;
14074
- const std::string piece = llama_token_to_piece(ctx, id, false);
14489
+ const llama_token id = candidates->data[i].id;
14490
+ const std::string & piece = ctx->model.vocab.cache_token_to_piece.at(id);
14075
14491
 
14076
14492
  if (llama_token_is_eog(&ctx->model, id)) {
14077
14493
  if (!allow_eog) {
@@ -14271,7 +14687,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
14271
14687
  GGML_ASSERT(false);
14272
14688
  }
14273
14689
 
14274
- const std::string piece = llama_token_to_piece(ctx, token, false);
14690
+ const std::string & piece = ctx->model.vocab.cache_token_to_piece.at(token);
14275
14691
 
14276
14692
  // Note terminating 0 in decoded string
14277
14693
  const auto decoded = decode_utf8(piece, grammar->partial_utf8);
@@ -14287,260 +14703,6 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
14287
14703
  ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
14288
14704
  }
14289
14705
 
14290
- //
14291
- // Beam search
14292
- //
14293
-
14294
- struct llama_beam {
14295
- std::vector<llama_token> tokens;
14296
- float p; // Cumulative beam probability (renormalized relative to all beams)
14297
- bool eob; // Initialize end-of-beam to false. Callback sets this to true.
14298
- // Sort beams by probability. In case of ties, prefer beams at eob.
14299
- bool operator<(const llama_beam & rhs) const {
14300
- return std::make_pair(p, eob) < std::make_pair(rhs.p, rhs.eob);
14301
- }
14302
- // Shift off first n tokens and discard them.
14303
- void shift_tokens(const size_t n) {
14304
- if (n) {
14305
- std::copy(tokens.begin() + n, tokens.end(), tokens.begin());
14306
- tokens.resize(tokens.size() - n);
14307
- }
14308
- }
14309
- llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eob}; }
14310
- };
14311
-
14312
- // A struct for calculating logit-related info.
14313
- struct llama_logit_info {
14314
- const float * const logits;
14315
- const int n_vocab;
14316
- const float max_l;
14317
- const float normalizer;
14318
- struct sum_exp {
14319
- float max_l;
14320
- float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
14321
- };
14322
- llama_logit_info(llama_context * ctx)
14323
- : logits(llama_get_logits(ctx))
14324
- , n_vocab(llama_n_vocab(llama_get_model(ctx)))
14325
- , max_l(*std::max_element(logits, logits + n_vocab))
14326
- , normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
14327
- { }
14328
- llama_token_data get_token_data(const llama_token token_id) const {
14329
- constexpr auto p = std::numeric_limits<float>::quiet_NaN(); // never used
14330
- return {token_id, logits[token_id], p};
14331
- }
14332
- // Return top k token_data by logit.
14333
- std::vector<llama_token_data> top_k(size_t k) {
14334
- std::vector<llama_token_data> min_heap; // min-heap by logit
14335
- const llama_token k_min = std::min(static_cast<llama_token>(k), n_vocab);
14336
- min_heap.reserve(k_min);
14337
- for (llama_token token_id = 0 ; token_id < k_min ; ++token_id) {
14338
- min_heap.push_back(get_token_data(token_id));
14339
- }
14340
- auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; };
14341
- std::make_heap(min_heap.begin(), min_heap.end(), comp);
14342
- for (llama_token token_id = k_min ; token_id < n_vocab ; ++token_id) {
14343
- if (min_heap.front().logit < logits[token_id]) {
14344
- std::pop_heap(min_heap.begin(), min_heap.end(), comp);
14345
- min_heap.back().id = token_id;
14346
- min_heap.back().logit = logits[token_id];
14347
- std::push_heap(min_heap.begin(), min_heap.end(), comp);
14348
- }
14349
- }
14350
- return min_heap;
14351
- }
14352
- float probability_from_logit(float logit) const {
14353
- return normalizer * std::exp(logit - max_l);
14354
- }
14355
- };
14356
-
14357
- struct llama_beam_search_data {
14358
- llama_context * ctx;
14359
- size_t n_beams;
14360
- int n_past;
14361
- int n_predict;
14362
- std::vector<llama_beam> beams;
14363
- std::vector<llama_beam> next_beams;
14364
-
14365
- // Re-calculated on each loop iteration
14366
- size_t common_prefix_length;
14367
-
14368
- // Used to communicate to/from callback on beams state.
14369
- std::vector<llama_beam_view> beam_views;
14370
-
14371
- llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict)
14372
- : ctx(ctx)
14373
- , n_beams(n_beams)
14374
- , n_past(n_past)
14375
- , n_predict(n_predict)
14376
- , beam_views(n_beams) {
14377
- beams.reserve(n_beams);
14378
- next_beams.reserve(n_beams);
14379
- }
14380
-
14381
- // Collapse beams to a single beam given by index.
14382
- void collapse_beams(const size_t beam_idx) {
14383
- if (0u < beam_idx) {
14384
- std::swap(beams[0], beams[beam_idx]);
14385
- }
14386
- beams.resize(1);
14387
- }
14388
-
14389
- // Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
14390
- // The repetitive patterns below reflect the 2 stages of heaps:
14391
- // * Gather elements until the vector is full, then call std::make_heap() on it.
14392
- // * If the heap is full and a new element is found that should be included, pop the
14393
- // least element to the back(), replace it with the new, then push it into the heap.
14394
- void fill_next_beams_by_top_probabilities(llama_beam & beam) {
14395
- // Min-heaps use a greater-than comparator.
14396
- const auto comp = [](const llama_beam & a, const llama_beam & b) { return a.p > b.p; };
14397
- if (beam.eob) {
14398
- // beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
14399
- if (next_beams.size() < n_beams) {
14400
- next_beams.push_back(std::move(beam));
14401
- if (next_beams.size() == n_beams) {
14402
- std::make_heap(next_beams.begin(), next_beams.end(), comp);
14403
- }
14404
- } else if (next_beams.front().p < beam.p) {
14405
- std::pop_heap(next_beams.begin(), next_beams.end(), comp);
14406
- next_beams.back() = std::move(beam);
14407
- std::push_heap(next_beams.begin(), next_beams.end(), comp);
14408
- }
14409
- } else {
14410
- // beam is not at end-of-sentence, so branch with next top_k tokens.
14411
- if (!beam.tokens.empty()) {
14412
- llama_decode(ctx, llama_batch_get_one(beam.tokens.data(), beam.tokens.size(), n_past, 0));
14413
- }
14414
- llama_logit_info logit_info(ctx);
14415
- std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
14416
-
14417
- // Clear the kv slot so that other beams may try different tokens at this position. The llama_decode()
14418
- // call in loop() will conclusively fill in the kv slot once the beams converge at this position.
14419
- llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
14420
-
14421
- size_t i=0;
14422
- if (next_beams.size() < n_beams) {
14423
- for (; next_beams.size() < n_beams ; ++i) {
14424
- llama_beam next_beam = beam;
14425
- next_beam.tokens.push_back(next_tokens[i].id);
14426
- next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit);
14427
- next_beams.push_back(std::move(next_beam));
14428
- }
14429
- std::make_heap(next_beams.begin(), next_beams.end(), comp);
14430
- } else {
14431
- for (; next_beams.front().p == 0.0f ; ++i) {
14432
- std::pop_heap(next_beams.begin(), next_beams.end(), comp);
14433
- next_beams.back() = beam;
14434
- next_beams.back().tokens.push_back(next_tokens[i].id);
14435
- next_beams.back().p *= logit_info.probability_from_logit(next_tokens[i].logit);
14436
- std::push_heap(next_beams.begin(), next_beams.end(), comp);
14437
- }
14438
- }
14439
- for (; i < n_beams ; ++i) {
14440
- const float next_p = beam.p * logit_info.probability_from_logit(next_tokens[i].logit);
14441
- if (next_beams.front().p < next_p) {
14442
- std::pop_heap(next_beams.begin(), next_beams.end(), comp);
14443
- next_beams.back() = beam;
14444
- next_beams.back().tokens.push_back(next_tokens[i].id);
14445
- next_beams.back().p = next_p;
14446
- std::push_heap(next_beams.begin(), next_beams.end(), comp);
14447
- }
14448
- }
14449
- }
14450
- }
14451
-
14452
- // Find common_prefix_length based on beams.
14453
- // Requires beams is not empty.
14454
- size_t find_common_prefix_length() {
14455
- size_t common_prefix_length = beams[0].tokens.size();
14456
- for (size_t i = 1 ; i < beams.size() ; ++i) {
14457
- common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size());
14458
- for (size_t j = 0 ; j < common_prefix_length ; ++j) {
14459
- if (beams[0].tokens[j] != beams[i].tokens[j]) {
14460
- common_prefix_length = j;
14461
- break;
14462
- }
14463
- }
14464
- }
14465
- return common_prefix_length;
14466
- }
14467
-
14468
- // Construct beams_state to send back to caller via the callback function.
14469
- // Side effect: set common_prefix_length = find_common_prefix_length();
14470
- llama_beams_state get_beams_state(const bool last_call) {
14471
- for (size_t i = 0 ; i < beams.size() ; ++i) {
14472
- beam_views[i] = beams[i].view();
14473
- }
14474
- common_prefix_length = find_common_prefix_length();
14475
- return {beam_views.data(), beams.size(), common_prefix_length, last_call};
14476
- }
14477
-
14478
- // Loop:
14479
- // * while i < n_predict, AND
14480
- // * any of the beams have not yet reached end-of-beam (eob), AND
14481
- // * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence
14482
- // (since all other beam probabilities can only decrease)
14483
- void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) {
14484
- beams.push_back({{}, 1.0f, false}); // Start with one empty beam w/ probability = 1.0 and !eob.
14485
- const auto not_eob = [](const llama_beam & beam) { return !beam.eob; };
14486
- for (int i = 0 ; i < n_predict && std::any_of(beams.begin(),beams.end(),not_eob) &&
14487
- !beams[top_beam_index()].eob ; ++i) {
14488
- callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
14489
- update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
14490
- if (common_prefix_length) {
14491
- llama_decode(ctx, llama_batch_get_one(beams[0].tokens.data(), common_prefix_length, n_past, 0));
14492
- n_past += common_prefix_length;
14493
- }
14494
- // Zero-out next_beam probabilities to place them last in following min-heap.
14495
- std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam & beam) { beam.p = 0.0f; });
14496
- for (llama_beam & beam : beams) {
14497
- beam.shift_tokens(common_prefix_length);
14498
- fill_next_beams_by_top_probabilities(beam);
14499
- }
14500
- // next_beams become the beams of next/final iteration. Swap them to re-use memory.
14501
- beams.swap(next_beams);
14502
- renormalize_beam_probabilities(beams);
14503
- }
14504
- collapse_beams(top_beam_index());
14505
- callback(callback_data, get_beams_state(true));
14506
- }
14507
-
14508
- // As beams grow, the cumulative probabilities decrease.
14509
- // Renormalize them to avoid floating point underflow.
14510
- static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) {
14511
- const auto sum_p = [](float sum, llama_beam & beam) { return sum + beam.p; };
14512
- const float inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
14513
- std::for_each(beams.begin(), beams.end(), [=](llama_beam & beam) { beam.p *= inv_sum; });
14514
- }
14515
-
14516
- // Assumes beams is non-empty. Uses llama_beam::operator<() for ordering.
14517
- size_t top_beam_index() {
14518
- return std::max_element(beams.begin(), beams.end()) - beams.begin();
14519
- }
14520
-
14521
- // Copy (p,eob) for each beam which may have been changed by the callback.
14522
- void update_beams_from_beam_views() {
14523
- for (size_t i = 0 ; i < beams.size() ; ++i) {
14524
- beams[i].p = beam_views[i].p;
14525
- beams[i].eob = beam_views[i].eob;
14526
- }
14527
- }
14528
- };
14529
-
14530
- void llama_beam_search(llama_context * ctx,
14531
- llama_beam_search_callback_fn_t callback, void * callback_data,
14532
- size_t n_beams, int n_past, int n_predict) {
14533
- assert(ctx);
14534
- const int64_t t_start_sample_us = ggml_time_us();
14535
-
14536
- llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict);
14537
-
14538
- beam_search_data.loop(callback, callback_data);
14539
-
14540
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
14541
- ctx->n_sample++;
14542
- }
14543
-
14544
14706
  //
14545
14707
  // quantization
14546
14708
  //
@@ -15751,7 +15913,7 @@ bool llama_supports_mlock(void) {
15751
15913
  }
15752
15914
 
15753
15915
  bool llama_supports_gpu_offload(void) {
15754
- #if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
15916
+ #if defined(GGML_USE_CUDA) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
15755
15917
  defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
15756
15918
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
15757
15919
  return true;
@@ -15808,7 +15970,7 @@ struct llama_model * llama_load_model_from_file(
15808
15970
  return true;
15809
15971
  };
15810
15972
  }
15811
- if (params.rpc_servers != nullptr) {
15973
+ if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
15812
15974
  // split the servers set them into model->rpc_servers
15813
15975
  std::string servers(params.rpc_servers);
15814
15976
  size_t pos = 0;
@@ -15862,6 +16024,11 @@ struct llama_context * llama_new_context_with_model(
15862
16024
  params.flash_attn = false;
15863
16025
  }
15864
16026
 
16027
+ if (params.type_v != GGML_TYPE_F16 && !params.flash_attn) {
16028
+ LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
16029
+ return nullptr;
16030
+ }
16031
+
15865
16032
  llama_context * ctx = new llama_context(*model);
15866
16033
 
15867
16034
  const auto & hparams = model->hparams;
@@ -15900,8 +16067,8 @@ struct llama_context * llama_new_context_with_model(
15900
16067
 
15901
16068
  cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
15902
16069
 
15903
- cparams.n_yarn_orig_ctx = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
15904
- hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx :
16070
+ cparams.n_ctx_orig_yarn = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
16071
+ hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn :
15905
16072
  hparams.n_ctx_train;
15906
16073
 
15907
16074
  cparams.cb_eval = params.cb_eval;
@@ -15966,17 +16133,7 @@ struct llama_context * llama_new_context_with_model(
15966
16133
 
15967
16134
  if (!hparams.vocab_only) {
15968
16135
  // initialize backends
15969
- #if defined(GGML_USE_RPC)
15970
- for (auto & server : model->rpc_servers) {
15971
- ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
15972
- if (backend == nullptr) {
15973
- LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
15974
- llama_free(ctx);
15975
- return nullptr;
15976
- }
15977
- ctx->backends.push_back(backend);
15978
- }
15979
- #elif defined(GGML_USE_METAL)
16136
+ #if defined(GGML_USE_METAL)
15980
16137
  if (model->n_gpu_layers > 0) {
15981
16138
  ctx->backend_metal = ggml_backend_metal_init();
15982
16139
  if (ctx->backend_metal == nullptr) {
@@ -16015,7 +16172,7 @@ struct llama_context * llama_new_context_with_model(
16015
16172
  return nullptr;
16016
16173
  }
16017
16174
  if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
16018
- ggml_backend_t backend = ggml_backend_vk_init(0);
16175
+ ggml_backend_t backend = ggml_backend_vk_init(model->main_gpu);
16019
16176
  if (backend == nullptr) {
16020
16177
  LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
16021
16178
  llama_free(ctx);
@@ -16068,6 +16225,19 @@ struct llama_context * llama_new_context_with_model(
16068
16225
  }
16069
16226
  ctx->backends.push_back(backend);
16070
16227
  }
16228
+ #endif
16229
+ #if defined(GGML_USE_RPC)
16230
+ if (model->n_gpu_layers > 0) {
16231
+ for (const auto & endpoint : model->rpc_servers) {
16232
+ ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
16233
+ if (backend == nullptr) {
16234
+ LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
16235
+ llama_free(ctx);
16236
+ return nullptr;
16237
+ }
16238
+ ctx->backends.push_back(backend);
16239
+ }
16240
+ }
16071
16241
  #endif
16072
16242
  ctx->backend_cpu = ggml_backend_cpu_init();
16073
16243
  if (ctx->backend_cpu == nullptr) {
@@ -16235,6 +16405,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
16235
16405
  case LLM_ARCH_COMMAND_R:
16236
16406
  case LLM_ARCH_OLMO:
16237
16407
  case LLM_ARCH_ARCTIC:
16408
+ case LLM_ARCH_DEEPSEEK2:
16238
16409
  return LLAMA_ROPE_TYPE_NORM;
16239
16410
 
16240
16411
  // the pairs of head values are offset by n_rot/2
@@ -17849,9 +18020,9 @@ float llama_token_get_score(const struct llama_model * model, llama_token token)
17849
18020
  return model->vocab.id_to_token[token].score;
17850
18021
  }
17851
18022
 
17852
- llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) {
18023
+ llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token) {
17853
18024
  GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
17854
- return model->vocab.id_to_token[token].type;
18025
+ return model->vocab.id_to_token[token].attr;
17855
18026
  }
17856
18027
 
17857
18028
  bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
@@ -17861,6 +18032,10 @@ bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
17861
18032
  );
17862
18033
  }
17863
18034
 
18035
+ bool llama_token_is_control(const struct llama_model * model, llama_token token) {
18036
+ return llama_is_control_token(model->vocab, token);
18037
+ }
18038
+
17864
18039
  llama_token llama_token_bos(const struct llama_model * model) {
17865
18040
  return model->vocab.special_bos_id;
17866
18041
  }
@@ -17932,7 +18107,16 @@ static std::string llama_decode_text(const std::string & text) {
17932
18107
 
17933
18108
  const auto cpts = unicode_cpts_from_utf8(text);
17934
18109
  for (const auto cpt : cpts) {
17935
- decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(cpt));
18110
+ const auto utf8 = unicode_cpt_to_utf8(cpt);
18111
+ try {
18112
+ decoded_text += unicode_utf8_to_byte(utf8);
18113
+ } catch (const std::out_of_range & e) {
18114
+ decoded_text += "[UNK_BYTE_0x";
18115
+ for (const auto c : utf8) {
18116
+ decoded_text += format("%02x", (uint8_t) c);
18117
+ }
18118
+ decoded_text += text + "]";
18119
+ }
17936
18120
  }
17937
18121
 
17938
18122
  return decoded_text;
@@ -17940,69 +18124,88 @@ static std::string llama_decode_text(const std::string & text) {
17940
18124
 
17941
18125
  // does not write null-terminator to buf
17942
18126
  int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
18127
+ // ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
18128
+ if (!special && llama_is_control_token(model->vocab, token)) {
18129
+ return 0;
18130
+ }
18131
+
18132
+ // if we have a cache - use it
18133
+ {
18134
+ const auto & cache = model->vocab.cache_token_to_piece;
18135
+
18136
+ if (!cache.empty()) {
18137
+ const auto & res = cache.at(token);
18138
+ if (length < (int) res.size()) {
18139
+ return -(int) res.size();
18140
+ }
18141
+ memcpy(buf, res.c_str(), res.size());
18142
+ return res.size();
18143
+ }
18144
+ }
18145
+
17943
18146
  if (0 <= token && token < llama_n_vocab(model)) {
17944
18147
  switch (llama_vocab_get_type(model->vocab)) {
17945
- case LLAMA_VOCAB_TYPE_WPM:
17946
- case LLAMA_VOCAB_TYPE_SPM: {
17947
- // NOTE: we accept all unsupported token types,
17948
- // suppressing them like CONTROL tokens.
17949
- if (llama_is_normal_token(model->vocab, token)) {
17950
- std::string result = model->vocab.id_to_token[token].text;
17951
- llama_unescape_whitespace(result);
17952
- if (length < (int) result.length()) {
17953
- return -(int) result.length();
17954
- }
17955
- memcpy(buf, result.c_str(), result.length());
17956
- return result.length();
17957
- } else if (
17958
- (llama_is_user_defined_token(model->vocab, token)) ||
17959
- (llama_is_control_token (model->vocab, token) && special)) {
17960
- std::string result = model->vocab.id_to_token[token].text;
17961
- if (length < (int) result.length()) {
17962
- return -(int) result.length();
17963
- }
17964
- memcpy(buf, result.c_str(), result.length());
17965
- return result.length();
17966
- } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
17967
- if (length < 3) {
17968
- return -3;
17969
- }
17970
- memcpy(buf, "\xe2\x96\x85", 3);
17971
- return 3;
17972
- } else if (llama_is_byte_token(model->vocab, token)) {
17973
- if (length < 1) {
17974
- return -1;
18148
+ case LLAMA_VOCAB_TYPE_WPM:
18149
+ case LLAMA_VOCAB_TYPE_SPM: {
18150
+ // NOTE: we accept all unsupported token types,
18151
+ // suppressing them like CONTROL tokens.
18152
+ if (llama_is_normal_token(model->vocab, token)) {
18153
+ std::string result = model->vocab.id_to_token[token].text;
18154
+ llama_unescape_whitespace(result);
18155
+ if (length < (int) result.length()) {
18156
+ return -(int) result.length();
18157
+ }
18158
+ memcpy(buf, result.c_str(), result.length());
18159
+ return result.length();
18160
+ } else if (
18161
+ (llama_is_user_defined_token(model->vocab, token)) ||
18162
+ (llama_is_control_token (model->vocab, token) && special)) {
18163
+ std::string result = model->vocab.id_to_token[token].text;
18164
+ if (length < (int) result.length()) {
18165
+ return -(int) result.length();
18166
+ }
18167
+ memcpy(buf, result.c_str(), result.length());
18168
+ return result.length();
18169
+ } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
18170
+ if (length < 3) {
18171
+ return -3;
18172
+ }
18173
+ memcpy(buf, "\xe2\x96\x85", 3);
18174
+ return 3;
18175
+ } else if (llama_is_byte_token(model->vocab, token)) {
18176
+ if (length < 1) {
18177
+ return -1;
18178
+ }
18179
+ buf[0] = llama_token_to_byte(model->vocab, token);
18180
+ return 1;
17975
18181
  }
17976
- buf[0] = llama_token_to_byte(model->vocab, token);
17977
- return 1;
18182
+ break;
17978
18183
  }
17979
- break;
17980
- }
17981
- case LLAMA_VOCAB_TYPE_BPE: {
17982
- // NOTE: we accept all unsupported token types,
17983
- // suppressing them like CONTROL tokens.
17984
- if (llama_is_normal_token(model->vocab, token)) {
17985
- std::string result = model->vocab.id_to_token[token].text;
17986
- result = llama_decode_text(result);
17987
- if (length < (int) result.length()) {
17988
- return -(int) result.length();
17989
- }
17990
- memcpy(buf, result.c_str(), result.length());
17991
- return result.length();
17992
- } else if (
17993
- (llama_is_user_defined_token(model->vocab, token)) ||
17994
- (llama_is_control_token (model->vocab, token) && special)) {
17995
- std::string result = model->vocab.id_to_token[token].text;
17996
- if (length < (int) result.length()) {
17997
- return -(int) result.length();
18184
+ case LLAMA_VOCAB_TYPE_BPE: {
18185
+ // NOTE: we accept all unsupported token types,
18186
+ // suppressing them like CONTROL tokens.
18187
+ if (llama_is_normal_token(model->vocab, token)) {
18188
+ std::string result = model->vocab.id_to_token[token].text;
18189
+ result = llama_decode_text(result);
18190
+ if (length < (int) result.length()) {
18191
+ return -(int) result.length();
18192
+ }
18193
+ memcpy(buf, result.c_str(), result.length());
18194
+ return result.length();
18195
+ } else if (
18196
+ (llama_is_user_defined_token(model->vocab, token)) ||
18197
+ (llama_is_control_token (model->vocab, token) && special)) {
18198
+ std::string result = model->vocab.id_to_token[token].text;
18199
+ if (length < (int) result.length()) {
18200
+ return -(int) result.length();
18201
+ }
18202
+ memcpy(buf, result.c_str(), result.length());
18203
+ return result.length();
17998
18204
  }
17999
- memcpy(buf, result.c_str(), result.length());
18000
- return result.length();
18205
+ break;
18001
18206
  }
18002
- break;
18003
- }
18004
- default:
18005
- GGML_ASSERT(false);
18207
+ default:
18208
+ GGML_ASSERT(false);
18006
18209
  }
18007
18210
  }
18008
18211
  return 0;
@@ -18337,6 +18540,7 @@ const char * llama_print_system_info(void) {
18337
18540
  s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
18338
18541
  s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
18339
18542
  s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
18543
+ s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | ";
18340
18544
  s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
18341
18545
  s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
18342
18546
  s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";