llama_cpp 0.15.3 → 0.16.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (149) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +16 -0
  3. data/ext/llama_cpp/extconf.rb +1 -2
  4. data/ext/llama_cpp/llama_cpp.cpp +27 -3
  5. data/lib/llama_cpp/version.rb +2 -2
  6. data/sig/llama_cpp.rbs +15 -1
  7. data/vendor/tmp/llama.cpp/Makefile +66 -36
  8. data/vendor/tmp/llama.cpp/ggml-alloc.c +4 -4
  9. data/vendor/tmp/llama.cpp/ggml-backend.c +5 -5
  10. data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
  11. data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
  12. data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
  13. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +103 -0
  14. data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
  15. data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
  16. data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
  17. data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
  18. data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
  19. data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
  20. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +662 -0
  21. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
  22. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
  23. data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
  24. data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
  25. data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
  26. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +1564 -0
  27. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +404 -0
  28. data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
  29. data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
  30. data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
  31. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +45 -0
  32. data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
  33. data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
  34. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +205 -0
  35. data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
  36. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
  37. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
  38. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
  39. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
  40. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
  41. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
  42. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
  43. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
  44. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
  45. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
  123. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
  124. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
  125. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
  126. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
  127. data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
  128. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +266 -0
  129. data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
  130. data/vendor/tmp/llama.cpp/ggml-cuda.cu +35 -16
  131. data/vendor/tmp/llama.cpp/ggml-impl.h +4 -0
  132. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +21 -7
  133. data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
  134. data/vendor/tmp/llama.cpp/ggml-metal.m +99 -35
  135. data/vendor/tmp/llama.cpp/ggml-metal.metal +146 -80
  136. data/vendor/tmp/llama.cpp/ggml-quants.c +101 -11
  137. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +75 -58
  138. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +345 -227
  139. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +99301 -39793
  140. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +458 -329
  141. data/vendor/tmp/llama.cpp/ggml.c +301 -409
  142. data/vendor/tmp/llama.cpp/ggml.h +19 -23
  143. data/vendor/tmp/llama.cpp/llama.cpp +855 -651
  144. data/vendor/tmp/llama.cpp/llama.h +28 -48
  145. metadata +121 -6
  146. data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
  147. data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
  148. data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
  149. data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
@@ -13,8 +13,6 @@
13
13
 
14
14
  #ifdef GGML_USE_CUDA
15
15
  # include "ggml-cuda.h"
16
- #elif defined(GGML_USE_CLBLAST)
17
- # include "ggml-opencl.h"
18
16
  #elif defined(GGML_USE_VULKAN)
19
17
  # include "ggml-vulkan.h"
20
18
  #elif defined(GGML_USE_SYCL)
@@ -103,14 +101,14 @@
103
101
  #endif
104
102
 
105
103
  #define LLAMA_MAX_NODES 8192
106
- #define LLAMA_MAX_EXPERTS 128
104
+ #define LLAMA_MAX_EXPERTS 160
107
105
 
108
106
  //
109
107
  // logging
110
108
  //
111
109
 
112
110
  LLAMA_ATTRIBUTE_FORMAT(2, 3)
113
- static void llama_log_internal (ggml_log_level level, const char* format, ...);
111
+ static void llama_log_internal (ggml_log_level level, const char * format, ...);
114
112
  static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
115
113
 
116
114
  #define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
@@ -222,6 +220,7 @@ enum llm_arch {
222
220
  LLM_ARCH_DBRX,
223
221
  LLM_ARCH_OLMO,
224
222
  LLM_ARCH_ARCTIC,
223
+ LLM_ARCH_DEEPSEEK2,
225
224
  LLM_ARCH_UNKNOWN,
226
225
  };
227
226
 
@@ -259,6 +258,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
259
258
  { LLM_ARCH_DBRX, "dbrx" },
260
259
  { LLM_ARCH_OLMO, "olmo" },
261
260
  { LLM_ARCH_ARCTIC, "arctic" },
261
+ { LLM_ARCH_DEEPSEEK2, "deepseek2" },
262
262
  { LLM_ARCH_UNKNOWN, "(unknown)" },
263
263
  };
264
264
 
@@ -279,11 +279,15 @@ enum llm_kv {
279
279
  LLM_KV_CONTEXT_LENGTH,
280
280
  LLM_KV_EMBEDDING_LENGTH,
281
281
  LLM_KV_BLOCK_COUNT,
282
+ LLM_KV_LEADING_DENSE_BLOCK_COUNT,
282
283
  LLM_KV_FEED_FORWARD_LENGTH,
284
+ LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
283
285
  LLM_KV_USE_PARALLEL_RESIDUAL,
284
286
  LLM_KV_TENSOR_DATA_LAYOUT,
285
287
  LLM_KV_EXPERT_COUNT,
286
288
  LLM_KV_EXPERT_USED_COUNT,
289
+ LLM_KV_EXPERT_SHARED_COUNT,
290
+ LLM_KV_EXPERT_WEIGHTS_SCALE,
287
291
  LLM_KV_POOLING_TYPE,
288
292
  LLM_KV_LOGIT_SCALE,
289
293
 
@@ -296,6 +300,8 @@ enum llm_kv {
296
300
  LLM_KV_ATTENTION_LAYERNORM_EPS,
297
301
  LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
298
302
  LLM_KV_ATTENTION_CAUSAL,
303
+ LLM_KV_ATTENTION_Q_LORA_RANK,
304
+ LLM_KV_ATTENTION_KV_LORA_RANK,
299
305
 
300
306
  LLM_KV_ROPE_DIMENSION_COUNT,
301
307
  LLM_KV_ROPE_FREQ_BASE,
@@ -305,6 +311,7 @@ enum llm_kv {
305
311
  LLM_KV_ROPE_SCALING_ATTN_FACTOR,
306
312
  LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
307
313
  LLM_KV_ROPE_SCALING_FINETUNED,
314
+ LLM_KV_ROPE_SCALING_YARN_LOG_MUL,
308
315
 
309
316
  LLM_KV_SPLIT_NO,
310
317
  LLM_KV_SPLIT_COUNT,
@@ -353,17 +360,21 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
353
360
  { LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
354
361
  { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
355
362
 
356
- { LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
357
- { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
358
- { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
359
- { LLM_KV_BLOCK_COUNT, "%s.block_count" },
360
- { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
361
- { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
362
- { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
363
- { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
364
- { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
365
- { LLM_KV_POOLING_TYPE , "%s.pooling_type" },
366
- { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
363
+ { LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
364
+ { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
365
+ { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
366
+ { LLM_KV_BLOCK_COUNT, "%s.block_count" },
367
+ { LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
368
+ { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
369
+ { LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
370
+ { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
371
+ { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
372
+ { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
373
+ { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
374
+ { LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
375
+ { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
376
+ { LLM_KV_POOLING_TYPE , "%s.pooling_type" },
377
+ { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
367
378
 
368
379
  { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
369
380
  { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -374,6 +385,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
374
385
  { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
375
386
  { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
376
387
  { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
388
+ { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
389
+ { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
377
390
 
378
391
  { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
379
392
  { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
@@ -383,6 +396,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
383
396
  { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
384
397
  { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
385
398
  { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
399
+ { LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" },
386
400
 
387
401
  { LLM_KV_SPLIT_NO, "split.no" },
388
402
  { LLM_KV_SPLIT_COUNT, "split.count" },
@@ -474,6 +488,12 @@ enum llm_tensor {
474
488
  LLM_TENSOR_SSM_A,
475
489
  LLM_TENSOR_SSM_D,
476
490
  LLM_TENSOR_SSM_OUT,
491
+ LLM_TENSOR_ATTN_Q_A,
492
+ LLM_TENSOR_ATTN_Q_B,
493
+ LLM_TENSOR_ATTN_KV_A_MQA,
494
+ LLM_TENSOR_ATTN_KV_B,
495
+ LLM_TENSOR_ATTN_Q_A_NORM,
496
+ LLM_TENSOR_ATTN_KV_A_NORM,
477
497
  };
478
498
 
479
499
  static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
@@ -1057,6 +1077,35 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
1057
1077
  { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1058
1078
  },
1059
1079
  },
1080
+ {
1081
+ LLM_ARCH_DEEPSEEK2,
1082
+ {
1083
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1084
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1085
+ { LLM_TENSOR_OUTPUT, "output" },
1086
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1087
+ { LLM_TENSOR_ATTN_Q_A_NORM, "blk.%d.attn_q_a_norm" },
1088
+ { LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" },
1089
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1090
+ { LLM_TENSOR_ATTN_Q_A, "blk.%d.attn_q_a" },
1091
+ { LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" },
1092
+ { LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
1093
+ { LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
1094
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1095
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1096
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1097
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1098
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1099
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1100
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1101
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1102
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1103
+ { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
1104
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
1105
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
1106
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1107
+ },
1108
+ },
1060
1109
  {
1061
1110
  LLM_ARCH_UNKNOWN,
1062
1111
  {
@@ -1651,12 +1700,13 @@ struct llama_mlock {
1651
1700
  };
1652
1701
  using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
1653
1702
 
1654
- static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
1703
+ // NOTE: avoid ever using this except for building the token_to_piece caches
1704
+ static std::string llama_token_to_piece(const struct llama_model * model, llama_token token, bool special) {
1655
1705
  std::vector<char> result(8, 0);
1656
- const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
1706
+ const int n_tokens = llama_token_to_piece(model, token, result.data(), result.size(), special);
1657
1707
  if (n_tokens < 0) {
1658
1708
  result.resize(-n_tokens);
1659
- int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
1709
+ int check = llama_token_to_piece(model, token, result.data(), result.size(), special);
1660
1710
  GGML_ASSERT(check == -n_tokens);
1661
1711
  }
1662
1712
  else {
@@ -1741,6 +1791,7 @@ enum e_model {
1741
1791
  MODEL_13B,
1742
1792
  MODEL_14B,
1743
1793
  MODEL_15B,
1794
+ MODEL_16B,
1744
1795
  MODEL_20B,
1745
1796
  MODEL_30B,
1746
1797
  MODEL_34B,
@@ -1748,6 +1799,7 @@ enum e_model {
1748
1799
  MODEL_40B,
1749
1800
  MODEL_65B,
1750
1801
  MODEL_70B,
1802
+ MODEL_236B,
1751
1803
  MODEL_314B,
1752
1804
  MODEL_SMALL,
1753
1805
  MODEL_MEDIUM,
@@ -1783,13 +1835,21 @@ struct llama_hparams {
1783
1835
  uint32_t n_expert_used = 0;
1784
1836
  uint32_t n_vocab_type = 0; // for BERT-style token types
1785
1837
 
1838
+ uint32_t n_layer_dense_lead = 0;
1839
+ uint32_t n_lora_q = 0;
1840
+ uint32_t n_lora_kv = 0;
1841
+ uint32_t n_ff_exp = 0;
1842
+ uint32_t n_expert_shared = 0;
1843
+ float expert_weights_scale = 0.0;
1844
+
1786
1845
  float f_norm_eps;
1787
1846
  float f_norm_rms_eps;
1788
1847
 
1789
1848
  float rope_attn_factor = 1.0f;
1790
1849
  float rope_freq_base_train;
1791
1850
  float rope_freq_scale_train;
1792
- uint32_t n_yarn_orig_ctx;
1851
+ uint32_t n_ctx_orig_yarn;
1852
+ float rope_yarn_log_mul;
1793
1853
 
1794
1854
  // for State Space Models
1795
1855
  uint32_t ssm_d_conv = 0;
@@ -1823,8 +1883,14 @@ struct llama_hparams {
1823
1883
  if (this->n_expert != other.n_expert) return true;
1824
1884
  if (this->n_expert_used != other.n_expert_used) return true;
1825
1885
 
1886
+ if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
1887
+ if (this->n_lora_q != other.n_lora_q) return true;
1888
+ if (this->n_lora_kv != other.n_lora_kv) return true;
1889
+ if (this->n_ff_exp != other.n_ff_exp) return true;
1890
+ if (this->n_expert_shared != other.n_expert_shared) return true;
1891
+
1826
1892
  if (this->rope_finetuned != other.rope_finetuned) return true;
1827
- if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
1893
+ if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true;
1828
1894
 
1829
1895
  if (this->ssm_d_conv != other.ssm_d_conv) return true;
1830
1896
  if (this->ssm_d_inner != other.ssm_d_inner) return true;
@@ -1838,6 +1904,8 @@ struct llama_hparams {
1838
1904
  if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
1839
1905
  if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
1840
1906
  if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
1907
+ if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
1908
+ if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
1841
1909
 
1842
1910
  return false;
1843
1911
  }
@@ -1881,7 +1949,7 @@ struct llama_cparams {
1881
1949
  float rope_freq_base;
1882
1950
  float rope_freq_scale;
1883
1951
 
1884
- uint32_t n_yarn_orig_ctx;
1952
+ uint32_t n_ctx_orig_yarn;
1885
1953
  // These hyperparameters are not exposed in GGUF, because all
1886
1954
  // existing YaRN models use the same values for them.
1887
1955
  float yarn_ext_factor;
@@ -1913,6 +1981,8 @@ struct llama_layer {
1913
1981
  struct ggml_tensor * attn_k_norm_b;
1914
1982
  struct ggml_tensor * attn_out_norm;
1915
1983
  struct ggml_tensor * attn_out_norm_b;
1984
+ struct ggml_tensor * attn_q_a_norm;
1985
+ struct ggml_tensor * attn_kv_a_norm;
1916
1986
 
1917
1987
  // attention
1918
1988
  struct ggml_tensor * wq;
@@ -1920,6 +1990,10 @@ struct llama_layer {
1920
1990
  struct ggml_tensor * wv;
1921
1991
  struct ggml_tensor * wo;
1922
1992
  struct ggml_tensor * wqkv;
1993
+ struct ggml_tensor * wq_a;
1994
+ struct ggml_tensor * wq_b;
1995
+ struct ggml_tensor * wkv_a_mqa;
1996
+ struct ggml_tensor * wkv_b;
1923
1997
 
1924
1998
  // attention bias
1925
1999
  struct ggml_tensor * bq;
@@ -1953,8 +2027,9 @@ struct llama_layer {
1953
2027
  struct ggml_tensor * ffn_up_shexp;
1954
2028
 
1955
2029
  // ff bias
1956
- struct ggml_tensor * ffn_down_b; // b2
1957
- struct ggml_tensor * ffn_up_b; // b3
2030
+ struct ggml_tensor * ffn_gate_b = nullptr;
2031
+ struct ggml_tensor * ffn_down_b = nullptr; // b2
2032
+ struct ggml_tensor * ffn_up_b = nullptr; // b3
1958
2033
  struct ggml_tensor * ffn_act;
1959
2034
 
1960
2035
  // mamba proj
@@ -2072,12 +2147,12 @@ struct llama_control_vector {
2072
2147
  struct llama_vocab {
2073
2148
  using id = int32_t;
2074
2149
  using token = std::string;
2075
- using ttype = llama_token_type;
2150
+ using tattr = llama_token_attr;
2076
2151
 
2077
2152
  struct token_data {
2078
2153
  token text;
2079
2154
  float score;
2080
- ttype type;
2155
+ tattr attr;
2081
2156
  };
2082
2157
 
2083
2158
  enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
@@ -2086,7 +2161,8 @@ struct llama_vocab {
2086
2161
  std::unordered_map<token, id> token_to_id;
2087
2162
  std::vector<token_data> id_to_token;
2088
2163
 
2089
- std::unordered_map<token, id> special_tokens_cache;
2164
+ std::vector<id> cache_special_tokens;
2165
+ std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = true);
2090
2166
 
2091
2167
  std::map<std::pair<std::string, std::string>, int> bpe_ranks;
2092
2168
 
@@ -2293,13 +2369,34 @@ struct llama_context {
2293
2369
  struct llama_control_vector cvec;
2294
2370
  };
2295
2371
 
2372
+ static size_t llama_get_device_count(const llama_model & model) {
2373
+ size_t count = 1;
2374
+ #if defined(GGML_USE_CUDA)
2375
+ count = ggml_backend_cuda_get_device_count();
2376
+ #elif defined(GGML_USE_SYCL)
2377
+ count = ggml_backend_sycl_get_device_count();
2378
+ #elif defined(GGML_USE_VULKAN)
2379
+ count = ggml_backend_vk_get_device_count();
2380
+ #endif
2381
+ #if defined(GGML_USE_RPC)
2382
+ count += model.rpc_servers.size();
2383
+ #endif
2384
+ return count;
2385
+ GGML_UNUSED(model);
2386
+ }
2387
+
2296
2388
  static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
2297
2389
  ggml_backend_buffer_type_t buft = nullptr;
2298
2390
 
2299
- #ifdef GGML_USE_RPC
2300
- std::string endpoint = model.rpc_servers[gpu];
2301
- buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
2302
- #elif defined(GGML_USE_METAL)
2391
+ #if defined(GGML_USE_RPC)
2392
+ int dev_count = (int)llama_get_device_count(model);
2393
+ int rpc_count = (int)model.rpc_servers.size();
2394
+ if (gpu >= dev_count - rpc_count) {
2395
+ const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str();
2396
+ return ggml_backend_rpc_buffer_type(endpoint);
2397
+ }
2398
+ #endif
2399
+ #if defined(GGML_USE_METAL)
2303
2400
  buft = ggml_backend_metal_buffer_type();
2304
2401
  #elif defined(GGML_USE_CUDA)
2305
2402
  buft = ggml_backend_cuda_buffer_type(gpu);
@@ -2307,8 +2404,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
2307
2404
  buft = ggml_backend_vk_buffer_type(gpu);
2308
2405
  #elif defined(GGML_USE_SYCL)
2309
2406
  buft = ggml_backend_sycl_buffer_type(gpu);
2310
- #elif defined(GGML_USE_CLBLAST)
2311
- buft = ggml_backend_opencl_buffer_type();
2312
2407
  #elif defined(GGML_USE_KOMPUTE)
2313
2408
  buft = ggml_backend_kompute_buffer_type(gpu);
2314
2409
  if (buft == nullptr) {
@@ -2347,29 +2442,19 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
2347
2442
  GGML_UNUSED(tensor_split);
2348
2443
  }
2349
2444
 
2350
- static size_t llama_get_device_count(const llama_model & model) {
2351
- #if defined(GGML_USE_RPC)
2352
- return model.rpc_servers.size();
2353
- #elif defined(GGML_USE_CUDA)
2354
- return ggml_backend_cuda_get_device_count();
2355
- #elif defined(GGML_USE_SYCL)
2356
- return ggml_backend_sycl_get_device_count();
2357
- #elif defined(GGML_USE_VULKAN)
2358
- return ggml_backend_vk_get_device_count();
2359
- #else
2360
- return 1;
2361
- #endif
2362
- GGML_UNUSED(model);
2363
- }
2364
-
2365
2445
  static size_t llama_get_device_memory(const llama_model & model, int device) {
2366
2446
  #if defined(GGML_USE_RPC)
2367
- size_t total;
2368
- size_t free;
2369
- std::string endpoint = model.rpc_servers[device];
2370
- ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
2371
- return free;
2372
- #elif defined(GGML_USE_CUDA)
2447
+ int dev_count = (int)llama_get_device_count(model);
2448
+ int rpc_count = (int)model.rpc_servers.size();
2449
+ if (device >= dev_count - rpc_count) {
2450
+ size_t total;
2451
+ size_t free;
2452
+ const char * endpoint = model.rpc_servers[device - dev_count + rpc_count].c_str();
2453
+ ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
2454
+ return free;
2455
+ }
2456
+ #endif
2457
+ #if defined(GGML_USE_CUDA)
2373
2458
  size_t total;
2374
2459
  size_t free;
2375
2460
  ggml_backend_cuda_get_device_memory(device, &free, &total);
@@ -2441,10 +2526,6 @@ static bool llama_kv_cache_init(
2441
2526
  }
2442
2527
  }
2443
2528
 
2444
- #ifdef GGML_USE_CLBLAST
2445
- offload = false;
2446
- #endif
2447
-
2448
2529
  // count used buffer types
2449
2530
  std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
2450
2531
  if (offload) {
@@ -3832,6 +3913,7 @@ static const char * llama_model_type_name(e_model type) {
3832
3913
  case MODEL_13B: return "13B";
3833
3914
  case MODEL_14B: return "14B";
3834
3915
  case MODEL_15B: return "15B";
3916
+ case MODEL_16B: return "16B";
3835
3917
  case MODEL_20B: return "20B";
3836
3918
  case MODEL_30B: return "30B";
3837
3919
  case MODEL_34B: return "34B";
@@ -3839,6 +3921,7 @@ static const char * llama_model_type_name(e_model type) {
3839
3921
  case MODEL_40B: return "40B";
3840
3922
  case MODEL_65B: return "65B";
3841
3923
  case MODEL_70B: return "70B";
3924
+ case MODEL_236B: return "236B";
3842
3925
  case MODEL_314B: return "314B";
3843
3926
  case MODEL_SMALL: return "0.1B";
3844
3927
  case MODEL_MEDIUM: return "0.4B";
@@ -3922,8 +4005,8 @@ static void llm_load_hparams(
3922
4005
  ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
3923
4006
  hparams.rope_finetuned = rope_finetuned;
3924
4007
 
3925
- hparams.n_yarn_orig_ctx = hparams.n_ctx_train;
3926
- ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_yarn_orig_ctx, false);
4008
+ hparams.n_ctx_orig_yarn = hparams.n_ctx_train;
4009
+ ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn, false);
3927
4010
 
3928
4011
  // rope_freq_base (optional)
3929
4012
  hparams.rope_freq_base_train = 10000.0f;
@@ -3981,7 +4064,9 @@ static void llm_load_hparams(
3981
4064
  switch (hparams.n_layer) {
3982
4065
  case 22: model.type = e_model::MODEL_1B; break;
3983
4066
  case 26: model.type = e_model::MODEL_3B; break;
3984
- case 32: model.type = hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B; break;
4067
+ // granite uses a vocab with len 49152
4068
+ case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break;
4069
+ case 36: model.type = e_model::MODEL_8B; break; // granite
3985
4070
  case 40: model.type = e_model::MODEL_13B; break;
3986
4071
  case 48: model.type = e_model::MODEL_34B; break;
3987
4072
  case 60: model.type = e_model::MODEL_30B; break;
@@ -4251,6 +4336,8 @@ static void llm_load_hparams(
4251
4336
  case 30: model.type = e_model::MODEL_3B; break;
4252
4337
  case 32: model.type = e_model::MODEL_7B; break;
4253
4338
  case 40: model.type = e_model::MODEL_15B; break;
4339
+ case 52: model.type = e_model::MODEL_20B; break; // granite
4340
+ case 88: model.type = e_model::MODEL_34B; break; // granite
4254
4341
  default: model.type = e_model::MODEL_UNKNOWN;
4255
4342
  }
4256
4343
  } break;
@@ -4384,6 +4471,26 @@ static void llm_load_hparams(
4384
4471
  model.type = e_model::MODEL_UNKNOWN;
4385
4472
  }
4386
4473
  } break;
4474
+ case LLM_ARCH_DEEPSEEK2:
4475
+ {
4476
+ bool is_lite = (hparams.n_layer == 27);
4477
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
4478
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
4479
+ if (!is_lite) {
4480
+ ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
4481
+ }
4482
+ ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
4483
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
4484
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
4485
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
4486
+ ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
4487
+
4488
+ switch (hparams.n_layer) {
4489
+ case 27: model.type = e_model::MODEL_16B; break;
4490
+ case 60: model.type = e_model::MODEL_236B; break;
4491
+ default: model.type = e_model::MODEL_UNKNOWN;
4492
+ }
4493
+ } break;
4387
4494
  default: (void)0;
4388
4495
  }
4389
4496
 
@@ -4490,15 +4597,14 @@ static void llm_load_vocab(
4490
4597
  vocab.special_cls_id = 101;
4491
4598
  vocab.special_mask_id = 103;
4492
4599
  vocab.add_space_prefix = false;
4493
- } else {
4494
- if (tokenizer_model == "gpt2") {
4495
- vocab.type = LLAMA_VOCAB_TYPE_BPE;
4496
- } else {
4497
- LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str());
4498
- LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
4499
- vocab.type = LLAMA_VOCAB_TYPE_SPM;
4500
- return;
4600
+ } else if (tokenizer_model == "gpt2") {
4601
+ vocab.type = LLAMA_VOCAB_TYPE_BPE;
4602
+
4603
+ const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
4604
+ if (add_space_prefix_keyidx != -1) {
4605
+ vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
4501
4606
  }
4607
+
4502
4608
  // read bpe merges and populate bpe ranks
4503
4609
  const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
4504
4610
  if (merges_keyidx == -1) {
@@ -4532,6 +4638,8 @@ static void llm_load_vocab(
4532
4638
  vocab.special_pad_id = -1;
4533
4639
  vocab.special_cls_id = -1;
4534
4640
  vocab.special_mask_id = -1;
4641
+ } else {
4642
+ throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
4535
4643
  }
4536
4644
 
4537
4645
  // for now, only BPE models have pre-tokenizers
@@ -4593,6 +4701,9 @@ static void llm_load_vocab(
4593
4701
  } else if (
4594
4702
  tokenizer_pre == "dbrx") {
4595
4703
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
4704
+ } else if (
4705
+ tokenizer_pre == "smaug-bpe") {
4706
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
4596
4707
  } else {
4597
4708
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
4598
4709
  }
@@ -4631,7 +4742,20 @@ static void llm_load_vocab(
4631
4742
  auto & token_data = vocab.id_to_token[i];
4632
4743
  token_data.text = std::move(word);
4633
4744
  token_data.score = scores ? scores[i] : 0.0f;
4634
- token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL;
4745
+ token_data.attr = LLAMA_TOKEN_ATTR_NORMAL;
4746
+
4747
+ if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file
4748
+ switch(toktypes[i]) {
4749
+ case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break;
4750
+ case LLAMA_TOKEN_TYPE_UNUSED: token_data.attr = LLAMA_TOKEN_ATTR_UNUSED; break;
4751
+ case LLAMA_TOKEN_TYPE_NORMAL: token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; break;
4752
+ case LLAMA_TOKEN_TYPE_CONTROL: token_data.attr = LLAMA_TOKEN_ATTR_CONTROL; break;
4753
+ case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break;
4754
+ case LLAMA_TOKEN_TYPE_BYTE: token_data.attr = LLAMA_TOKEN_ATTR_BYTE; break;
4755
+ case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
4756
+ default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
4757
+ }
4758
+ }
4635
4759
  }
4636
4760
  GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
4637
4761
 
@@ -4721,96 +4845,88 @@ static void llm_load_vocab(
4721
4845
 
4722
4846
  // build special tokens cache
4723
4847
  {
4724
- // TODO: It is unclear (to me) at this point, whether special tokes are guaranteed to be of a deterministic type,
4725
- // and will always be correctly labeled in 'added_tokens.json' etc.
4726
- // The assumption is, since special tokens aren't meant to be exposed to end user, they are designed
4727
- // to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer
4728
- // are special tokens.
4729
- // From testing, this appears to correlate 1:1 with special tokens.
4730
- //
4848
+ for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
4849
+ if (!(vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL)) {
4850
+ vocab.cache_special_tokens.push_back(id);
4851
+ }
4852
+ }
4731
4853
 
4732
- // Counting special tokens and verifying in only one direction
4733
- // is sufficient to detect difference in those two sets.
4734
- //
4735
- uint32_t special_tokens_count_by_type = 0;
4736
- uint32_t special_tokens_count_from_verification = 0;
4854
+ std::sort( vocab.cache_special_tokens.begin(), vocab.cache_special_tokens.end(),
4855
+ [&] (const llama_vocab::id a, const llama_vocab::id b) {
4856
+ return vocab.id_to_token[a].text.size() > vocab.id_to_token[b].text.size();
4857
+ }
4858
+ );
4737
4859
 
4738
- bool special_tokens_definition_mismatch = false;
4860
+ LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size());
4861
+ }
4739
4862
 
4740
- for (const auto & t : vocab.token_to_id) {
4741
- const auto & token = t.first;
4742
- const auto & id = t.second;
4863
+ // build token to piece cache
4864
+ {
4865
+ size_t size_cache = 0;
4743
4866
 
4744
- // Count all non-normal tokens in the vocab while iterating
4745
- if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) {
4746
- special_tokens_count_by_type++;
4747
- }
4867
+ std::vector<llama_vocab::token> cache_token_to_piece(n_vocab);
4748
4868
 
4749
- // Skip single character tokens
4750
- if (token.length() > 1) {
4751
- bool is_tokenizable = false;
4869
+ for (uint32_t id = 0; id < n_vocab; ++id) {
4870
+ cache_token_to_piece[id] = llama_token_to_piece(&model, id, true);
4752
4871
 
4753
- // Split token string representation in two, in all possible ways
4754
- // and check if both halves can be matched to a valid token
4755
- for (unsigned i = 1; i < token.length();) {
4756
- const auto left = token.substr(0, i);
4757
- const auto right = token.substr(i);
4872
+ size_cache += cache_token_to_piece[id].size();
4873
+ }
4758
4874
 
4759
- // check if we didnt partition in the middle of a utf sequence
4760
- auto utf = utf8_len(left.at(left.length() - 1));
4875
+ std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
4761
4876
 
4762
- if (utf == 1) {
4763
- if (vocab.token_to_id.find(left) != vocab.token_to_id.end() &&
4764
- vocab.token_to_id.find(right) != vocab.token_to_id.end() ) {
4765
- is_tokenizable = true;
4766
- break;
4767
- }
4768
- i++;
4769
- } else {
4770
- // skip over the rest of multibyte utf sequence
4771
- i += utf - 1;
4772
- }
4877
+ LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
4878
+ }
4879
+
4880
+ // Handle per token attributes
4881
+ //NOTE: Each model customizes per token attributes.
4882
+ //NOTE: Per token attributes are missing from the GGUF file.
4883
+ //TODO: Extract attributes from GGUF file.
4884
+ {
4885
+ auto _contains_any = [] (const std::string &str, const std::vector<std::string> &substrs) -> bool {
4886
+ for (auto substr : substrs) {
4887
+ if (str.find(substr) < std::string::npos) {
4888
+ return true;
4773
4889
  }
4890
+ }
4891
+ return false;
4892
+ };
4774
4893
 
4775
- if (!is_tokenizable) {
4776
- // Some tokens are multibyte, but they are utf sequences with equivalent text length of 1
4777
- // it's faster to re-filter them here, since there are way less candidates now
4894
+ auto _set_tokenid_attr = [&] (const llama_vocab::id id, llama_token_attr attr, bool value) {
4895
+ uint32_t current = vocab.id_to_token.at(id).attr;
4896
+ current = value ? (current | attr) : (current & ~attr);
4897
+ vocab.id_to_token[id].attr = (llama_token_attr) current;
4898
+ };
4778
4899
 
4779
- // Calculate a total "utf" length of a token string representation
4780
- size_t utf8_str_len = 0;
4781
- for (unsigned i = 0; i < token.length();) {
4782
- utf8_str_len++;
4783
- i += utf8_len(token.at(i));
4784
- }
4900
+ auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) {
4901
+ _set_tokenid_attr(vocab.token_to_id.at(token), attr, value);
4902
+ };
4785
4903
 
4786
- // And skip the ones which are one character
4787
- if (utf8_str_len > 1) {
4788
- // At this point what we have left are special tokens only
4789
- vocab.special_tokens_cache[token] = id;
4904
+ std::string model_name;
4905
+ std::string tokenizer_pre;
4790
4906
 
4791
- // Count manually found special tokens
4792
- special_tokens_count_from_verification++;
4907
+ ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
4908
+ ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
4793
4909
 
4794
- // If this manually found special token is not marked as such, flag a mismatch
4795
- if (vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL) {
4796
- special_tokens_definition_mismatch = true;
4797
- }
4798
- }
4799
- }
4910
+ // model name to lowercase
4911
+ std::transform(model_name.begin(), model_name.end(), model_name.begin(),
4912
+ [] (const std::string::value_type x) {
4913
+ return std::tolower(x);
4800
4914
  }
4801
- }
4915
+ );
4802
4916
 
4803
- if (special_tokens_definition_mismatch || special_tokens_count_from_verification != special_tokens_count_by_type) {
4804
- LLAMA_LOG_WARN("%s: mismatch in special tokens definition ( %u/%zu vs %u/%zu ).\n",
4805
- __func__,
4806
- special_tokens_count_from_verification, vocab.id_to_token.size(),
4807
- special_tokens_count_by_type, vocab.id_to_token.size()
4808
- );
4809
- } else {
4810
- LLAMA_LOG_INFO("%s: special tokens definition check successful ( %u/%zu ).\n",
4811
- __func__,
4812
- special_tokens_count_from_verification, vocab.id_to_token.size()
4813
- );
4917
+ // set attributes by model/tokenizer name
4918
+ if (_contains_any(tokenizer_pre, {"jina-v2-es", "jina-v2-de"})) {
4919
+ _set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
4920
+ } else if (_contains_any(model_name, {"phi-3", "phi3"})) {
4921
+ for (auto id : vocab.cache_special_tokens) {
4922
+ _set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
4923
+ }
4924
+ for (auto token : {"</s>"}) {
4925
+ _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true);
4926
+ }
4927
+ for (auto token : {"<unk>", "<s>", "<|endoftext|>"}) {
4928
+ _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
4929
+ }
4814
4930
  }
4815
4931
  }
4816
4932
  }
@@ -4852,7 +4968,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
4852
4968
  LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
4853
4969
  LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
4854
4970
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
4855
- LLAMA_LOG_INFO("%s: n_yarn_orig_ctx = %u\n", __func__, hparams.n_yarn_orig_ctx);
4971
+ LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
4856
4972
  LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
4857
4973
  LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
4858
4974
  LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
@@ -4892,6 +5008,16 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
4892
5008
  if (vocab.special_suffix_id != -1) { LLAMA_LOG_INFO( "%s: SUF token = %d '%s'\n", __func__, vocab.special_suffix_id, vocab.id_to_token[vocab.special_suffix_id].text.c_str() ); }
4893
5009
  if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
4894
5010
  if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
5011
+
5012
+ if (model.arch == LLM_ARCH_DEEPSEEK2) {
5013
+ LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
5014
+ LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
5015
+ LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
5016
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
5017
+ LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
5018
+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
5019
+ LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
5020
+ }
4895
5021
  }
4896
5022
 
4897
5023
  // Returns false if cancelled by progress_callback
@@ -5048,8 +5174,6 @@ static bool llm_load_tensors(
5048
5174
  throw std::runtime_error("model has expert layers but no expert layers are used");
5049
5175
  }
5050
5176
 
5051
- GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
5052
-
5053
5177
  ggml_context * ctx_input = ctx_map.at(model.buft_input.buft);
5054
5178
  ggml_context * ctx_output = ctx_map.at(model.buft_output.buft);
5055
5179
  ggml_context * ctx_output_split = ctx_map.at(model.buft_output.buft_matrix);
@@ -5069,12 +5193,10 @@ static bool llm_load_tensors(
5069
5193
  // output
5070
5194
  {
5071
5195
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5072
- if (model.arch != LLM_ARCH_MINICPM){
5073
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5074
- // if output is NULL, init from the input tok embed
5075
- if (model.output == NULL) {
5076
- model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5077
- }
5196
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
5197
+ // if output is NULL, init from the input tok embed
5198
+ if (model.output == NULL) {
5199
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
5078
5200
  }
5079
5201
  }
5080
5202
 
@@ -5103,6 +5225,11 @@ static bool llm_load_tensors(
5103
5225
  layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5104
5226
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
5105
5227
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5228
+
5229
+ // optional MLP bias
5230
+ layer.ffn_gate_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
5231
+ layer.ffn_down_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5232
+ layer.ffn_up_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
5106
5233
  } else {
5107
5234
  layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
5108
5235
 
@@ -6210,6 +6337,70 @@ static bool llm_load_tensors(
6210
6337
  layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
6211
6338
  }
6212
6339
  } break;
6340
+ case LLM_ARCH_DEEPSEEK2:
6341
+ {
6342
+ bool is_lite = (hparams.n_layer == 27);
6343
+
6344
+ const uint32_t n_embd_head_qk_rope = hparams.n_rot;
6345
+ const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
6346
+ const uint32_t q_lora_rank = hparams.n_lora_q;
6347
+ const uint32_t kv_lora_rank = hparams.n_lora_kv;
6348
+ const uint32_t n_ff_exp = hparams.n_ff_exp;
6349
+
6350
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6351
+
6352
+ // output
6353
+ {
6354
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
6355
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
6356
+ }
6357
+
6358
+ for (int i = 0; i < n_layer; ++i) {
6359
+ ggml_context * ctx_layer = ctx_for_layer(i);
6360
+ ggml_context * ctx_split = ctx_for_layer_split(i);
6361
+
6362
+ auto & layer = model.layers[i];
6363
+
6364
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
6365
+ if (!is_lite) {
6366
+ layer.attn_q_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank});
6367
+ }
6368
+ layer.attn_kv_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank});
6369
+
6370
+ if (!is_lite) {
6371
+ layer.wq_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank});
6372
+ layer.wq_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.n_head * hparams.n_embd_head_k});
6373
+ } else {
6374
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa});
6375
+ }
6376
+ layer.wkv_a_mqa = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope});
6377
+ layer.wkv_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, hparams.n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)});
6378
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {hparams.n_head * hparams.n_embd_head_v, n_embd});
6379
+
6380
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
6381
+
6382
+ if ((uint32_t) i < hparams.n_layer_dense_lead) {
6383
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
6384
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
6385
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
6386
+ } else {
6387
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
6388
+
6389
+ GGML_ASSERT(hparams.n_expert > 0);
6390
+ GGML_ASSERT(hparams.n_expert_used > 0);
6391
+
6392
+ // MoE branch
6393
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
6394
+ layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
6395
+ layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
6396
+
6397
+ // Shared expert branch
6398
+ layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * hparams.n_expert_shared});
6399
+ layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * hparams.n_expert_shared, n_embd});
6400
+ layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * hparams.n_expert_shared});
6401
+ }
6402
+ }
6403
+ } break;
6213
6404
  default:
6214
6405
  throw std::runtime_error("unknown architecture");
6215
6406
  }
@@ -6664,6 +6855,8 @@ static struct ggml_tensor * llm_build_moe_ffn(
6664
6855
  int64_t n_expert_used,
6665
6856
  llm_ffn_op_type type_op,
6666
6857
  bool norm_w,
6858
+ bool scale_w,
6859
+ float w_scale,
6667
6860
  const llm_build_cb & cb,
6668
6861
  int il) {
6669
6862
  int64_t n_embd = cur->ne[0];
@@ -6695,6 +6888,10 @@ static struct ggml_tensor * llm_build_moe_ffn(
6695
6888
 
6696
6889
  weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens);
6697
6890
  }
6891
+ if (scale_w) {
6892
+ weights = ggml_scale(ctx, weights, w_scale);
6893
+ cb(weights, "ffn_moe_weights_scaled", il);
6894
+ }
6698
6895
 
6699
6896
  cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
6700
6897
  ggml_tensor * up = ggml_mul_mat_id(ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
@@ -6937,7 +7134,7 @@ struct llm_build_context {
6937
7134
  const int32_t n_kv; // size of KV cache to consider (n_kv <= kv_self.size)
6938
7135
  const int32_t n_outputs;
6939
7136
  const int32_t kv_head; // index of where we store new KV data in the cache
6940
- const int32_t n_orig_ctx;
7137
+ const int32_t n_ctx_orig;
6941
7138
 
6942
7139
  const bool flash_attn;
6943
7140
 
@@ -6986,7 +7183,7 @@ struct llm_build_context {
6986
7183
  n_kv (worst_case ? kv_self.size : kv_self.n),
6987
7184
  n_outputs (worst_case ? n_tokens : lctx.n_outputs),
6988
7185
  kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
6989
- n_orig_ctx (cparams.n_yarn_orig_ctx),
7186
+ n_ctx_orig (cparams.n_ctx_orig_yarn),
6990
7187
  flash_attn (cparams.flash_attn),
6991
7188
  pooling_type (cparams.pooling_type),
6992
7189
  rope_type (hparams.rope_type),
@@ -7044,7 +7241,7 @@ struct llm_build_context {
7044
7241
  ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
7045
7242
  ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
7046
7243
  0),
7047
- lctx.inp_K_shift, rope_factors, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7244
+ lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7048
7245
  ext_factor, attn_factor, beta_fast, beta_slow);
7049
7246
 
7050
7247
  cb(tmp, "K_shifted", il);
@@ -7153,7 +7350,7 @@ struct llm_build_context {
7153
7350
  // choose long/short freq factors based on the context size
7154
7351
  const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
7155
7352
 
7156
- if (n_ctx_pre_seq > hparams.n_yarn_orig_ctx) {
7353
+ if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) {
7157
7354
  return model.layers[il].rope_long;
7158
7355
  }
7159
7356
 
@@ -7269,14 +7466,14 @@ struct llm_build_context {
7269
7466
 
7270
7467
  Qcur = ggml_rope_ext(
7271
7468
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7272
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7469
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7273
7470
  ext_factor, attn_factor, beta_fast, beta_slow
7274
7471
  );
7275
7472
  cb(Qcur, "Qcur", il);
7276
7473
 
7277
7474
  Kcur = ggml_rope_ext(
7278
7475
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7279
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7476
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7280
7477
  ext_factor, attn_factor, beta_fast, beta_slow
7281
7478
  );
7282
7479
  cb(Kcur, "Kcur", il);
@@ -7305,9 +7502,9 @@ struct llm_build_context {
7305
7502
  cb(cur, "ffn_norm", il);
7306
7503
 
7307
7504
  cur = llm_build_ffn(ctx0, cur,
7308
- model.layers[il].ffn_up, NULL,
7309
- model.layers[il].ffn_gate, NULL,
7310
- model.layers[il].ffn_down, NULL,
7505
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
7506
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b,
7507
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
7311
7508
  NULL,
7312
7509
  LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
7313
7510
  cb(cur, "ffn_out", il);
@@ -7325,6 +7522,7 @@ struct llm_build_context {
7325
7522
  model.layers[il].ffn_down_exps,
7326
7523
  n_expert, n_expert_used,
7327
7524
  LLM_FFN_SILU, true,
7525
+ false, 0.0,
7328
7526
  cb, il);
7329
7527
  cb(cur, "ffn_moe_out", il);
7330
7528
  }
@@ -7399,12 +7597,12 @@ struct llm_build_context {
7399
7597
  case MODEL_7B:
7400
7598
  Qcur = ggml_rope_ext(
7401
7599
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7402
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7600
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7403
7601
  ext_factor, attn_factor, beta_fast, beta_slow
7404
7602
  );
7405
7603
  Kcur = ggml_rope_ext(
7406
7604
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7407
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7605
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7408
7606
  ext_factor, attn_factor, beta_fast, beta_slow
7409
7607
  );
7410
7608
  break;
@@ -7511,14 +7709,14 @@ struct llm_build_context {
7511
7709
 
7512
7710
  Qcur = ggml_rope_ext(
7513
7711
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7514
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7712
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7515
7713
  ext_factor, attn_factor, beta_fast, beta_slow
7516
7714
  );
7517
7715
  cb(Qcur, "Qcur", il);
7518
7716
 
7519
7717
  Kcur = ggml_rope_ext(
7520
7718
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7521
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7719
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7522
7720
  ext_factor, attn_factor, beta_fast, beta_slow
7523
7721
  );
7524
7722
  cb(Kcur, "Kcur", il);
@@ -7631,13 +7829,13 @@ struct llm_build_context {
7631
7829
 
7632
7830
  // using mode = 2 for neox mode
7633
7831
  Qcur = ggml_rope_ext(
7634
- ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
7832
+ ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
7635
7833
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
7636
7834
  );
7637
7835
  cb(Qcur, "Qcur", il);
7638
7836
 
7639
7837
  Kcur = ggml_rope_ext(
7640
- ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
7838
+ ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
7641
7839
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
7642
7840
  );
7643
7841
  cb(Kcur, "Kcur", il);
@@ -7755,14 +7953,14 @@ struct llm_build_context {
7755
7953
 
7756
7954
  Qcur = ggml_rope_ext(
7757
7955
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7758
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7956
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7759
7957
  ext_factor, attn_factor, beta_fast, beta_slow
7760
7958
  );
7761
7959
  cb(Qcur, "Qcur", il);
7762
7960
 
7763
7961
  Kcur = ggml_rope_ext(
7764
7962
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7765
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7963
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7766
7964
  ext_factor, attn_factor, beta_fast, beta_slow
7767
7965
  );
7768
7966
  cb(Kcur, "Kcur", il);
@@ -7806,6 +8004,7 @@ struct llm_build_context {
7806
8004
  model.layers[il].ffn_down_exps,
7807
8005
  n_expert, n_expert_used,
7808
8006
  LLM_FFN_GELU, true,
8007
+ false, 0.0,
7809
8008
  cb, il);
7810
8009
  cb(cur, "ffn_moe_out", il);
7811
8010
 
@@ -7907,14 +8106,14 @@ struct llm_build_context {
7907
8106
 
7908
8107
  Qcur = ggml_rope_ext(
7909
8108
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
7910
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8109
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7911
8110
  ext_factor, attn_factor, beta_fast, beta_slow
7912
8111
  );
7913
8112
  cb(Qcur, "Qcur", il);
7914
8113
 
7915
8114
  Kcur = ggml_rope_ext(
7916
8115
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
7917
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8116
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7918
8117
  ext_factor, attn_factor, beta_fast, beta_slow
7919
8118
  );
7920
8119
  cb(Kcur, "Kcur", il);
@@ -7949,6 +8148,7 @@ struct llm_build_context {
7949
8148
  model.layers[il].ffn_down_exps,
7950
8149
  n_expert, n_expert_used,
7951
8150
  LLM_FFN_SILU, true,
8151
+ false, 0.0,
7952
8152
  cb, il);
7953
8153
  cb(cur, "ffn_moe_out", il);
7954
8154
 
@@ -8260,14 +8460,14 @@ struct llm_build_context {
8260
8460
 
8261
8461
  Qcur = ggml_rope_ext(
8262
8462
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
8263
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8463
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8264
8464
  ext_factor, attn_factor, beta_fast, beta_slow
8265
8465
  );
8266
8466
  cb(Qcur, "Qcur", il);
8267
8467
 
8268
8468
  Kcur = ggml_rope_ext(
8269
8469
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
8270
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8470
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8271
8471
  ext_factor, attn_factor, beta_fast, beta_slow
8272
8472
  );
8273
8473
  cb(Kcur, "Kcur", il);
@@ -8700,14 +8900,14 @@ struct llm_build_context {
8700
8900
 
8701
8901
  Qcur = ggml_rope_ext(
8702
8902
  ctx0, Qcur, inp_pos, nullptr,
8703
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8903
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8704
8904
  ext_factor, attn_factor, beta_fast, beta_slow
8705
8905
  );
8706
8906
  cb(Qcur, "Qcur", il);
8707
8907
 
8708
8908
  Kcur = ggml_rope_ext(
8709
8909
  ctx0, Kcur, inp_pos, nullptr,
8710
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8910
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8711
8911
  ext_factor, attn_factor, beta_fast, beta_slow
8712
8912
  );
8713
8913
  cb(Kcur, "Kcur", il);
@@ -8819,13 +9019,13 @@ struct llm_build_context {
8819
9019
 
8820
9020
  // using mode = 2 for neox mode
8821
9021
  Qcur = ggml_rope_ext(
8822
- ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
9022
+ ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
8823
9023
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
8824
9024
  );
8825
9025
  cb(Qcur, "Qcur", il);
8826
9026
 
8827
9027
  Kcur = ggml_rope_ext(
8828
- ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
9028
+ ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
8829
9029
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
8830
9030
  );
8831
9031
  cb(Kcur, "Kcur", il);
@@ -8931,14 +9131,14 @@ struct llm_build_context {
8931
9131
 
8932
9132
  Qcur = ggml_rope_ext(
8933
9133
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
8934
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9134
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8935
9135
  ext_factor, attn_factor, beta_fast, beta_slow
8936
9136
  );
8937
9137
  cb(Qcur, "Qcur", il);
8938
9138
 
8939
9139
  Kcur = ggml_rope_ext(
8940
9140
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
8941
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9141
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8942
9142
  ext_factor, attn_factor, beta_fast, beta_slow
8943
9143
  );
8944
9144
  cb(Kcur, "Kcur", il);
@@ -9045,14 +9245,14 @@ struct llm_build_context {
9045
9245
 
9046
9246
  Qcur = ggml_rope_ext(
9047
9247
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9048
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9248
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9049
9249
  ext_factor, attn_factor, beta_fast, beta_slow
9050
9250
  );
9051
9251
  cb(Qcur, "Qcur", il);
9052
9252
 
9053
9253
  Kcur = ggml_rope_ext(
9054
9254
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9055
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9255
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9056
9256
  ext_factor, attn_factor, beta_fast, beta_slow
9057
9257
  );
9058
9258
  cb(Kcur, "Kcur", il);
@@ -9087,6 +9287,7 @@ struct llm_build_context {
9087
9287
  model.layers[il].ffn_down_exps,
9088
9288
  n_expert, n_expert_used,
9089
9289
  LLM_FFN_SILU, false,
9290
+ false, 0.0,
9090
9291
  cb, il);
9091
9292
  cb(cur, "ffn_moe_out", il);
9092
9293
 
@@ -9196,7 +9397,7 @@ struct llm_build_context {
9196
9397
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9197
9398
 
9198
9399
  Qcur = ggml_rope_ext(
9199
- ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
9400
+ ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
9200
9401
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9201
9402
  );
9202
9403
  cb(Qcur, "Qcur", il);
@@ -9207,7 +9408,7 @@ struct llm_build_context {
9207
9408
  cb(Qcur, "Qcur", il);
9208
9409
 
9209
9410
  Kcur = ggml_rope_ext(
9210
- ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
9411
+ ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
9211
9412
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9212
9413
  );
9213
9414
  cb(Kcur, "Kcur", il);
@@ -9318,7 +9519,7 @@ struct llm_build_context {
9318
9519
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9319
9520
 
9320
9521
  Qcur = ggml_rope_ext(
9321
- ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
9522
+ ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig,
9322
9523
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9323
9524
  );
9324
9525
  cb(Qcur, "Qcur", il);
@@ -9327,7 +9528,7 @@ struct llm_build_context {
9327
9528
  cb(Qcur, "Qcur", il);
9328
9529
 
9329
9530
  Kcur = ggml_rope_ext(
9330
- ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
9531
+ ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig,
9331
9532
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9332
9533
  );
9333
9534
  cb(Kcur, "Kcur", il);
@@ -9435,13 +9636,13 @@ struct llm_build_context {
9435
9636
 
9436
9637
  Qcur = ggml_rope_ext(
9437
9638
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos, nullptr,
9438
- n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9639
+ n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
9439
9640
  ext_factor, attn_factor, beta_fast, beta_slow);
9440
9641
  cb(Qcur, "Qcur", il);
9441
9642
 
9442
9643
  Kcur = ggml_rope_ext(
9443
9644
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, nullptr,
9444
- n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9645
+ n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
9445
9646
  ext_factor, attn_factor, beta_fast, beta_slow);
9446
9647
  cb(Kcur, "Kcur", il);
9447
9648
 
@@ -9643,14 +9844,14 @@ struct llm_build_context {
9643
9844
 
9644
9845
  struct ggml_tensor * Qcur = ggml_rope_ext(
9645
9846
  ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9646
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9847
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9647
9848
  ext_factor, attn_factor, beta_fast, beta_slow
9648
9849
  );
9649
9850
  cb(Qcur, "Qcur", il);
9650
9851
 
9651
9852
  struct ggml_tensor * Kcur = ggml_rope_ext(
9652
9853
  ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9653
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9854
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9654
9855
  ext_factor, attn_factor, beta_fast, beta_slow
9655
9856
  );
9656
9857
  cb(Kcur, "Kcur", il);
@@ -9759,14 +9960,14 @@ struct llm_build_context {
9759
9960
 
9760
9961
  Qcur = ggml_rope_ext(
9761
9962
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9762
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9963
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9763
9964
  ext_factor, attn_factor, beta_fast, beta_slow
9764
9965
  );
9765
9966
  cb(Qcur, "Qcur", il);
9766
9967
 
9767
9968
  Kcur = ggml_rope_ext(
9768
9969
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9769
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9970
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9770
9971
  ext_factor, attn_factor, beta_fast, beta_slow
9771
9972
  );
9772
9973
  cb(Kcur, "Kcur", il);
@@ -9876,14 +10077,14 @@ struct llm_build_context {
9876
10077
 
9877
10078
  Qcur = ggml_rope_ext(
9878
10079
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
9879
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10080
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9880
10081
  ext_factor, attn_factor, beta_fast, beta_slow
9881
10082
  );
9882
10083
  cb(Qcur, "Qcur", il);
9883
10084
 
9884
10085
  Kcur = ggml_rope_ext(
9885
10086
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
9886
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10087
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9887
10088
  ext_factor, attn_factor, beta_fast, beta_slow
9888
10089
  );
9889
10090
  cb(Kcur, "Kcur", il);
@@ -10006,14 +10207,14 @@ struct llm_build_context {
10006
10207
 
10007
10208
  Qcur = ggml_rope_ext(
10008
10209
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10009
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10210
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10010
10211
  ext_factor, attn_factor, beta_fast, beta_slow
10011
10212
  );
10012
10213
  cb(Qcur, "Qcur", il);
10013
10214
 
10014
10215
  Kcur = ggml_rope_ext(
10015
10216
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10016
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10217
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10017
10218
  ext_factor, attn_factor, beta_fast, beta_slow
10018
10219
  );
10019
10220
  cb(Kcur, "Kcur", il);
@@ -10078,7 +10279,7 @@ struct llm_build_context {
10078
10279
  cb(cur, "lmhead_scaling", -1);
10079
10280
 
10080
10281
  // lm_head
10081
- cur = ggml_mul_mat(ctx0, model.tok_embd, cur);
10282
+ cur = ggml_mul_mat(ctx0, model.output, cur);
10082
10283
  cb(cur, "result_output", -1);
10083
10284
 
10084
10285
  ggml_build_forward_expand(gf, cur);
@@ -10126,7 +10327,7 @@ struct llm_build_context {
10126
10327
 
10127
10328
  Qcur = ggml_rope_ext(
10128
10329
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
10129
- n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10330
+ n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
10130
10331
  ext_factor, attn_factor, beta_fast, beta_slow);
10131
10332
  cb(Qcur, "Qcur", il);
10132
10333
 
@@ -10135,7 +10336,7 @@ struct llm_build_context {
10135
10336
 
10136
10337
  Kcur = ggml_rope_ext(
10137
10338
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
10138
- n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10339
+ n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
10139
10340
  ext_factor, attn_factor, beta_fast, beta_slow);
10140
10341
  cb(Kcur, "Kcur", il);
10141
10342
 
@@ -10246,14 +10447,14 @@ struct llm_build_context {
10246
10447
 
10247
10448
  Qcur = ggml_rope_ext(
10248
10449
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10249
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10450
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10250
10451
  ext_factor, attn_factor, beta_fast, beta_slow
10251
10452
  );
10252
10453
  cb(Qcur, "Qcur", il);
10253
10454
 
10254
10455
  Kcur = ggml_rope_ext(
10255
10456
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10256
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10457
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10257
10458
  ext_factor, attn_factor, beta_fast, beta_slow
10258
10459
  );
10259
10460
  cb(Kcur, "Kcur", il);
@@ -10536,14 +10737,14 @@ struct llm_build_context {
10536
10737
 
10537
10738
  Qcur = ggml_rope_ext(
10538
10739
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10539
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10740
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10540
10741
  ext_factor, attn_factor, beta_fast, beta_slow
10541
10742
  );
10542
10743
  cb(Qcur, "Qcur", il);
10543
10744
 
10544
10745
  Kcur = ggml_rope_ext(
10545
10746
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10546
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10747
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10547
10748
  ext_factor, attn_factor, beta_fast, beta_slow
10548
10749
  );
10549
10750
  cb(Kcur, "Kcur", il);
@@ -10667,14 +10868,14 @@ struct llm_build_context {
10667
10868
 
10668
10869
  Qcur = ggml_rope_ext(
10669
10870
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10670
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10871
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10671
10872
  ext_factor, attn_factor, beta_fast, beta_slow
10672
10873
  );
10673
10874
  cb(Qcur, "Qcur", il);
10674
10875
 
10675
10876
  Kcur = ggml_rope_ext(
10676
10877
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10677
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10878
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10678
10879
  ext_factor, attn_factor, beta_fast, beta_slow
10679
10880
  );
10680
10881
  cb(Kcur, "Kcur", il);
@@ -10781,14 +10982,14 @@ struct llm_build_context {
10781
10982
 
10782
10983
  Qcur = ggml_rope_ext(
10783
10984
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10784
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10985
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10785
10986
  ext_factor, attn_factor, beta_fast, beta_slow
10786
10987
  );
10787
10988
  cb(Qcur, "Qcur", il);
10788
10989
 
10789
10990
  Kcur = ggml_rope_ext(
10790
10991
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10791
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10992
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10792
10993
  ext_factor, attn_factor, beta_fast, beta_slow
10793
10994
  );
10794
10995
  cb(Kcur, "Kcur", il);
@@ -10916,14 +11117,14 @@ struct llm_build_context {
10916
11117
 
10917
11118
  Qcur = ggml_rope_ext(
10918
11119
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
10919
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
11120
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10920
11121
  ext_factor, attn_factor, beta_fast, beta_slow
10921
11122
  );
10922
11123
  cb(Qcur, "Qcur", il);
10923
11124
 
10924
11125
  Kcur = ggml_rope_ext(
10925
11126
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
10926
- n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
11127
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10927
11128
  ext_factor, attn_factor, beta_fast, beta_slow
10928
11129
  );
10929
11130
  cb(Kcur, "Kcur", il);
@@ -10974,6 +11175,7 @@ struct llm_build_context {
10974
11175
  model.layers[il].ffn_down_exps,
10975
11176
  n_expert, n_expert_used,
10976
11177
  LLM_FFN_SILU, true,
11178
+ false, 0.0,
10977
11179
  cb, il);
10978
11180
  cb(cur, "ffn_moe_out", il);
10979
11181
 
@@ -11005,6 +11207,239 @@ struct llm_build_context {
11005
11207
 
11006
11208
  return gf;
11007
11209
  }
11210
+
11211
+ struct ggml_cgraph * build_deepseek2() {
11212
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
11213
+
11214
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
11215
+ int32_t n_tokens = this->n_tokens;
11216
+
11217
+ bool is_lite = (hparams.n_layer == 27);
11218
+
11219
+ // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
11220
+ // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
11221
+ const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
11222
+ const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(hparams.n_embd_head_k));
11223
+ const float attn_factor_scaled = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
11224
+
11225
+ const uint32_t n_embd_head_qk_rope = hparams.n_rot;
11226
+ const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
11227
+ const uint32_t kv_lora_rank = hparams.n_lora_kv;
11228
+
11229
+ struct ggml_tensor * cur;
11230
+ struct ggml_tensor * inpL;
11231
+
11232
+ // {n_embd, n_tokens}
11233
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
11234
+
11235
+ // inp_pos - contains the positions
11236
+ struct ggml_tensor * inp_pos = build_inp_pos();
11237
+
11238
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
11239
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
11240
+
11241
+ for (int il = 0; il < n_layer; ++il) {
11242
+ struct ggml_tensor * inpSA = inpL;
11243
+
11244
+ // norm
11245
+ cur = llm_build_norm(ctx0, inpL, hparams,
11246
+ model.layers[il].attn_norm, NULL,
11247
+ LLM_NORM_RMS, cb, il);
11248
+ cb(cur, "attn_norm", il);
11249
+
11250
+ // self_attention
11251
+ {
11252
+ struct ggml_tensor * q = NULL;
11253
+ if (!is_lite) {
11254
+ // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
11255
+ q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
11256
+ cb(q, "q", il);
11257
+
11258
+ q = llm_build_norm(ctx0, q, hparams,
11259
+ model.layers[il].attn_q_a_norm, NULL,
11260
+ LLM_NORM_RMS, cb, il);
11261
+ cb(q, "q", il);
11262
+
11263
+ // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
11264
+ q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
11265
+ cb(q, "q", il);
11266
+ } else {
11267
+ q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
11268
+ cb(q, "q", il);
11269
+ }
11270
+
11271
+ // split into {n_head * n_embd_head_qk_nope, n_tokens}
11272
+ struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
11273
+ ggml_row_size(q->type, hparams.n_embd_head_k),
11274
+ ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
11275
+ 0);
11276
+ cb(q_nope, "q_nope", il);
11277
+
11278
+ // and {n_head * n_embd_head_qk_rope, n_tokens}
11279
+ struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
11280
+ ggml_row_size(q->type, hparams.n_embd_head_k),
11281
+ ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
11282
+ ggml_row_size(q->type, n_embd_head_qk_nope));
11283
+ cb(q_pe, "q_pe", il);
11284
+
11285
+ // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
11286
+ struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
11287
+ cb(kv_pe_compresseed, "kv_pe_compresseed", il);
11288
+
11289
+ // split into {kv_lora_rank, n_tokens}
11290
+ struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
11291
+ kv_pe_compresseed->nb[1],
11292
+ 0);
11293
+ cb(kv_compressed, "kv_compressed", il);
11294
+
11295
+ // and {n_embd_head_qk_rope, n_tokens}
11296
+ struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
11297
+ kv_pe_compresseed->nb[1],
11298
+ kv_pe_compresseed->nb[1],
11299
+ ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
11300
+ cb(k_pe, "k_pe", il);
11301
+
11302
+ kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
11303
+ kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
11304
+ model.layers[il].attn_kv_a_norm, NULL,
11305
+ LLM_NORM_RMS, cb, il);
11306
+ cb(kv_compressed, "kv_compressed", il);
11307
+
11308
+ // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
11309
+ struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
11310
+ cb(kv, "kv", il);
11311
+
11312
+ // split into {n_head * n_embd_head_qk_nope, n_tokens}
11313
+ struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
11314
+ ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
11315
+ ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
11316
+ 0);
11317
+ cb(k_nope, "k_nope", il);
11318
+
11319
+ // and {n_head * n_embd_head_v, n_tokens}
11320
+ struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
11321
+ ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
11322
+ ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
11323
+ ggml_row_size(kv->type, (n_embd_head_qk_nope)));
11324
+ cb(v_states, "v_states", il);
11325
+
11326
+ v_states = ggml_cont(ctx0, v_states);
11327
+ cb(v_states, "v_states", il);
11328
+
11329
+ v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
11330
+ ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
11331
+ 0);
11332
+ cb(v_states, "v_states", il);
11333
+
11334
+ q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
11335
+ q_pe = ggml_rope_ext(
11336
+ ctx0, q_pe, inp_pos, nullptr,
11337
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
11338
+ ext_factor, attn_factor_scaled, beta_fast, beta_slow
11339
+ );
11340
+ cb(q_pe, "q_pe", il);
11341
+
11342
+ // shared RoPE key
11343
+ k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
11344
+ k_pe = ggml_rope_ext(
11345
+ ctx0, k_pe, inp_pos, nullptr,
11346
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
11347
+ ext_factor, attn_factor_scaled, beta_fast, beta_slow
11348
+ );
11349
+ cb(k_pe, "k_pe", il);
11350
+
11351
+ struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
11352
+ cb(q_states, "q_states", il);
11353
+
11354
+ struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
11355
+ cb(k_states, "k_states", il);
11356
+
11357
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
11358
+ model.layers[il].wo, NULL,
11359
+ k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
11360
+ }
11361
+
11362
+ if (il == n_layer - 1) {
11363
+ // skip computing output for unused tokens
11364
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
11365
+ n_tokens = n_outputs;
11366
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
11367
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
11368
+ }
11369
+
11370
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
11371
+ cb(ffn_inp, "ffn_inp", il);
11372
+
11373
+ if ((uint32_t) il < hparams.n_layer_dense_lead) {
11374
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
11375
+ model.layers[il].ffn_norm, NULL,
11376
+ LLM_NORM_RMS, cb, il);
11377
+ cb(cur, "ffn_norm", il);
11378
+
11379
+ cur = llm_build_ffn(ctx0, cur,
11380
+ model.layers[il].ffn_up, NULL,
11381
+ model.layers[il].ffn_gate, NULL,
11382
+ model.layers[il].ffn_down, NULL,
11383
+ NULL,
11384
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
11385
+ cb(cur, "ffn_out", il);
11386
+ } else {
11387
+ // MoE branch
11388
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
11389
+ model.layers[il].ffn_norm, NULL,
11390
+ LLM_NORM_RMS, cb, il);
11391
+ cb(cur, "ffn_norm", il);
11392
+
11393
+ ggml_tensor * moe_out =
11394
+ llm_build_moe_ffn(ctx0, cur,
11395
+ model.layers[il].ffn_gate_inp,
11396
+ model.layers[il].ffn_up_exps,
11397
+ model.layers[il].ffn_gate_exps,
11398
+ model.layers[il].ffn_down_exps,
11399
+ n_expert, n_expert_used,
11400
+ LLM_FFN_SILU, false,
11401
+ true, hparams.expert_weights_scale,
11402
+ cb, il);
11403
+ cb(moe_out, "ffn_moe_out", il);
11404
+
11405
+ // FFN shared expert
11406
+ {
11407
+ ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, cur,
11408
+ model.layers[il].ffn_up_shexp, NULL,
11409
+ model.layers[il].ffn_gate_shexp, NULL,
11410
+ model.layers[il].ffn_down_shexp, NULL,
11411
+ NULL,
11412
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
11413
+ cb(ffn_shexp, "ffn_shexp", il);
11414
+
11415
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
11416
+ cb(cur, "ffn_out", il);
11417
+ }
11418
+ }
11419
+
11420
+ cur = ggml_add(ctx0, cur, ffn_inp);
11421
+ cb(cur, "l_out", il);
11422
+
11423
+ // input for next layer
11424
+ inpL = cur;
11425
+ }
11426
+
11427
+ cur = inpL;
11428
+
11429
+ cur = llm_build_norm(ctx0, cur, hparams,
11430
+ model.output_norm, NULL,
11431
+ LLM_NORM_RMS, cb, -1);
11432
+ cb(cur, "result_norm", -1);
11433
+
11434
+ // lm_head
11435
+ cur = ggml_mul_mat(ctx0, model.output, cur);
11436
+ cb(cur, "result_output", -1);
11437
+
11438
+ ggml_build_forward_expand(gf, cur);
11439
+
11440
+ return gf;
11441
+ }
11442
+
11008
11443
  };
11009
11444
 
11010
11445
  static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -11223,6 +11658,10 @@ static struct ggml_cgraph * llama_build_graph(
11223
11658
  {
11224
11659
  result = llm.build_arctic();
11225
11660
  } break;
11661
+ case LLM_ARCH_DEEPSEEK2:
11662
+ {
11663
+ result = llm.build_deepseek2();
11664
+ } break;
11226
11665
  default:
11227
11666
  GGML_ASSERT(false);
11228
11667
  }
@@ -12239,27 +12678,27 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
12239
12678
 
12240
12679
  static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
12241
12680
  GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
12242
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL;
12681
+ return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL;
12243
12682
  }
12244
12683
 
12245
12684
  static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
12246
12685
  GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
12247
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNKNOWN;
12686
+ return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN;
12248
12687
  }
12249
12688
 
12250
12689
  static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
12251
12690
  GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
12252
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
12691
+ return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL;
12253
12692
  }
12254
12693
 
12255
12694
  static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
12256
12695
  GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
12257
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
12696
+ return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE;
12258
12697
  }
12259
12698
 
12260
12699
  static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
12261
12700
  GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
12262
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
12701
+ return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
12263
12702
  }
12264
12703
 
12265
12704
  static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
@@ -12512,6 +12951,7 @@ struct llm_tokenizer_bpe {
12512
12951
  });
12513
12952
  break;
12514
12953
  case LLAMA_VOCAB_PRE_TYPE_DBRX:
12954
+ case LLAMA_VOCAB_PRE_TYPE_SMAUG:
12515
12955
  word_collection = unicode_regex_split(text, {
12516
12956
  // same as llama3
12517
12957
  "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
@@ -12734,7 +13174,7 @@ struct llm_tokenizer_wpm {
12734
13174
  llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
12735
13175
 
12736
13176
  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
12737
- auto * token_map = &vocab.token_to_id;
13177
+ const auto & token_map = vocab.token_to_id;
12738
13178
 
12739
13179
  // normalize and split by whitespace
12740
13180
  std::vector<std::string> words = preprocess(text);
@@ -12749,108 +13189,89 @@ struct llm_tokenizer_wpm {
12749
13189
  }
12750
13190
 
12751
13191
  // prepend phantom space
12752
- std::string word1 = "\xe2\x96\x81" + word;
12753
- int n = word1.size();
13192
+ const std::string word1 = "\xe2\x96\x81" + word;
13193
+ const int n = word1.size();
12754
13194
 
12755
- // we're at the start of a new word
12756
- int i = 0;
12757
- bool match_any = false;
13195
+ const size_t current_tokens = output.size();
12758
13196
 
13197
+ // we're at the start of a new word
12759
13198
  // move through character position in word
12760
- while (i < n) {
13199
+ for (int i = 0; i < n; ++i) {
12761
13200
  // loop through possible match length
12762
13201
  bool match = false;
12763
13202
  for (int j = n; j > i; j--) {
12764
- auto it = token_map->find(word1.substr(i, j - i));
12765
- if (it != token_map->end()) {
13203
+ auto it = token_map.find(word1.substr(i, j - i));
13204
+ if (it != token_map.end()) {
12766
13205
  output.push_back(it->second);
12767
13206
  match = true;
12768
- match_any = true;
12769
- i = j;
13207
+ i = j - 1;
12770
13208
  break;
12771
13209
  }
12772
13210
  }
12773
13211
 
12774
- // must be an unknown character
12775
- if (!match) {
12776
- i++;
13212
+ if (!match) { // discard all
13213
+ output.resize(current_tokens);
13214
+ break; // and discard next tokens
12777
13215
  }
12778
13216
  }
12779
13217
 
12780
13218
  // we didn't find any matches for this word
12781
- if (!match_any) {
13219
+ if (current_tokens == output.size()) {
12782
13220
  output.push_back(vocab.special_unk_id);
12783
13221
  }
12784
13222
  }
12785
13223
  }
12786
13224
 
12787
13225
  std::vector<std::string> preprocess(const std::string & text) {
12788
- std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
12789
-
12790
- // strip accents, strip control, uniformize whitespace,
12791
- // to lowercase, pad chinese characters, pad punctuation
12792
- std::string new_str = "";
12793
- for (uint32_t code : cpts_nfd) {
12794
- const codepoint_flags flags = unicode_cpt_flags(code);
12795
- if (flags.is_accent_mark || flags.is_control) {
13226
+ const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
13227
+ std::vector<std::string> words(1, "");
13228
+
13229
+ for (const char32_t cpt : cpts_nfd) {
13230
+ const auto flags = unicode_cpt_flags(cpt);
13231
+
13232
+ if (flags.is_whitespace) {
13233
+ if (words.back().size()) { // finish previous word if any
13234
+ words.emplace_back();
13235
+ }
12796
13236
  continue;
12797
13237
  }
12798
- code = unicode_tolower(code);
12799
- if (flags.is_separator || flags.is_whitespace) { //####FIXME: is_separator ?
12800
- code = ' ';
12801
- }
12802
- std::string s = unicode_cpt_to_utf8(code);
12803
- if (flags.is_punctuation || is_ascii_punct(code) || is_chinese_char(code)) {
12804
- new_str += " ";
12805
- new_str += s;
12806
- new_str += " ";
12807
- } else {
12808
- new_str += s;
13238
+
13239
+ assert (!flags.is_separator);
13240
+ if (cpt == 0 || cpt == 0xFFFD || flags.is_control) {
13241
+ continue;
12809
13242
  }
12810
- }
12811
13243
 
12812
- // split by whitespace
12813
- uint64_t l = 0;
12814
- uint64_t r = 0;
12815
- std::vector<std::string> words;
12816
- while (r < new_str.size()) {
12817
- // if is whitespace
12818
- if (isspace(new_str[r], std::locale::classic())) {
12819
- if (r > l) words.push_back(new_str.substr(l, (r - l)));
12820
- l = r + 1;
12821
- r = l;
13244
+ const std::string s = unicode_cpt_to_utf8(unicode_tolower(cpt));
13245
+ if (flags.is_punctuation || ( cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt)) {
13246
+ if (words.back().size()) { // finish previous word if any
13247
+ words.emplace_back();
13248
+ }
13249
+ words.back() = s; // single char word
13250
+ words.emplace_back(); // start a new word
12822
13251
  } else {
12823
- r += 1;
13252
+ words.back() += s; // append char to word
12824
13253
  }
12825
13254
  }
12826
- if (r > l) {
12827
- words.push_back(new_str.substr(l, (r - l)));
12828
- }
12829
- return words;
12830
- }
12831
13255
 
12832
- bool is_ascii_punct(uint32_t code) {
12833
- if (code > 0xFF) {
12834
- return false;
13256
+ if (!words.back().size()) {
13257
+ words.pop_back();
12835
13258
  }
12836
- auto c = char(static_cast<unsigned char>(code));
12837
- return ispunct(c, std::locale::classic());
13259
+
13260
+ return words;
12838
13261
  }
12839
13262
 
12840
- bool is_chinese_char(uint32_t cpt) {
12841
- if ((cpt >= 0x4E00 && cpt <= 0x9FFF) ||
12842
- (cpt >= 0x3400 && cpt <= 0x4DBF) ||
13263
+ static bool is_chinese_char(uint32_t cpt) {
13264
+ return
13265
+ (cpt >= 0x04E00 && cpt <= 0x09FFF) ||
13266
+ (cpt >= 0x03400 && cpt <= 0x04DBF) ||
12843
13267
  (cpt >= 0x20000 && cpt <= 0x2A6DF) ||
12844
13268
  (cpt >= 0x2A700 && cpt <= 0x2B73F) ||
12845
13269
  (cpt >= 0x2B740 && cpt <= 0x2B81F) ||
12846
13270
  (cpt >= 0x2B920 && cpt <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
12847
- (cpt >= 0xF900 && cpt <= 0xFAFF) ||
12848
- (cpt >= 0x2F800 && cpt <= 0x2FA1F) ||
12849
- (cpt >= 0x3000 && cpt <= 0x303F) ||
12850
- (cpt >= 0xFF00 && cpt <= 0xFFEF)) {
12851
- return true; // NOLINT
12852
- }
12853
- return false;
13271
+ (cpt >= 0x0F900 && cpt <= 0x0FAFF) ||
13272
+ (cpt >= 0x2F800 && cpt <= 0x2FA1F);
13273
+ //(cpt >= 0x3000 && cpt <= 0x303F) ||
13274
+ //(cpt >= 0xFF00 && cpt <= 0xFFEF);
12854
13275
  }
12855
13276
 
12856
13277
  const llama_vocab & vocab;
@@ -12894,9 +13315,9 @@ struct fragment_buffer_variant {
12894
13315
 
12895
13316
  static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
12896
13317
  // for each special token
12897
- for (const auto & st: vocab.special_tokens_cache) {
12898
- const auto & special_token = st.first;
12899
- const auto & special_id = st.second;
13318
+ for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
13319
+ const auto & data = vocab.id_to_token[special_id];
13320
+ const auto & special_token = data.text;
12900
13321
 
12901
13322
  // for each text fragment
12902
13323
  std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
@@ -12905,7 +13326,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
12905
13326
 
12906
13327
  // if a fragment is text ( not yet processed )
12907
13328
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
12908
- auto * raw_text = &(fragment.raw_text);
13329
+ auto & raw_text = fragment.raw_text;
12909
13330
 
12910
13331
  auto raw_text_base_offset = fragment.offset;
12911
13332
  auto raw_text_base_length = fragment.length;
@@ -12915,7 +13336,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
12915
13336
  // find the first occurrence of a given special token in this fragment
12916
13337
  // passing offset argument only limit the "search area" but match coordinates
12917
13338
  // are still relative to the source full raw_text
12918
- auto match = raw_text->find(special_token, raw_text_base_offset);
13339
+ auto match = raw_text.find(special_token, raw_text_base_offset);
12919
13340
 
12920
13341
  // no occurrences found, stop processing this fragment for a given special token
12921
13342
  if (match == std::string::npos) break;
@@ -12933,13 +13354,22 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
12933
13354
  if (match > raw_text_base_offset) {
12934
13355
  // left
12935
13356
  const int64_t left_reminder_offset = raw_text_base_offset + 0;
12936
- const int64_t left_reminder_length = match - raw_text_base_offset;
12937
- buffer.emplace_after(it, (*raw_text), left_reminder_offset, left_reminder_length);
13357
+ int64_t left_reminder_length = match - raw_text_base_offset;
13358
+
13359
+ if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) {
13360
+ while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) {
13361
+ left_reminder_length--;
13362
+ }
13363
+ }
13364
+
13365
+ if (left_reminder_length > 0) {
13366
+ buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
13367
+ it++;
13368
+ }
12938
13369
 
12939
13370
  #ifdef PRETOKENIZERDEBUG
12940
13371
  LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
12941
13372
  #endif
12942
- it++;
12943
13373
  }
12944
13374
 
12945
13375
  // special token
@@ -12948,16 +13378,25 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
12948
13378
 
12949
13379
  // right
12950
13380
  if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
12951
- const int64_t right_reminder_offset = match + special_token.length();
12952
- const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
12953
- buffer.emplace_after(it, (*raw_text), right_reminder_offset, right_reminder_length);
13381
+ int64_t right_reminder_offset = match + special_token.length();
13382
+ int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
13383
+
13384
+ if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) {
13385
+ while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) {
13386
+ right_reminder_offset++;
13387
+ right_reminder_length--;
13388
+ }
13389
+ }
13390
+
13391
+ if (right_reminder_length > 0) {
13392
+ buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
13393
+ it++;
13394
+ }
12954
13395
 
12955
13396
  #ifdef PRETOKENIZERDEBUG
12956
13397
  LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
12957
13398
  #endif
12958
13399
 
12959
- it++;
12960
-
12961
13400
  if (source == 0) {
12962
13401
  buffer.erase_after(buffer.before_begin());
12963
13402
  } else {
@@ -13003,9 +13442,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13003
13442
  // tokenizer.encode('', add_special_tokens=True) returns [1]
13004
13443
  // tokenizer.encode('', add_special_tokens=False) returns []
13005
13444
 
13006
- static const bool rtrim = true; //TODO: as param
13007
13445
  bool is_prev_special = false;
13008
- bool special_token_rtrim = false;
13009
13446
 
13010
13447
  if (add_special && vocab.special_add_bos != 0) {
13011
13448
  GGML_ASSERT(vocab.special_bos_id != -1);
@@ -13015,25 +13452,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13015
13452
 
13016
13453
  for (const auto & fragment : fragment_buffer) {
13017
13454
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
13018
- // without adding this leading whitespace, we do not get the same results as the original tokenizer
13019
-
13020
- // TODO: It's likely possible to get rid of this string copy entirely
13021
- // by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
13022
- // and passing 'add space prefix' as bool argument
13023
- //
13024
13455
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
13025
13456
 
13026
- if (special_token_rtrim) {
13027
- size_t num_whitespaces = 0;
13028
- while (isspace(raw_text[num_whitespaces])) {
13029
- num_whitespaces++;
13030
- }
13031
- if (num_whitespaces == raw_text.size()) {
13032
- continue; // skip if all whitespaces
13033
- }
13034
- raw_text = raw_text.substr(num_whitespaces);
13035
- }
13036
-
13037
13457
  if (vocab.add_space_prefix) {
13038
13458
  if (!output.size() || is_prev_special) { // prefix with space if first token
13039
13459
  raw_text = " " + raw_text;
@@ -13049,11 +13469,6 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13049
13469
  } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
13050
13470
  output.push_back(fragment.token);
13051
13471
  is_prev_special = true;
13052
- // phi-3 special tokens without rtrim, works fine for llama-spm too
13053
- special_token_rtrim = rtrim
13054
- && fragment.token != vocab.special_bos_id
13055
- && fragment.token != vocab.special_unk_id
13056
- && fragment.token != vocab.special_eos_id;
13057
13472
  }
13058
13473
  }
13059
13474
 
@@ -14054,7 +14469,7 @@ void llama_sample_repetition_penalties(
14054
14469
 
14055
14470
  void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
14056
14471
  GGML_ASSERT(ctx);
14057
- const int64_t t_start_sample_us = ggml_time_us();
14472
+ int64_t t_start_sample_us = ggml_time_us();
14058
14473
 
14059
14474
  bool allow_eog = false;
14060
14475
  for (const auto & stack : grammar->stacks) {
@@ -14066,12 +14481,13 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
14066
14481
 
14067
14482
  std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
14068
14483
  candidates_decoded.reserve(candidates->size);
14069
- std::vector<llama_grammar_candidate> candidates_grammar;
14484
+
14485
+ std::vector<llama_grammar_candidate> candidates_grammar;
14070
14486
  candidates_grammar.reserve(candidates->size);
14071
14487
 
14072
14488
  for (size_t i = 0; i < candidates->size; ++i) {
14073
- const llama_token id = candidates->data[i].id;
14074
- const std::string piece = llama_token_to_piece(ctx, id, false);
14489
+ const llama_token id = candidates->data[i].id;
14490
+ const std::string & piece = ctx->model.vocab.cache_token_to_piece.at(id);
14075
14491
 
14076
14492
  if (llama_token_is_eog(&ctx->model, id)) {
14077
14493
  if (!allow_eog) {
@@ -14271,7 +14687,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
14271
14687
  GGML_ASSERT(false);
14272
14688
  }
14273
14689
 
14274
- const std::string piece = llama_token_to_piece(ctx, token, false);
14690
+ const std::string & piece = ctx->model.vocab.cache_token_to_piece.at(token);
14275
14691
 
14276
14692
  // Note terminating 0 in decoded string
14277
14693
  const auto decoded = decode_utf8(piece, grammar->partial_utf8);
@@ -14287,260 +14703,6 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
14287
14703
  ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
14288
14704
  }
14289
14705
 
14290
- //
14291
- // Beam search
14292
- //
14293
-
14294
- struct llama_beam {
14295
- std::vector<llama_token> tokens;
14296
- float p; // Cumulative beam probability (renormalized relative to all beams)
14297
- bool eob; // Initialize end-of-beam to false. Callback sets this to true.
14298
- // Sort beams by probability. In case of ties, prefer beams at eob.
14299
- bool operator<(const llama_beam & rhs) const {
14300
- return std::make_pair(p, eob) < std::make_pair(rhs.p, rhs.eob);
14301
- }
14302
- // Shift off first n tokens and discard them.
14303
- void shift_tokens(const size_t n) {
14304
- if (n) {
14305
- std::copy(tokens.begin() + n, tokens.end(), tokens.begin());
14306
- tokens.resize(tokens.size() - n);
14307
- }
14308
- }
14309
- llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eob}; }
14310
- };
14311
-
14312
- // A struct for calculating logit-related info.
14313
- struct llama_logit_info {
14314
- const float * const logits;
14315
- const int n_vocab;
14316
- const float max_l;
14317
- const float normalizer;
14318
- struct sum_exp {
14319
- float max_l;
14320
- float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
14321
- };
14322
- llama_logit_info(llama_context * ctx)
14323
- : logits(llama_get_logits(ctx))
14324
- , n_vocab(llama_n_vocab(llama_get_model(ctx)))
14325
- , max_l(*std::max_element(logits, logits + n_vocab))
14326
- , normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
14327
- { }
14328
- llama_token_data get_token_data(const llama_token token_id) const {
14329
- constexpr auto p = std::numeric_limits<float>::quiet_NaN(); // never used
14330
- return {token_id, logits[token_id], p};
14331
- }
14332
- // Return top k token_data by logit.
14333
- std::vector<llama_token_data> top_k(size_t k) {
14334
- std::vector<llama_token_data> min_heap; // min-heap by logit
14335
- const llama_token k_min = std::min(static_cast<llama_token>(k), n_vocab);
14336
- min_heap.reserve(k_min);
14337
- for (llama_token token_id = 0 ; token_id < k_min ; ++token_id) {
14338
- min_heap.push_back(get_token_data(token_id));
14339
- }
14340
- auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; };
14341
- std::make_heap(min_heap.begin(), min_heap.end(), comp);
14342
- for (llama_token token_id = k_min ; token_id < n_vocab ; ++token_id) {
14343
- if (min_heap.front().logit < logits[token_id]) {
14344
- std::pop_heap(min_heap.begin(), min_heap.end(), comp);
14345
- min_heap.back().id = token_id;
14346
- min_heap.back().logit = logits[token_id];
14347
- std::push_heap(min_heap.begin(), min_heap.end(), comp);
14348
- }
14349
- }
14350
- return min_heap;
14351
- }
14352
- float probability_from_logit(float logit) const {
14353
- return normalizer * std::exp(logit - max_l);
14354
- }
14355
- };
14356
-
14357
- struct llama_beam_search_data {
14358
- llama_context * ctx;
14359
- size_t n_beams;
14360
- int n_past;
14361
- int n_predict;
14362
- std::vector<llama_beam> beams;
14363
- std::vector<llama_beam> next_beams;
14364
-
14365
- // Re-calculated on each loop iteration
14366
- size_t common_prefix_length;
14367
-
14368
- // Used to communicate to/from callback on beams state.
14369
- std::vector<llama_beam_view> beam_views;
14370
-
14371
- llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict)
14372
- : ctx(ctx)
14373
- , n_beams(n_beams)
14374
- , n_past(n_past)
14375
- , n_predict(n_predict)
14376
- , beam_views(n_beams) {
14377
- beams.reserve(n_beams);
14378
- next_beams.reserve(n_beams);
14379
- }
14380
-
14381
- // Collapse beams to a single beam given by index.
14382
- void collapse_beams(const size_t beam_idx) {
14383
- if (0u < beam_idx) {
14384
- std::swap(beams[0], beams[beam_idx]);
14385
- }
14386
- beams.resize(1);
14387
- }
14388
-
14389
- // Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
14390
- // The repetitive patterns below reflect the 2 stages of heaps:
14391
- // * Gather elements until the vector is full, then call std::make_heap() on it.
14392
- // * If the heap is full and a new element is found that should be included, pop the
14393
- // least element to the back(), replace it with the new, then push it into the heap.
14394
- void fill_next_beams_by_top_probabilities(llama_beam & beam) {
14395
- // Min-heaps use a greater-than comparator.
14396
- const auto comp = [](const llama_beam & a, const llama_beam & b) { return a.p > b.p; };
14397
- if (beam.eob) {
14398
- // beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
14399
- if (next_beams.size() < n_beams) {
14400
- next_beams.push_back(std::move(beam));
14401
- if (next_beams.size() == n_beams) {
14402
- std::make_heap(next_beams.begin(), next_beams.end(), comp);
14403
- }
14404
- } else if (next_beams.front().p < beam.p) {
14405
- std::pop_heap(next_beams.begin(), next_beams.end(), comp);
14406
- next_beams.back() = std::move(beam);
14407
- std::push_heap(next_beams.begin(), next_beams.end(), comp);
14408
- }
14409
- } else {
14410
- // beam is not at end-of-sentence, so branch with next top_k tokens.
14411
- if (!beam.tokens.empty()) {
14412
- llama_decode(ctx, llama_batch_get_one(beam.tokens.data(), beam.tokens.size(), n_past, 0));
14413
- }
14414
- llama_logit_info logit_info(ctx);
14415
- std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
14416
-
14417
- // Clear the kv slot so that other beams may try different tokens at this position. The llama_decode()
14418
- // call in loop() will conclusively fill in the kv slot once the beams converge at this position.
14419
- llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
14420
-
14421
- size_t i=0;
14422
- if (next_beams.size() < n_beams) {
14423
- for (; next_beams.size() < n_beams ; ++i) {
14424
- llama_beam next_beam = beam;
14425
- next_beam.tokens.push_back(next_tokens[i].id);
14426
- next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit);
14427
- next_beams.push_back(std::move(next_beam));
14428
- }
14429
- std::make_heap(next_beams.begin(), next_beams.end(), comp);
14430
- } else {
14431
- for (; next_beams.front().p == 0.0f ; ++i) {
14432
- std::pop_heap(next_beams.begin(), next_beams.end(), comp);
14433
- next_beams.back() = beam;
14434
- next_beams.back().tokens.push_back(next_tokens[i].id);
14435
- next_beams.back().p *= logit_info.probability_from_logit(next_tokens[i].logit);
14436
- std::push_heap(next_beams.begin(), next_beams.end(), comp);
14437
- }
14438
- }
14439
- for (; i < n_beams ; ++i) {
14440
- const float next_p = beam.p * logit_info.probability_from_logit(next_tokens[i].logit);
14441
- if (next_beams.front().p < next_p) {
14442
- std::pop_heap(next_beams.begin(), next_beams.end(), comp);
14443
- next_beams.back() = beam;
14444
- next_beams.back().tokens.push_back(next_tokens[i].id);
14445
- next_beams.back().p = next_p;
14446
- std::push_heap(next_beams.begin(), next_beams.end(), comp);
14447
- }
14448
- }
14449
- }
14450
- }
14451
-
14452
- // Find common_prefix_length based on beams.
14453
- // Requires beams is not empty.
14454
- size_t find_common_prefix_length() {
14455
- size_t common_prefix_length = beams[0].tokens.size();
14456
- for (size_t i = 1 ; i < beams.size() ; ++i) {
14457
- common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size());
14458
- for (size_t j = 0 ; j < common_prefix_length ; ++j) {
14459
- if (beams[0].tokens[j] != beams[i].tokens[j]) {
14460
- common_prefix_length = j;
14461
- break;
14462
- }
14463
- }
14464
- }
14465
- return common_prefix_length;
14466
- }
14467
-
14468
- // Construct beams_state to send back to caller via the callback function.
14469
- // Side effect: set common_prefix_length = find_common_prefix_length();
14470
- llama_beams_state get_beams_state(const bool last_call) {
14471
- for (size_t i = 0 ; i < beams.size() ; ++i) {
14472
- beam_views[i] = beams[i].view();
14473
- }
14474
- common_prefix_length = find_common_prefix_length();
14475
- return {beam_views.data(), beams.size(), common_prefix_length, last_call};
14476
- }
14477
-
14478
- // Loop:
14479
- // * while i < n_predict, AND
14480
- // * any of the beams have not yet reached end-of-beam (eob), AND
14481
- // * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence
14482
- // (since all other beam probabilities can only decrease)
14483
- void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) {
14484
- beams.push_back({{}, 1.0f, false}); // Start with one empty beam w/ probability = 1.0 and !eob.
14485
- const auto not_eob = [](const llama_beam & beam) { return !beam.eob; };
14486
- for (int i = 0 ; i < n_predict && std::any_of(beams.begin(),beams.end(),not_eob) &&
14487
- !beams[top_beam_index()].eob ; ++i) {
14488
- callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
14489
- update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
14490
- if (common_prefix_length) {
14491
- llama_decode(ctx, llama_batch_get_one(beams[0].tokens.data(), common_prefix_length, n_past, 0));
14492
- n_past += common_prefix_length;
14493
- }
14494
- // Zero-out next_beam probabilities to place them last in following min-heap.
14495
- std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam & beam) { beam.p = 0.0f; });
14496
- for (llama_beam & beam : beams) {
14497
- beam.shift_tokens(common_prefix_length);
14498
- fill_next_beams_by_top_probabilities(beam);
14499
- }
14500
- // next_beams become the beams of next/final iteration. Swap them to re-use memory.
14501
- beams.swap(next_beams);
14502
- renormalize_beam_probabilities(beams);
14503
- }
14504
- collapse_beams(top_beam_index());
14505
- callback(callback_data, get_beams_state(true));
14506
- }
14507
-
14508
- // As beams grow, the cumulative probabilities decrease.
14509
- // Renormalize them to avoid floating point underflow.
14510
- static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) {
14511
- const auto sum_p = [](float sum, llama_beam & beam) { return sum + beam.p; };
14512
- const float inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
14513
- std::for_each(beams.begin(), beams.end(), [=](llama_beam & beam) { beam.p *= inv_sum; });
14514
- }
14515
-
14516
- // Assumes beams is non-empty. Uses llama_beam::operator<() for ordering.
14517
- size_t top_beam_index() {
14518
- return std::max_element(beams.begin(), beams.end()) - beams.begin();
14519
- }
14520
-
14521
- // Copy (p,eob) for each beam which may have been changed by the callback.
14522
- void update_beams_from_beam_views() {
14523
- for (size_t i = 0 ; i < beams.size() ; ++i) {
14524
- beams[i].p = beam_views[i].p;
14525
- beams[i].eob = beam_views[i].eob;
14526
- }
14527
- }
14528
- };
14529
-
14530
- void llama_beam_search(llama_context * ctx,
14531
- llama_beam_search_callback_fn_t callback, void * callback_data,
14532
- size_t n_beams, int n_past, int n_predict) {
14533
- assert(ctx);
14534
- const int64_t t_start_sample_us = ggml_time_us();
14535
-
14536
- llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict);
14537
-
14538
- beam_search_data.loop(callback, callback_data);
14539
-
14540
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
14541
- ctx->n_sample++;
14542
- }
14543
-
14544
14706
  //
14545
14707
  // quantization
14546
14708
  //
@@ -15751,7 +15913,7 @@ bool llama_supports_mlock(void) {
15751
15913
  }
15752
15914
 
15753
15915
  bool llama_supports_gpu_offload(void) {
15754
- #if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
15916
+ #if defined(GGML_USE_CUDA) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
15755
15917
  defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
15756
15918
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
15757
15919
  return true;
@@ -15808,7 +15970,7 @@ struct llama_model * llama_load_model_from_file(
15808
15970
  return true;
15809
15971
  };
15810
15972
  }
15811
- if (params.rpc_servers != nullptr) {
15973
+ if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
15812
15974
  // split the servers set them into model->rpc_servers
15813
15975
  std::string servers(params.rpc_servers);
15814
15976
  size_t pos = 0;
@@ -15862,6 +16024,11 @@ struct llama_context * llama_new_context_with_model(
15862
16024
  params.flash_attn = false;
15863
16025
  }
15864
16026
 
16027
+ if (params.type_v != GGML_TYPE_F16 && !params.flash_attn) {
16028
+ LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
16029
+ return nullptr;
16030
+ }
16031
+
15865
16032
  llama_context * ctx = new llama_context(*model);
15866
16033
 
15867
16034
  const auto & hparams = model->hparams;
@@ -15900,8 +16067,8 @@ struct llama_context * llama_new_context_with_model(
15900
16067
 
15901
16068
  cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
15902
16069
 
15903
- cparams.n_yarn_orig_ctx = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
15904
- hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx :
16070
+ cparams.n_ctx_orig_yarn = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
16071
+ hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn :
15905
16072
  hparams.n_ctx_train;
15906
16073
 
15907
16074
  cparams.cb_eval = params.cb_eval;
@@ -15966,17 +16133,7 @@ struct llama_context * llama_new_context_with_model(
15966
16133
 
15967
16134
  if (!hparams.vocab_only) {
15968
16135
  // initialize backends
15969
- #if defined(GGML_USE_RPC)
15970
- for (auto & server : model->rpc_servers) {
15971
- ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
15972
- if (backend == nullptr) {
15973
- LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
15974
- llama_free(ctx);
15975
- return nullptr;
15976
- }
15977
- ctx->backends.push_back(backend);
15978
- }
15979
- #elif defined(GGML_USE_METAL)
16136
+ #if defined(GGML_USE_METAL)
15980
16137
  if (model->n_gpu_layers > 0) {
15981
16138
  ctx->backend_metal = ggml_backend_metal_init();
15982
16139
  if (ctx->backend_metal == nullptr) {
@@ -16015,7 +16172,7 @@ struct llama_context * llama_new_context_with_model(
16015
16172
  return nullptr;
16016
16173
  }
16017
16174
  if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
16018
- ggml_backend_t backend = ggml_backend_vk_init(0);
16175
+ ggml_backend_t backend = ggml_backend_vk_init(model->main_gpu);
16019
16176
  if (backend == nullptr) {
16020
16177
  LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
16021
16178
  llama_free(ctx);
@@ -16068,6 +16225,19 @@ struct llama_context * llama_new_context_with_model(
16068
16225
  }
16069
16226
  ctx->backends.push_back(backend);
16070
16227
  }
16228
+ #endif
16229
+ #if defined(GGML_USE_RPC)
16230
+ if (model->n_gpu_layers > 0) {
16231
+ for (const auto & endpoint : model->rpc_servers) {
16232
+ ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
16233
+ if (backend == nullptr) {
16234
+ LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
16235
+ llama_free(ctx);
16236
+ return nullptr;
16237
+ }
16238
+ ctx->backends.push_back(backend);
16239
+ }
16240
+ }
16071
16241
  #endif
16072
16242
  ctx->backend_cpu = ggml_backend_cpu_init();
16073
16243
  if (ctx->backend_cpu == nullptr) {
@@ -16235,6 +16405,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
16235
16405
  case LLM_ARCH_COMMAND_R:
16236
16406
  case LLM_ARCH_OLMO:
16237
16407
  case LLM_ARCH_ARCTIC:
16408
+ case LLM_ARCH_DEEPSEEK2:
16238
16409
  return LLAMA_ROPE_TYPE_NORM;
16239
16410
 
16240
16411
  // the pairs of head values are offset by n_rot/2
@@ -17849,9 +18020,9 @@ float llama_token_get_score(const struct llama_model * model, llama_token token)
17849
18020
  return model->vocab.id_to_token[token].score;
17850
18021
  }
17851
18022
 
17852
- llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) {
18023
+ llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token) {
17853
18024
  GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
17854
- return model->vocab.id_to_token[token].type;
18025
+ return model->vocab.id_to_token[token].attr;
17855
18026
  }
17856
18027
 
17857
18028
  bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
@@ -17861,6 +18032,10 @@ bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
17861
18032
  );
17862
18033
  }
17863
18034
 
18035
+ bool llama_token_is_control(const struct llama_model * model, llama_token token) {
18036
+ return llama_is_control_token(model->vocab, token);
18037
+ }
18038
+
17864
18039
  llama_token llama_token_bos(const struct llama_model * model) {
17865
18040
  return model->vocab.special_bos_id;
17866
18041
  }
@@ -17932,7 +18107,16 @@ static std::string llama_decode_text(const std::string & text) {
17932
18107
 
17933
18108
  const auto cpts = unicode_cpts_from_utf8(text);
17934
18109
  for (const auto cpt : cpts) {
17935
- decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(cpt));
18110
+ const auto utf8 = unicode_cpt_to_utf8(cpt);
18111
+ try {
18112
+ decoded_text += unicode_utf8_to_byte(utf8);
18113
+ } catch (const std::out_of_range & e) {
18114
+ decoded_text += "[UNK_BYTE_0x";
18115
+ for (const auto c : utf8) {
18116
+ decoded_text += format("%02x", (uint8_t) c);
18117
+ }
18118
+ decoded_text += text + "]";
18119
+ }
17936
18120
  }
17937
18121
 
17938
18122
  return decoded_text;
@@ -17940,69 +18124,88 @@ static std::string llama_decode_text(const std::string & text) {
17940
18124
 
17941
18125
  // does not write null-terminator to buf
17942
18126
  int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
18127
+ // ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
18128
+ if (!special && llama_is_control_token(model->vocab, token)) {
18129
+ return 0;
18130
+ }
18131
+
18132
+ // if we have a cache - use it
18133
+ {
18134
+ const auto & cache = model->vocab.cache_token_to_piece;
18135
+
18136
+ if (!cache.empty()) {
18137
+ const auto & res = cache.at(token);
18138
+ if (length < (int) res.size()) {
18139
+ return -(int) res.size();
18140
+ }
18141
+ memcpy(buf, res.c_str(), res.size());
18142
+ return res.size();
18143
+ }
18144
+ }
18145
+
17943
18146
  if (0 <= token && token < llama_n_vocab(model)) {
17944
18147
  switch (llama_vocab_get_type(model->vocab)) {
17945
- case LLAMA_VOCAB_TYPE_WPM:
17946
- case LLAMA_VOCAB_TYPE_SPM: {
17947
- // NOTE: we accept all unsupported token types,
17948
- // suppressing them like CONTROL tokens.
17949
- if (llama_is_normal_token(model->vocab, token)) {
17950
- std::string result = model->vocab.id_to_token[token].text;
17951
- llama_unescape_whitespace(result);
17952
- if (length < (int) result.length()) {
17953
- return -(int) result.length();
17954
- }
17955
- memcpy(buf, result.c_str(), result.length());
17956
- return result.length();
17957
- } else if (
17958
- (llama_is_user_defined_token(model->vocab, token)) ||
17959
- (llama_is_control_token (model->vocab, token) && special)) {
17960
- std::string result = model->vocab.id_to_token[token].text;
17961
- if (length < (int) result.length()) {
17962
- return -(int) result.length();
17963
- }
17964
- memcpy(buf, result.c_str(), result.length());
17965
- return result.length();
17966
- } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
17967
- if (length < 3) {
17968
- return -3;
17969
- }
17970
- memcpy(buf, "\xe2\x96\x85", 3);
17971
- return 3;
17972
- } else if (llama_is_byte_token(model->vocab, token)) {
17973
- if (length < 1) {
17974
- return -1;
18148
+ case LLAMA_VOCAB_TYPE_WPM:
18149
+ case LLAMA_VOCAB_TYPE_SPM: {
18150
+ // NOTE: we accept all unsupported token types,
18151
+ // suppressing them like CONTROL tokens.
18152
+ if (llama_is_normal_token(model->vocab, token)) {
18153
+ std::string result = model->vocab.id_to_token[token].text;
18154
+ llama_unescape_whitespace(result);
18155
+ if (length < (int) result.length()) {
18156
+ return -(int) result.length();
18157
+ }
18158
+ memcpy(buf, result.c_str(), result.length());
18159
+ return result.length();
18160
+ } else if (
18161
+ (llama_is_user_defined_token(model->vocab, token)) ||
18162
+ (llama_is_control_token (model->vocab, token) && special)) {
18163
+ std::string result = model->vocab.id_to_token[token].text;
18164
+ if (length < (int) result.length()) {
18165
+ return -(int) result.length();
18166
+ }
18167
+ memcpy(buf, result.c_str(), result.length());
18168
+ return result.length();
18169
+ } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
18170
+ if (length < 3) {
18171
+ return -3;
18172
+ }
18173
+ memcpy(buf, "\xe2\x96\x85", 3);
18174
+ return 3;
18175
+ } else if (llama_is_byte_token(model->vocab, token)) {
18176
+ if (length < 1) {
18177
+ return -1;
18178
+ }
18179
+ buf[0] = llama_token_to_byte(model->vocab, token);
18180
+ return 1;
17975
18181
  }
17976
- buf[0] = llama_token_to_byte(model->vocab, token);
17977
- return 1;
18182
+ break;
17978
18183
  }
17979
- break;
17980
- }
17981
- case LLAMA_VOCAB_TYPE_BPE: {
17982
- // NOTE: we accept all unsupported token types,
17983
- // suppressing them like CONTROL tokens.
17984
- if (llama_is_normal_token(model->vocab, token)) {
17985
- std::string result = model->vocab.id_to_token[token].text;
17986
- result = llama_decode_text(result);
17987
- if (length < (int) result.length()) {
17988
- return -(int) result.length();
17989
- }
17990
- memcpy(buf, result.c_str(), result.length());
17991
- return result.length();
17992
- } else if (
17993
- (llama_is_user_defined_token(model->vocab, token)) ||
17994
- (llama_is_control_token (model->vocab, token) && special)) {
17995
- std::string result = model->vocab.id_to_token[token].text;
17996
- if (length < (int) result.length()) {
17997
- return -(int) result.length();
18184
+ case LLAMA_VOCAB_TYPE_BPE: {
18185
+ // NOTE: we accept all unsupported token types,
18186
+ // suppressing them like CONTROL tokens.
18187
+ if (llama_is_normal_token(model->vocab, token)) {
18188
+ std::string result = model->vocab.id_to_token[token].text;
18189
+ result = llama_decode_text(result);
18190
+ if (length < (int) result.length()) {
18191
+ return -(int) result.length();
18192
+ }
18193
+ memcpy(buf, result.c_str(), result.length());
18194
+ return result.length();
18195
+ } else if (
18196
+ (llama_is_user_defined_token(model->vocab, token)) ||
18197
+ (llama_is_control_token (model->vocab, token) && special)) {
18198
+ std::string result = model->vocab.id_to_token[token].text;
18199
+ if (length < (int) result.length()) {
18200
+ return -(int) result.length();
18201
+ }
18202
+ memcpy(buf, result.c_str(), result.length());
18203
+ return result.length();
17998
18204
  }
17999
- memcpy(buf, result.c_str(), result.length());
18000
- return result.length();
18205
+ break;
18001
18206
  }
18002
- break;
18003
- }
18004
- default:
18005
- GGML_ASSERT(false);
18207
+ default:
18208
+ GGML_ASSERT(false);
18006
18209
  }
18007
18210
  }
18008
18211
  return 0;
@@ -18337,6 +18540,7 @@ const char * llama_print_system_info(void) {
18337
18540
  s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
18338
18541
  s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
18339
18542
  s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
18543
+ s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | ";
18340
18544
  s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
18341
18545
  s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
18342
18546
  s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";