llama_cpp 0.16.0 → 0.16.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (142) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +13 -0
  3. data/ext/llama_cpp/extconf.rb +3 -0
  4. data/ext/llama_cpp/llama_cpp.cpp +14 -0
  5. data/lib/llama_cpp/version.rb +2 -2
  6. data/sig/llama_cpp.rbs +4 -0
  7. data/vendor/tmp/llama.cpp/Makefile +119 -54
  8. data/vendor/tmp/llama.cpp/ggml-alloc.c +78 -22
  9. data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
  10. data/vendor/tmp/llama.cpp/ggml-backend.c +190 -65
  11. data/vendor/tmp/llama.cpp/ggml-backend.h +6 -3
  12. data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
  13. data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
  14. data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
  15. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +1 -0
  16. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +21 -9
  17. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +1 -1
  18. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +15 -1491
  19. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +77 -62
  20. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +77 -10
  21. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +1 -0
  22. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +1 -1
  23. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +1 -1
  24. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +1 -1
  25. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +1 -1
  26. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +1 -1
  27. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +1 -1
  28. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +1 -1
  29. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +1 -1
  30. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +1 -1
  31. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +1 -1
  32. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +1 -1
  33. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +1 -1
  34. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +1 -1
  35. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +1 -1
  36. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +1 -1
  37. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +1 -1
  38. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +1 -1
  39. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +1 -1
  40. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +1 -1
  41. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +1 -1
  42. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +1 -1
  43. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +1 -1
  44. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +1 -1
  45. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +1 -1
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +1 -1
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +1 -1
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +1 -1
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +1 -1
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +1 -1
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +1 -1
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +1 -1
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +1 -1
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +1 -1
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +1 -1
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +1 -1
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +1 -1
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +1 -1
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +1 -1
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +1 -1
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +1 -1
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +1 -1
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +1 -1
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +1 -1
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +1 -1
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +1 -1
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +1 -1
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +1 -1
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +1 -1
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +1 -1
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +1 -1
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +1 -1
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +1 -1
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +1 -1
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +1 -1
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +1 -1
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +1 -1
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +1 -1
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +1 -1
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +1 -1
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +1 -1
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +1 -1
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +1 -1
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +1 -1
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +1 -1
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +1 -1
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +1 -1
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +1 -1
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +1 -1
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +1 -1
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +1 -1
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +1 -1
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +1 -1
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +1 -1
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +1 -1
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +1 -1
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +1 -1
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +1 -1
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +1 -1
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +1 -1
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +1 -1
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +1 -1
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +1 -1
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +1 -1
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +1 -1
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +1 -1
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +1 -1
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +1 -1
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +1 -1
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +1 -1
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +1 -1
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +1 -1
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  123. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +48 -0
  124. data/vendor/tmp/llama.cpp/ggml-cuda.cu +95 -129
  125. data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
  126. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +8 -7
  127. data/vendor/tmp/llama.cpp/ggml-metal.m +17 -9
  128. data/vendor/tmp/llama.cpp/ggml-quants.c +982 -368
  129. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +21 -15
  130. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +2133 -13215
  131. data/vendor/tmp/llama.cpp/ggml-sycl.h +1 -10
  132. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +28826 -25037
  133. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +438 -493
  134. data/vendor/tmp/llama.cpp/ggml.c +158 -414
  135. data/vendor/tmp/llama.cpp/ggml.h +6 -0
  136. data/vendor/tmp/llama.cpp/llama.cpp +628 -279
  137. data/vendor/tmp/llama.cpp/llama.h +9 -1
  138. data/vendor/tmp/llama.cpp/sgemm.cpp +2 -0
  139. data/vendor/tmp/llama.cpp/unicode-data.cpp +851 -801
  140. data/vendor/tmp/llama.cpp/unicode.cpp +33 -19
  141. data/vendor/tmp/llama.cpp/unicode.h +1 -1
  142. metadata +15 -3
@@ -21,6 +21,10 @@
21
21
  # include "ggml-kompute.h"
22
22
  #endif
23
23
 
24
+ #ifdef GGML_USE_BLAS
25
+ # include "ggml-blas.h"
26
+ #endif
27
+
24
28
  #ifdef GGML_USE_METAL
25
29
  # include "ggml-metal.h"
26
30
  #endif
@@ -282,6 +286,7 @@ enum llm_kv {
282
286
  LLM_KV_LEADING_DENSE_BLOCK_COUNT,
283
287
  LLM_KV_FEED_FORWARD_LENGTH,
284
288
  LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
289
+ LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH,
285
290
  LLM_KV_USE_PARALLEL_RESIDUAL,
286
291
  LLM_KV_TENSOR_DATA_LAYOUT,
287
292
  LLM_KV_EXPERT_COUNT,
@@ -360,21 +365,22 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
360
365
  { LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
361
366
  { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
362
367
 
363
- { LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
364
- { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
365
- { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
366
- { LLM_KV_BLOCK_COUNT, "%s.block_count" },
367
- { LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
368
- { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
369
- { LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
370
- { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
371
- { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
372
- { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
373
- { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
374
- { LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
375
- { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
376
- { LLM_KV_POOLING_TYPE , "%s.pooling_type" },
377
- { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
368
+ { LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
369
+ { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
370
+ { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
371
+ { LLM_KV_BLOCK_COUNT, "%s.block_count" },
372
+ { LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
373
+ { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
374
+ { LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
375
+ { LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, "%s.expert_shared_feed_forward_length" },
376
+ { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
377
+ { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
378
+ { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
379
+ { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
380
+ { LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
381
+ { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
382
+ { LLM_KV_POOLING_TYPE , "%s.pooling_type" },
383
+ { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
378
384
 
379
385
  { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
380
386
  { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -704,6 +710,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
704
710
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
705
711
  { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
706
712
  { LLM_TENSOR_TOKEN_TYPES, "token_types" },
713
+ { LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
707
714
  { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
708
715
  { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
709
716
  { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
@@ -1273,6 +1280,126 @@ struct no_init {
1273
1280
  };
1274
1281
 
1275
1282
  struct llama_file {
1283
+
1284
+ #if defined(_WIN32)
1285
+ // use FILE * so we don't have to re-open the file to mmap
1286
+ FILE * fp;
1287
+ HANDLE fp_win32;
1288
+ size_t size;
1289
+
1290
+ private:
1291
+ std::string GetErrorMessageWin32(DWORD error_code) const {
1292
+ std::string ret;
1293
+ LPSTR lpMsgBuf = NULL;
1294
+ DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
1295
+ NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
1296
+ if (!bufLen) {
1297
+ ret = format("Win32 error code: %s", error_code);
1298
+ } else {
1299
+ ret = lpMsgBuf;
1300
+ LocalFree(lpMsgBuf);
1301
+ }
1302
+
1303
+ return ret;
1304
+ }
1305
+
1306
+ public:
1307
+
1308
+ llama_file(const char * fname, const char * mode) {
1309
+ fp = ggml_fopen(fname, mode);
1310
+ if (fp == NULL) {
1311
+ throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
1312
+ }
1313
+ fp_win32 = (HANDLE) _get_osfhandle(_fileno(fp));
1314
+ seek(0, SEEK_END);
1315
+ size = tell();
1316
+ seek(0, SEEK_SET);
1317
+ }
1318
+
1319
+ size_t tell() const {
1320
+ // SetFilePointerEx returns the current position when seeking relative 0 bytes
1321
+ LARGE_INTEGER li;
1322
+ li.QuadPart = 0;
1323
+ BOOL ret = SetFilePointerEx(fp_win32, li, &li, FILE_CURRENT);
1324
+ if (!ret) {
1325
+ throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
1326
+ }
1327
+
1328
+ return li.QuadPart;
1329
+ }
1330
+
1331
+ void seek(size_t offset, int whence) const {
1332
+ // no need to convert SEEK_* to FILE_*. The enums are the same.
1333
+ // Still, keep static asserts to avoid failures in the future.
1334
+ static_assert(SEEK_SET == FILE_BEGIN, "SEEK_SET != FILE_BEGIN");
1335
+ static_assert(SEEK_CUR == FILE_CURRENT, "SEEK_CUR != FILE_CURRENT");
1336
+ static_assert(SEEK_END == FILE_END, "SEEK_END != FILE_END");
1337
+
1338
+ LARGE_INTEGER li;
1339
+ li.QuadPart = offset;
1340
+ BOOL ret = SetFilePointerEx(fp_win32, li, NULL, whence);
1341
+ if (!ret) {
1342
+ throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
1343
+ }
1344
+ }
1345
+
1346
+ void read_raw(void * ptr, size_t len) const {
1347
+ // On Win32 ReadFile is significant faster than fread which is again significant faster than std::fstream. Thus
1348
+ // use the Win32 API to do file io instead of the C/C++ library functions.
1349
+
1350
+ // There are conditions under which ReadFile cannot read chunks >64MB.
1351
+ // Thus split the operation into smaller chunks if len exceeds this limit.
1352
+ size_t bytes_read = 0;
1353
+ while (bytes_read < len) {
1354
+ size_t chunk_size = std::min<size_t>(len - bytes_read, 64*1024*1024);
1355
+ DWORD chunk_read = 0;
1356
+ BOOL result = ReadFile(fp_win32, reinterpret_cast<char*>(ptr) + bytes_read, chunk_size, &chunk_read, NULL);
1357
+ if (!result) {
1358
+ throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
1359
+ }
1360
+ if (chunk_read < chunk_size || chunk_read == 0) {
1361
+ throw std::runtime_error("unexpectedly reached end of file");
1362
+ }
1363
+
1364
+ bytes_read += chunk_read;
1365
+ } ;
1366
+ }
1367
+
1368
+ uint32_t read_u32() const {
1369
+ uint32_t val;
1370
+ read_raw(&val, sizeof(val));
1371
+ return val;
1372
+ }
1373
+
1374
+ void write_raw(const void * ptr, size_t len) const {
1375
+ // There are conditions under which WriteFile cannot write chunks >64MB.
1376
+ // Thus split the operation into smaller chunks if len exceeds this limit.
1377
+ size_t bytes_written = 0;
1378
+ while (bytes_written < len) {
1379
+ size_t chunk_size = std::min<size_t>(len - bytes_written, 64*1024*1024);
1380
+ DWORD chunk_written = 0;
1381
+ BOOL result = WriteFile(fp_win32, reinterpret_cast<char const*>(ptr) + bytes_written, chunk_size, &chunk_written, NULL);
1382
+ if (!result) {
1383
+ throw std::runtime_error(format("write error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
1384
+ }
1385
+ if (chunk_written < chunk_size || chunk_written == 0) {
1386
+ throw std::runtime_error("unexpectedly failed to write bytes");
1387
+ }
1388
+
1389
+ bytes_written += chunk_written;
1390
+ }
1391
+ }
1392
+
1393
+ void write_u32(std::uint32_t val) const {
1394
+ write_raw(&val, sizeof(val));
1395
+ }
1396
+
1397
+ ~llama_file() {
1398
+ if (fp) {
1399
+ std::fclose(fp);
1400
+ }
1401
+ }
1402
+ #else
1276
1403
  // use FILE * so we don't have to re-open the file to mmap
1277
1404
  FILE * fp;
1278
1405
  size_t size;
@@ -1293,7 +1420,10 @@ struct llama_file {
1293
1420
  #else
1294
1421
  long ret = std::ftell(fp);
1295
1422
  #endif
1296
- GGML_ASSERT(ret != -1); // this really shouldn't fail
1423
+ if (ret == -1) {
1424
+ throw std::runtime_error(format("ftell error: %s", strerror(errno)));
1425
+ }
1426
+
1297
1427
  return (size_t) ret;
1298
1428
  }
1299
1429
 
@@ -1303,7 +1433,9 @@ struct llama_file {
1303
1433
  #else
1304
1434
  int ret = std::fseek(fp, (long) offset, whence);
1305
1435
  #endif
1306
- GGML_ASSERT(ret == 0); // same
1436
+ if (ret != 0) {
1437
+ throw std::runtime_error(format("seek error: %s", strerror(errno)));
1438
+ }
1307
1439
  }
1308
1440
 
1309
1441
  void read_raw(void * ptr, size_t len) const {
@@ -1346,6 +1478,7 @@ struct llama_file {
1346
1478
  std::fclose(fp);
1347
1479
  }
1348
1480
  }
1481
+ #endif
1349
1482
  };
1350
1483
  using llama_files = std::vector<std::unique_ptr<llama_file>>;
1351
1484
 
@@ -1839,6 +1972,7 @@ struct llama_hparams {
1839
1972
  uint32_t n_lora_q = 0;
1840
1973
  uint32_t n_lora_kv = 0;
1841
1974
  uint32_t n_ff_exp = 0;
1975
+ uint32_t n_ff_shexp = 0;
1842
1976
  uint32_t n_expert_shared = 0;
1843
1977
  float expert_weights_scale = 0.0;
1844
1978
 
@@ -1887,6 +2021,7 @@ struct llama_hparams {
1887
2021
  if (this->n_lora_q != other.n_lora_q) return true;
1888
2022
  if (this->n_lora_kv != other.n_lora_kv) return true;
1889
2023
  if (this->n_ff_exp != other.n_ff_exp) return true;
2024
+ if (this->n_ff_shexp != other.n_ff_shexp) return true;
1890
2025
  if (this->n_expert_shared != other.n_expert_shared) return true;
1891
2026
 
1892
2027
  if (this->rope_finetuned != other.rope_finetuned) return true;
@@ -2158,6 +2293,8 @@ struct llama_vocab {
2158
2293
  enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
2159
2294
  enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
2160
2295
 
2296
+ int max_token_len = 0; // used for optimizing longest token search
2297
+
2161
2298
  std::unordered_map<token, id> token_to_id;
2162
2299
  std::vector<token_data> id_to_token;
2163
2300
 
@@ -2175,16 +2312,17 @@ struct llama_vocab {
2175
2312
  id special_cls_id = -1;
2176
2313
  id special_mask_id = -1;
2177
2314
 
2178
- int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
2179
- int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
2180
-
2181
2315
  id linefeed_id = 13;
2182
2316
  id special_prefix_id = -1;
2183
2317
  id special_suffix_id = -1;
2184
2318
  id special_middle_id = -1;
2185
2319
  id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
2186
2320
 
2187
- bool add_space_prefix = true;
2321
+ // tokenizer flags
2322
+ bool tokenizer_add_space_prefix = true;
2323
+ bool tokenizer_add_bos = false;
2324
+ bool tokenizer_add_eos = false;
2325
+ bool tokenizer_ignore_merges = false;
2188
2326
 
2189
2327
  int find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
2190
2328
  GGML_ASSERT(token_left.find(' ') == std::string::npos);
@@ -2298,9 +2436,13 @@ struct llama_context {
2298
2436
  std::vector<ggml_backend_t> backends;
2299
2437
  #ifdef GGML_USE_METAL
2300
2438
  ggml_backend_t backend_metal = nullptr;
2439
+ #endif
2440
+ #ifdef GGML_USE_BLAS
2441
+ ggml_backend_t backend_blas = nullptr;
2301
2442
  #endif
2302
2443
  ggml_backend_t backend_cpu = nullptr;
2303
2444
 
2445
+
2304
2446
  const llama_model & model;
2305
2447
 
2306
2448
  // key + value cache for the self attention
@@ -3712,6 +3854,44 @@ struct llama_model_loader {
3712
3854
  std::vector<no_init<uint8_t>> read_buf;
3713
3855
  std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
3714
3856
 
3857
+ #if defined(GGML_USE_CUDA)
3858
+ // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
3859
+ // NVMe raid configurations might require more / larger buffers.
3860
+ constexpr size_t num_buffers = 4;
3861
+ constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
3862
+
3863
+ std::vector<ggml_backend_buffer_t> host_buffers;
3864
+ std::vector<void*> host_ptrs;
3865
+ std::vector<ggml_backend_event_t> events;
3866
+ size_t buffer_idx = 0; // buffer to use for async loads
3867
+
3868
+ ggml_backend_t cuda_backend = nullptr;
3869
+ if (!use_mmap && !check_tensors) {
3870
+ // When not using mmaped io use async uploads from pinned memory to GPU memory.
3871
+ // First determine if the CUDA backend is active, and if so, determine the device ID.
3872
+ ggml_backend_buffer_t buf = bufs_mmap.count(0) ? bufs_mmap.at(0) : nullptr;
3873
+ if (buf) {
3874
+ ggml_backend_buffer_type_t buffer_type = ggml_backend_buffer_get_type(buf);
3875
+ for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
3876
+ auto * cuda_buffer_type = ggml_backend_cuda_buffer_type(i);
3877
+ if (buffer_type == cuda_buffer_type) {
3878
+ cuda_backend = ggml_backend_cuda_init(i);
3879
+ break;
3880
+ }
3881
+ }
3882
+ }
3883
+
3884
+ // If the cuda backend is active create pinned memory buffers and events for synchronisation.
3885
+ if (cuda_backend) {
3886
+ for (size_t idx = 0; idx < num_buffers; ++idx) {
3887
+ host_buffers.emplace_back(ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buffer_size));
3888
+ host_ptrs.emplace_back(ggml_backend_buffer_get_base(host_buffers[idx]));
3889
+ events.emplace_back(ggml_backend_event_new(cuda_backend));
3890
+ }
3891
+ }
3892
+ }
3893
+ #endif
3894
+
3715
3895
  for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
3716
3896
  const auto * weight = get_weight(ggml_get_name(cur));
3717
3897
  if (weight == nullptr) {
@@ -3767,12 +3947,36 @@ struct llama_model_loader {
3767
3947
  }));
3768
3948
  }
3769
3949
  } else {
3770
- read_buf.resize(n_size);
3771
- file->seek(weight->offs, SEEK_SET);
3772
- file->read_raw(read_buf.data(), n_size);
3773
- ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
3774
- if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
3775
- throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
3950
+ #if defined(GGML_USE_CUDA)
3951
+ // If cuda_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
3952
+ if (cuda_backend) {
3953
+ file->seek(weight->offs, SEEK_SET);
3954
+
3955
+ size_t bytes_read = 0;
3956
+
3957
+ while (bytes_read < n_size) {
3958
+ size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
3959
+
3960
+ ggml_backend_event_synchronize(events[buffer_idx]);
3961
+ file->read_raw(host_ptrs[buffer_idx], read_iteration);
3962
+ ggml_backend_tensor_set_async(cuda_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
3963
+ ggml_backend_event_record(events[buffer_idx]);
3964
+
3965
+ bytes_read += read_iteration;
3966
+ ++buffer_idx;
3967
+ buffer_idx %= num_buffers;
3968
+ }
3969
+ }
3970
+ else
3971
+ #endif
3972
+ {
3973
+ read_buf.resize(n_size);
3974
+ file->seek(weight->offs, SEEK_SET);
3975
+ file->read_raw(read_buf.data(), n_size);
3976
+ ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
3977
+ if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
3978
+ throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
3979
+ }
3776
3980
  }
3777
3981
  }
3778
3982
  }
@@ -3780,6 +3984,18 @@ struct llama_model_loader {
3780
3984
  size_done += n_size;
3781
3985
  }
3782
3986
 
3987
+ #if defined(GGML_USE_CUDA)
3988
+ // free temporary resources used for async cuda uploads
3989
+ if (cuda_backend) {
3990
+ for (size_t idx = 0; idx < num_buffers;++idx) {
3991
+ ggml_backend_event_synchronize(events[idx]);
3992
+ ggml_backend_event_free(events[idx]);
3993
+ ggml_backend_buffer_free(host_buffers[idx]);
3994
+ }
3995
+ ggml_backend_free(cuda_backend);
3996
+ }
3997
+ #endif
3998
+
3783
3999
  // check validation results
3784
4000
  bool validation_failed = false;
3785
4001
  for (auto & future : validation_result) {
@@ -4246,6 +4462,9 @@ static void llm_load_hparams(
4246
4462
  } break;
4247
4463
  case LLM_ARCH_QWEN2MOE:
4248
4464
  {
4465
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
4466
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
4467
+
4249
4468
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
4250
4469
  switch (hparams.n_layer) {
4251
4470
  case 24: model.type = e_model::MODEL_A2_7B; break;
@@ -4552,38 +4771,9 @@ static void llm_load_vocab(
4552
4771
  vocab.special_cls_id = -1;
4553
4772
  vocab.special_mask_id = -1;
4554
4773
 
4555
- // For Fill-In-the-Middle (FIM)/infill models which where converted
4556
- // prior to support of FIM special tokens in GGUF, the following
4557
- // will allow those models to continue to work. The general names
4558
- // of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
4559
- // CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
4560
- // new versions of these models have been published.
4561
- std::string gen_name;
4562
- ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
4563
-
4564
- std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
4565
- [](unsigned char c){ return std::tolower(c); });
4566
-
4567
- if (gen_name.find("code") != std::string::npos) {
4568
- if (model.arch == LLM_ARCH_LLAMA) {
4569
- vocab.special_prefix_id = 32007;
4570
- vocab.special_suffix_id = 32008;
4571
- vocab.special_middle_id = 32009;
4572
- vocab.special_eot_id = 32010;
4573
- } else if (model.arch == LLM_ARCH_GEMMA) {
4574
- vocab.special_prefix_id = 67;
4575
- vocab.special_suffix_id = 69;
4576
- vocab.special_middle_id = 68;
4577
- // TODO: this is not EOT, it is "file separator" token, needs fix
4578
- // https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
4579
- //vocab.special_eot_id = 70;
4580
- vocab.special_eot_id = 107;
4581
- }
4582
- }
4583
-
4584
4774
  const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
4585
4775
  if (add_space_prefix_keyidx != -1) {
4586
- vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
4776
+ vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
4587
4777
  } // The default value of add_space_prefix is true.
4588
4778
  } else if (tokenizer_model == "bert") {
4589
4779
  vocab.type = LLAMA_VOCAB_TYPE_WPM;
@@ -4596,13 +4786,13 @@ static void llm_load_vocab(
4596
4786
  vocab.special_pad_id = 0;
4597
4787
  vocab.special_cls_id = 101;
4598
4788
  vocab.special_mask_id = 103;
4599
- vocab.add_space_prefix = false;
4789
+ vocab.tokenizer_add_space_prefix = false;
4600
4790
  } else if (tokenizer_model == "gpt2") {
4601
4791
  vocab.type = LLAMA_VOCAB_TYPE_BPE;
4602
4792
 
4603
4793
  const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
4604
4794
  if (add_space_prefix_keyidx != -1) {
4605
- vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
4795
+ vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
4606
4796
  }
4607
4797
 
4608
4798
  // read bpe merges and populate bpe ranks
@@ -4653,14 +4843,15 @@ static void llm_load_vocab(
4653
4843
  LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
4654
4844
  LLAMA_LOG_WARN("%s: \n", __func__);
4655
4845
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4656
- } else if (
4657
- tokenizer_pre == "default") {
4846
+ } else if (tokenizer_pre == "default") {
4658
4847
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4659
4848
  } else if (
4660
4849
  tokenizer_pre == "llama3" ||
4661
4850
  tokenizer_pre == "llama-v3" ||
4662
4851
  tokenizer_pre == "llama-bpe") {
4663
4852
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
4853
+ vocab.tokenizer_ignore_merges = true;
4854
+ vocab.tokenizer_add_bos = true;
4664
4855
  } else if (
4665
4856
  tokenizer_pre == "deepseek-llm") {
4666
4857
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
@@ -4681,7 +4872,8 @@ static void llm_load_vocab(
4681
4872
  tokenizer_pre == "jina-es" ||
4682
4873
  tokenizer_pre == "jina-de" ||
4683
4874
  tokenizer_pre == "jina-v2-es" ||
4684
- tokenizer_pre == "jina-v2-de") {
4875
+ tokenizer_pre == "jina-v2-de" ||
4876
+ tokenizer_pre == "jina-v2-code") {
4685
4877
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
4686
4878
  } else if (
4687
4879
  tokenizer_pre == "refact") {
@@ -4704,9 +4896,20 @@ static void llm_load_vocab(
4704
4896
  } else if (
4705
4897
  tokenizer_pre == "smaug-bpe") {
4706
4898
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
4899
+ } else if (
4900
+ tokenizer_pre == "poro-chat") {
4901
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
4707
4902
  } else {
4708
4903
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
4709
4904
  }
4905
+ } else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
4906
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4907
+ vocab.tokenizer_add_bos = true;
4908
+ vocab.tokenizer_add_eos = false;
4909
+ } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
4910
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4911
+ vocab.tokenizer_add_bos = true;
4912
+ vocab.tokenizer_add_eos = false;
4710
4913
  } else {
4711
4914
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4712
4915
  }
@@ -4738,6 +4941,7 @@ static void llm_load_vocab(
4738
4941
  GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
4739
4942
 
4740
4943
  vocab.token_to_id[word] = i;
4944
+ vocab.max_token_len = std::max(vocab.max_token_len, (int) word.size());
4741
4945
 
4742
4946
  auto & token_data = vocab.id_to_token[i];
4743
4947
  token_data.text = std::move(word);
@@ -4761,6 +4965,45 @@ static void llm_load_vocab(
4761
4965
 
4762
4966
  // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
4763
4967
  if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
4968
+ // For Fill-In-the-Middle (FIM)/infill models which where converted
4969
+ // prior to support of FIM special tokens in GGUF, the following
4970
+ // will allow those models to continue to work. The general names
4971
+ // of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
4972
+ // CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
4973
+ // new versions of these models have been published.
4974
+ std::string gen_name;
4975
+ ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
4976
+
4977
+ std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
4978
+ [](unsigned char c){ return std::tolower(c); });
4979
+
4980
+ if (gen_name.find("code") != std::string::npos) {
4981
+ if (model.arch == LLM_ARCH_LLAMA
4982
+ && 32010 < vocab.id_to_token.size()
4983
+ && vocab.id_to_token[32007].text == "<PRE>"
4984
+ && vocab.id_to_token[32008].text == "<SUF>"
4985
+ && vocab.id_to_token[32009].text == "<MID>"
4986
+ && vocab.id_to_token[32010].text == "<EOT>") {
4987
+ vocab.special_prefix_id = 32007;
4988
+ vocab.special_suffix_id = 32008;
4989
+ vocab.special_middle_id = 32009;
4990
+ vocab.special_eot_id = 32010;
4991
+ } else if (model.arch == LLM_ARCH_GEMMA
4992
+ && 107 < vocab.id_to_token.size()
4993
+ && vocab.id_to_token[67].text == "<|fim_prefix|>"
4994
+ && vocab.id_to_token[69].text == "<|fim_suffix|>"
4995
+ && vocab.id_to_token[68].text == "<|fim_middle|>"
4996
+ && vocab.id_to_token[107].text == "<end_of_turn>") {
4997
+ vocab.special_prefix_id = 67;
4998
+ vocab.special_suffix_id = 69;
4999
+ vocab.special_middle_id = 68;
5000
+ // TODO: this is not EOT, it is "file separator" token, needs fix
5001
+ // https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
5002
+ //vocab.special_eot_id = 70;
5003
+ vocab.special_eot_id = 107;
5004
+ }
5005
+ }
5006
+
4764
5007
  try {
4765
5008
  vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
4766
5009
  } catch (const std::exception & e) {
@@ -4812,10 +5055,10 @@ static void llm_load_vocab(
4812
5055
  bool temp = true;
4813
5056
 
4814
5057
  if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
4815
- vocab.special_add_bos = int(temp);
5058
+ vocab.tokenizer_add_bos = temp;
4816
5059
  }
4817
5060
  if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
4818
- vocab.special_add_eos = int(temp);
5061
+ vocab.tokenizer_add_eos = temp;
4819
5062
  }
4820
5063
  }
4821
5064
 
@@ -4915,7 +5158,7 @@ static void llm_load_vocab(
4915
5158
  );
4916
5159
 
4917
5160
  // set attributes by model/tokenizer name
4918
- if (_contains_any(tokenizer_pre, {"jina-v2-es", "jina-v2-de"})) {
5161
+ if (_contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})) {
4919
5162
  _set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
4920
5163
  } else if (_contains_any(model_name, {"phi-3", "phi3"})) {
4921
5164
  for (auto id : vocab.cache_special_tokens) {
@@ -5009,6 +5252,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
5009
5252
  if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
5010
5253
  if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
5011
5254
 
5255
+ LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len);
5256
+
5012
5257
  if (model.arch == LLM_ARCH_DEEPSEEK2) {
5013
5258
  LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
5014
5259
  LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
@@ -5018,6 +5263,11 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
5018
5263
  LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
5019
5264
  LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
5020
5265
  }
5266
+
5267
+ if (model.arch == LLM_ARCH_QWEN2MOE) {
5268
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
5269
+ LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
5270
+ }
5021
5271
  }
5022
5272
 
5023
5273
  // Returns false if cancelled by progress_callback
@@ -5161,7 +5411,7 @@ static bool llm_load_tensors(
5161
5411
  // create tensors for the weights
5162
5412
  {
5163
5413
  const int64_t n_embd = hparams.n_embd;
5164
- const int64_t n_embd_head = n_embd / hparams.n_head;
5414
+ const int64_t n_embd_head = (hparams.n_head == 0) ? 0 : n_embd / hparams.n_head;
5165
5415
  const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
5166
5416
  const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
5167
5417
  const int64_t n_embd_gqa = n_embd_v_gqa;
@@ -5515,7 +5765,7 @@ static bool llm_load_tensors(
5515
5765
 
5516
5766
  layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
5517
5767
  } else {
5518
- layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5768
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5519
5769
  }
5520
5770
 
5521
5771
  layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
@@ -5556,6 +5806,9 @@ static bool llm_load_tensors(
5556
5806
  layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm
5557
5807
  layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
5558
5808
 
5809
+ layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5810
+ layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5811
+
5559
5812
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5560
5813
  layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5561
5814
 
@@ -5801,16 +6054,17 @@ static bool llm_load_tensors(
5801
6054
  GGML_ASSERT(hparams.n_expert_used > 0);
5802
6055
 
5803
6056
  // MoE branch
5804
- auto n_ff_exp = n_ff / hparams.n_expert_used;
6057
+ auto n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / hparams.n_expert_used;
5805
6058
  layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
5806
6059
  layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
5807
6060
  layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
5808
6061
 
5809
6062
  // Shared expert branch
6063
+ auto n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
5810
6064
  layer.ffn_gate_inp_shexp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd});
5811
- layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff});
5812
- layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff, n_embd});
5813
- layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff});
6065
+ layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp});
6066
+ layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd});
6067
+ layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp});
5814
6068
  }
5815
6069
  } break;
5816
6070
  case LLM_ARCH_PHI2:
@@ -6600,16 +6854,6 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
6600
6854
  }
6601
6855
  #endif
6602
6856
 
6603
- #ifdef GGML_USE_SYCL
6604
- if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
6605
- ggml_backend_sycl_set_single_device_mode(params.main_gpu);
6606
- //SYCL use device index (0, 1, 2) directly, uer input device id, then convert to device index.
6607
- params.main_gpu = ggml_backend_sycl_get_device_index(params.main_gpu);
6608
- } else {
6609
- ggml_backend_sycl_set_mul_device_mode();
6610
- }
6611
- #endif
6612
-
6613
6857
  if (!llm_load_tensors(
6614
6858
  ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
6615
6859
  params.progress_callback, params.progress_callback_user_data
@@ -7410,6 +7654,50 @@ struct llm_build_context {
7410
7654
  return lctx.inp_s_seq;
7411
7655
  }
7412
7656
 
7657
+ struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) {
7658
+ // find result_norm tensor for input
7659
+ struct ggml_tensor * inp = nullptr;
7660
+ for (int i = gf->n_nodes - 1; i >= 0; --i) {
7661
+ inp = gf->nodes[i];
7662
+ if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
7663
+ break;
7664
+ } else {
7665
+ inp = nullptr;
7666
+ }
7667
+ }
7668
+ GGML_ASSERT(inp != nullptr && "missing result_norm/result_embd tensor");
7669
+
7670
+ struct ggml_tensor * cur;
7671
+
7672
+ switch (pooling_type) {
7673
+ case LLAMA_POOLING_TYPE_MEAN:
7674
+ {
7675
+ struct ggml_tensor * inp_mean = build_inp_mean();
7676
+ cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, inp)), inp_mean);
7677
+ } break;
7678
+ case LLAMA_POOLING_TYPE_CLS:
7679
+ case LLAMA_POOLING_TYPE_LAST:
7680
+ {
7681
+ struct ggml_tensor * inp_cls = build_inp_cls();
7682
+ cur = ggml_get_rows(ctx0, inp, inp_cls);
7683
+ } break;
7684
+ case LLAMA_POOLING_TYPE_NONE:
7685
+ {
7686
+ cur = inp;
7687
+ } break;
7688
+ default:
7689
+ {
7690
+ GGML_ASSERT(false && "unknown pooling type");
7691
+ } break;
7692
+ }
7693
+
7694
+ cb(cur, "result_embd_pooled", -1);
7695
+
7696
+ ggml_build_forward_expand(gf, cur);
7697
+
7698
+ return gf;
7699
+ }
7700
+
7413
7701
  struct ggml_cgraph * build_llama() {
7414
7702
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
7415
7703
 
@@ -8390,8 +8678,6 @@ struct llm_build_context {
8390
8678
  if (model.arch != LLM_ARCH_JINA_BERT_V2) {
8391
8679
  inp_pos = build_inp_pos();
8392
8680
  }
8393
- struct ggml_tensor * inp_mean = build_inp_mean();
8394
- struct ggml_tensor * inp_cls = build_inp_cls();
8395
8681
 
8396
8682
  // construct input embeddings (token, type, position)
8397
8683
  inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
@@ -8519,6 +8805,11 @@ struct llm_build_context {
8519
8805
  // attention layer norm
8520
8806
  cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il);
8521
8807
 
8808
+ if (model.layers[il].attn_norm_2 != nullptr) {
8809
+ cur = ggml_add(ctx0, cur, inpL); // re-add the layer input
8810
+ cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, cb, il);
8811
+ }
8812
+
8522
8813
  struct ggml_tensor * ffn_inp = cur;
8523
8814
  cb(ffn_inp, "ffn_inp", il);
8524
8815
 
@@ -8561,28 +8852,6 @@ struct llm_build_context {
8561
8852
  cur = inpL;
8562
8853
  cb(cur, "result_embd", -1);
8563
8854
 
8564
- // pooling layer
8565
- switch (pooling_type) {
8566
- case LLAMA_POOLING_TYPE_NONE:
8567
- {
8568
- // nop
8569
- } break;
8570
- case LLAMA_POOLING_TYPE_MEAN:
8571
- {
8572
- cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
8573
- cb(cur, "result_embd_pooled", -1);
8574
- } break;
8575
- case LLAMA_POOLING_TYPE_CLS:
8576
- {
8577
- cur = ggml_get_rows(ctx0, cur, inp_cls);
8578
- cb(cur, "result_embd_pooled", -1);
8579
- } break;
8580
- case LLAMA_POOLING_TYPE_UNSPECIFIED:
8581
- {
8582
- GGML_ASSERT(false && "Invalid pooling type");
8583
- } break;
8584
- }
8585
-
8586
8855
  ggml_build_forward_expand(gf, cur);
8587
8856
 
8588
8857
  return gf;
@@ -11520,7 +11789,8 @@ static struct ggml_cgraph * llama_build_graph(
11520
11789
  if (batch.n_tokens < 32 || full_offload) {
11521
11790
  if (il != -1 && strcmp(name, "norm") == 0) {
11522
11791
  for (auto * backend : lctx.backends) {
11523
- if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
11792
+ if (ggml_backend_supports_buft(backend, lctx.model.buft_layer[il].buft) &&
11793
+ (ggml_backend_supports_op(backend, cur) || ggml_backend_offload_op(backend, cur))) {
11524
11794
  ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
11525
11795
  break;
11526
11796
  }
@@ -11666,6 +11936,11 @@ static struct ggml_cgraph * llama_build_graph(
11666
11936
  GGML_ASSERT(false);
11667
11937
  }
11668
11938
 
11939
+ // add on pooling layer
11940
+ if (lctx.cparams.embeddings) {
11941
+ result = llm.append_pooling(result);
11942
+ }
11943
+
11669
11944
  llm.free();
11670
11945
 
11671
11946
  return result;
@@ -11755,7 +12030,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
11755
12030
  // (!a || b) is a logical implication (a -> b)
11756
12031
  // !hparams.causal_attn -> !cparams.causal_attn
11757
12032
  (hparams.causal_attn || !cparams.causal_attn) &&
11758
- "causal attention with embedding models is not supported"
12033
+ "causal attention is not supported by this model"
11759
12034
  );
11760
12035
 
11761
12036
  if (lctx.inp_KQ_mask) {
@@ -11887,6 +12162,37 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
11887
12162
  }
11888
12163
  }
11889
12164
 
12165
+ if (cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
12166
+ const int64_t n_tokens = batch.n_tokens;
12167
+
12168
+ GGML_ASSERT(lctx.inp_cls);
12169
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
12170
+
12171
+ uint32_t * data = (uint32_t *) lctx.inp_cls->data;
12172
+ memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
12173
+
12174
+ std::vector<int> last_pos(n_tokens, -1);
12175
+ std::vector<int> last_row(n_tokens, -1);
12176
+
12177
+ for (int i = 0; i < n_tokens; ++i) {
12178
+ const llama_seq_id seq_id = batch.seq_id[i][0];
12179
+ const llama_pos pos = batch.pos[i];
12180
+
12181
+ GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
12182
+
12183
+ if (pos >= last_pos[seq_id]) {
12184
+ last_pos[seq_id] = pos;
12185
+ last_row[seq_id] = i;
12186
+ }
12187
+ }
12188
+
12189
+ for (int i = 0; i < n_tokens; ++i) {
12190
+ if (last_row[i] >= 0) {
12191
+ data[i] = last_row[i];
12192
+ }
12193
+ }
12194
+ }
12195
+
11890
12196
  if (kv_self.recurrent) {
11891
12197
  const int64_t n_kv = kv_self.n;
11892
12198
 
@@ -11948,8 +12254,8 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
11948
12254
  const auto n_embd = hparams.n_embd;
11949
12255
 
11950
12256
  // TODO: use a per-batch flag for logits presence instead
11951
- const bool has_logits = cparams.causal_attn;
11952
- const bool has_embd = cparams.embeddings && (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
12257
+ const bool has_logits = !cparams.embeddings;
12258
+ const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
11953
12259
 
11954
12260
  const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
11955
12261
  const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0;
@@ -12017,6 +12323,11 @@ static void llama_graph_compute(
12017
12323
  ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
12018
12324
  ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
12019
12325
  }
12326
+ #ifdef GGML_USE_BLAS
12327
+ if (lctx.backend_blas != nullptr) {
12328
+ ggml_backend_blas_set_n_threads(lctx.backend_blas, n_threads);
12329
+ }
12330
+ #endif
12020
12331
 
12021
12332
  ggml_backend_sched_graph_compute_async(lctx.sched, gf);
12022
12333
 
@@ -12074,11 +12385,13 @@ static int llama_decode_internal(
12074
12385
  std::vector<std::vector<llama_seq_id>> seq_id;
12075
12386
 
12076
12387
  // count outputs
12077
- if (batch_all.logits) {
12388
+ if (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE) {
12389
+ n_outputs = n_tokens_all;
12390
+ } else if (batch_all.logits) {
12078
12391
  for (uint32_t i = 0; i < n_tokens_all; ++i) {
12079
12392
  n_outputs += batch_all.logits[i] != 0;
12080
12393
  }
12081
- } else if (lctx.logits_all || (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE)) {
12394
+ } else if (lctx.logits_all) {
12082
12395
  n_outputs = n_tokens_all;
12083
12396
  } else {
12084
12397
  // keep last output only
@@ -12209,47 +12522,19 @@ static int llama_decode_internal(
12209
12522
  // no output
12210
12523
  res = nullptr;
12211
12524
  embd = nullptr;
12212
- } else if (!hparams.causal_attn) {
12213
- res = nullptr; // do not extract logits for embedding models such as BERT
12214
-
12215
- // token or sequence embeddings
12216
- embd = gf->nodes[gf->n_nodes - 1];
12217
-
12218
- GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0);
12219
12525
  } else if (cparams.embeddings) {
12220
- // the embeddings could be in the second to last tensor, or any of the previous tensors
12221
- int i_embd = gf->n_nodes - 2;
12222
- for (int i = 3; strcmp(embd->name, "result_norm") != 0; ++i) {
12223
- i_embd = gf->n_nodes - i;
12224
- if (i_embd < 0) { break; }
12225
- embd = gf->nodes[i_embd];
12226
- }
12227
- GGML_ASSERT(i_embd >= 0 && "missing result_norm tensor");
12228
-
12229
- // TODO: use a per-batch flag to know when to skip logits while keeping embeddings
12230
- if (!cparams.causal_attn) {
12231
- res = nullptr; // do not extract logits when not needed
12232
- // skip computing logits
12233
- // TODO: is this safe?
12234
- gf->n_nodes = i_embd + 1;
12526
+ res = nullptr; // do not extract logits for embedding case
12527
+ embd = gf->nodes[gf->n_nodes - 1];
12528
+ if (strcmp(embd->name, "result_embd_pooled") != 0) {
12529
+ embd = gf->nodes[gf->n_nodes - 2];
12235
12530
  }
12531
+ GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
12236
12532
  } else {
12237
12533
  embd = nullptr; // do not extract embeddings when not needed
12238
12534
  GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
12239
12535
  }
12240
12536
  // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
12241
12537
 
12242
- // for big prompts, if BLAS is enabled, it is better to use only one thread
12243
- // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
12244
- // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
12245
- // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
12246
- // with the BLAS calls. need a better solution
12247
- // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
12248
- // being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
12249
- if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
12250
- n_threads = std::min(4, n_threads);
12251
- }
12252
-
12253
12538
  ggml_backend_sched_alloc_graph(lctx.sched, gf);
12254
12539
 
12255
12540
  llama_set_inputs(lctx, u_batch);
@@ -12312,11 +12597,10 @@ static int llama_decode_internal(
12312
12597
  ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
12313
12598
  }
12314
12599
  } break;
12315
- case LLAMA_POOLING_TYPE_CLS:
12316
12600
  case LLAMA_POOLING_TYPE_MEAN:
12601
+ case LLAMA_POOLING_TYPE_CLS:
12602
+ case LLAMA_POOLING_TYPE_LAST:
12317
12603
  {
12318
- GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0);
12319
-
12320
12604
  // extract sequence embeddings
12321
12605
  auto & embd_seq_out = lctx.embd_seq;
12322
12606
  embd_seq_out.clear();
@@ -12930,107 +13214,142 @@ struct llm_bigram_bpe {
12930
13214
  };
12931
13215
 
12932
13216
  struct llm_tokenizer_bpe {
12933
- llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {}
12934
-
12935
- void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
12936
- int final_prev_index = -1;
12937
- bool ignore_merges = false;
12938
-
12939
- std::vector<std::string> word_collection;
12940
- switch (vocab.type) {
12941
- case LLAMA_VOCAB_TYPE_BPE:
12942
- switch (vocab.type_pre) {
12943
- case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
12944
- ignore_merges = true;
12945
- word_collection = unicode_regex_split(text, {
12946
- // original regex from tokenizer.json
12947
- //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12948
-
12949
- // adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
12950
- "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12951
- });
12952
- break;
12953
- case LLAMA_VOCAB_PRE_TYPE_DBRX:
12954
- case LLAMA_VOCAB_PRE_TYPE_SMAUG:
12955
- word_collection = unicode_regex_split(text, {
12956
- // same as llama3
12957
- "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12958
- });
12959
- break;
12960
- case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
12961
- word_collection = unicode_regex_split(text, {
12962
- "[\r\n]",
12963
- "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
12964
- "\\s?[!-/:-~!-/:-~‘-‟ -。]+",
12965
- "\\s+$",
12966
- "[一-龥ࠀ-一가-퟿]+",
12967
- "\\p{N}+",
12968
- });
12969
- break;
12970
- case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
12971
- word_collection = unicode_regex_split(text, {
12972
- "[\r\n]",
12973
- "\\s?\\p{L}+",
12974
- "\\s?\\p{P}+",
12975
- "[一-龥ࠀ-一가-퟿]+",
12976
- "\\p{N}",
12977
- });
12978
- break;
12979
- case LLAMA_VOCAB_PRE_TYPE_FALCON:
12980
- word_collection = unicode_regex_split(text, {
12981
- "[\\p{P}\\$\\+<=>\\^~\\|]+",
12982
- "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12983
- "[0-9][0-9][0-9]",
12984
- });
12985
- break;
12986
- case LLAMA_VOCAB_PRE_TYPE_MPT:
12987
- // TODO: MPT pre-tokenization regexes are unknown
12988
- // the following are close, but not exact. run the following:
12989
- // ./bin/test-tokenizer-0 ../models/ggml-vocab-mpt.gguf
12990
- GGML_ASSERT("MPT pre-tokenization regexes are unknown - fixes needed");
12991
- word_collection = unicode_regex_split(text, {
12992
- "\\s?\\p{L}+",
12993
- "\\s?\\p{P}+",
12994
- "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12995
- });
12996
- break;
12997
- case LLAMA_VOCAB_PRE_TYPE_STARCODER:
12998
- case LLAMA_VOCAB_PRE_TYPE_REFACT:
12999
- case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
13000
- word_collection = unicode_regex_split(text, {
13001
- "\\p{N}",
13002
- "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
13003
- });
13004
- break;
13005
- case LLAMA_VOCAB_PRE_TYPE_GPT2:
13006
- case LLAMA_VOCAB_PRE_TYPE_OLMO:
13007
- word_collection = unicode_regex_split(text, {
13008
- "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
13009
- });
13010
- break;
13011
- case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
13012
- case LLAMA_VOCAB_PRE_TYPE_QWEN2:
13013
- word_collection = unicode_regex_split(text, {
13014
- // original regex from tokenizer.json
13015
- // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
13016
- "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
13017
- });
13018
- break;
13019
- default:
13020
- // default regex for BPE tokenization pre-processing
13021
- word_collection = unicode_regex_split(text, {
13022
- "[\\p{P}\\$\\+<=>\\^~\\|]+",
13023
- "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
13024
- "\\p{N}+",
13025
- "[0-9][0-9][0-9]",
13026
- });
13027
- break;
13028
- }
13217
+ llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {
13218
+ GGML_ASSERT(vocab.type == LLAMA_VOCAB_TYPE_BPE);
13219
+ switch (vocab.type_pre) {
13220
+ case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
13221
+ regex_exprs = {
13222
+ // original regex from tokenizer.json
13223
+ //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
13224
+
13225
+ // adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
13226
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
13227
+ };
13228
+ break;
13229
+ case LLAMA_VOCAB_PRE_TYPE_DBRX:
13230
+ case LLAMA_VOCAB_PRE_TYPE_SMAUG:
13231
+ regex_exprs = {
13232
+ // same as llama3
13233
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
13234
+ };
13235
+ break;
13236
+ case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
13237
+ regex_exprs = {
13238
+ "[\r\n]",
13239
+ "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
13240
+ "\\s?[!-/:-~!-/:-~‘-‟ -。]+",
13241
+ "\\s+$",
13242
+ "[一-龥ࠀ-一가-퟿]+",
13243
+ "\\p{N}+",
13244
+ };
13245
+ break;
13246
+ case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
13247
+ regex_exprs = {
13248
+ "[\r\n]",
13249
+ "\\s?\\p{L}+",
13250
+ "\\s?\\p{P}+",
13251
+ "[一-龥ࠀ-一가-퟿]+",
13252
+ "\\p{N}",
13253
+ };
13254
+ break;
13255
+ case LLAMA_VOCAB_PRE_TYPE_FALCON:
13256
+ regex_exprs = {
13257
+ "[\\p{P}\\$\\+<=>\\^~\\|`]+",
13258
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
13259
+ "[0-9][0-9][0-9]",
13260
+ };
13261
+ break;
13262
+ case LLAMA_VOCAB_PRE_TYPE_MPT:
13263
+ // TODO: MPT pre-tokenization regexes are unknown
13264
+ // the following are close, but not exact. run the following:
13265
+ // ./bin/test-tokenizer-0 ../models/ggml-vocab-mpt.gguf
13266
+ GGML_ASSERT("MPT pre-tokenization regexes are unknown - fixes needed");
13267
+ regex_exprs = {
13268
+ "\\s?\\p{L}+",
13269
+ "\\s?\\p{P}+",
13270
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
13271
+ };
13272
+ break;
13273
+ case LLAMA_VOCAB_PRE_TYPE_STARCODER:
13274
+ case LLAMA_VOCAB_PRE_TYPE_REFACT:
13275
+ case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
13276
+ regex_exprs = {
13277
+ "\\p{N}",
13278
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
13279
+ };
13280
+ break;
13281
+ case LLAMA_VOCAB_PRE_TYPE_GPT2:
13282
+ case LLAMA_VOCAB_PRE_TYPE_OLMO:
13283
+ regex_exprs = {
13284
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
13285
+ };
13286
+ break;
13287
+ case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
13288
+ case LLAMA_VOCAB_PRE_TYPE_QWEN2:
13289
+ regex_exprs = {
13290
+ // original regex from tokenizer.json
13291
+ // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
13292
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
13293
+ };
13294
+ break;
13295
+ case LLAMA_VOCAB_PRE_TYPE_PORO:
13296
+ regex_exprs = {
13297
+ " ?[^(\\s|.,!?…。,、।۔،)]+",
13298
+ };
13029
13299
  break;
13030
13300
  default:
13031
- GGML_ASSERT(false);
13301
+ // default regex for BPE tokenization pre-processing
13302
+ regex_exprs = {
13303
+ "[\\p{P}\\$\\+<=>\\^~\\|]+",
13304
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
13305
+ "\\p{N}+",
13306
+ "[0-9][0-9][0-9]",
13307
+ };
13032
13308
  break;
13033
13309
  }
13310
+ }
13311
+
13312
+ void append(const llama_vocab::id token_id, std::vector<llama_vocab::id> & output) const {
13313
+ output.push_back(token_id);
13314
+ }
13315
+
13316
+ bool append_bos(std::vector<llama_vocab::id> & output) const {
13317
+ if (vocab.tokenizer_add_bos) {
13318
+ GGML_ASSERT(vocab.special_bos_id != -1);
13319
+ output.push_back(vocab.special_bos_id);
13320
+ return true;
13321
+ }
13322
+ return false;
13323
+ }
13324
+
13325
+ bool append_eos(std::vector<llama_vocab::id> & output) const {
13326
+ if (vocab.tokenizer_add_eos) {
13327
+ GGML_ASSERT(vocab.special_eos_id != -1);
13328
+ output.push_back(vocab.special_eos_id);
13329
+ return true;
13330
+ }
13331
+ return false;
13332
+ }
13333
+
13334
+ void check_double_bos_eos(const std::vector<llama_vocab::id> & output) const {
13335
+ if (vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) {
13336
+ LLAMA_LOG_WARN(
13337
+ "%s: Added a BOS token to the prompt as specified by the model but the prompt "
13338
+ "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
13339
+ "Are you sure this is what you want?\n", __FUNCTION__);
13340
+ }
13341
+ if (vocab.tokenizer_add_eos && output.size() >= 2 && *(output.end()-2) == vocab.special_eos_id) {
13342
+ LLAMA_LOG_WARN(
13343
+ "%s: Added a EOS token to the prompt as specified by the model but the prompt "
13344
+ "also ends with a EOS token. So now the final prompt ends with 2 EOS tokens. "
13345
+ "Are you sure this is what you want?\n", __FUNCTION__);
13346
+ }
13347
+ }
13348
+
13349
+ void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
13350
+ int final_prev_index = -1;
13351
+
13352
+ const auto word_collection = unicode_regex_split(text, regex_exprs);
13034
13353
 
13035
13354
  symbols_final.clear();
13036
13355
 
@@ -13041,7 +13360,7 @@ struct llm_tokenizer_bpe {
13041
13360
  int index = 0;
13042
13361
  size_t offset = 0;
13043
13362
 
13044
- if (ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
13363
+ if (vocab.tokenizer_ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
13045
13364
  symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
13046
13365
  offset = word.size();
13047
13366
  }
@@ -13122,10 +13441,9 @@ struct llm_tokenizer_bpe {
13122
13441
  for (auto j = str.begin(); j != str.end(); ++j) {
13123
13442
  std::string byte_str(1, *j);
13124
13443
  auto token_multibyte = vocab.token_to_id.find(byte_str);
13125
- if (token_multibyte == vocab.token_to_id.end()) {
13126
- throw std::runtime_error("ERROR: byte not found in vocab");
13444
+ if (token_multibyte != vocab.token_to_id.end()) {
13445
+ output.push_back(token_multibyte->second);
13127
13446
  }
13128
- output.push_back((*token_multibyte).second);
13129
13447
  }
13130
13448
  } else {
13131
13449
  output.push_back((*token).second);
@@ -13164,6 +13482,8 @@ private:
13164
13482
 
13165
13483
  const llama_vocab & vocab;
13166
13484
 
13485
+ std::vector<std::string> regex_exprs;
13486
+
13167
13487
  std::vector<llm_symbol> symbols;
13168
13488
  std::vector<llm_symbol> symbols_final;
13169
13489
 
@@ -13173,7 +13493,7 @@ private:
13173
13493
  struct llm_tokenizer_wpm {
13174
13494
  llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
13175
13495
 
13176
- void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
13496
+ void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) const {
13177
13497
  const auto & token_map = vocab.token_to_id;
13178
13498
 
13179
13499
  // normalize and split by whitespace
@@ -13182,7 +13502,7 @@ struct llm_tokenizer_wpm {
13182
13502
  // bos token prepended already
13183
13503
 
13184
13504
  // find the longest tokens that form the words
13185
- for (const std::string &word : words) {
13505
+ for (const std::string & word : words) {
13186
13506
  // skip empty words
13187
13507
  if (word.size() == 0) {
13188
13508
  continue;
@@ -13199,7 +13519,7 @@ struct llm_tokenizer_wpm {
13199
13519
  for (int i = 0; i < n; ++i) {
13200
13520
  // loop through possible match length
13201
13521
  bool match = false;
13202
- for (int j = n; j > i; j--) {
13522
+ for (int j = std::min(n, i + vocab.max_token_len + 1); j > i; j--) {
13203
13523
  auto it = token_map.find(word1.substr(i, j - i));
13204
13524
  if (it != token_map.end()) {
13205
13525
  output.push_back(it->second);
@@ -13222,11 +13542,12 @@ struct llm_tokenizer_wpm {
13222
13542
  }
13223
13543
  }
13224
13544
 
13225
- std::vector<std::string> preprocess(const std::string & text) {
13545
+ // TODO: reduce string copies by using cpts_offs array
13546
+ std::vector<std::string> preprocess(const std::string & text) const {
13226
13547
  const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
13227
13548
  std::vector<std::string> words(1, "");
13228
13549
 
13229
- for (const char32_t cpt : cpts_nfd) {
13550
+ for (const uint32_t cpt : cpts_nfd) {
13230
13551
  const auto flags = unicode_cpt_flags(cpt);
13231
13552
 
13232
13553
  if (flags.is_whitespace) {
@@ -13444,7 +13765,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13444
13765
 
13445
13766
  bool is_prev_special = false;
13446
13767
 
13447
- if (add_special && vocab.special_add_bos != 0) {
13768
+ if (add_special && vocab.tokenizer_add_bos) {
13448
13769
  GGML_ASSERT(vocab.special_bos_id != -1);
13449
13770
  output.push_back(vocab.special_bos_id);
13450
13771
  is_prev_special = true;
@@ -13454,7 +13775,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13454
13775
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
13455
13776
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
13456
13777
 
13457
- if (vocab.add_space_prefix) {
13778
+ if (vocab.tokenizer_add_space_prefix) {
13458
13779
  if (!output.size() || is_prev_special) { // prefix with space if first token
13459
13780
  raw_text = " " + raw_text;
13460
13781
  }
@@ -13472,23 +13793,24 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13472
13793
  }
13473
13794
  }
13474
13795
 
13475
- if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
13796
+ if (add_special && vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) {
13476
13797
  LLAMA_LOG_WARN(
13477
13798
  "%s: Added a BOS token to the prompt as specified by the model but the prompt "
13478
13799
  "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
13479
13800
  "Are you sure this is what you want?\n", __FUNCTION__);
13480
13801
  }
13481
13802
 
13482
- if (add_special && vocab.special_add_eos == 1) {
13803
+ if (add_special && vocab.tokenizer_add_eos) {
13483
13804
  GGML_ASSERT(vocab.special_eos_id != -1);
13484
13805
  output.push_back(vocab.special_eos_id);
13485
13806
  }
13486
13807
  } break;
13487
13808
  case LLAMA_VOCAB_TYPE_BPE:
13488
13809
  {
13489
- if (add_special && vocab.special_add_bos != 0) {
13490
- GGML_ASSERT(vocab.special_bos_id != -1);
13491
- output.push_back(vocab.special_bos_id);
13810
+ llm_tokenizer_bpe tokenizer(vocab);
13811
+
13812
+ if (add_special) {
13813
+ tokenizer.append_bos(output);
13492
13814
  }
13493
13815
 
13494
13816
  for (const auto & fragment : fragment_buffer) {
@@ -13498,23 +13820,15 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13498
13820
  #ifdef PRETOKENIZERDEBUG
13499
13821
  LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
13500
13822
  #endif
13501
- llm_tokenizer_bpe tokenizer(vocab);
13502
13823
  tokenizer.tokenize(raw_text, output);
13503
13824
  } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
13504
- output.push_back(fragment.token);
13825
+ tokenizer.append(fragment.token, output);
13505
13826
  }
13506
13827
  }
13507
13828
 
13508
- if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
13509
- LLAMA_LOG_WARN(
13510
- "%s: Added a BOS token to the prompt as specified by the model but the prompt "
13511
- "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
13512
- "Are you sure this is what you want?\n", __FUNCTION__);
13513
- }
13514
-
13515
- if (add_special && vocab.special_add_eos == 1) {
13516
- GGML_ASSERT(vocab.special_add_eos != -1);
13517
- output.push_back(vocab.special_eos_id);
13829
+ if (add_special) {
13830
+ tokenizer.append_eos(output);
13831
+ tokenizer.check_double_bos_eos(output);
13518
13832
  }
13519
13833
  } break;
13520
13834
  case LLAMA_VOCAB_TYPE_WPM:
@@ -13524,6 +13838,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13524
13838
  output.push_back(vocab.special_cls_id);
13525
13839
  }
13526
13840
 
13841
+ llm_tokenizer_wpm tokenizer(vocab);
13842
+
13527
13843
  for (const auto & fragment : fragment_buffer) {
13528
13844
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
13529
13845
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
@@ -13531,7 +13847,6 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13531
13847
  #ifdef PRETOKENIZERDEBUG
13532
13848
  LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
13533
13849
  #endif
13534
- llm_tokenizer_wpm tokenizer(vocab);
13535
13850
  tokenizer.tokenize(raw_text, output);
13536
13851
  } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
13537
13852
  output.push_back(fragment.token);
@@ -13631,7 +13946,7 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
13631
13946
  const uint32_t chr) {
13632
13947
 
13633
13948
  bool found = false;
13634
- bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
13949
+ bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
13635
13950
 
13636
13951
  GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT
13637
13952
 
@@ -13640,6 +13955,10 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
13640
13955
  // inclusive range, e.g. [a-z]
13641
13956
  found = found || (pos->value <= chr && chr <= pos[1].value);
13642
13957
  pos += 2;
13958
+ } else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
13959
+ // Any character matches "."
13960
+ found = true;
13961
+ pos += 1;
13643
13962
  } else {
13644
13963
  // exact char match, e.g. [a] or "a"
13645
13964
  found = found || pos->value == chr;
@@ -13657,7 +13976,7 @@ static bool llama_grammar_match_partial_char(
13657
13976
  const llama_grammar_element * pos,
13658
13977
  const llama_partial_utf8 partial_utf8) {
13659
13978
 
13660
- bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
13979
+ bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
13661
13980
  GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
13662
13981
 
13663
13982
  uint32_t partial_value = partial_utf8.value;
@@ -13687,6 +14006,9 @@ static bool llama_grammar_match_partial_char(
13687
14006
  return is_positive_char;
13688
14007
  }
13689
14008
  pos += 2;
14009
+ } else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
14010
+ // Any character matches "."
14011
+ return true;
13690
14012
  } else {
13691
14013
  // exact char match, e.g. [a] or "a"
13692
14014
  if (low <= pos->value && pos->value <= high) {
@@ -13747,6 +14069,7 @@ static void llama_grammar_advance_stack(
13747
14069
  }
13748
14070
  case LLAMA_GRETYPE_CHAR:
13749
14071
  case LLAMA_GRETYPE_CHAR_NOT:
14072
+ case LLAMA_GRETYPE_CHAR_ANY:
13750
14073
  if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
13751
14074
  // only add the stack if it's not a duplicate of one we already have
13752
14075
  new_stacks.emplace_back(stack);
@@ -15220,6 +15543,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
15220
15543
  if (imatrix_data) {
15221
15544
  LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
15222
15545
  qs.has_imatrix = true;
15546
+ // check imatrix for nans or infs
15547
+ for (const auto & kv : *imatrix_data) {
15548
+ for (float f : kv.second) {
15549
+ if (!std::isfinite(f)) {
15550
+ throw std::runtime_error(format("imatrix contains non-finite value %f\n", f));
15551
+ }
15552
+ }
15553
+ }
15223
15554
  }
15224
15555
  }
15225
15556
 
@@ -16024,6 +16355,11 @@ struct llama_context * llama_new_context_with_model(
16024
16355
  params.flash_attn = false;
16025
16356
  }
16026
16357
 
16358
+ if (params.flash_attn && model->hparams.n_embd_head_k != model->hparams.n_embd_head_v) {
16359
+ LLAMA_LOG_WARN("%s: flash_attn requires n_embd_head_k == n_embd_head_v - forcing off\n", __func__);
16360
+ params.flash_attn = false;
16361
+ }
16362
+
16027
16363
  if (params.type_v != GGML_TYPE_F16 && !params.flash_attn) {
16028
16364
  LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
16029
16365
  return nullptr;
@@ -16195,8 +16531,7 @@ struct llama_context * llama_new_context_with_model(
16195
16531
  if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
16196
16532
  ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
16197
16533
  if (backend == nullptr) {
16198
- int main_gpu_id = ggml_backend_sycl_get_device_id(model->main_gpu);
16199
- LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, main_gpu_id, model->main_gpu);
16534
+ LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
16200
16535
  llama_free(ctx);
16201
16536
  return nullptr;
16202
16537
  }
@@ -16226,6 +16561,16 @@ struct llama_context * llama_new_context_with_model(
16226
16561
  ctx->backends.push_back(backend);
16227
16562
  }
16228
16563
  #endif
16564
+
16565
+ #ifdef GGML_USE_BLAS
16566
+ ctx->backend_blas = ggml_backend_blas_init();
16567
+ if (ctx->backend_blas == nullptr) {
16568
+ LLAMA_LOG_WARN("%s: failed to initialize BLAS backend\n", __func__);
16569
+ } else {
16570
+ ctx->backends.push_back(ctx->backend_blas);
16571
+ }
16572
+ #endif
16573
+
16229
16574
  #if defined(GGML_USE_RPC)
16230
16575
  if (model->n_gpu_layers > 0) {
16231
16576
  for (const auto & endpoint : model->rpc_servers) {
@@ -17814,6 +18159,10 @@ void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)
17814
18159
  ctx->abort_callback_data = abort_callback_data;
17815
18160
  }
17816
18161
 
18162
+ void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
18163
+ ctx->cparams.embeddings = embeddings;
18164
+ }
18165
+
17817
18166
  void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
17818
18167
  ctx->cparams.causal_attn = causal_attn;
17819
18168
  }
@@ -18057,11 +18406,11 @@ llama_token llama_token_nl(const struct llama_model * model) {
18057
18406
  }
18058
18407
 
18059
18408
  int32_t llama_add_bos_token(const struct llama_model * model) {
18060
- return model->vocab.special_add_bos;
18409
+ return model->vocab.tokenizer_add_bos;
18061
18410
  }
18062
18411
 
18063
18412
  int32_t llama_add_eos_token(const struct llama_model * model) {
18064
- return model->vocab.special_add_eos;
18413
+ return model->vocab.tokenizer_add_eos;
18065
18414
  }
18066
18415
 
18067
18416
  llama_token llama_token_prefix(const struct llama_model * model) {